troggle/parsers/QMs.py

import csv
import os
import re
from pathlib import Path

from django.conf import settings

from troggle.core.models.caves import Cave
from troggle.core.models.logbooks import QM
from troggle.core.models.troggle import DataIssue

"""Reads the CSV files containg QMs for a select few caves
See parsers/survex.py for the parser which extracts QMs from the survex files
"""


def deleteQMs():
    QM.objects.all().delete()
    DataIssue.objects.filter(parser="QMs").delete()


def parseCaveQMs(cave, inputFile, ticked=False):
    """Runs through the CSV file at inputFile (which is a relative path from expoweb) and
    saves each QM as a QM instance.
    This is creating and linking a Placeholder logbookentry dated 1st Jan. in the relevant
    year. This is pointless but it is needed because found_by is a ForeignKey in the db
    and we can't be arsed to fudge this properly with a null.(July 2020)

    Linking to a passage in a SVX file might be more interesting as the QM does sometimes
    have the passage name, e.g. in 204/qm.csv
    C2000-204-39	B	Tree	Pitch in Cave Tree		treeumphant.28	Gosser Streamway
    The CSV file does not have the exact date for the QM, only the year, so links to
    survex files might be ambiguous. But potentially useful?

    Much of this code assumes that QMs are edited using troggle. This is not done so this code can be deleted.
    All QMs are created afresh and this is all only run once on import on a fresh database.
    """

    if cave == "204-steinBH":
        try:
            steinBr = Cave.objects.get(official_name="Steinbr&uuml;ckenh&ouml;hle")
            caveid = steinBr
        except Cave.DoesNotExist:
            message = f" ! - {qmPath} Steinbruckenhoehle is not in the database. Please run cave parser"
            print(message)
            DataIssue.objects.create(parser="QMs", message=message)
            return
    elif cave == "234-Hauch":
        try:
            hauchHl = Cave.objects.get(official_name="Hauchh&ouml;hle")
            caveid = hauchHl
        except Cave.DoesNotExist:
            message = f" ! - {qmPath} Hauchhoehle is not in the database. Please run cave parser"
            print(message)
            DataIssue.objects.create(parser="QMs", message=message)
            return
    elif cave == "161-KH":
        try:
            kh = Cave.objects.get(official_name="Kaninchenh&ouml;hle")
            caveid = kh
        except Cave.DoesNotExist:
            message = f" ! - {qmPath} KH is not in the database. Please run cave parser"
            print(message)
            DataIssue.objects.create(parser="QMs", message=message)
        nqms = parse_KH_QMs(kh, inputFile=inputFile, ticked=ticked)
        return nqms

    qmPath = Path(settings.EXPOWEB, inputFile)  

    with open(qmPath, "r") as qmCSVContents:
        dialect = csv.Sniffer().sniff(qmCSVContents.read())
        qmCSVContents.seek(0, 0)
        qmReader = csv.reader(qmCSVContents, dialect=dialect)
        next(qmReader)  # Skip header row
        n = 0
        nqms = 0
        for line in qmReader:
            #"Number","Grade","Area","Description","Page reference","Nearest survey station","Completion description","Comment"
            try:
                n += 1
                year = int(line[0][1:5])
                f"PH_{int(year)}_{int(n):02d}"
                QMnum = re.match(r".*?-\d*?-X?(?P<numb>\d*)", line[0]).group("numb")
                newQM = QM() # creates python object, does not touch db yet
                # newQM.found_by=placeholder
                newQM.number = QMnum
                newQM.cave = caveid
                newQM.expoyear = year
                newQM.blockname = ""
                if line[1] == "Dig":
                    newQM.grade = "D"
                else:
                    newQM.grade = line[1]
                newQM.area = line[2]
                newQM.location_description = line[3]
                newQM.page_ref = line[4]
                # In the table, completion is indicated by the presence of a completion discription.
                newQM.nearest_station_name = line[5]
                newQM.completion_description = line[6]
                if newQM.completion_description:
                    newQM.ticked = True
                else:
                    newQM.ticked = False

                newQM.comment = line[7]
                try:
                    # year and number are unique for a cave in CSV imports
                    preexistingQM = QM.objects.get(
                        number=QMnum, expoyear=year, cave=caveid,
                    )  
                    if preexistingQM:
                        message = f" ! - {qmPath} PRE-EXISTING QM - should not exist ! {str(line)} "
                        print(message)
                        DataIssue.objects.create(parser="QMs", message=message)
                        preexistingQM.delete()
                    newQM.save()
                except QM.DoesNotExist:  # if there is no pre-existing QM, save the new one
                    newQM.save()
                nqms += 1
            except KeyError:  # check on this one
                message = f" ! - {qmPath} KeyError {str(line)} "
                print(message)
                DataIssue.objects.create(parser="QMs", message=message)
                continue
            except IndexError:
                message = f" ! - {qmPath} IndexError {str(line)} "
                print(message)
                DataIssue.objects.create(parser="QMs", message=message)
                continue
            except:
                message = f" ! - {qmPath} UNKNOWN error {str(line)} "
                print(message)
                DataIssue.objects.create(parser="QMs", message=message)
                raise
                continue
    return nqms


def parse_KH_QMs(kh, inputFile, ticked):
    """import QMs from the 1623-161 (Kaninchenhohle) html pages, different format"""
    with open(os.path.join(settings.EXPOWEB, inputFile), "r") as khQMfile:
        khQMs = khQMfile.readlines()
        nqms = 0
        line = 0
        fails = 0
        for dataline in khQMs:
            # <dt><a href="sibria.htm#qC1997-161-27" name="C1997-161-27">C1997-161-27</a> A<dd>Sib: pitch at end of Fuzzy Logic [Paradox Rift - continues] [sep.fuzzy.13]
            line += 1
            res = re.search(
                r"name=\"[CB](?P<year>\d*)-(?P<cave>\d*)-(?P<number>\d*).*</a>\s*(?P<grade>[ABCDX?V])<dd>(?P<location_description>[^[]*)(\[\s*(?P<completion>[^]]*)\s*\])?\s*(\[\s*(?P<station_name>[^]]*)\s*\])?",
                dataline,
            )
            if res:
                res = res.groupdict()
                year = int(res["year"])
                
                completion = res["completion"]
                station_name = res["station_name"]
                if not completion and station_name:
                    if station_name.startswith("<a href"):
                        completion = station_name
                        station_name = ""
                if completion and not station_name:
                    if not ticked:
                        station_name = completion
                        
                if completion:
                    completion = completion.replace("<a href=\"","<a href=\"/1623/161/")
                nearest_station_name = ""
                resolution_station_name = ""
                if station_name:
                    if ticked:
                        resolution_station_name = station_name.replace("<a href=\"","<a href=\"/1623/161/")
                    else:
                        nearest_station_name =  station_name.replace("<a href=\"","<a href=\"/1623/161/")
                lookupAttribs = {
                    #'found_by':placeholder,
                    "blockname": "",
                    "expoyear": year,
                    "number": res["number"],
                    "cave": kh,
                    "grade": res["grade"],
                }
                nonLookupAttribs = {
                    "ticked": ticked,
                    "page_ref": "",
                    "completion_description": completion,
                    "nearest_station_name": nearest_station_name,
                    "resolution_station_name": resolution_station_name,
                    "location_description": res["location_description"].replace("<a href=\"","<a href=\"/1623/161/"),
                }
                # Create new. We know it doesn't exist as we deleted evrything when we started.
                instance = QM.objects.create(**nonLookupAttribs, **lookupAttribs)
                nqms += 1
            else:
                if dataline.startswith("<dt><a href"):
                    fails += 1
                    message = f" ! - {inputFile} line {line} Parse error \n{str(dataline)} "
                    print(message)
                    DataIssue.objects.create(parser="QMs", message=message)
    print(f" - {fails:2g} parsing errors in {inputFile}")
    return nqms


def Load_QMs():
    deleteQMs()
    #Number	Grade	Area	Description	Page reference	Nearest station	Completion description	Comment
    n204 = parseCaveQMs(cave="204-steinBH", inputFile=r"1623/204/qm-204.csv") # TAB separated values
    
    #"Number","Grade","Area","Description","Page reference","Nearest survey station","Completion description","Comment"
    n234 = parseCaveQMs(cave="234-Hauch", inputFile=r"1623/234/qm-234.csv") # COMMA separated values, with quotes.
    n161 = parseCaveQMs(cave="161-KH", inputFile="1623/161/qmtodo.htm", ticked=False)
    t161 = parseCaveQMs(cave="161-KH", inputFile="1623/161/qmdone.htm", ticked=True)
    # parseCaveQMs(cave='balkonhoehle',inputFile=r"1623/264/qm.csv")
    print(f" - Imported: {n204} QMs for 204, {n234} QMs for 234, {t161} QMs for 161 done, {n161} QMs for 161 not done.")

    print()