2
0
mirror of https://expo.survex.com/repositories/troggle/.git synced 2024-12-12 11:32:18 +00:00
troggle/parsers/QMs.py

218 lines
9.6 KiB
Python
Raw Normal View History

import csv
2020-05-28 02:20:50 +01:00
import os
import re
from pathlib import Path
2020-05-28 02:20:50 +01:00
from django.conf import settings
2023-01-29 16:47:46 +00:00
from troggle.core.models.caves import Cave
from troggle.core.models.logbooks import QM
2023-01-19 18:33:04 +00:00
from troggle.core.models.troggle import DataIssue
2020-05-28 02:20:50 +01:00
2023-01-19 21:18:42 +00:00
"""Reads the CSV files containg QMs for a select few caves
See parsers/survex.py for the parser which extracts QMs from the survex files
2023-01-19 21:18:42 +00:00
"""
def deleteQMs():
QM.objects.all().delete()
2023-01-19 21:18:42 +00:00
DataIssue.objects.filter(parser="QMs").delete()
def parseCaveQMs(cave, inputFile, ticked=False):
2023-01-19 21:18:42 +00:00
"""Runs through the CSV file at inputFile (which is a relative path from expoweb) and
saves each QM as a QM instance.
This is creating and linking a Placeholder logbookentry dated 1st Jan. in the relevant
2023-01-19 21:18:42 +00:00
year. This is pointless but it is needed because found_by is a ForeignKey in the db
and we can't be arsed to fudge this properly with a null.(July 2020)
Linking to a passage in a SVX file might be more interesting as the QM does sometimes
have the passage name, e.g. in 204/qm.csv
C2000-204-39 B Tree Pitch in Cave Tree treeumphant.28 Gosser Streamway
The CSV file does not have the exact date for the QM, only the year, so links to
2022-07-06 15:35:08 +01:00
survex files might be ambiguous. But potentially useful?
2023-01-19 21:18:42 +00:00
2022-07-06 15:35:08 +01:00
Much of this code assumes that QMs are edited using troggle. This is not done so this code can be deleted.
All QMs are created afresh and this is all only run once on import on a fresh database.
"""
2023-01-19 21:18:42 +00:00
if cave == "204-steinBH":
try:
2023-01-19 21:18:42 +00:00
steinBr = Cave.objects.get(official_name="Steinbrückenhöhle")
caveid = steinBr
except Cave.DoesNotExist:
2023-01-19 21:18:42 +00:00
message = f" ! - {qmPath} Steinbruckenhoehle is not in the database. Please run cave parser"
2022-03-15 20:53:55 +00:00
print(message)
2023-01-19 21:18:42 +00:00
DataIssue.objects.create(parser="QMs", message=message)
return
2023-01-19 21:18:42 +00:00
elif cave == "234-Hauch":
try:
2023-01-19 21:18:42 +00:00
hauchHl = Cave.objects.get(official_name="Hauchhöhle")
caveid = hauchHl
except Cave.DoesNotExist:
2023-01-19 21:18:42 +00:00
message = f" ! - {qmPath} Hauchhoehle is not in the database. Please run cave parser"
2022-03-15 20:53:55 +00:00
print(message)
2023-01-19 21:18:42 +00:00
DataIssue.objects.create(parser="QMs", message=message)
return
2023-01-19 21:18:42 +00:00
elif cave == "161-KH":
try:
2023-01-19 21:18:42 +00:00
kh = Cave.objects.get(official_name="Kaninchenhöhle")
caveid = kh
except Cave.DoesNotExist:
2023-01-19 21:18:42 +00:00
message = f" ! - {qmPath} KH is not in the database. Please run cave parser"
2022-03-15 20:53:55 +00:00
print(message)
2023-01-19 21:18:42 +00:00
DataIssue.objects.create(parser="QMs", message=message)
nqms = parse_KH_QMs(kh, inputFile=inputFile, ticked=ticked)
2022-07-05 14:30:42 +01:00
return nqms
qmPath = Path(settings.EXPOWEB, inputFile)
with open(qmPath, "r") as qmCSVContents:
dialect = csv.Sniffer().sniff(qmCSVContents.read())
qmCSVContents.seek(0, 0)
qmReader = csv.reader(qmCSVContents, dialect=dialect)
next(qmReader) # Skip header row
n = 0
nqms = 0
for line in qmReader:
2023-03-17 20:01:52 +00:00
#"Number","Grade","Area","Description","Page reference","Nearest survey station","Completion description","Comment"
try:
n += 1
year = int(line[0][1:5])
f"PH_{int(year)}_{int(n):02d}"
QMnum = re.match(r".*?-\d*?-X?(?P<numb>\d*)", line[0]).group("numb")
2023-03-17 20:01:52 +00:00
newQM = QM() # creates python object, does not touch db yet
# newQM.found_by=placeholder
newQM.number = QMnum
newQM.cave = caveid
2023-03-17 20:01:52 +00:00
newQM.expoyear = year
newQM.blockname = ""
if line[1] == "Dig":
newQM.grade = "D"
else:
newQM.grade = line[1]
newQM.area = line[2]
newQM.location_description = line[3]
2023-03-17 20:01:52 +00:00
newQM.page_ref = line[4]
# In the table, completion is indicated by the presence of a completion discription.
2023-03-17 20:01:52 +00:00
newQM.nearest_station_name = line[5]
newQM.completion_description = line[6]
if newQM.completion_description:
newQM.ticked = True
else:
newQM.ticked = False
2023-03-17 20:01:52 +00:00
newQM.comment = line[7]
try:
# year and number are unique for a cave in CSV imports
preexistingQM = QM.objects.get(
2023-03-17 20:01:52 +00:00
number=QMnum, expoyear=year, cave=caveid,
)
if preexistingQM:
message = f" ! - {qmPath} PRE-EXISTING QM - should not exist ! {str(line)} "
print(message)
DataIssue.objects.create(parser="QMs", message=message)
preexistingQM.delete()
2023-03-17 20:01:52 +00:00
newQM.save()
except QM.DoesNotExist: # if there is no pre-existing QM, save the new one
newQM.save()
nqms += 1
except KeyError: # check on this one
message = f" ! - {qmPath} KeyError {str(line)} "
print(message)
DataIssue.objects.create(parser="QMs", message=message)
continue
except IndexError:
message = f" ! - {qmPath} IndexError {str(line)} "
print(message)
DataIssue.objects.create(parser="QMs", message=message)
continue
2023-03-17 20:01:52 +00:00
except:
message = f" ! - {qmPath} UNKNOWN error {str(line)} "
print(message)
DataIssue.objects.create(parser="QMs", message=message)
raise
continue
2022-07-05 14:30:42 +01:00
return nqms
2023-01-19 21:18:42 +00:00
def parse_KH_QMs(kh, inputFile, ticked):
2023-01-19 21:18:42 +00:00
"""import QMs from the 1623-161 (Kaninchenhohle) html pages, different format"""
with open(os.path.join(settings.EXPOWEB, inputFile), "r") as khQMfile:
khQMs = khQMfile.readlines()
nqms = 0
2023-03-18 00:57:40 +00:00
line = 0
fails = 0
for dataline in khQMs:
2023-03-17 20:01:52 +00:00
# <dt><a href="sibria.htm#qC1997-161-27" name="C1997-161-27">C1997-161-27</a> A<dd>Sib: pitch at end of Fuzzy Logic [Paradox Rift - continues] [sep.fuzzy.13]
2023-03-18 00:57:40 +00:00
line += 1
res = re.search(
2023-03-18 03:03:06 +00:00
r"name=\"[CB](?P<year>\d*)-(?P<cave>\d*)-(?P<number>\d*).*</a>\s*(?P<grade>[ABCDX?V])<dd>(?P<location_description>[^[]*)(\[\s*(?P<completion>[^]]*)\s*\])?\s*(\[\s*(?P<station_name>[^]]*)\s*\])?",
2023-03-18 00:57:40 +00:00
dataline,
)
if res:
res = res.groupdict()
year = int(res["year"])
2023-03-18 00:57:40 +00:00
2023-03-18 03:03:06 +00:00
completion = res["completion"]
station_name = res["station_name"]
if not completion and station_name:
if station_name.startswith("<a href"):
completion = station_name
station_name = ""
if completion and not station_name:
if not ticked:
station_name = completion
if completion:
completion = completion.replace("<a href=\"","<a href=\"/1623/161/")
2023-03-18 00:57:40 +00:00
nearest_station_name = ""
resolution_station_name = ""
2023-03-18 03:03:06 +00:00
if station_name:
2023-03-18 00:57:40 +00:00
if ticked:
2023-03-18 03:03:06 +00:00
resolution_station_name = station_name.replace("<a href=\"","<a href=\"/1623/161/")
2023-03-18 00:57:40 +00:00
else:
2023-03-18 03:03:06 +00:00
nearest_station_name = station_name.replace("<a href=\"","<a href=\"/1623/161/")
coUniqueAttribs = {
#'found_by':placeholder,
"blockname": "",
"expoyear": year,
"number": res["number"],
"cave": kh,
"grade": res["grade"],
}
otherAttribs = {
"ticked": ticked,
2023-03-17 20:01:52 +00:00
"page_ref": "",
2023-03-18 03:03:06 +00:00
"completion_description": completion,
2023-03-18 00:57:40 +00:00
"nearest_station_name": nearest_station_name,
"resolution_station_name": resolution_station_name,
"location_description": res["location_description"].replace("<a href=\"","<a href=\"/1623/161/"),
}
# Create new. We know it doesn't exist as we deleted evrything when we started.
instance = QM.objects.create(**otherAttribs, **coUniqueAttribs)
nqms += 1
2023-03-18 00:57:40 +00:00
else:
if dataline.startswith("<dt><a href"):
fails += 1
message = f" ! - {inputFile} line {line} Parse error \n{str(dataline)} "
print(message)
DataIssue.objects.create(parser="QMs", message=message)
print(f" - {fails:2g} parsing errors in {inputFile}")
2022-07-05 14:30:42 +01:00
return nqms
2023-01-19 21:18:42 +00:00
2020-06-06 22:51:55 +01:00
def Load_QMs():
deleteQMs()
2023-03-17 20:01:52 +00:00
#Number Grade Area Description Page reference Nearest station Completion description Comment
n204 = parseCaveQMs(cave="204-steinBH", inputFile=r"1623/204/qm-204.csv") # TAB separated values
#"Number","Grade","Area","Description","Page reference","Nearest survey station","Completion description","Comment"
n234 = parseCaveQMs(cave="234-Hauch", inputFile=r"1623/234/qm-234.csv") # COMMA separated values, with quotes.
2023-01-19 21:18:42 +00:00
n161 = parseCaveQMs(cave="161-KH", inputFile="1623/161/qmtodo.htm", ticked=False)
t161 = parseCaveQMs(cave="161-KH", inputFile="1623/161/qmdone.htm", ticked=True)
# parseCaveQMs(cave='balkonhoehle',inputFile=r"1623/264/qm.csv")
2024-08-05 09:35:28 +01:00
print(f" - Imported: {n204} QMs for 204, {n234} QMs for 234, {t161} QMs checked for 161, {n161} QMs not checked for 161.")
2022-07-05 14:30:42 +01:00
2023-01-19 21:18:42 +00:00
print()