forked from expo/troggle
695 lines
30 KiB
Python
695 lines
30 KiB
Python
import csv
|
|
import os
|
|
import re
|
|
# import pickle
|
|
# import shelve
|
|
import time
|
|
from random import randint
|
|
from datetime import datetime, date
|
|
from pathlib import Path
|
|
|
|
from django.conf import settings
|
|
from django.template.defaultfilters import slugify
|
|
from django.utils.timezone import get_current_timezone, make_aware
|
|
|
|
from troggle.core.models.troggle import DataIssue, Expedition
|
|
from troggle.core.utils import TROG, save_carefully
|
|
from troggle.core.models.caves import Cave, LogbookEntry, PersonTrip, GetCaveLookup
|
|
from parsers.people import GetPersonExpeditionNameLookup
|
|
|
|
'''
|
|
Parses and imports logbooks in all their wonderful confusion
|
|
|
|
# When we edit logbook entries, allow a "?" after any piece of data to say we've frigged it and
|
|
# it can be checked up later from the hard-copy if necessary; or it's not possible to determin (name, trip place, etc)
|
|
'''
|
|
todo='''
|
|
- refactor everything with some urgency, esp. LoadLogbookForExpedition()
|
|
|
|
- remove the TROG and lbo things since we need the database for multiuser access? Or not?
|
|
|
|
- profile the code to find bad repetitive things, of which there are many.
|
|
|
|
- far too many uses of Django field dereferencing to get values, which is SLOW
|
|
|
|
- Logbooks 1987, 1988, 1989 all crash on MySql - but not sqlite - with db constraint fail. Edit logbook to fix.
|
|
|
|
- import/parse/re-export-as-html the 'artisanal-format' old logbooks so that
|
|
we keep only a modern HTML05 format. Then we can retire the old parsers and reduce the
|
|
volume of code here substantially.
|
|
|
|
- rewrite to use generators rather than storing everything intermediate in lists - to reduce memory impact.
|
|
|
|
- We should ensure logbook.html is utf-8 and stop this crap:
|
|
file_in = open(logbookfile,'rb')
|
|
txt = file_in.read().decode("latin1")
|
|
|
|
- this is a slow and uncertain function: cave = getCaveByReference(caveRef)
|
|
|
|
- use Fixtures https://docs.djangoproject.com/en/4.1/ref/django-admin/#django-admin-loaddata to cache
|
|
data for old logbooks. New design needed, with a mechanism for flagging fixtures as outdated after edits.
|
|
|
|
'''
|
|
MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200
|
|
DEFAULT_LOGBOOK_PARSER = "parser_html"
|
|
DEFAULT_LOGBOOK_FILE = "logbook.html"
|
|
# All years since 2010 use the default value for Logbook parser
|
|
# but several don't work, and are skipped by the parsing code, e.g. 1983
|
|
LOGBOOK_PARSER_SETTINGS = {
|
|
"2010": ("logbook.html", "parser_html"),
|
|
"2009": ("2009logbook.txt", "wiki_parser"),
|
|
"2008": ("2008logbook.txt", "wiki_parser"),
|
|
"2007": ("logbook.html", "parser_html"),
|
|
"2006": ("logbook.html", "parser_html"),
|
|
# "2006": ("logbook/logbook_06.txt", "wiki_parser"),
|
|
"2006": ("logbook.html", "parser_html"),
|
|
"2005": ("logbook.html", "parser_html"),
|
|
"2004": ("logbook.html", "parser_html"),
|
|
"2003": ("logbook.html", "parser_html"),
|
|
"2002": ("logbook.html", "parser_html"),
|
|
"2001": ("log.htm", "parser_html_01"),
|
|
"2000": ("log.htm", "parser_html_01"),
|
|
"1999": ("log.htm", "parser_html_01"),
|
|
"1998": ("log.htm", "parser_html_01"),
|
|
"1997": ("log.htm", "parser_html_01"),
|
|
"1996": ("log.htm", "parser_html_01"),
|
|
"1995": ("log.htm", "parser_html_01"),
|
|
"1994": ("log.htm", "parser_html_01"),
|
|
"1993": ("log.htm", "parser_html_01"),
|
|
"1992": ("log.htm", "parser_html_01"),
|
|
"1991": ("log.htm", "parser_html_01"),
|
|
"1990": ("log.htm", "parser_html_01"),
|
|
"1989": ("log.htm", "parser_html_01"), #crashes MySQL
|
|
"1988": ("log.htm", "parser_html_01"), #crashes MySQL
|
|
"1987": ("log.htm", "parser_html_01"), #crashes MySQL
|
|
"1985": ("log.htm", "parser_html_01"),
|
|
"1984": ("log.htm", "parser_html_01"),
|
|
"1983": ("log.htm", "parser_html_01"),
|
|
"1982": ("log.htm", "parser_html_01"),
|
|
}
|
|
|
|
entries = { "2022": 64, "2019": 56, "2018": 74, "2017": 60, "2016": 81, "2015": 79,
|
|
"2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 52,
|
|
"2008": 49, "2007": 111, "2006": 60, "2005": 55, "2004": 76, "2003": 42, "2002": 31,
|
|
"2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 94, "1995": 41,
|
|
"1994": 32, "1993": 41, "1992": 61, "1991": 38, "1990": 87, "1989": 1,"1988": 1,"1987": 1,
|
|
"1985": 24, "1984": 32, "1983": 52, "1982": 42,}
|
|
# Logbooks log.htm exist for 1983, 84, 85, 87, 88, 89 but have no full-working parser, or need hand-editing.
|
|
|
|
logentries = [] # the entire logbook for one year is a single object: a list of entries
|
|
noncaveplaces = [ "QMplaceholder", "Journey", "Loser Plateau", "UNKNOWN", 'plateau',
|
|
'base camp', 'basecamp', 'top camp', 'topcamp' ]
|
|
logdataissues = TROG['issues']['logdataissues']
|
|
trips ={}
|
|
|
|
#
|
|
# the logbook loading section
|
|
#
|
|
def set_trip_id(year, seq):
|
|
tid= f"{year}_s{seq:02d}"
|
|
return tid
|
|
|
|
rx_tripperson = re.compile(r'(?i)<u>(.*?)</u>$')
|
|
rx_round_bracket = re.compile(r"[\(\[].*?[\)\]]")
|
|
|
|
|
|
def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
|
|
res = [ ]
|
|
author = None
|
|
# print(f'# {tid}')
|
|
# print(f" - {tid} '{trippeople}' ")
|
|
|
|
for tripperson in re.split(r",|\+|&|&(?!\w+;)| and ", trippeople):
|
|
tripperson = tripperson.strip()
|
|
# mul = re.match(r"(?i)<u>(.*?)</u>$", tripperson)
|
|
mul = rx_tripperson.match(tripperson)
|
|
if mul:
|
|
tripperson = mul.group(1).strip()
|
|
if tripperson and tripperson[0] != '*':
|
|
tripperson = re.sub(rx_round_bracket, "", tripperson).strip()
|
|
|
|
if tripperson =="Wiggy":
|
|
tripperson = "Phil Wigglesworth"
|
|
if tripperson =="Animal":
|
|
tripperson = "Mike Richardson"
|
|
if tripperson =="MikeTA":
|
|
tripperson = "Mike Richardson"
|
|
|
|
|
|
personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower())
|
|
if not personyear:
|
|
message = f" ! - {expedition.year} No name match for: '{tripperson}' in entry {tid=} for this expedition year."
|
|
print(message)
|
|
DataIssue.objects.create(parser='logbooks', message=message)
|
|
logdataissues[tid]=message
|
|
res.append((personyear, logtime_underground))
|
|
if mul:
|
|
author = personyear
|
|
if not author:
|
|
if not res:
|
|
return None, None
|
|
author = res[-1][0]
|
|
|
|
#print(f" - {tid} [{author.person}] '{res[0][0].person}'...")
|
|
return res, author
|
|
|
|
def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, tid=None):
|
|
""" saves a logbook entry and related persontrips
|
|
Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday !
|
|
|
|
troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite
|
|
but we are saving the same thing too many times..
|
|
"""
|
|
try:
|
|
trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground, tid=tid)
|
|
except:
|
|
message = f" ! - {expedition.year} Skipping logentry: {title} - GetTripPersons FAIL"
|
|
DataIssue.objects.create(parser='logbooks', message=message)
|
|
logdataissues["title"]=message
|
|
print(message)
|
|
raise
|
|
return
|
|
|
|
if not author:
|
|
message = f" ! - {expedition.year} Warning: logentry: {title} - no author for entry '{tid}'"
|
|
DataIssue.objects.create(parser='logbooks', message=message)
|
|
logdataissues["title"]=message
|
|
print(message)
|
|
#return
|
|
|
|
# This needs attention. The slug field is derived from 'title'
|
|
# both GetCaveLookup() and GetTripCave() need to work together better. None of this data is *used* though?
|
|
#tripCave = GetTripCave(place):
|
|
|
|
lplace = place.lower()
|
|
cave=None
|
|
if lplace not in noncaveplaces:
|
|
cave = GetCaveLookup().get(lplace)
|
|
|
|
y = str(date)[:4]
|
|
text = text.replace('src="', f'src="/years/{y}/' )
|
|
text = text.replace("src='", f"src='/years/{y}/" )
|
|
|
|
#Check for an existing copy of the current entry, and save
|
|
expeditionday = expedition.get_expedition_day(date)
|
|
lookupAttribs={'date':date, 'title':title}
|
|
# 'cave' is converted to a string doing this, which renders as the cave slug.
|
|
# but it is a db query which we should try to avoid - rewrite this
|
|
|
|
#NEW slug for a logbook entry here! Unique id + slugified title fragment
|
|
|
|
if tid is not None:
|
|
slug = tid
|
|
# slug = tid + "_" + slugify(title)[:10].replace('-','_')
|
|
else:
|
|
slug = str(randint(1000,9999)) + "_" + slugify(title)[:10].replace('-','_')
|
|
nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave_slug':str(cave), 'slug': slug}
|
|
|
|
# This creates the lbo instance of LogbookEntry
|
|
lbo, created=save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs)
|
|
|
|
|
|
for tripperson, time_underground in trippersons:
|
|
# print(f" - {tid} '{tripperson}' author:{tripperson == author}")
|
|
lookupAttribs={'personexpedition':tripperson, 'logbook_entry':lbo}
|
|
nonLookupAttribs={'time_underground':time_underground, 'is_logbook_entry_author':(tripperson == author)}
|
|
# this creates the PersonTrip instance.
|
|
save_carefully(PersonTrip, lookupAttribs, nonLookupAttribs)
|
|
|
|
def ParseDate(tripdate, year):
|
|
""" Interprets dates in the expo logbooks and returns a correct datetime.date object """
|
|
dummydate = date(1970, 1, 1)
|
|
month = 1
|
|
day = 1
|
|
# message = f" ! - Trying to parse date in logbook: {tripdate} - {year}"
|
|
# print(message)
|
|
try:
|
|
mdatestandard = re.match(r"(\d\d\d\d)-(\d\d)-(\d\d)", tripdate)
|
|
mdategoof = re.match(r"(\d\d?)/0?(\d)/(20|19)?(\d\d)", tripdate)
|
|
if mdatestandard:
|
|
if not (mdatestandard.group(1) == year):
|
|
message = f" ! - Bad date (year) in logbook: {tripdate} - {year}"
|
|
DataIssue.objects.create(parser='logbooks', message=message)
|
|
logdataissues["tripdate"]=message
|
|
return dummydate
|
|
else:
|
|
year, month, day = int(mdatestandard.group(1)), int(mdatestandard.group(2)), int(mdatestandard.group(3))
|
|
elif mdategoof:
|
|
if not (not mdategoof.group(3) or mdategoof.group(3) == year[:2]):
|
|
message = " ! - Bad date mdategoof.group(3) in logbook: " + tripdate + " - " + mdategoof.group(3)
|
|
DataIssue.objects.create(parser='logbooks', message=message)
|
|
logdataissues["tripdate"]=message
|
|
return dummydate
|
|
else:
|
|
yadd = int(year[:2]) * 100
|
|
day, month, year = int(mdategoof.group(1)), int(mdategoof.group(2)), int(mdategoof.group(4)) + yadd
|
|
else:
|
|
year = 1970
|
|
message = f" ! - Bad date in logbook: {tripdate} - {year}"
|
|
DataIssue.objects.create(parser='logbooks', message=message)
|
|
logdataissues["tripdate"]=message
|
|
|
|
return date(year, month, day)
|
|
except:
|
|
message = f" ! - Failed to parse date in logbook: {tripdate} - {year}"
|
|
DataIssue.objects.create(parser='logbooks', message=message)
|
|
logdataissues["tripdate"]=message
|
|
return datetime.date(1970, 1, 1)
|
|
|
|
# (2006 - not any more), 2008 - 2009
|
|
def wiki_parser(year, expedition, txt):
|
|
global logentries
|
|
global logdataissues
|
|
|
|
logbook_entry_count = 0
|
|
trippara = re.findall(r"===(.*?)===([\s\S]*?)(?====)", txt)
|
|
for triphead, triptext in trippara:
|
|
logbook_entry_count += 1
|
|
tid = set_trip_id(year,logbook_entry_count)
|
|
|
|
tripheadp = triphead.split("|")
|
|
if not (len(tripheadp) == 3):
|
|
message = " ! - Bad no of items in tripdate in logbook: " + tripdate + " - " + tripheadp
|
|
DataIssue.objects.create(parser='logbooks', message=message)
|
|
logdataissues["tripdate"]=message
|
|
|
|
tripdate, tripplace, trippeople = tripheadp
|
|
tripsplace = tripplace.split(" - ")
|
|
tripcave = tripsplace[0].strip()
|
|
if len(tripsplace) == 1:
|
|
tripsplace = tripsplace[0]
|
|
else:
|
|
tripsplace = tripsplace[1]
|
|
|
|
#print(f"! LOGBOOK {year} {logbook_entry_count:2} {len(triptext):4} '{tripsplace}'")
|
|
|
|
tul = re.findall(r"T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext)
|
|
if tul:
|
|
tu = tul[0][0]
|
|
else:
|
|
tu = ""
|
|
|
|
ldate = ParseDate(tripdate.strip(), year)
|
|
tripid =""
|
|
|
|
entrytuple = (ldate, tripcave, tripsplace, triptext,
|
|
trippeople, expedition, tu, tripid)
|
|
logentries.append(entrytuple)
|
|
|
|
|
|
# 2002, 2004, 2005, 2007, 2010 - now
|
|
# 2006 wiki text is incomplete, but the html all there. So using this parser now.
|
|
def parser_html(year, expedition, txt):
|
|
global logentries
|
|
global logdataissues
|
|
|
|
tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt)
|
|
logbook_entry_count = 0
|
|
for trippara in tripparas:
|
|
logbook_entry_count += 1
|
|
tid = set_trip_id(year,logbook_entry_count)
|
|
# print(f' - new tid:{tid} lbe count: {logbook_entry_count}')
|
|
|
|
s = re.match(r'''(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date
|
|
\s*(?:<a\s+id="(.*?)"\s*/>\s*</a>)?
|
|
\s*<div\s+class="tripdate"\s*(?:id="(.*?)")?>(.*?)</div>(?:<p>)?
|
|
\s*<div\s+class="trippeople">\s*(.*?)</div>
|
|
\s*<div\s+class="triptitle">\s*(.*?)</div>
|
|
([\s\S]*?)
|
|
\s*(?:<div\s+class="timeug">\s*(.*?)</div>)?
|
|
\s*$
|
|
''', trippara)
|
|
if s:
|
|
tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups()
|
|
else: # allow title and people to be swapped in order
|
|
msg = f" !- {year} Can't parse:{logbook_entry_count} '{trippara[:40]}'..."
|
|
print(msg)
|
|
DataIssue.objects.create(parser='logbooks', message=msg)
|
|
logdataissues[tid]=msg
|
|
|
|
s2 = re.match(r'''(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date
|
|
\s*(?:<a\s+id="(.*?)"\s*/>\s*</a>)?
|
|
\s*<div\s+class="tripdate"\s*(?:id="(.*?)")?>(.*?)</div>(?:<p>)?
|
|
\s*<div\s+class="triptitle">\s*(.*?)</div>
|
|
\s*<div\s+class="trippeople">\s*(.*?)</div>
|
|
([\s\S]*?)
|
|
\s*(?:<div\s+class="timeug">\s*(.*?)</div>)?
|
|
\s*$
|
|
''', trippara)
|
|
if s2:
|
|
tripid, tripid1, tripdate, triptitle, trippeople, triptext, tu = s2.groups()
|
|
else:
|
|
if not re.search(r"Rigging Guide", trippara):
|
|
msg = f" !- Logbook. Can't parse entry on 2nd pass:{logbook_entry_count} '{trippara[:40]}'..."
|
|
print(msg)
|
|
DataIssue.objects.create(parser='logbooks', message=msg)
|
|
logdataissues[tid]=msg
|
|
continue
|
|
|
|
ldate = ParseDate(tripdate.strip(), year)
|
|
triptitles = triptitle.split(" - ")
|
|
if len(triptitles) >= 2:
|
|
tripcave = triptitles[0]
|
|
else:
|
|
tripcave = "UNKNOWN"
|
|
ltriptext = re.sub(r"</p>", "", triptext)
|
|
ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
|
|
ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
|
|
|
|
entrytuple = (ldate, tripcave, triptitle, ltriptext,
|
|
trippeople, expedition, tu, tripid1)
|
|
logentries.append(entrytuple)
|
|
|
|
|
|
# main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it
|
|
# trying it out for years 1982 - 1990 too. Some logbook editing required by hand.. place
|
|
def parser_html_01(year, expedition, txt):
|
|
global logentries
|
|
global logdataissues
|
|
errorcount = 0
|
|
|
|
tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
|
|
logbook_entry_count = 0
|
|
for trippara in tripparas:
|
|
logbook_entry_count += 1
|
|
tid = set_trip_id(year,logbook_entry_count)
|
|
# print(f" #0 - tid: {tid}")
|
|
try:
|
|
#print(f" #1 - tid: {tid}")
|
|
s = re.match(r"(?i)(?s)\s*(?:<p>)?(.*?)</?p>(.*)$", trippara)
|
|
if not s:
|
|
message = " ! - Skipping logentry {year} failure to parse header: " + tid + trippara[:300] + "..."
|
|
DataIssue.objects.create(parser='logbooks', message=message)
|
|
logdataissues[tid]=message
|
|
print(message)
|
|
break
|
|
try:
|
|
tripheader, triptext = s.group(1), s.group(2)
|
|
except:
|
|
message = f" ! - Fail to set tripheader, triptext. trip:<{tid}> s:'{s}'"
|
|
DataIssue.objects.create(parser='logbooks', message=message)
|
|
logdataissues[tid]=message
|
|
print(message)
|
|
|
|
|
|
# mtripid = re.search(r'<a id="(.*?)"', tripheader)
|
|
# if not mtripid:
|
|
# message = f" ! - A tag id not found. Never mind. Not needed. trip:<{tid}> header:'{tripheader}'"
|
|
# DataIssue.objects.create(parser='logbooks', message=message)
|
|
# logdataissues[tid]=message
|
|
# print(message)
|
|
|
|
# tripid = mtripid and mtripid.group(1) or ""
|
|
# print(f" # - mtripid: {mtripid}")
|
|
tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader)
|
|
#print(f" #2 - tid: {tid}")
|
|
try:
|
|
tripdate, triptitle, trippeople = tripheader.split("|")
|
|
except:
|
|
message = f" ! - Fail 3 to split out date|title|people. trip:<{tid}> '{tripheader.split('|')}'"
|
|
DataIssue.objects.create(parser='logbooks', message=message)
|
|
logdataissues[tid]=message
|
|
print(message)
|
|
try:
|
|
tripdate, triptitle = tripheader.split("|")
|
|
trippeople = "GUESS ANON"
|
|
except:
|
|
message = f" ! - Skipping logentry {year} Fail 2 to split out date|title (anon). trip:<{tid}> '{tripheader.split('|')}' CRASHES MySQL !"
|
|
DataIssue.objects.create(parser='logbooks', message=message)
|
|
logdataissues[tid]=message
|
|
print(message)
|
|
break
|
|
#print(f" #3 - tid: {tid}")
|
|
ldate = ParseDate(tripdate.strip(), year)
|
|
#print(f" # - tid: {tid} <{tripdate}> <{triptitle}> <{trippeople}>")
|
|
#print(f" #4 - tid: {tid}")
|
|
|
|
mtu = re.search(r'<p[^>]*>(T/?U.*)', triptext)
|
|
if mtu:
|
|
tu = mtu.group(1)
|
|
triptext = triptext[:mtu.start(0)] + triptext[mtu.end():]
|
|
else:
|
|
tu = ""
|
|
|
|
triptitles = triptitle.split(" - ")
|
|
tripcave = triptitles[0].strip()
|
|
|
|
ltriptext = triptext
|
|
|
|
mtail = re.search(r'(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&|</?p>|\((?:same day|\d+)\))*$', ltriptext)
|
|
if mtail:
|
|
ltriptext = ltriptext[:mtail.start(0)]
|
|
ltriptext = re.sub(r"</p>", "", ltriptext)
|
|
ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
|
|
ltriptext = re.sub(r"</?u>", "_", ltriptext)
|
|
ltriptext = re.sub(r"</?i>", "''", ltriptext)
|
|
ltriptext = re.sub(r"</?b>", "'''", ltriptext)
|
|
ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
|
|
|
|
if ltriptext == "":
|
|
message = " ! - Zero content for logbook entry!: " + tid
|
|
DataIssue.objects.create(parser='logbooks', message=message)
|
|
logdataissues[tid]=message
|
|
print(message)
|
|
|
|
|
|
entrytuple = (ldate, tripcave, triptitle, ltriptext,
|
|
trippeople, expedition, tu, tid)
|
|
logentries.append(entrytuple)
|
|
|
|
except:
|
|
message = f" ! - Skipping logentry {year} due to exception in: {tid}"
|
|
DataIssue.objects.create(parser='logbooks', message=message)
|
|
logdataissues[tid]=message
|
|
print(message)
|
|
errorcount += 1
|
|
raise
|
|
if errorcount >5 :
|
|
message = f" !!- TOO MANY ERRORS - aborting at '{tid}' logbook: {year}"
|
|
DataIssue.objects.create(parser='logbooks', message=message)
|
|
logdataissues[tid]=message
|
|
print(message)
|
|
return
|
|
|
|
# parser for 2003. Retired after conversion of the logbook.html
|
|
# KEEP THIS COMMENTED-OUT example until after we have doen the same thing with the html_01 parser
|
|
# def parser_html_03(year, expedition, txt):
|
|
# global logentries
|
|
# global logdataissues
|
|
|
|
# tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt)
|
|
# logbook_entry_count = 0
|
|
# for trippara in tripparas:
|
|
# logbook_entry_count += 1
|
|
# tid = set_trip_id(year,logbook_entry_count) # default trip id, before we read the date
|
|
|
|
# s = re.match(r"(?s)\s*<p>(.*?)</p>(.*)$", trippara)
|
|
# if not ( s ) :
|
|
# message = " ! - Skipping logentry {year} on failure to parse parser_html_03: {} {} {}...".format(tid,s,trippara[:300])
|
|
# DataIssue.objects.create(parser='logbooks', message=message)
|
|
# logdataissues[tid]=message
|
|
# print(message)
|
|
# break
|
|
|
|
# tripheader, triptext = s.group(1), s.group(2)
|
|
# tripheader = re.sub(r" ", " ", tripheader)
|
|
# tripheader = re.sub(r"\s+", " ", tripheader).strip()
|
|
# sheader = tripheader.split(" -- ")
|
|
# tu = ""
|
|
# if re.match("T/U|Time underwater", sheader[-1]):
|
|
# tu = sheader.pop() # not a number in 2003 usually
|
|
# # print(f" - {logbook_entry_count} '{tu}' ")
|
|
# if len(sheader) != 3:
|
|
# print(" ! Header not three pieces for parser_html_03() ", sheader)
|
|
# tripdate, triptitle, trippeople = sheader
|
|
# ldate = ParseDate(tripdate.strip(), year)
|
|
# # print(f" - {logbook_entry_count} '{ldate}' from '{tripdate.strip()}' ")
|
|
# # print(f" - {logbook_entry_count} '{trippeople}' ")
|
|
# titlelist = triptitle.split(" , ")
|
|
# if len(titlelist) >= 2:
|
|
# location, *namelist = titlelist # list unpacking operator
|
|
# tripname = ", ".join(namelist) # concatenate strings
|
|
# # print(f" - {logbook_entry_count} {location} '{tripname}'")
|
|
# else:
|
|
# location = "UNKNOWN"
|
|
|
|
# ltriptext = triptext + "<br /><br />\n\n" + tu
|
|
# ltriptext = re.sub(r"</p>", "", ltriptext)
|
|
# #ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
|
|
# ltriptext = re.sub(r"<p>", "<br /><br />\n\n", ltriptext).strip()
|
|
# #ltriptext = re.sub(r"[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext)
|
|
|
|
|
|
# entrytuple = (ldate, location, tripname, ltriptext,
|
|
# trippeople, expedition, tu, tid)
|
|
# logentries.append(entrytuple)
|
|
|
|
|
|
def LoadLogbookForExpedition(expedition):
|
|
""" Parses all logbook entries for one expedition
|
|
"""
|
|
global logentries
|
|
# absolutely horrid. REFACTOR THIS (all my fault..)
|
|
global logdataissues
|
|
global entries
|
|
|
|
logbook_parseable = False
|
|
yearlinks = LOGBOOK_PARSER_SETTINGS
|
|
expologbase = os.path.join(settings.EXPOWEB, "years")
|
|
logentries=[]
|
|
|
|
year = expedition.year
|
|
expect = entries[year]
|
|
# print(" - Logbook for: " + year)
|
|
|
|
|
|
|
|
def cleanerrors(year):
|
|
global logdataissues
|
|
dataissues = DataIssue.objects.filter(parser='logbooks')
|
|
for di in dataissues:
|
|
ph = year
|
|
if re.search(ph, di.message) is not None:
|
|
#print(f' - CLEANING dataissue {di.message}')
|
|
di.delete()
|
|
|
|
#print(f' - CLEAN {year} {len(logdataissues)} {type(logdataissues)} data issues for this year')
|
|
dellist = []
|
|
for key, value in logdataissues.items():
|
|
#print(f' - CLEANING logdataissues [{key}]: {value}')
|
|
if key.startswith(year):
|
|
#print(f' - CLEANING logdataissues [{key:12}]: {value} ')
|
|
dellist.append(key)
|
|
for i in dellist:
|
|
del logdataissues[i]
|
|
|
|
cleanerrors(year)
|
|
|
|
if year in yearlinks:
|
|
logbookpath = Path(expologbase) / year / yearlinks[year][0]
|
|
expedition.logbookfile = yearlinks[year][0]
|
|
parsefunc = yearlinks[year][1]
|
|
else:
|
|
logbookpath = Path(expologbase) / year / DEFAULT_LOGBOOK_FILE
|
|
expedition.logbookfile = DEFAULT_LOGBOOK_FILE
|
|
parsefunc = DEFAULT_LOGBOOK_PARSER
|
|
|
|
expedition.save()
|
|
|
|
lbes = LogbookEntry.objects.filter(expedition=expedition)
|
|
for lbe in lbes:
|
|
lbe.delete()
|
|
|
|
try:
|
|
file_in = open(logbookpath,'rb')
|
|
txt = file_in.read().decode("utf-8")
|
|
file_in.close()
|
|
logbook_parseable = True
|
|
except (IOError):
|
|
logbook_parseable = False
|
|
print(" ! Couldn't open logbook as UTF-8 " + logbookpath)
|
|
except:
|
|
logbook_parseable = False
|
|
print(" ! Very Bad Error opening " + logbookpath)
|
|
|
|
if logbook_parseable:
|
|
parser = globals()[parsefunc]
|
|
print(f' - {year} parsing with {parsefunc}')
|
|
parser(year, expedition, txt) # this launches the right parser for this year
|
|
|
|
i=0
|
|
for entrytuple in logentries:
|
|
# date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
|
|
try:
|
|
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
|
|
except ValueError: # cope with removal of entry_type but still in cache files. Remove in Dec. 2022.
|
|
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, entry_type, tripid1 = entrytuple
|
|
print(f' - Exception entry_type "{entry_type}" {tripid1}')
|
|
EnterLogIntoDbase(date, tripcave, triptitle, text, trippeople, expedition, 0,
|
|
tripid1)
|
|
i +=1
|
|
|
|
if len(logentries) == expect:
|
|
# print(f"OK {year} {len(logentries):5d} is {expect}\n")
|
|
pass
|
|
else:
|
|
print(f"Mismatch {year} {len(logentries):5d} is not {expect}\n")
|
|
|
|
return len(logentries)
|
|
|
|
def LoadLogbook(year):
|
|
nlbe={}
|
|
TROG['pagecache']['expedition'][year] = None # clear cache
|
|
|
|
expo = Expedition.objects.get(year=year)
|
|
nlbe[expo] = LoadLogbookForExpedition(expo) # this actually loads the logbook for one expo
|
|
|
|
|
|
def LoadLogbooks():
|
|
""" This is the master function for parsing all logbooks into the Troggle database.
|
|
This should be rewritten to use coroutines to load all logbooks from disc in parallel,
|
|
but must be serialised to write to database as sqlite is single-user.
|
|
"""
|
|
global logdataissues
|
|
global entries
|
|
|
|
logdataissues = {}
|
|
DataIssue.objects.filter(parser='logbooks').delete()
|
|
expos = Expedition.objects.all()
|
|
if len(expos) <= 1:
|
|
message = f" ! - No expeditions found. Load 'people' first"
|
|
DataIssue.objects.create(parser='logbooks', message=message)
|
|
logdataissues[f"sqlfail 0000"]=message
|
|
print(message)
|
|
|
|
noexpo = ["1986", "2020", "2021",] #no expo
|
|
lostlogbook = ["1976", "1977", "1978", "1979", "1980", "1981"]
|
|
sqlfail = ["1987", "1988", "1989"] # breaks mysql with db constraint fail - debug locally first]
|
|
nologbook = noexpo + lostlogbook + sqlfail
|
|
|
|
nlbe={}
|
|
expd ={}
|
|
actuals = []
|
|
|
|
for expo in expos: # pointless as we explicitly know the years in this code.
|
|
year = expo.year
|
|
TROG['pagecache']['expedition'][year] = None # clear cache
|
|
if year in sqlfail:
|
|
print(" - Logbook for: " + year + " NO parsing attempted - known sql failures")
|
|
message = f" ! - Not even attempting to parse logbook for {year} until code fixed"
|
|
DataIssue.objects.create(parser='logbooks', message=message)
|
|
logdataissues[f"sqlfail {year}"]=message
|
|
print(message)
|
|
|
|
if year not in nologbook:
|
|
if year in entries:
|
|
actuals.append(expo)
|
|
else:
|
|
print(" - No Logbook yet for: " + year) # catch case when preparing for next expo
|
|
|
|
for ex in actuals:
|
|
nlbe[ex] = LoadLogbookForExpedition(ex) # this actually loads the logbook for one expo
|
|
|
|
# tried to use map with concurrent threads - but sqlite database is not concurrent, so failed with database lock
|
|
# yt = 0
|
|
# for r in map(LoadLogbookForExpedition, actuals):
|
|
# yt = r
|
|
|
|
yt = 0
|
|
for e in nlbe:
|
|
yt += nlbe[e]
|
|
print(f"total {yt:,} log entries parsed in all expeditions")
|
|
|
|
|
|
|
|
# dateRegex = re.compile(r'<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S)
|
|
# expeditionYearRegex = re.compile(r'<span\s+class="expeditionyear">(.*?)</span>', re.S)
|
|
# titleRegex = re.compile(r'<H1>(.*?)</H1>', re.S)
|
|
# reportRegex = re.compile(r'<div\s+class="report">(.*)</div>\s*</body>', re.S)
|
|
# personRegex = re.compile(r'<div\s+class="person">(.*?)</div>', re.S)
|
|
# nameAuthorRegex = re.compile(r'<span\s+class="name(,author|)">(.*?)</span>', re.S)
|
|
# TURegex = re.compile(r'<span\s+class="TU">([0-9]*\.?[0-9]+)</span>', re.S)
|
|
# locationRegex = re.compile(r'<span\s+class="location">(.*?)</span>', re.S)
|
|
# caveRegex = re.compile(r'<span\s+class="cave">(.*?)</span>', re.S)
|
|
|