2
0
mirror of https://expo.survex.com/repositories/troggle/.git synced 2025-04-03 17:31:47 +01:00

Remove logdataissues from TROG

This commit is contained in:
Philip Sargent 2023-01-27 17:24:31 +00:00
parent 75834902f2
commit 2fee216e80
2 changed files with 38 additions and 68 deletions

View File

@ -33,15 +33,15 @@ save_carefully() - core function that saves troggle objects in the database
various git add/commit functions that need refactoring together various git add/commit functions that need refactoring together
NOTE that TROG is not serialized! Two users can update it and conflict !!
This needs to be in a multi-user database with transactions.
''' '''
TROG = { TROG = {
'pagecache' : { 'pagecache' : {
'expedition' : {} 'expedition' : {}
}, },
'issues' : {
'logdataissues' : {}
},
'caves' : { 'caves' : {
'gcavelookup' : {}, 'gcavelookup' : {},
'gcavecount' : {} 'gcavecount' : {}

View File

@ -10,7 +10,7 @@ from django.template.defaultfilters import slugify
from parsers.people import GetPersonExpeditionNameLookup from parsers.people import GetPersonExpeditionNameLookup
from troggle.core.models.caves import GetCaveLookup, LogbookEntry, PersonTrip from troggle.core.models.caves import GetCaveLookup, LogbookEntry, PersonTrip
from troggle.core.models.troggle import DataIssue, Expedition from troggle.core.models.troggle import DataIssue, Expedition
from troggle.core.utils import TROG, save_carefully from troggle.core.utils import save_carefully
""" """
Parses and imports logbooks in all their wonderful confusion Parses and imports logbooks in all their wonderful confusion
@ -18,12 +18,16 @@ Parses and imports logbooks in all their wonderful confusion
https://expo.survex.com/handbook/computing/logbooks-parsing.html https://expo.survex.com/handbook/computing/logbooks-parsing.html
""" """
todo = """ todo = """
- refactor everything with some urgency, esp. LoadLogbookForExpedition() - refactor everything with some urgency, esp. parse_logbook_for_expedition()
- remove the TROG things since we need the database for multiuser access? Or not? - break out the code that hits the database from that which parses the logbook
so that the file-reading and parsing can be parallelized, while writing to the
database remains serialized (sqlite is single-user).
- profile the code to find bad repetitive things, of which there are many. - profile the code to find bad repetitive things, of which there are many.
- attach or link a DataIssue to an individual expo (logbook) so that it can be found and deleted
- far too many uses of Django field dereferencing to get values, which is SLOW - far too many uses of Django field dereferencing to get values, which is SLOW
- rewrite to use generators rather than storing everything intermediate in lists - to reduce memory impact. - rewrite to use generators rather than storing everything intermediate in lists - to reduce memory impact.
@ -96,12 +100,8 @@ entries = {
logentries = [] # the entire logbook for one year is a single object: a list of entries logentries = [] # the entire logbook for one year is a single object: a list of entries
noncaveplaces = ["Journey", "Loser Plateau", "UNKNOWN", "plateau", "base camp", "basecamp", "top camp", "topcamp"] noncaveplaces = ["Journey", "Loser Plateau", "UNKNOWN", "plateau", "base camp", "basecamp", "top camp", "topcamp"]
logdataissues = TROG["issues"]["logdataissues"]
trips = {} trips = {}
#
# the logbook loading section
#
def set_trip_id(year, seq): def set_trip_id(year, seq):
tid = f"{year}_s{seq:02d}" tid = f"{year}_s{seq:02d}"
return tid return tid
@ -149,7 +149,6 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
message = f" ! - {expedition.year} No name match for: '{tripperson}' in entry {tid=} for this expedition year." message = f" ! - {expedition.year} No name match for: '{tripperson}' in entry {tid=} for this expedition year."
print(message) print(message)
DataIssue.objects.create(parser="logbooks", message=message) DataIssue.objects.create(parser="logbooks", message=message)
logdataissues[tid] = message
res.append((personyear, logtime_underground)) res.append((personyear, logtime_underground))
if mul: if mul:
author = personyear author = personyear
@ -163,7 +162,7 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, tid=None): def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, tid=None):
"""saves a logbook entry and related persontrips """saves a single logbook entry and related persontrips
Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday ! Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday !
troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite but we are saving the same thing too many times.. troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite but we are saving the same thing too many times..
@ -193,7 +192,6 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
except: except:
message = f" ! - {expedition.year} Skipping logentry: {title} - GetTripPersons FAIL" message = f" ! - {expedition.year} Skipping logentry: {title} - GetTripPersons FAIL"
DataIssue.objects.create(parser="logbooks", message=message) DataIssue.objects.create(parser="logbooks", message=message)
logdataissues["title"] = message
print(message) print(message)
raise raise
return return
@ -201,7 +199,6 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
if not author: if not author:
message = f" ! - {expedition.year} Warning: logentry: {title} - no expo member author for entry '{tid}'" message = f" ! - {expedition.year} Warning: logentry: {title} - no expo member author for entry '{tid}'"
DataIssue.objects.create(parser="logbooks", message=message) DataIssue.objects.create(parser="logbooks", message=message)
logdataissues["title"] = message
print(message) print(message)
# return # return
@ -261,7 +258,7 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
def ParseDate(tripdate, year): def ParseDate(tripdate, year):
"""Interprets dates in the expo logbooks and returns a correct datetime.date object""" """Interprets dates in the expo logbooks and returns a correct datetime.date object"""
dummydate = date(1970, 1, 1) dummydate = date(1970, 1, 1) # replace with _EPOCH
month = 1 month = 1
day = 1 day = 1
# message = f" ! - Trying to parse date in logbook: {tripdate} - {year}" # message = f" ! - Trying to parse date in logbook: {tripdate} - {year}"
@ -273,7 +270,6 @@ def ParseDate(tripdate, year):
if not (mdatestandard.group(1) == year): if not (mdatestandard.group(1) == year):
message = f" ! - Bad date (year) in logbook: {tripdate} - {year}" message = f" ! - Bad date (year) in logbook: {tripdate} - {year}"
DataIssue.objects.create(parser="logbooks", message=message) DataIssue.objects.create(parser="logbooks", message=message)
logdataissues["tripdate"] = message
return dummydate return dummydate
else: else:
year, month, day = int(mdatestandard.group(1)), int(mdatestandard.group(2)), int(mdatestandard.group(3)) year, month, day = int(mdatestandard.group(1)), int(mdatestandard.group(2)), int(mdatestandard.group(3))
@ -281,23 +277,20 @@ def ParseDate(tripdate, year):
if not (not mdategoof.group(3) or mdategoof.group(3) == year[:2]): if not (not mdategoof.group(3) or mdategoof.group(3) == year[:2]):
message = " ! - Bad date mdategoof.group(3) in logbook: " + tripdate + " - " + mdategoof.group(3) message = " ! - Bad date mdategoof.group(3) in logbook: " + tripdate + " - " + mdategoof.group(3)
DataIssue.objects.create(parser="logbooks", message=message) DataIssue.objects.create(parser="logbooks", message=message)
logdataissues["tripdate"] = message
return dummydate return dummydate
else: else:
yadd = int(year[:2]) * 100 yadd = int(year[:2]) * 100
day, month, year = int(mdategoof.group(1)), int(mdategoof.group(2)), int(mdategoof.group(4)) + yadd day, month, year = int(mdategoof.group(1)), int(mdategoof.group(2)), int(mdategoof.group(4)) + yadd
else: else:
year = 1970 year = 1970 # replace with _EPOCH
message = f" ! - Bad date in logbook: {tripdate} - {year}" message = f" ! - Bad date in logbook: {tripdate} - {year}"
DataIssue.objects.create(parser="logbooks", message=message) DataIssue.objects.create(parser="logbooks", message=message)
logdataissues["tripdate"] = message
return date(year, month, day) return date(year, month, day)
except: except:
message = f" ! - Failed to parse date in logbook: {tripdate} - {year}" message = f" ! - Failed to parse date in logbook: {tripdate} - {year}"
DataIssue.objects.create(parser="logbooks", message=message) DataIssue.objects.create(parser="logbooks", message=message)
logdataissues["tripdate"] = message return datetime.date(1970, 1, 1) # replace with _EPOCH
return datetime.date(1970, 1, 1)
def parser_html(year, expedition, txt, seq=""): def parser_html(year, expedition, txt, seq=""):
@ -309,7 +302,6 @@ def parser_html(year, expedition, txt, seq=""):
from parser_html_01 format logfiles, believe me. from parser_html_01 format logfiles, believe me.
""" """
global logentries global logentries
global logdataissues
# extract front material and stash for later use when rebuilding from list of entries # extract front material and stash for later use when rebuilding from list of entries
headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt) headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt)
@ -356,7 +348,6 @@ def parser_html(year, expedition, txt, seq=""):
msg = f" !- {year} Can't parse:{logbook_entry_count} '{trippara[:50]}'..." msg = f" !- {year} Can't parse:{logbook_entry_count} '{trippara[:50]}'..."
print(msg) print(msg)
DataIssue.objects.create(parser="logbooks", message=msg) DataIssue.objects.create(parser="logbooks", message=msg)
logdataissues[tid] = msg
s2 = re.match( s2 = re.match(
r"""(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date r"""(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date
@ -377,7 +368,6 @@ def parser_html(year, expedition, txt, seq=""):
msg = f" !- Logbook. Can't parse entry on 2nd pass:{logbook_entry_count} '{trippara[:50]}'..." msg = f" !- Logbook. Can't parse entry on 2nd pass:{logbook_entry_count} '{trippara[:50]}'..."
print(msg) print(msg)
DataIssue.objects.create(parser="logbooks", message=msg) DataIssue.objects.create(parser="logbooks", message=msg)
logdataissues[tid] = msg
continue continue
ldate = ParseDate(tripdate.strip(), year) ldate = ParseDate(tripdate.strip(), year)
@ -413,7 +403,6 @@ def parser_blog(year, expedition, txt, sq=""):
So the content is nested inside the header. Attachments (images) come after the content. So the content is nested inside the header. Attachments (images) come after the content.
""" """
global logentries global logentries
global logdataissues
tripheads = re.findall( tripheads = re.findall(
r"<article class=\"message message--post js-post js-inlineModContainer\s*\"\s*([\s\S]*?)(?=</article)", txt r"<article class=\"message message--post js-post js-inlineModContainer\s*\"\s*([\s\S]*?)(?=</article)", txt
@ -455,7 +444,6 @@ def parser_blog(year, expedition, txt, sq=""):
if not (match_author): if not (match_author):
message = f" ! - Skipping logentry {year}:{logbook_entry_count} on failure to parse data-author {tid} {triphead[:400]}..." message = f" ! - Skipping logentry {year}:{logbook_entry_count} on failure to parse data-author {tid} {triphead[:400]}..."
DataIssue.objects.create(parser="logbooks", message=message) DataIssue.objects.create(parser="logbooks", message=message)
logdataissues[tid] = message
print(message) print(message)
break break
trippeople = match_author.group(1) trippeople = match_author.group(1)
@ -465,7 +453,6 @@ def parser_blog(year, expedition, txt, sq=""):
if not (match_datetime): if not (match_datetime):
message = f" ! - Skipping logentry {year}:{logbook_entry_count} on failure to parse datetime {tid} {triphead[:400]}..." message = f" ! - Skipping logentry {year}:{logbook_entry_count} on failure to parse datetime {tid} {triphead[:400]}..."
DataIssue.objects.create(parser="logbooks", message=message) DataIssue.objects.create(parser="logbooks", message=message)
logdataissues[tid] = message
print(message) print(message)
break break
datestamp = match_datetime.group(1) datestamp = match_datetime.group(1)
@ -475,7 +462,6 @@ def parser_blog(year, expedition, txt, sq=""):
except: except:
message = f" ! - FROMISOFORMAT fail logentry {year}:{logbook_entry_count} {tid} '{datestamp}'" message = f" ! - FROMISOFORMAT fail logentry {year}:{logbook_entry_count} {tid} '{datestamp}'"
DataIssue.objects.create(parser="logbooks", message=message) DataIssue.objects.create(parser="logbooks", message=message)
logdataissues[tid] = message
print(message) print(message)
# fallback, ignore the timestamp bits: # fallback, ignore the timestamp bits:
tripdate = datetime.fromisoformat(datestamp[0:10]) tripdate = datetime.fromisoformat(datestamp[0:10])
@ -494,14 +480,30 @@ def parser_blog(year, expedition, txt, sq=""):
entrytuple = (tripdate, location, tripname, tripcontent, trippeople, expedition, tu, tid) entrytuple = (tripdate, location, tripname, tripcontent, trippeople, expedition, tu, tid)
logentries.append(entrytuple) logentries.append(entrytuple)
def clean_logbook_for_expedition(expedition):
def cleanerrors(year):
dataissues = DataIssue.objects.filter(parser="logbooks")
for di in dataissues:
ph = year
if re.search(ph, di.message) is not None: # SLOW just to delete issues for one year
# print(f' - CLEANING dataissue {di.message}')
di.delete()
def LoadLogbookForExpedition(expedition, clean=True):
year = expedition.year
cleanerrors(year)
lbes = LogbookEntry.objects.filter(expedition=expedition) # must be a quicker way
for lbe in lbes:
lbe.delete()
def parse_logbook_for_expedition(expedition):
"""Parses all logbook entries for one expedition """Parses all logbook entries for one expedition
if clean==True then it deletes all entries for this year first. if clean==True then it deletes all entries for this year first.
""" """
global logentries global logentries
# absolutely horrid. REFACTOR THIS (all my fault..) # absolutely horrid. REFACTOR THIS (all my fault..)
global logdataissues
global entries global entries
logbook_parseable = False logbook_parseable = False
@ -513,28 +515,6 @@ def LoadLogbookForExpedition(expedition, clean=True):
expect = entries[year] expect = entries[year]
# print(" - Logbook for: " + year) # print(" - Logbook for: " + year)
def cleanerrors(year):
global logdataissues
dataissues = DataIssue.objects.filter(parser="logbooks")
for di in dataissues:
ph = year
if re.search(ph, di.message) is not None:
# print(f' - CLEANING dataissue {di.message}')
di.delete()
# print(f' - CLEAN {year} {len(logdataissues)} {type(logdataissues)} data issues for this year')
dellist = []
for key, value in logdataissues.items():
# print(f' - CLEANING logdataissues [{key}]: {value}')
if key.startswith(year):
# print(f' - CLEANING logdataissues [{key:12}]: {value} ')
dellist.append(key)
for i in dellist:
del logdataissues[i]
if clean:
cleanerrors(year)
if year in yearlinks: if year in yearlinks:
yearfile, yearparser = yearlinks[year] yearfile, yearparser = yearlinks[year]
logbookpath = Path(yearfile) logbookpath = Path(yearfile)
@ -549,11 +529,6 @@ def LoadLogbookForExpedition(expedition, clean=True):
expedition.save() expedition.save()
lbes = LogbookEntry.objects.filter(expedition=expedition)
if clean:
for lbe in lbes:
lbe.delete()
for sq in ["", "2", "3", "4"]: # cope with blog saved as many separate files for sq in ["", "2", "3", "4"]: # cope with blog saved as many separate files
lb = Path(expologbase, year, logbookpath.stem + sq + logbookpath.suffix) lb = Path(expologbase, year, logbookpath.stem + sq + logbookpath.suffix)
if not (lb.is_file()): if not (lb.is_file()):
@ -603,15 +578,15 @@ def LoadLogbook(year):
global LOGBOOK_PARSER_SETTINGS global LOGBOOK_PARSER_SETTINGS
nlbe = {} nlbe = {}
TROG["pagecache"]["expedition"][year] = None # clear cache
expo = Expedition.objects.get(year=year) expo = Expedition.objects.get(year=year)
year = expo.year # some type funny year = expo.year # some type funny
nlbe[expo] = LoadLogbookForExpedition(expo) # this actually loads the logbook for one expo clean_logbook_for_expedition(expo)
nlbe[expo] = parse_logbook_for_expedition(expo) # this actually loads the logbook for one expo
if year in BLOG_PARSER_SETTINGS: if year in BLOG_PARSER_SETTINGS:
print("BLOG parsing") print("BLOG parsing")
LOGBOOK_PARSER_SETTINGS[year] = BLOG_PARSER_SETTINGS[year] LOGBOOK_PARSER_SETTINGS[year] = BLOG_PARSER_SETTINGS[year]
nlbe[expo] = LoadLogbookForExpedition(expo, clean=False) # this loads the blog logbook for one expo nlbe[expo] = parse_logbook_for_expedition(expo) # this loads the blog logbook for one expo
else: else:
print( print(
f"Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}" f"Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}"
@ -623,16 +598,13 @@ def LoadLogbooks():
This should be rewritten to use coroutines to load all logbooks from disc in parallel, This should be rewritten to use coroutines to load all logbooks from disc in parallel,
but must be serialised to write to database as sqlite is single-user. but must be serialised to write to database as sqlite is single-user.
""" """
global logdataissues
global entries global entries
logdataissues = {}
DataIssue.objects.filter(parser="logbooks").delete() DataIssue.objects.filter(parser="logbooks").delete()
expos = Expedition.objects.all() expos = Expedition.objects.all()
if len(expos) <= 1: if len(expos) <= 1:
message = " ! - No expeditions found. Load 'people' first" message = " ! - No expeditions found. Load 'people' first"
DataIssue.objects.create(parser="logbooks", message=message) DataIssue.objects.create(parser="logbooks", message=message)
logdataissues["sqlfail 0000"] = message
print(message) print(message)
return return
@ -651,12 +623,10 @@ def LoadLogbooks():
for expo in expos: # pointless as we explicitly know the years in this code. for expo in expos: # pointless as we explicitly know the years in this code.
year = expo.year year = expo.year
TROG["pagecache"]["expedition"][year] = None # clear cache
if year in sqlfail: if year in sqlfail:
print(" - Logbook for: " + year + " NO parsing attempted - known sql failures") print(" - Logbook for: " + year + " NO parsing attempted - known sql failures")
message = f" ! - Not even attempting to parse logbook for {year} until code fixed" message = f" ! - Not even attempting to parse logbook for {year} until code fixed"
DataIssue.objects.create(parser="logbooks", message=message) DataIssue.objects.create(parser="logbooks", message=message)
logdataissues[f"sqlfail {year}"] = message
print(message) print(message)
if year not in nologbook: if year not in nologbook:
@ -669,7 +639,7 @@ def LoadLogbooks():
bloglist.append(expo) bloglist.append(expo)
for ex in loglist: for ex in loglist:
nlbe[ex] = LoadLogbookForExpedition(ex) # this loads the logbook for one expo nlbe[ex] = parse_logbook_for_expedition(ex) # this loads the logbook for one expo
for b in bloglist: for b in bloglist:
if str(b) in LOGBOOK_PARSER_SETTINGS: if str(b) in LOGBOOK_PARSER_SETTINGS:
@ -678,12 +648,12 @@ def LoadLogbooks():
orig = (DEFAULT_LOGBOOK_FILE, DEFAULT_LOGBOOK_PARSER) orig = (DEFAULT_LOGBOOK_FILE, DEFAULT_LOGBOOK_PARSER)
LOGBOOK_PARSER_SETTINGS[str(b)] = BLOG_PARSER_SETTINGS[str(b)] LOGBOOK_PARSER_SETTINGS[str(b)] = BLOG_PARSER_SETTINGS[str(b)]
print(f" - BLOG: {b}") print(f" - BLOG: {b}")
nlbe[b] = LoadLogbookForExpedition(b, clean=False) # this loads the blog logbook for one expo nlbe[b] = parse_logbook_for_expedition(b, clean=False) # this loads the blog logbook for one expo
LOGBOOK_PARSER_SETTINGS[str(b)] = orig LOGBOOK_PARSER_SETTINGS[str(b)] = orig
# tried to use map with concurrent threads - but sqlite database is not concurrent, so failed with database lock # tried to use map with concurrent threads - but sqlite database is not concurrent, so failed with database lock
# yt = 0 # yt = 0
# for r in map(LoadLogbookForExpedition, loglist): # for r in map(parse_logbook_for_expedition, loglist):
# yt = r # yt = r
yt = 0 yt = 0