diff --git a/core/utils.py b/core/utils.py index 5b1599b..8bb7c2a 100644 --- a/core/utils.py +++ b/core/utils.py @@ -33,15 +33,15 @@ save_carefully() - core function that saves troggle objects in the database various git add/commit functions that need refactoring together +NOTE that TROG is not serialized! Two users can update it and conflict !! +This needs to be in a multi-user database with transactions. + ''' TROG = { 'pagecache' : { 'expedition' : {} }, - 'issues' : { - 'logdataissues' : {} - }, 'caves' : { 'gcavelookup' : {}, 'gcavecount' : {} diff --git a/parsers/logbooks.py b/parsers/logbooks.py index 3b01eed..838c253 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -10,7 +10,7 @@ from django.template.defaultfilters import slugify from parsers.people import GetPersonExpeditionNameLookup from troggle.core.models.caves import GetCaveLookup, LogbookEntry, PersonTrip from troggle.core.models.troggle import DataIssue, Expedition -from troggle.core.utils import TROG, save_carefully +from troggle.core.utils import save_carefully """ Parses and imports logbooks in all their wonderful confusion @@ -18,12 +18,16 @@ Parses and imports logbooks in all their wonderful confusion https://expo.survex.com/handbook/computing/logbooks-parsing.html """ todo = """ -- refactor everything with some urgency, esp. LoadLogbookForExpedition() +- refactor everything with some urgency, esp. parse_logbook_for_expedition() -- remove the TROG things since we need the database for multiuser access? Or not? +- break out the code that hits the database from that which parses the logbook +so that the file-reading and parsing can be parallelized, while writing to the +database remains serialized (sqlite is single-user). - profile the code to find bad repetitive things, of which there are many. +- attach or link a DataIssue to an individual expo (logbook) so that it can be found and deleted + - far too many uses of Django field dereferencing to get values, which is SLOW - rewrite to use generators rather than storing everything intermediate in lists - to reduce memory impact. @@ -96,12 +100,8 @@ entries = { logentries = [] # the entire logbook for one year is a single object: a list of entries noncaveplaces = ["Journey", "Loser Plateau", "UNKNOWN", "plateau", "base camp", "basecamp", "top camp", "topcamp"] -logdataissues = TROG["issues"]["logdataissues"] trips = {} -# -# the logbook loading section -# def set_trip_id(year, seq): tid = f"{year}_s{seq:02d}" return tid @@ -149,7 +149,6 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None): message = f" ! - {expedition.year} No name match for: '{tripperson}' in entry {tid=} for this expedition year." print(message) DataIssue.objects.create(parser="logbooks", message=message) - logdataissues[tid] = message res.append((personyear, logtime_underground)) if mul: author = personyear @@ -163,7 +162,7 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None): def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, tid=None): - """saves a logbook entry and related persontrips + """saves a single logbook entry and related persontrips Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday ! troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite but we are saving the same thing too many times.. @@ -193,7 +192,6 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_ except: message = f" ! - {expedition.year} Skipping logentry: {title} - GetTripPersons FAIL" DataIssue.objects.create(parser="logbooks", message=message) - logdataissues["title"] = message print(message) raise return @@ -201,7 +199,6 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_ if not author: message = f" ! - {expedition.year} Warning: logentry: {title} - no expo member author for entry '{tid}'" DataIssue.objects.create(parser="logbooks", message=message) - logdataissues["title"] = message print(message) # return @@ -261,7 +258,7 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_ def ParseDate(tripdate, year): """Interprets dates in the expo logbooks and returns a correct datetime.date object""" - dummydate = date(1970, 1, 1) + dummydate = date(1970, 1, 1) # replace with _EPOCH month = 1 day = 1 # message = f" ! - Trying to parse date in logbook: {tripdate} - {year}" @@ -273,7 +270,6 @@ def ParseDate(tripdate, year): if not (mdatestandard.group(1) == year): message = f" ! - Bad date (year) in logbook: {tripdate} - {year}" DataIssue.objects.create(parser="logbooks", message=message) - logdataissues["tripdate"] = message return dummydate else: year, month, day = int(mdatestandard.group(1)), int(mdatestandard.group(2)), int(mdatestandard.group(3)) @@ -281,23 +277,20 @@ def ParseDate(tripdate, year): if not (not mdategoof.group(3) or mdategoof.group(3) == year[:2]): message = " ! - Bad date mdategoof.group(3) in logbook: " + tripdate + " - " + mdategoof.group(3) DataIssue.objects.create(parser="logbooks", message=message) - logdataissues["tripdate"] = message return dummydate else: yadd = int(year[:2]) * 100 day, month, year = int(mdategoof.group(1)), int(mdategoof.group(2)), int(mdategoof.group(4)) + yadd else: - year = 1970 + year = 1970 # replace with _EPOCH message = f" ! - Bad date in logbook: {tripdate} - {year}" DataIssue.objects.create(parser="logbooks", message=message) - logdataissues["tripdate"] = message return date(year, month, day) except: message = f" ! - Failed to parse date in logbook: {tripdate} - {year}" DataIssue.objects.create(parser="logbooks", message=message) - logdataissues["tripdate"] = message - return datetime.date(1970, 1, 1) + return datetime.date(1970, 1, 1) # replace with _EPOCH def parser_html(year, expedition, txt, seq=""): @@ -309,7 +302,6 @@ def parser_html(year, expedition, txt, seq=""): from parser_html_01 format logfiles, believe me. """ global logentries - global logdataissues # extract front material and stash for later use when rebuilding from list of entries headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt) @@ -356,7 +348,6 @@ def parser_html(year, expedition, txt, seq=""): msg = f" !- {year} Can't parse:{logbook_entry_count} '{trippara[:50]}'..." print(msg) DataIssue.objects.create(parser="logbooks", message=msg) - logdataissues[tid] = msg s2 = re.match( r"""(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date @@ -377,7 +368,6 @@ def parser_html(year, expedition, txt, seq=""): msg = f" !- Logbook. Can't parse entry on 2nd pass:{logbook_entry_count} '{trippara[:50]}'..." print(msg) DataIssue.objects.create(parser="logbooks", message=msg) - logdataissues[tid] = msg continue ldate = ParseDate(tripdate.strip(), year) @@ -413,7 +403,6 @@ def parser_blog(year, expedition, txt, sq=""): So the content is nested inside the header. Attachments (images) come after the content. """ global logentries - global logdataissues tripheads = re.findall( r"<article class=\"message message--post js-post js-inlineModContainer\s*\"\s*([\s\S]*?)(?=</article)", txt @@ -455,7 +444,6 @@ def parser_blog(year, expedition, txt, sq=""): if not (match_author): message = f" ! - Skipping logentry {year}:{logbook_entry_count} on failure to parse data-author {tid} {triphead[:400]}..." DataIssue.objects.create(parser="logbooks", message=message) - logdataissues[tid] = message print(message) break trippeople = match_author.group(1) @@ -465,7 +453,6 @@ def parser_blog(year, expedition, txt, sq=""): if not (match_datetime): message = f" ! - Skipping logentry {year}:{logbook_entry_count} on failure to parse datetime {tid} {triphead[:400]}..." DataIssue.objects.create(parser="logbooks", message=message) - logdataissues[tid] = message print(message) break datestamp = match_datetime.group(1) @@ -475,7 +462,6 @@ def parser_blog(year, expedition, txt, sq=""): except: message = f" ! - FROMISOFORMAT fail logentry {year}:{logbook_entry_count} {tid} '{datestamp}'" DataIssue.objects.create(parser="logbooks", message=message) - logdataissues[tid] = message print(message) # fallback, ignore the timestamp bits: tripdate = datetime.fromisoformat(datestamp[0:10]) @@ -494,14 +480,30 @@ def parser_blog(year, expedition, txt, sq=""): entrytuple = (tripdate, location, tripname, tripcontent, trippeople, expedition, tu, tid) logentries.append(entrytuple) +def clean_logbook_for_expedition(expedition): + def cleanerrors(year): + dataissues = DataIssue.objects.filter(parser="logbooks") + for di in dataissues: + ph = year + if re.search(ph, di.message) is not None: # SLOW just to delete issues for one year + # print(f' - CLEANING dataissue {di.message}') + di.delete() -def LoadLogbookForExpedition(expedition, clean=True): + + + year = expedition.year + cleanerrors(year) + + lbes = LogbookEntry.objects.filter(expedition=expedition) # must be a quicker way + for lbe in lbes: + lbe.delete() + +def parse_logbook_for_expedition(expedition): """Parses all logbook entries for one expedition if clean==True then it deletes all entries for this year first. """ global logentries # absolutely horrid. REFACTOR THIS (all my fault..) - global logdataissues global entries logbook_parseable = False @@ -513,28 +515,6 @@ def LoadLogbookForExpedition(expedition, clean=True): expect = entries[year] # print(" - Logbook for: " + year) - def cleanerrors(year): - global logdataissues - dataissues = DataIssue.objects.filter(parser="logbooks") - for di in dataissues: - ph = year - if re.search(ph, di.message) is not None: - # print(f' - CLEANING dataissue {di.message}') - di.delete() - - # print(f' - CLEAN {year} {len(logdataissues)} {type(logdataissues)} data issues for this year') - dellist = [] - for key, value in logdataissues.items(): - # print(f' - CLEANING logdataissues [{key}]: {value}') - if key.startswith(year): - # print(f' - CLEANING logdataissues [{key:12}]: {value} ') - dellist.append(key) - for i in dellist: - del logdataissues[i] - - if clean: - cleanerrors(year) - if year in yearlinks: yearfile, yearparser = yearlinks[year] logbookpath = Path(yearfile) @@ -549,11 +529,6 @@ def LoadLogbookForExpedition(expedition, clean=True): expedition.save() - lbes = LogbookEntry.objects.filter(expedition=expedition) - if clean: - for lbe in lbes: - lbe.delete() - for sq in ["", "2", "3", "4"]: # cope with blog saved as many separate files lb = Path(expologbase, year, logbookpath.stem + sq + logbookpath.suffix) if not (lb.is_file()): @@ -603,15 +578,15 @@ def LoadLogbook(year): global LOGBOOK_PARSER_SETTINGS nlbe = {} - TROG["pagecache"]["expedition"][year] = None # clear cache expo = Expedition.objects.get(year=year) year = expo.year # some type funny - nlbe[expo] = LoadLogbookForExpedition(expo) # this actually loads the logbook for one expo + clean_logbook_for_expedition(expo) + nlbe[expo] = parse_logbook_for_expedition(expo) # this actually loads the logbook for one expo if year in BLOG_PARSER_SETTINGS: print("BLOG parsing") LOGBOOK_PARSER_SETTINGS[year] = BLOG_PARSER_SETTINGS[year] - nlbe[expo] = LoadLogbookForExpedition(expo, clean=False) # this loads the blog logbook for one expo + nlbe[expo] = parse_logbook_for_expedition(expo) # this loads the blog logbook for one expo else: print( f"Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}" @@ -623,16 +598,13 @@ def LoadLogbooks(): This should be rewritten to use coroutines to load all logbooks from disc in parallel, but must be serialised to write to database as sqlite is single-user. """ - global logdataissues global entries - logdataissues = {} DataIssue.objects.filter(parser="logbooks").delete() expos = Expedition.objects.all() if len(expos) <= 1: message = " ! - No expeditions found. Load 'people' first" DataIssue.objects.create(parser="logbooks", message=message) - logdataissues["sqlfail 0000"] = message print(message) return @@ -651,12 +623,10 @@ def LoadLogbooks(): for expo in expos: # pointless as we explicitly know the years in this code. year = expo.year - TROG["pagecache"]["expedition"][year] = None # clear cache if year in sqlfail: print(" - Logbook for: " + year + " NO parsing attempted - known sql failures") message = f" ! - Not even attempting to parse logbook for {year} until code fixed" DataIssue.objects.create(parser="logbooks", message=message) - logdataissues[f"sqlfail {year}"] = message print(message) if year not in nologbook: @@ -669,7 +639,7 @@ def LoadLogbooks(): bloglist.append(expo) for ex in loglist: - nlbe[ex] = LoadLogbookForExpedition(ex) # this loads the logbook for one expo + nlbe[ex] = parse_logbook_for_expedition(ex) # this loads the logbook for one expo for b in bloglist: if str(b) in LOGBOOK_PARSER_SETTINGS: @@ -678,12 +648,12 @@ def LoadLogbooks(): orig = (DEFAULT_LOGBOOK_FILE, DEFAULT_LOGBOOK_PARSER) LOGBOOK_PARSER_SETTINGS[str(b)] = BLOG_PARSER_SETTINGS[str(b)] print(f" - BLOG: {b}") - nlbe[b] = LoadLogbookForExpedition(b, clean=False) # this loads the blog logbook for one expo + nlbe[b] = parse_logbook_for_expedition(b, clean=False) # this loads the blog logbook for one expo LOGBOOK_PARSER_SETTINGS[str(b)] = orig # tried to use map with concurrent threads - but sqlite database is not concurrent, so failed with database lock # yt = 0 - # for r in map(LoadLogbookForExpedition, loglist): + # for r in map(parse_logbook_for_expedition, loglist): # yt = r yt = 0