From 0853bbdd19f93ae5e4d7615843e99380d6dab437 Mon Sep 17 00:00:00 2001 From: Philip Sargent <philip.sargent@klebos.com> Date: Tue, 30 Aug 2022 17:58:49 +0300 Subject: [PATCH] Many fixes and speedups --- parsers/logbooks.py | 129 +++++++++++++++++++++++--------------------- 1 file changed, 68 insertions(+), 61 deletions(-) diff --git a/parsers/logbooks.py b/parsers/logbooks.py index 41d0895..40311b6 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -27,33 +27,36 @@ An idea which no longer seems sensible given that we rely on the database to do # it can be checked up later from the hard-copy if necessary; or it's not possible to determin (name, trip place, etc) ''' todo=''' -- Put the object store 'trips' and the 'logdataissues' into TROG global object - - Use the .shelve.db cache for all logbooks, not just individually - refactor everything with some urgency, esp. LoadLogbookForExpedition() --- far too many uses of Django field dereferencing to get values, which is SLOW +- profile the code to find bad repetitive things, of which there are many. -- Loogbooks 1987, 1988, 1989 all crash on MySql - but not sqlite - with db constraint fail. Edit logbook to fix. +- far too many uses of Django field dereferencing to get values, which is SLOW + +- Logbooks 1987, 1988, 1989 all crash on MySql - but not sqlite - with db constraint fail. Edit logbook to fix. - import/parse/re-export-as-html the 'artisanal-format' old logbooks so that - we keep only a modern HTML05 format. Then we can retiure the old parsers and reduce the + we keep only a modern HTML05 format. Then we can retire the old parsers and reduce the volume of code here substantially. - rewrite to use generators rather than storing everything intermediate in lists - to reduce memory impact. - -- the object store will need additional functions to replicate the persontrip calculation - and storage. For the moment we leave all that to be done in the django db - Concurrent synchronisation would be nice.. - -- DB lock currently prevents multiple threads for loading logbooks. But asyncio might work..? - We should ensure logbook.html is utf-8 and stop this crap: file_in = open(logbookfile,'rb') txt = file_in.read().decode("latin1") - this is a slow and uncertain function: cave = getCaveByReference(caveRef) + +- the object store will need additional functions to replicate the persontrip calculation + and storage. For the moment we leave all that to be done in the django db + Concurrent synchronisation would be nice.. + +- DB lock currently prevents multiple threads for loading logbooks. But asyncio might work..? + +- Put the object store 'trips' and the 'logdataissues' into TROG global object + ''' logentries = [] # the entire logbook for one year is a single object: a list of entries @@ -62,7 +65,7 @@ noncaveplaces = [ "QMplaceholder", "Journey", "Loser Plateau", "UNKNOWN", 'plate logdataissues = TROG['issues']['logdataissues'] trips ={} -entries = { "2022": 42, "2019": 44, "2018": 74, "2017": 60, "2016": 81, "2015": 79, +entries = { "2022": 62, "2019": 44, "2018": 74, "2017": 60, "2016": 81, "2015": 79, "2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 52, "2008": 49, "2007": 111, "2006": 60, "2005": 55, "2004": 76, "2003": 40, "2002": 31, "2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 94, "1995": 41, @@ -101,7 +104,7 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None): personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower()) if not personyear: - message = f" ! - {expedition.year} No name match for: '{tripperson}' " + message = f" ! - {expedition.year} No name match for: '{tripperson}' in entry {tid=} for this expedition year." print(message) DataIssue.objects.create(parser='logbooks', message=message) logdataissues[tid]=message @@ -115,7 +118,7 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None): return res, author -def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, entry_type="wiki", tid=None): +def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, tid=None): """ saves a logbook entry and related persontrips Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday ! @@ -132,7 +135,7 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_ return if not author: - message = f" ! - {expedition.year} Warning: logentry: {title} - no author for entry in year " + message = f" ! - {expedition.year} Warning: logentry: {title} - no author for entry '{tid}'" DataIssue.objects.create(parser='logbooks', message=message) logdataissues["title"]=message print(message) @@ -163,9 +166,9 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_ slug = tid + "_" + slugify(title)[:10].replace('-','_') else: slug = str(randint(1000,10000)) + "_" + slugify(title)[:10].replace('-','_') - nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave_slug':str(cave), 'slug': slug, 'entry_type':entry_type} + nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave_slug':str(cave), 'slug': slug} - # This cretes the lbo instance of LogbookEntry + # This creates the lbo instance of LogbookEntry lbo, created=save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs) @@ -253,14 +256,14 @@ def Parselogwikitxt(year, expedition, txt): trippeople, expedition, tu, "wiki", tripid) logentries.append(entrytuple) - EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople, - expedition=expedition, logtime_underground=0, tid=tid) + # EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople, + # expedition=expedition, logtime_underground=0, tid=tid) - EnterLogIntoObjStore(year, ldate, tripcave, tripplace, triptext, trippeople, - tu, "wiki", tripid, logbook_entry_count, tid=tid) + # EnterLogIntoObjStore(year, ldate, tripcave, tripplace, triptext, trippeople, + # tu, "wiki", tripid, logbook_entry_count, tid=tid) -def EnterLogIntoObjStore(year, date, tripcave, triptitle, text, trippeople, tu, formattype, tripid1, seq, tid=None): +def EnterLogIntoObjStore(year, date, tripcave, triptitle, text, trippeople, tu, tripid1, seq, tid=None): '''Called once for each logbook entry as the logbook is parsed ''' # This will need additional functions to replicate the persontrip calculation and storage. For the @@ -280,7 +283,7 @@ def EnterLogIntoObjStore(year, date, tripcave, triptitle, text, trippeople, tu, if not tid: tid = set_trip_id(str(date),seq) - trips[tid] = (year, date, tripcave, triptitle, text, trippeople, tu, formattype) + trips[tid] = (year, date, tripcave, triptitle, text, trippeople, tu) ## copy a lot of checking functionality here from EnterLogIntoDbase() # GetTripPersons is a db query, so this will need to be put in ObjStore before this will work.. @@ -353,12 +356,12 @@ def Parseloghtmltxt(year, expedition, txt): trippeople, expedition, tu, "html", tripid1) logentries.append(entrytuple) - EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, - trippeople=trippeople, expedition=expedition, logtime_underground=0, - entry_type="html", tid=tid) + # EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, + # trippeople=trippeople, expedition=expedition, logtime_underground=0, + # entry_type="html", tid=tid) - EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu, - "html", tripid1, logbook_entry_count, tid=tid) + # EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu, + # "html", tripid1, logbook_entry_count, tid=tid) # main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it # trying it out for years 1982 - 1990 too. Some logbook editing required by hand.. place @@ -455,24 +458,24 @@ def Parseloghtml01(year, expedition, txt): entrytuple = (ldate, tripcave, triptitle, ltriptext, trippeople, expedition, tu, "html01", tid) logentries.append(entrytuple) - try: - EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext, - trippeople=trippeople, expedition=expedition, logtime_underground=0, - entry_type="html", tid=tid) - except: - message = " ! - Enter log entry into database FAIL exception in: " + tid - DataIssue.objects.create(parser='logbooks', message=message) - logdataissues[tid]=message - print(message) + # try: + # EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext, + # trippeople=trippeople, expedition=expedition, logtime_underground=0, + # entry_type="html", tid=tid) + # except: + # message = " ! - Enter log entry into database FAIL exception in: " + tid + # DataIssue.objects.create(parser='logbooks', message=message) + # logdataissues[tid]=message + # print(message) - try: - EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu, - "html01", tid, logbook_entry_count, tid=tid) - except: - message = " ! - Enter log entry into ObjectStore FAIL exception in: " + tid - DataIssue.objects.create(parser='logbooks', message=message) - logdataissues[tid]=message - print(message) + # try: + # EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu, + # "html01", tid, logbook_entry_count, tid=tid) + # except: + # message = " ! - Enter log entry into ObjectStore FAIL exception in: " + tid + # DataIssue.objects.create(parser='logbooks', message=message) + # logdataissues[tid]=message + # print(message) except: message = f" ! - Skipping logentry {year} due to exception in: {tid}" @@ -514,7 +517,7 @@ def Parseloghtml03(year, expedition, txt): if re.match("T/U|Time underwater", sheader[-1]): tu = sheader.pop() if len(sheader) != 3: - print((" ! Header not three pieces", sheader)) + print(" ! Header not three pieces", sheader) tripdate, triptitle, trippeople = sheader ldate = ParseDate(tripdate.strip(), year) triptitles = triptitle.split(" , ") @@ -532,12 +535,12 @@ def Parseloghtml03(year, expedition, txt): trippeople, expedition, tu, "html03", tid) logentries.append(entrytuple) - EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, - text = ltriptext, trippeople=trippeople, expedition=expedition, - logtime_underground=0, entry_type="html", tid=tid) + # EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, + # text = ltriptext, trippeople=trippeople, expedition=expedition, + # logtime_underground=0, entry_type="html", tid=tid) - EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu, - "html03", tid, logbook_entry_count, tid=tid) + # EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu, + # "html03", tid, logbook_entry_count, tid=tid) def SetDatesFromLogbookEntries(expedition): @@ -618,7 +621,7 @@ def LoadLogbookForExpedition(expedition): expedition.logbookfile = yearlinks[year][0] parsefunc = yearlinks[year][1] else: - logbookpath = os.path.join(expologbase, year, settings.DEFAULT_LOGBOOK_FILE) + logbookpath = Path(expologbase) / year / settings.DEFAULT_LOGBOOK_FILE expedition.logbookfile = settings.DEFAULT_LOGBOOK_FILE parsefunc = settings.DEFAULT_LOGBOOK_PARSER cache_filename = Path(str(logbookpath) + ".cache") @@ -639,13 +642,13 @@ def LoadLogbookForExpedition(expedition): print(" - ! Cache is > 30 days old") bad_cache= True if bad_cache: - print(" - ! Cache is either stale or more than 30 days old. Deleting it.") + print(" - so cache is either stale or more than 30 days old. Deleting it.") os.remove(cache_filename) logentries=[] - print(" ! Removed stale or corrupt cache file") + print(" - Deleted stale or corrupt cache file") raise - # print(" - Reading cache: " + str(cache_filename), end='') try: + # print(" - Reading cache: " + str(cache_filename), end='') with open(cache_filename, "rb") as f: year,n,logentries = pickle.load(f) if validcache(year,n): @@ -660,21 +663,22 @@ def LoadLogbookForExpedition(expedition): logentries=[] raise except : - print(" - Cache de-pickle failure \"" + str(cache_filename) +"\"") + print(" - Cache old or de-pickle failure \"" + str(cache_filename) +"\"") try: file_in = open(logbookpath,'rb') - txt = file_in.read().decode("latin1") + txt = file_in.read().decode("utf-8") file_in.close() logbook_parseable = True except (IOError): logbook_parseable = False - print((" ! Couldn't open logbook " + logbookpath)) + print(" ! Couldn't open logbook as UTF-8 " + logbookpath) if logbook_parseable: parser = globals()[parsefunc] print(f' - Using parser {parsefunc}') parser(year, expedition, txt) # this launches the right parser for this year + print(" - Setting dates from logbook entries") SetDatesFromLogbookEntries(expedition) if len(logentries) >0: print(" - Cacheing " , len(logentries), " log entries") @@ -686,11 +690,14 @@ def LoadLogbookForExpedition(expedition): i=0 for entrytuple in logentries: - date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, entry_type, tripid1 = entrytuple + try: + date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple + except ValueError: # cope with removal of entry_type but still in cache files. Remove in Sept. 2022. + date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, entry_type, tripid1 = entrytuple EnterLogIntoDbase(date, tripcave, triptitle, text, trippeople, expedition, 0, - entry_type, tripid1) + tripid1) EnterLogIntoObjStore(year, date, tripcave, triptitle, text, trippeople, logtime_underground, - entry_type, tripid1, i) + tripid1, i) i +=1 SetDatesFromLogbookEntries(expedition)