From 9ccf5912d4f77b6eaf083f1cee680bb22c4b3ceb Mon Sep 17 00:00:00 2001 From: Philip Sargent Date: Wed, 23 Mar 2022 22:55:59 +0000 Subject: [PATCH] restored logbook cacheing --- parsers/logbooks.py | 125 +++++++++++++++++++++++++------------------- 1 file changed, 70 insertions(+), 55 deletions(-) diff --git a/parsers/logbooks.py b/parsers/logbooks.py index ea7c27f..5ebc671 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -109,6 +109,9 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None): def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, entry_type="wiki", tid=None): """ saves a logbook entry and related persontrips Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday ! + + troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite + but we are saving the same thing too many times.. Also seen in teh ObjStore mimic """ try: trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground, tid=tid) @@ -153,12 +156,14 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_ slug = str(randint(1000,10000)) + "_" + slugify(title)[:10].replace('-','_') nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave_slug':str(cave), 'slug': slug, 'entry_type':entry_type} + # This cretes the lbo instance of LogbookEntry lbo, created=save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs) for tripperson, time_underground in trippersons: lookupAttribs={'personexpedition':tripperson, 'logbook_entry':lbo} nonLookupAttribs={'time_underground':time_underground, 'is_logbook_entry_author':(tripperson == author)} + # this creates the PersonTrip instance. save_carefully(PersonTrip, lookupAttribs, nonLookupAttribs) # PersonTrip also saved in SetDatesFromLogbookEntries def ParseDate(tripdate, year): @@ -243,19 +248,25 @@ def Parselogwikitxt(year, expedition, txt): def EnterLogIntoObjStore(year, date, tripcave, triptitle, text, trippeople, tu, formattype, tripid1, seq, tid=None): + '''Called once for each logbook entry as the logbook is parsed + ''' # This will need additional functions to replicate the persontrip calculation and storage. For the # moment we leave all that to be done in the django db global trips # should be a singleton TROG eventually global logdataissues + if tid in trips: tyear, tdate, *trest = trips[tid] - msg = f" ! DUPLICATE on {tdate} id: '{tid}'" + msg = f" ! DUPLICATE tid: '{tid}' on date:{tdate} " print(msg) DataIssue.objects.create(parser='logbooks', message=msg) tid = set_trip_id(str(date),seq) #print(" - De-dup ",seq, tid) logdataissues[tid]=msg + + if not tid: + tid = set_trip_id(str(date),seq) trips[tid] = (year, date, tripcave, triptitle, text, trippeople, tu, formattype) ## copy a lot of checking functionality here from EnterLogIntoDbase() @@ -267,7 +278,9 @@ def EnterLogIntoObjStore(year, date, tripcave, triptitle, text, trippeople, tu, # message = " ! - Skipping logentry: %s - no author for entry in year '%s'" % (title, expedition.year) # DataIssue.objects.create(parser='logbooks', message=message) # logdataissues[tid+"author"]=message - pass + return + + # 2002, 2004, 2005, 2007, 2010 - now # 2006 wiki text is incomplete, but the html all there. So using this parser now. @@ -280,6 +293,7 @@ def Parseloghtmltxt(year, expedition, txt): for trippara in tripparas: logbook_entry_count += 1 tid = set_trip_id(year,logbook_entry_count) + print(f' - new tid:{tid} lbe count: {logbook_entry_count}') s = re.match(r'''(?x)(?:\s*.*?\s*

)? # second date \s*(?:\s*)? @@ -517,7 +531,7 @@ def SetDatesFromLogbookEntries(expedition): lprevpersontrip.save() persontrip.persontrip_next = None lprevpersontrip = persontrip - persontrip.save() # also saved in EnterLogIntoDbase. MERGE these to speed up import. + #persontrip.save() # also saved in EnterLogIntoDbase. MERGE these to speed up import. def LoadLogbookForExpedition(expedition, expect): @@ -579,51 +593,54 @@ def LoadLogbookForExpedition(expedition, expect): print(" - Cache file does not exist \"" + str(cache_filename) +"\"") expedition.save() - now = time.time() - bad_cache = True # emporarily disable reading the cache - buggy - try: - cache_t = os.path.getmtime(cache_filename) - if os.path.getmtime(logbookfile) - cache_t > 2: # at least 2 secs later - bad_cache= True - if now - cache_t > 30*24*60*60: - bad_cache= True - if bad_cache: - print(" - ! Cache is either stale or more than 30 days old. Deleting it.") - os.remove(cache_filename) - logentries=[] - print(" ! Removed stale or corrupt cache file") - raise - print(" - Reading cache: " + str(cache_filename), end='') + logbook_cached = False + if True: # enable cache system + now = time.time() + bad_cache = False # temporarily disable reading the cache - buggy try: - with open(cache_filename, "rb") as f: - year,n,logentries = pickle.load(f) - if validcache(year,n): - print(" -- Loaded ", len(logentries), " log entries") - logbook_cached = True - else: - print(" !- Told to expect ", expect, " but ", len(logentries), " found in cache") + cache_t = os.path.getmtime(cache_filename) + if os.path.getmtime(logbookfile) - cache_t > 2: # at least 2 secs later + print(" - ! Cache is older than the logbook file") + bad_cache= True + if now - cache_t > 30*24*60*60: + print(" - ! Cache is > 30 days old") + bad_cache= True + if bad_cache: + print(" - ! Cache is either stale or more than 30 days old. Deleting it.") + os.remove(cache_filename) + logentries=[] + print(" ! Removed stale or corrupt cache file") raise - except: - print(" ! Failed to load corrupt cache. (Or I was told to ignore it). Deleting it.") - os.remove(cache_filename) - logentries=[] - raise - except : - print(" - Cache de-pickle failure \"" + str(cache_filename) +"\"") - try: - file_in = open(logbookfile,'rb') - txt = file_in.read().decode("latin1") - file_in.close() - logbook_parseable = True - print((" - Using: " + parsefunc + " to parse " + logbookfile)) - except (IOError): - logbook_parseable = False - print((" ! Couldn't open logbook " + logbookfile)) + # print(" - Reading cache: " + str(cache_filename), end='') + try: + with open(cache_filename, "rb") as f: + year,n,logentries = pickle.load(f) + if validcache(year,n): + print(" -- Loaded ", len(logentries), " log entries") + logbook_cached = True + else: + print(" !- Told to expect ", expect, " but ", len(logentries), " found in cache") + raise + except: + print(" ! Failed to load corrupt cache. (Or I was told to ignore it). Deleting it.") + os.remove(cache_filename) + logentries=[] + raise + except : + print(" - Cache de-pickle failure \"" + str(cache_filename) +"\"") + try: + file_in = open(logbookfile,'rb') + txt = file_in.read().decode("latin1") + file_in.close() + logbook_parseable = True + except (IOError): + logbook_parseable = False + print((" ! Couldn't open logbook " + logbookfile)) if logbook_parseable: parser = globals()[parsefunc] - - parser(expedition.year, expedition, txt) # this launches the parser + print(f' - Using parser {parsefunc}') + parser(expedition.year, expedition, txt) # this launches the right parser for this year SetDatesFromLogbookEntries(expedition) if len(logentries) >0: @@ -634,17 +651,15 @@ def LoadLogbookForExpedition(expedition, expect): else: print(" ! NO TRIP entries found in logbook, check the syntax.") - if logbook_cached: # working on this bit... - i=0 - for entrytuple in range(len(logentries)): - date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, entry_type, tripid1 = logentries[i] - #print(" - entry tuple " , i, " tid", tripid1) - EnterLogIntoDbase(date, tripcave, triptitle, text, trippeople, expedition, 0, - entry_type, tripid1) - EnterLogIntoObjStore(expedition.year, date, tripcave, triptitle, text, trippeople, logtime_underground, - entry_type, tripid1, i) - i +=1 - SetDatesFromLogbookEntries(expedition) + i=0 + for entrytuple in logentries: + date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, entry_type, tripid1 = entrytuple + EnterLogIntoDbase(date, tripcave, triptitle, text, trippeople, expedition, 0, + entry_type, tripid1) + EnterLogIntoObjStore(expedition.year, date, tripcave, triptitle, text, trippeople, logtime_underground, + entry_type, tripid1, i) + i +=1 + SetDatesFromLogbookEntries(expedition) return len(logentries) def LoadLogbooks(): @@ -672,7 +687,7 @@ def LoadLogbooks(): "2008": 49, "2007": 111, "2006": 60, "2005": 55, "2004": 76, "2003": 40, "2002": 31, "2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 94, "1995": 41, "1994": 32, "1993": 41, "1992": 61, "1991": 38, "1990": 87, "1989": 1,"1988": 1,"1987": 1, - "1985": 22,"1984": 32,"1983": 52,"1982": 42,} + "1985": 24,"1984": 32,"1983": 52,"1982": 42,} # Logbooks log.htm exist for 1983, 84, 85, 87, 88, 89 but have no full-working parser, or need hand-editing. try: os.remove("loadlogbk.log")