From 0853bbdd19f93ae5e4d7615843e99380d6dab437 Mon Sep 17 00:00:00 2001
From: Philip Sargent <philip.sargent@klebos.com>
Date: Tue, 30 Aug 2022 17:58:49 +0300
Subject: [PATCH] Many fixes and speedups

---
 parsers/logbooks.py | 129 +++++++++++++++++++++++---------------------
 1 file changed, 68 insertions(+), 61 deletions(-)

diff --git a/parsers/logbooks.py b/parsers/logbooks.py
index 41d0895..40311b6 100644
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -27,33 +27,36 @@ An idea which no longer seems sensible given that we rely on the database to do
 # it can be checked up later from the hard-copy if necessary; or it's not possible to determin (name, trip place, etc)
 '''
 todo='''
-- Put the object store 'trips' and the 'logdataissues' into TROG global object
-
 - Use the .shelve.db cache for all logbooks, not just individually
 
 - refactor everything with some urgency, esp. LoadLogbookForExpedition()
 
--- far too many uses of Django field dereferencing to get values, which is SLOW
+- profile the code to find bad repetitive things, of which there are many.
 
-- Loogbooks 1987, 1988, 1989 all crash on MySql - but not sqlite - with db constraint fail. Edit logbook to fix.
+- far too many uses of Django field dereferencing to get values, which is SLOW
+
+- Logbooks 1987, 1988, 1989 all crash on MySql - but not sqlite - with db constraint fail. Edit logbook to fix.
 
 - import/parse/re-export-as-html the 'artisanal-format' old logbooks so that
-  we keep only a modern HTML05 format. Then we can retiure the old parsers and reduce the
+  we keep only a modern HTML05 format. Then we can retire the old parsers and reduce the
   volume of code here substantially.
 
 - rewrite to use generators rather than storing everything intermediate in lists - to reduce memory impact.
-
-- the object store will need additional functions to replicate the persontrip calculation 
-  and storage. For the moment we leave all that to be done in the django db
-  Concurrent synchronisation would be nice.. 
-  
-- DB lock currently prevents multiple threads for loading logbooks. But asyncio might work..?
   
 - We should ensure logbook.html is utf-8 and stop this crap:             
             file_in = open(logbookfile,'rb')
             txt = file_in.read().decode("latin1")
             
 - this is a slow and uncertain function:  cave = getCaveByReference(caveRef)
+
+- the object store will need additional functions to replicate the persontrip calculation 
+  and storage. For the moment we leave all that to be done in the django db
+  Concurrent synchronisation would be nice.. 
+  
+- DB lock currently prevents multiple threads for loading logbooks. But asyncio might work..?
+
+- Put the object store 'trips' and the 'logdataissues' into TROG global object
+
 '''
 
 logentries = [] # the entire logbook for one year is a single object: a list of entries
@@ -62,7 +65,7 @@ noncaveplaces = [ "QMplaceholder", "Journey", "Loser Plateau", "UNKNOWN", 'plate
 logdataissues = TROG['issues']['logdataissues']
 trips ={}
 
-entries = { "2022": 42, "2019": 44, "2018": 74, "2017": 60, "2016": 81, "2015": 79, 
+entries = { "2022": 62, "2019": 44, "2018": 74, "2017": 60, "2016": 81, "2015": 79, 
     "2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 52, 
     "2008": 49, "2007": 111, "2006": 60, "2005": 55, "2004": 76, "2003": 40, "2002": 31, 
     "2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 94, "1995": 41, 
@@ -101,7 +104,7 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
                             
             personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower())
             if not personyear:
-                message = f" ! - {expedition.year} No name match for: '{tripperson}' " 
+                message = f" ! - {expedition.year} No name match for: '{tripperson}' in entry {tid=} for this expedition year." 
                 print(message)
                 DataIssue.objects.create(parser='logbooks', message=message)
                 logdataissues[tid]=message
@@ -115,7 +118,7 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
         
     return res, author
 
-def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, entry_type="wiki", tid=None):
+def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, tid=None):
     """ saves a logbook entry and related persontrips 
     Does NOT save the expeditionday_id  - all NULLs. why? Because we are deprecating expeditionday !
     
@@ -132,7 +135,7 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
         return
         
     if not author:
-        message = f" ! - {expedition.year} Warning: logentry: {title} - no author for entry in year "
+        message = f" ! - {expedition.year} Warning: logentry: {title} - no author for entry '{tid}'"
         DataIssue.objects.create(parser='logbooks', message=message)
         logdataissues["title"]=message
         print(message)
@@ -163,9 +166,9 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
         slug = tid + "_" + slugify(title)[:10].replace('-','_')
     else: 
         slug = str(randint(1000,10000)) + "_" + slugify(title)[:10].replace('-','_')
-    nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave_slug':str(cave), 'slug': slug, 'entry_type':entry_type}
+    nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave_slug':str(cave), 'slug': slug}
     
-    # This cretes the lbo instance of LogbookEntry
+    # This creates the lbo instance of LogbookEntry
     lbo, created=save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs)
 
     
@@ -253,14 +256,14 @@ def Parselogwikitxt(year, expedition, txt):
                 trippeople, expedition, tu, "wiki", tripid)
         logentries.append(entrytuple)
 
-        EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople, 
-                expedition=expedition, logtime_underground=0, tid=tid)
+        # EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople, 
+                # expedition=expedition, logtime_underground=0, tid=tid)
         
-        EnterLogIntoObjStore(year, ldate, tripcave, tripplace, triptext, trippeople, 
-                tu, "wiki", tripid, logbook_entry_count, tid=tid)
+        # EnterLogIntoObjStore(year, ldate, tripcave, tripplace, triptext, trippeople, 
+                # tu, "wiki", tripid, logbook_entry_count, tid=tid)
 
 
-def EnterLogIntoObjStore(year, date, tripcave, triptitle, text, trippeople, tu, formattype, tripid1, seq, tid=None):
+def EnterLogIntoObjStore(year, date, tripcave, triptitle, text, trippeople, tu, tripid1, seq, tid=None):
     '''Called once for each logbook entry as the logbook is parsed
     '''
     # This will need additional functions to replicate the persontrip calculation and storage. For the
@@ -280,7 +283,7 @@ def EnterLogIntoObjStore(year, date, tripcave, triptitle, text, trippeople, tu,
     
     if not tid:
         tid = set_trip_id(str(date),seq)
-    trips[tid] = (year, date, tripcave, triptitle, text, trippeople, tu, formattype)
+    trips[tid] = (year, date, tripcave, triptitle, text, trippeople, tu)
 
     ## copy a lot of checking functionality here from EnterLogIntoDbase()
     # GetTripPersons is a db query, so this will need to be put in ObjStore  before this will work..
@@ -353,12 +356,12 @@ def Parseloghtmltxt(year, expedition, txt):
                 trippeople, expedition, tu, "html", tripid1)
         logentries.append(entrytuple)
 
-        EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext,
-                          trippeople=trippeople, expedition=expedition, logtime_underground=0,
-                          entry_type="html", tid=tid)
+        # EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext,
+                          # trippeople=trippeople, expedition=expedition, logtime_underground=0,
+                          # entry_type="html", tid=tid)
 
-        EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu, 
-                            "html", tripid1, logbook_entry_count, tid=tid)
+        # EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu, 
+                            # "html", tripid1, logbook_entry_count, tid=tid)
 
 # main parser for 1991 - 2001.  simpler because the data has been hacked so much to fit it
 # trying it out for years 1982 - 1990 too. Some logbook editing required by hand.. place
@@ -455,24 +458,24 @@ def Parseloghtml01(year, expedition, txt):
             entrytuple = (ldate, tripcave, triptitle, ltriptext, 
                     trippeople, expedition, tu, "html01", tid)
             logentries.append(entrytuple)
-            try:
-                EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext,
-                                  trippeople=trippeople, expedition=expedition, logtime_underground=0,
-                                  entry_type="html", tid=tid)
-            except:
-                message = " ! - Enter log entry into database FAIL  exception in: " + tid 
-                DataIssue.objects.create(parser='logbooks', message=message)
-                logdataissues[tid]=message
-                print(message)
+            # try:
+                # EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext,
+                                  # trippeople=trippeople, expedition=expedition, logtime_underground=0,
+                                  # entry_type="html", tid=tid)
+            # except:
+                # message = " ! - Enter log entry into database FAIL  exception in: " + tid 
+                # DataIssue.objects.create(parser='logbooks', message=message)
+                # logdataissues[tid]=message
+                # print(message)
                 
-            try:
-                EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu, 
-                                    "html01", tid, logbook_entry_count, tid=tid)
-            except:
-                message = " ! - Enter log entry into ObjectStore FAIL  exception in: " + tid 
-                DataIssue.objects.create(parser='logbooks', message=message)
-                logdataissues[tid]=message
-                print(message)
+            # try:
+                # EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu, 
+                                    # "html01", tid, logbook_entry_count, tid=tid)
+            # except:
+                # message = " ! - Enter log entry into ObjectStore FAIL  exception in: " + tid 
+                # DataIssue.objects.create(parser='logbooks', message=message)
+                # logdataissues[tid]=message
+                # print(message)
                 
         except:
             message = f" ! - Skipping logentry {year} due to exception in: {tid}"
@@ -514,7 +517,7 @@ def Parseloghtml03(year, expedition, txt):
         if re.match("T/U|Time underwater", sheader[-1]):
             tu = sheader.pop()
         if len(sheader) != 3:
-            print((" ! Header not three pieces", sheader))
+            print(" ! Header not three pieces", sheader)
         tripdate, triptitle, trippeople = sheader
         ldate = ParseDate(tripdate.strip(), year)
         triptitles = triptitle.split(" , ")
@@ -532,12 +535,12 @@ def Parseloghtml03(year, expedition, txt):
                 trippeople, expedition, tu, "html03", tid)
         logentries.append(entrytuple)
 
-        EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle,
-                          text = ltriptext, trippeople=trippeople, expedition=expedition,
-                          logtime_underground=0, entry_type="html", tid=tid)
+        # EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle,
+                          # text = ltriptext, trippeople=trippeople, expedition=expedition,
+                          # logtime_underground=0, entry_type="html", tid=tid)
 
-        EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu, 
-                            "html03", tid, logbook_entry_count, tid=tid)
+        # EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu, 
+                            # "html03", tid, logbook_entry_count, tid=tid)
 
 
 def SetDatesFromLogbookEntries(expedition):
@@ -618,7 +621,7 @@ def LoadLogbookForExpedition(expedition):
         expedition.logbookfile = yearlinks[year][0] 
         parsefunc   = yearlinks[year][1]
     else:
-        logbookpath = os.path.join(expologbase, year, settings.DEFAULT_LOGBOOK_FILE)
+        logbookpath = Path(expologbase) /  year / settings.DEFAULT_LOGBOOK_FILE
         expedition.logbookfile = settings.DEFAULT_LOGBOOK_FILE
         parsefunc   = settings.DEFAULT_LOGBOOK_PARSER
     cache_filename = Path(str(logbookpath) + ".cache")
@@ -639,13 +642,13 @@ def LoadLogbookForExpedition(expedition):
                 print(" - ! Cache is > 30 days old")
                 bad_cache= True
             if bad_cache:
-                print(" - ! Cache is either stale or more than 30 days old. Deleting it.")
+                print("   - so cache is either stale or more than 30 days old. Deleting it.")
                 os.remove(cache_filename)
                 logentries=[]
-                print("   ! Removed stale or corrupt cache file")
+                print("   - Deleted stale or corrupt cache file")
                 raise
-            # print("   - Reading cache: " + str(cache_filename), end='')
             try:
+                # print("   - Reading cache: " + str(cache_filename), end='')
                 with open(cache_filename, "rb") as f:
                     year,n,logentries = pickle.load(f) 
                 if validcache(year,n):
@@ -660,21 +663,22 @@ def LoadLogbookForExpedition(expedition):
                 logentries=[]
                 raise
         except : 
-            print("   - Cache de-pickle failure \"" + str(cache_filename) +"\"")
+            print("   - Cache old or de-pickle failure \"" + str(cache_filename) +"\"")
             try:
                 file_in = open(logbookpath,'rb')
-                txt = file_in.read().decode("latin1")
+                txt = file_in.read().decode("utf-8")
                 file_in.close()
                 logbook_parseable = True
             except (IOError):
                 logbook_parseable = False
-                print(("   ! Couldn't open logbook " + logbookpath))
+                print("   ! Couldn't open logbook as UTF-8 " + logbookpath)
 
     if logbook_parseable:
         parser = globals()[parsefunc]
         print(f' - Using parser {parsefunc}')
         parser(year, expedition, txt) # this launches the right parser for this year
         
+        print("   - Setting dates from logbook entries")
         SetDatesFromLogbookEntries(expedition)
         if len(logentries) >0:
             print("   - Cacheing " , len(logentries), " log entries")
@@ -686,11 +690,14 @@ def LoadLogbookForExpedition(expedition):
 
     i=0
     for entrytuple in logentries:
-        date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, entry_type, tripid1 = entrytuple
+        try:
+            date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
+        except ValueError: # cope with removal of entry_type but still in cache files. Remove in Sept. 2022.
+            date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, entry_type, tripid1 = entrytuple
         EnterLogIntoDbase(date, tripcave, triptitle, text, trippeople, expedition, 0,
-                entry_type, tripid1)
+                tripid1)
         EnterLogIntoObjStore(year, date, tripcave, triptitle, text, trippeople, logtime_underground, 
-               entry_type, tripid1, i)
+                tripid1, i)
         i +=1
     SetDatesFromLogbookEntries(expedition)