make ?reload private and clean old error msgs

2025-12-13 20:27:05 +00:00 · 2021-04-23 03:07:21 +01:00
parent 1a4be0f02e
commit dbd186e299
10 changed files with 172 additions and 85 deletions
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -21,6 +21,39 @@ Parses and imports logbooks in all their wonderful confusion
 # When we edit logbook entries, allow a "?" after any piece of data to say we've frigged it and
 # it can be checked up later from the hard-copy if necessary; or it's not possible to determin (name, trip place, etc)
 '''
+todo='''
+- Put the object store 'trips' and the 'logdataissues' into TROG global object
+
+- refactor everything with some urgency, esp. LoadLogbookForExpedition()
+
+- delete all the autoLogbooKEntry stuff when we are absolutely certain what it does
+
+- Logbooks log.htm exist for 1983, 84, 85, 87, 88, 89 but have no full-working parser, 
+  or it is broken/incomplete and need hand-editing.
+
+- import/parse/re-export-as-html the 'artisanal-format' old logbooks so that
+  we keep only a modern HTML05 format. Then we can retiure the old parsers and reduce the
+  volume of code here substantially.
+
+- edit LoadLogbooks() to use coroutines to speed up import substantially,
+  but perhaps we had better profile it first?
+  
+- rewrite to use generators rather than storing everything intermediate in lists - to reduce memory impact.
+
+- the object store will need additional functions to replicate the persontrip calculation 
+  and storage. For the moment we leave all that to be done in the django db
+  
+- We should ensure logbook.html is utf-8 and stop this crap:             
+            file_in = open(logbookfile,'rb')
+            txt = file_in.read().decode("latin1")
+'''
+
+logentries = [] # the entire logbook for one year is a single object: a list of entries
+noncaveplaces = [ "QMplaceholder", "Journey", "Loser Plateau", "UNKNOWN", 'plateau', 
+        'base camp', 'basecamp', 'top camp', 'topcamp' ]
+logdataissues = {}
+trips ={}
+

 #
 # the logbook loading section
@@ -77,12 +110,6 @@ def GetTripCave(place):
        return None


-logentries = [] # the entire logbook for one year is a single object: a list of entries
-noncaveplaces = [ "QMplaceholder", "Journey", "Loser Plateau", "UNKNOWN", 'plateau', 
-        'base camp', 'basecamp', 'top camp', 'topcamp' ]
-logdataissues = {}
-trips ={}
-
 def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, entry_type="wiki"):
    """ saves a logbook entry and related persontrips 
    Does NOT save the expeditionday_id  - all NULLs. why?
@@ -205,8 +232,10 @@ def EnterLogIntoObjStore(year, date, tripcave, triptitle, text, trippeople, tu,
       #print(" - New id ",tid)
    else:
        tid= tripid1
+
    if tid in trips:
-        msg = "   ! DUPLICATE id .{}. {} ~{}~".format(tid, trips[tid][0], trips[tid][1])
+        tyear, tdate, *trest = trips[tid]
+        msg = f"   ! DUPLICATE on {tdate} id: '{tid}'"
        print(msg)
        DataIssue.objects.create(parser='logbooks', message=msg)
        tid= "d{}-s{:02d}".format(str(date),seq)
@@ -427,6 +456,7 @@ def LoadLogbookForExpedition(expedition, expect):
    """
    # absolutely horrid. REFACTOR THIS (all my fault..)
    global logentries
+    global logdataissues
    logbook_parseable = False
    logbook_cached = False
    yearlinks   = settings.LOGBOOK_PARSER_SETTINGS
@@ -445,6 +475,26 @@ def LoadLogbookForExpedition(expedition, expect):
            return False
        return True
    
+    def cleanerrors(year):
+        global logdataissues
+        print(f'   - CLEAN {year} {len(logdataissues)} data issues in total')
+        dataissues = DataIssue.objects.filter(parser='logbooks')
+        for di in dataissues:
+            ph = "t" + year + "-"
+            if re.search(ph, di.message) is not None:
+                print(f'   - CLEANING dataissue {di.message}')
+                di.delete()
+     
+        for te, content in logdataissues:
+            #  tripentry = year + "." + str(logbook_entry_count)
+            print(f'   - CLEAN {te}')
+            if te.startswith(year + "."):
+                print(f'   - CLEANING logdataissue {te}')
+                logdataissues.pop(te)
+
+
+    cleanerrors(expedition.year)
+
    if expedition.year in yearlinks:
        logbookfile = os.path.join(expologbase, yearlinks[expedition.year][0])
        expedition.logbookfile = yearlinks[expedition.year][0] 
@@ -478,10 +528,10 @@ def LoadLogbookForExpedition(expedition, expect):
                print("  -- Loaded ", len(logentries), " log entries")
                logbook_cached = True
            else:
-                print("  !- Should be ", expect, " but ", len(logentries), " found in cache")
+                print("  !- Told to expect ", expect, " but ", len(logentries), " found in cache")
                raise
        except:
-            print("   ! Failed to load corrupt cache. Deleting it.")
+            print("   ! Failed to load corrupt cache.  (Or I was told to ignore it). Deleting it.")
            os.remove(cache_filename)
            logentries=[]
            raise
@@ -554,7 +604,7 @@ def LoadLogbooks():
            TROG['pagecache']['expedition'][expo.year] = None # clear cache
            if expo.year not in nologbook:
                print((" - Logbook for: " + expo.year))
-                numentries = LoadLogbookForExpedition(expo, entries[expo.year])
+                numentries = LoadLogbookForExpedition(expo, entries[expo.year])  # this actually loads the logbook for one year
                log.write("{} {:5d} should be {}\n".format(expo.year, numentries, entries[expo.year]))
                nlbe[expo.year]=numentries
                expd[expo.year]= 0
@@ -588,6 +638,12 @@ locationRegex = re.compile(r'<span\s+class="location">(.*?)</span>', re.S)
 caveRegex = re.compile(r'<span\s+class="cave">(.*?)</span>', re.S)

 def parseAutoLogBookEntry(filename):
+    '''An AutoLogBookEntry appears to be one that was created online using a form, for a single trip,
+    which is then stored in a separate location to the usual logbook.html 
+    But when importing logbook.html all these individual entries also need ot be parsed.
+    
+    This is all redundant as we are getting rid of the whole individual trip entry system
+    '''
    errors = []
    f = open(filename, "r")
    contents = f.read()