From 8128870d57c751d0589b58f1ce88824261d36335 Mon Sep 17 00:00:00 2001
From: Philip Sargent <philip.sargent@klebos.com>
Date: Sat, 24 Apr 2021 01:23:55 +0100
Subject: [PATCH] more robust logbooks parsing

---
 core/views/logbooks.py | 16 +++++++----
 parsers/logbooks.py    | 64 +++++++++++++++++++++++++++++-------------
 2 files changed, 55 insertions(+), 25 deletions(-)

diff --git a/core/views/logbooks.py b/core/views/logbooks.py
index ecf0f6b..82dbf61 100644
--- a/core/views/logbooks.py
+++ b/core/views/logbooks.py
@@ -176,13 +176,17 @@ def personexpedition(request, first_name='',  last_name='', year=''):
 
 def logbookentry(request, date, slug):
     this_logbookentry = LogbookEntry.objects.filter(date=date, slug=slug)
-
-    if len(this_logbookentry)>1:
-        return render(request, 'object_list.html',{'object_list':this_logbookentry})
+    
+    if this_logbookentry:
+        if len(this_logbookentry)>1:
+            return render(request, 'object_list.html',{'object_list':this_logbookentry})
+        else:
+            this_logbookentry=this_logbookentry[0]
+            return render(request, 'logbookentry.html', {'logbookentry': this_logbookentry})
     else:
-        this_logbookentry=this_logbookentry[0]
-        return render(request, 'logbookentry.html', {'logbookentry': this_logbookentry})
-
+        msg =(f' Logbook entry slug:"{slug}" not found in database on date:"{date}" ')
+        print(msg)
+        return render(request, 'errors/generic.html',{'message':msg})
 
 def logbookSearch(request, extra):
     query_string = ''
diff --git a/parsers/logbooks.py b/parsers/logbooks.py
index c9d7796..8237bdc 100644
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -71,13 +71,18 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
        
     for tripperson in re.split(r",|\+|&amp;|&(?!\w+;)| and ", trippeople):
         tripperson = tripperson.strip()
-        if not tid:
-            tid = expedition.year + "." + tripperson + datetime.now().strftime("%S%f") # no good. Should be getting the tid
         mul = re.match(r"<u>(.*?)</u>$(?i)", tripperson)
         if mul:
             tripperson = mul.group(1).strip()
         if tripperson and tripperson[0] != '*':
             tripperson = re.sub(round_bracket_regex, "", tripperson).strip()
+            
+            if tripperson =="Wiggy":
+                tripperson = "Phil Wigglesworth"
+            if tripperson =="Animal":
+                tripperson = "Mike Richardson"
+
+                            
             personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower())
             if not personyear:
                 message = f" ! - {expedition.year} No name match for: '{tripperson}' " 
@@ -91,6 +96,7 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
         if not res:
             return None, None
         author = res[-1][0]
+        
     return res, author
 
 def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, entry_type="wiki", tid=None):
@@ -107,11 +113,11 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
         return
         
     if not author:
-        message = f" ! - {expedition.year} Skipping logentry: {title} - - no author for entry in year "
+        message = f" ! - {expedition.year} Warning: logentry: {title} - no author for entry in year "
         DataIssue.objects.create(parser='logbooks', message=message)
         logdataissues["title"]=message
         print(message)
-        return
+        #return
 
     # This needs attention. The slug field is derived from 'title'
     # both GetCaveLookup() and GetTripCave() need to work together better. None of this data is *used* though?
@@ -298,7 +304,7 @@ def Parseloghtmltxt(year, expedition, txt):
                             "html", tripid1, logbook_entry_count, tid=tid)
 
 # main parser for 1991 - 2001.  simpler because the data has been hacked so much to fit it
-# trying it out for years 1982 - 1990 too. Some logbook editing required by hand..
+# trying it out for years 1982 - 1990 too. Some logbook editing required by hand.. place
 def Parseloghtml01(year, expedition, txt):
     global logentries
     global logdataissues
@@ -309,7 +315,9 @@ def Parseloghtml01(year, expedition, txt):
     for trippara in tripparas:
         logbook_entry_count += 1
         tid = set_trip_id(year,logbook_entry_count)
+        # print(f" #0 - tid: {tid}")
         try:
+            #print(f" #1 - tid: {tid}")
             s = re.match(r"(?s)\s*(?:<p>)?(.*?)</?p>(.*)$(?i)", trippara)
             if not s:
                 message = " ! - Skipping logentry {year} failure to parse header: " + tid + trippara[:300] + "..."
@@ -317,22 +325,40 @@ def Parseloghtml01(year, expedition, txt):
                 logdataissues[tid]=message
                 print(message)
                 break
-            tripheader, triptext = s.group(1), s.group(2)
-            mtripid = re.search(r'<a id="(.*?)"', tripheader)
+            try:
+                tripheader, triptext = s.group(1), s.group(2)
+            except:
+                message = f" ! - Fail to set tripheader, triptext. trip:<{tid}> s:'{s}'" 
+                DataIssue.objects.create(parser='logbooks', message=message)
+                logdataissues[tid]=message
+                print(message)
+                
+
+            # mtripid = re.search(r'<a id="(.*?)"', tripheader)
             # if not mtripid:
-                # # not an error, this is probabluy jusyt a different year
-                # message = f" ! - Fail id trip:{tid} header:'{tripheader}'" 
+                # message = f" ! - A tag id not found. Never mind. Not needed. trip:<{tid}> header:'{tripheader}'" 
                 # DataIssue.objects.create(parser='logbooks', message=message)
                 # logdataissues[tid]=message
                 # print(message)
                 
-            tripid = mtripid and mtripid.group(1) or ""
-            #print(f" # - mtripid: {mtripid}")
+            # tripid = mtripid and mtripid.group(1) or ""
+            # print(f" # - mtripid: {mtripid}")
             tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader)
-
-            tripdate, triptitle, trippeople = tripheader.split("|")
+            #print(f" #2 - tid: {tid}")
+            try:
+                tripdate, triptitle, trippeople = tripheader.split("|")
+            except:
+                message = f" ! - Fail to split out date|title|people. trip:<{tid}> '{tripheader.split('|')}'" 
+                DataIssue.objects.create(parser='logbooks', message=message)
+                logdataissues[tid]=message
+                print(message)
+                tripdate, triptitle = tripheader.split("|")
+                trippeople = "anon"
+            #print(f" #3 - tid: {tid}")
             ldate = ParseDate(tripdate.strip(), year)
-        
+            #print(f" # -             tid: {tid} <{tripdate}> <{triptitle}> <{trippeople}>")
+            #print(f" #4 - tid: {tid}")
+
             mtu = re.search(r'<p[^>]*>(T/?U.*)', triptext)
             if mtu:
                 tu = mtu.group(1)
@@ -363,7 +389,7 @@ def Parseloghtml01(year, expedition, txt):
                 
 
             entrytuple = (ldate, tripcave, triptitle, ltriptext, 
-                    trippeople, expedition, tu, "html01", tripid)
+                    trippeople, expedition, tu, "html01", tid)
             logentries.append(entrytuple)
             try:
                 EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext,
@@ -377,7 +403,7 @@ def Parseloghtml01(year, expedition, txt):
                 
             try:
                 EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu, 
-                                    "html01", tripid, logbook_entry_count, tid=tid)
+                                    "html01", tid, logbook_entry_count, tid=tid)
             except:
                 message = " ! - Enter log entry into ObjectStore FAIL  exception in: " + tid 
                 DataIssue.objects.create(parser='logbooks', message=message)
@@ -577,7 +603,7 @@ def LoadLogbookForExpedition(expedition, expect):
         SetDatesFromLogbookEntries(expedition)
         if len(logentries) >0:
             print("   - Cacheing " , len(logentries), " log entries")
-            with open(cache_filename, "wb") as fc:
+            with open(cache_filename, "wb") as fc: # we much check that permission are g+w ! or expo can't delete the cache
                 logbk=(expedition,len(logentries),logentries)
                 pickle.dump(logbk, fc, protocol=4)
         else:
@@ -608,14 +634,14 @@ def LoadLogbooks():
     if len(expos) <= 1:
         print(" ! No expeditions found. Load 'people' first.\n")
     nologbook = ["1976", "1977", "1978", "1979", "1980", "1981", 
-        "1987", "1988", "1989",
+        "1987", "1988", "1989", # needs more hand-editing of log.htm
         "1986", "2020",]
     entries = {"2021": 0, "2019": 20, "2018": 74, "2017": 60, "2016": 81, "2015": 79, 
         "2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 52, 
         "2008": 49, "2007": 111, "2006": 60, "2005": 55, "2004": 76, "2003": 40, "2002": 31, 
         "2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 94, "1995": 41, 
         "1994": 32, "1993": 41, "1992": 61, "1991": 38, "1990": 87, "1989": 1,"1988": 1,"1987": 1,
-        "1985": 21,"1984": 19,"1983": 22,"1982": 42,}
+        "1985": 22,"1984": 32,"1983": 52,"1982": 42,}
     # Logbooks log.htm exist for 1983, 84, 85, 87, 88, 89 but have no full-working parser, or need hand-editing.
     try:
         os.remove("loadlogbk.log")