From 3264b6edefd28b07372518dc866d27da8f1e81ea Mon Sep 17 00:00:00 2001
From: Philip Sargent <philip.sargent@klebos.com>
Date: Sat, 30 May 2020 20:31:20 +0100
Subject: [PATCH] bug fix in logbook parser

---
 parsers/logbooks.py | 199 ++++++++++++++++++--------------------------
 1 file changed, 83 insertions(+), 116 deletions(-)
diff --git a/parsers/logbooks.py b/parsers/logbooks.py
index cfc1a20..ce78e6d 100644
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -1,5 +1,4 @@
 #.-*- coding: utf-8 -*-
-
 import csv
 import datetime
 import os
@@ -110,7 +109,7 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
 
     trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground)
     if not author:
-        print(("   - Skipping logentry: " + title + " - no author for entry"))
+        print("   * Skipping logentry: " + title + " - no author for entry")
         message = "Skipping logentry: %s - no author for entry in year '%s'" % (title, expedition.year)
         DataIssue.objects.create(parser='logbooks', message=message)
         return
@@ -153,7 +152,6 @@ def Parselogwikitxt(year, expedition, txt):
     trippara = re.findall(r"===(.*?)===([\s\S]*?)(?====)", txt)
     for triphead, triptext in trippara:
         tripheadp = triphead.split("|")
-        #print "ttt", tripheadp
         assert len(tripheadp) == 3, (tripheadp, triptext)
         tripdate, tripplace, trippeople = tripheadp
         tripsplace = tripplace.split(" - ")
@@ -161,19 +159,14 @@ def Parselogwikitxt(year, expedition, txt):
 
         tul = re.findall(r"T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext)
         if tul:
-            #assert len(tul) <= 1, (triphead, triptext)
-            #assert tul[0][1] in ["hrs", "hours"], (triphead, triptext)
             tu = tul[0][0]
         else:
             tu = ""
-            #assert tripcave == "Journey", (triphead, triptext)
 
-        #print tripdate
         ldate = ParseDate(tripdate.strip(), year)
-        #print "\n", tripcave, "---   ppp", trippeople, len(triptext)
         EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)
 
-# 2002, 2004, 2005, 2007, 2010 - 2018
+# 2002, 2004, 2005, 2007, 2010 - now
 def Parseloghtmltxt(year, expedition, txt):
     #print(" - Starting log html parser")
     tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt)
@@ -194,27 +187,20 @@ def Parseloghtmltxt(year, expedition, txt):
         if not s:
             if not re.search(r"Rigging Guide", trippara):
                 print(("can't parse: ", trippara))  # this is 2007 which needs editing
-            #assert s, trippara
             continue
         tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups()
         ldate = ParseDate(tripdate.strip(), year)
-        #assert tripid[:-1] == "t" + tripdate, (tripid, tripdate)
-        #trippeople = re.sub(r"Ol(?!l)", "Olly", trippeople)
-        #trippeople = re.sub(r"Wook(?!e)", "Wookey", trippeople)
         triptitles = triptitle.split(" - ")
         if len(triptitles) >= 2:
             tripcave = triptitles[0]
         else:
             tripcave = "UNKNOWN"
-        #print("\n", tripcave, "---   ppp", trippeople, len(triptext))
         ltriptext = re.sub(r"</p>", "", triptext)
         ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
         ltriptext = re.sub(r"<p>", "</br></br>", ltriptext).strip()
         EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext,
                           trippeople=trippeople, expedition=expedition, logtime_underground=0,
                           entry_type="html")
-    if logbook_entry_count == 0:
-        print(" - No trip entries found in logbook, check the syntax matches htmltxt format")
 
 # main parser for 1991 - 2001.  simpler because the data has been hacked so much to fit it
 def Parseloghtml01(year, expedition, txt):
@@ -227,9 +213,6 @@ def Parseloghtml01(year, expedition, txt):
         tripid = mtripid and mtripid.group(1) or ""
         tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader)
 
-        #print "   ", [tripheader]
-        #continue
-
         tripdate, triptitle, trippeople = tripheader.split("|")
         ldate = ParseDate(tripdate.strip(), year)
     
@@ -247,19 +230,14 @@ def Parseloghtml01(year, expedition, txt):
         
         mtail = re.search(r'(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&amp;|</?p>|\((?:same day|\d+)\))*$', ltriptext)
         if mtail:
-            #print mtail.group(0)
             ltriptext = ltriptext[:mtail.start(0)]
         ltriptext = re.sub(r"</p>", "", ltriptext)
         ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
         ltriptext = re.sub(r"<p>|<br>", "\n\n", ltriptext).strip()
-        #ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!]", "NONASCII", ltriptext)
         ltriptext = re.sub(r"</?u>", "_", ltriptext)
         ltriptext = re.sub(r"</?i>", "''", ltriptext)
         ltriptext = re.sub(r"</?b>", "'''", ltriptext)
-        
 
-        #print ldate, trippeople.strip()
-            # could includ the tripid (url link for cross referencing)
         EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext,
                           trippeople=trippeople, expedition=expedition, logtime_underground=0,
                           entry_type="html")
@@ -286,7 +264,6 @@ def Parseloghtml03(year, expedition, txt):
             tripcave = triptitles[0]
         else:
             tripcave = "UNKNOWN"
-        #print tripcave, "---   ppp", triptitle, trippeople, len(triptext)
         ltriptext = re.sub(r"</p>", "", triptext)
         ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
         ltriptext = re.sub(r"<p>", "\n\n", ltriptext).strip()
@@ -316,104 +293,94 @@ def SetDatesFromLogbookEntries(expedition):
 
 
 def LoadLogbookForExpedition(expedition):
-    """ Parses all logbook entries for one expedition """
-
+    """ Parses all logbook entries for one expedition 
+    """
     global logentries
-    
-    expowebbase = os.path.join(settings.EXPOWEB, "years")
-    yearlinks = settings.LOGBOOK_PARSER_SETTINGS
-
     logbook_parseable = False
     logbook_cached = False
+    yearlinks   = settings.LOGBOOK_PARSER_SETTINGS
+    expologbase = os.path.join(settings.EXPOWEB, "years")
  
     if expedition.year in yearlinks:
-        # print " - Valid logbook year: ", expedition.year
-        year_settings = yearlinks[expedition.year]
+        logbookfile = os.path.join(expologbase, yearlinks[expedition.year][0])
+        parsefunc   = yearlinks[expedition.year][1]
+    else:
+        logbookfile = os.path.join(expologbase, expedition.year, settings.DEFAULT_LOGBOOK_FILE)
+        parsefunc   = settings.DEFAULT_LOGBOOK_PARSER
+    cache_filename = logbookfile + ".cache"
+
+    try:
+        bad_cache = False
+        now = time.time()
+        cache_t = os.path.getmtime(cache_filename)
+        if os.path.getmtime(logbookfile) - cache_t > 2: # at least 2 secs later
+            bad_cache= True
+        if now - cache_t > 30*24*60*60:
+            bad_cache= True
+        if bad_cache:
+            print(" - ! Cache is either stale or more than 30 days old. Deleting it.")
+            os.remove(cache_filename)
+            logentries=[]
+            print("   ! Removed stale or corrupt cache file")
+            raise
+        print("   - Reading cache: " + cache_filename, end='')
         try:
-            bad_cache = False
-            cache_filename = os.path.join(expowebbase, year_settings[0])+".cache"
-            now = time.time()
-            cache_t = os.path.getmtime(cache_filename)
-            file_t  = os.path.getmtime(os.path.join(expowebbase, year_settings[0]))
-            if file_t - cache_t > 2: # at least 2 secs later
-                #print " - Cache is stale."
-                bad_cache= True
-            if now - cache_t > 30*24*60*60:
-                #print " - Cache is more than 30 days old."
-                bad_cache= True
-            if bad_cache:
-                print(" - Cache is either stale or more than 30 days old. Deleting it.")
-                os.remove(cache_filename)
-                logentries=[]
-                raise
-            print((" - Reading cache: " + cache_filename ))
-            try:
-                with open(cache_filename, "rb") as f:
-                    logentries = pickle.load(f) 
-                print(" - Loaded ", len(logentries), " objects")
-                logbook_cached = True
-            except:
-                print(" - Failed to load corrupt cache. Deleting it.\n")
-                os.remove(cache_filename)
-                logentries=[]
-                raise
+            with open(cache_filename, "rb") as f:
+                logentries = pickle.load(f) 
+            print("  -- Loaded ", len(logentries), " log entries")
+            logbook_cached = True
         except:
-            print(" - Opening logbook: ")
-            file_in = open(os.path.join(expowebbase, year_settings[0]),'rb')
+            print("\n   ! Failed to load corrupt cache. Deleting it.\n")
+            os.remove(cache_filename)
+            logentries=[]
+            raise
+    except : # no cache found
+        #print("   - No cache \"" + cache_filename +"\"")
+        try:
+            file_in = open(logbookfile,'rb')
             txt = file_in.read().decode("latin1")
             file_in.close()
-            parsefunc = year_settings[1]
             logbook_parseable = True
-            print((" - Parsing logbook: " + year_settings[0] + "\n - Using parser: " + year_settings[1]))
+            print(("   - Using: " + parsefunc + " to parse " + logbookfile))
+        except (IOError):
+            logbook_parseable = False
+            print(("   ! Couldn't open logbook " + logbookfile))
 
-        if logbook_parseable:
-            parser = globals()[parsefunc]
-            parser(expedition.year, expedition, txt)
-            SetDatesFromLogbookEntries(expedition)
-            # and this has also stored all the objects in logentries[]
-            print(" - Storing " , len(logentries), " log entries")
-            cache_filename = os.path.join(expowebbase, year_settings[0])+".cache"
-            with open(cache_filename, "wb") as f:
-                pickle.dump(logentries, f, 2)
-            logentries=[] # flush for next year
+    if logbook_parseable:
+        parser = globals()[parsefunc]
+        parser(expedition.year, expedition, txt)
+        SetDatesFromLogbookEntries(expedition)
+        # and this has also stored all the log entries in logentries[]
+        if len(logentries) >0:
+            print("   - Cacheing " , len(logentries), " log entries")
+            with open(cache_filename, "wb") as fc:
+                pickle.dump(logentries, fc, 2)
+        else:
+            print("   ! NO TRIP entries found in logbook, check the syntax.")
 
-        if logbook_cached:
-            i=0
-            for entrytuple in range(len(logentries)):
-                date, place, title, text, trippeople, expedition, logtime_underground, \
-                    entry_type = logentries[i]
-                #print " - - obj ", i, date, title
-                EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground,\
-                    entry_type)
-                i +=1
-    else:
-            try:
-                file_in = open(os.path.join(expowebbase, expedition.year, settings.DEFAULT_LOGBOOK_FILE),'rb')
-                txt = file_in.read().decode("latin1")
-                file_in.close()
-                logbook_parseable = True
-                print("No set parser found using default")
-                parsefunc = settings.DEFAULT_LOGBOOK_PARSER
-            except (IOError):
-                logbook_parseable = False
-                print(("Couldn't open default logbook file and nothing in settings for expo " + expedition.year))
+        logentries=[] # flush for next year
 
-
-    #return "TOLOAD: " + year + "  " + str(expedition.personexpedition_set.all()[1].logbookentry_set.count()) + "  " + str(PersonTrip.objects.filter(personexpedition__expedition=expedition).count())
+    if logbook_cached:
+        i=0
+        for entrytuple in range(len(logentries)):
+            date, place, title, text, trippeople, expedition, logtime_underground, \
+                entry_type = logentries[i]
+            EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground,\
+                entry_type)
+            i +=1
 
 
 def LoadLogbooks():
-    """ This is the master function for parsing all logbooks into the Troggle database. """
-
-    # Clear the logbook data issues as we are reloading
+    """ This is the master function for parsing all logbooks into the Troggle database. 
+    """
     DataIssue.objects.filter(parser='logbooks').delete()
-    # Fetch all expos
     expos = Expedition.objects.all()
+    nologbook = ["1976", "1977","1978","1979","1980","1980","1981","1983","1984",
+    "1985","1986","1987","1988","1989","1990",]
     for expo in expos:
-        print(("\nLoading Logbook for: " + expo.year))
-        
-        # Load logbook for expo
-        LoadLogbookForExpedition(expo)
+        if expo.year not in nologbook:
+            print((" - Logbook for: " + expo.year))
+            LoadLogbookForExpedition(expo)
 
 
 dateRegex = re.compile(r'<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S)
@@ -437,7 +404,7 @@ def parseAutoLogBookEntry(filename):
         year, month, day = [int(x) for x in dateMatch.groups()]
         date = datetime.date(year, month, day)
     else:
-        errors.append("Date could not be found")
+        errors.append(" - Date could not be found")
 
     expeditionYearMatch = expeditionYearRegex.search(contents)
     if expeditionYearMatch:
@@ -445,17 +412,17 @@ def parseAutoLogBookEntry(filename):
             expedition = Expedition.objects.get(year = expeditionYearMatch.groups()[0])
             personExpeditionNameLookup = GetPersonExpeditionNameLookup(expedition)
         except Expedition.DoesNotExist:
-            errors.append("Expedition not in database")   
+            errors.append(" - Expedition not in database")   
     else:
-        errors.append("Expedition Year could not be parsed")   
+        errors.append(" - Expedition Year could not be parsed")   
 
     titleMatch = titleRegex.search(contents)
     if titleMatch:
         title, = titleMatch.groups()
         if len(title) > settings.MAX_LOGBOOK_ENTRY_TITLE_LENGTH:
-            errors.append("Title too long")   
+            errors.append(" - Title too long")   
     else:
-        errors.append("Title could not be found") 
+        errors.append(" - Title could not be found") 
 
     caveMatch = caveRegex.search(contents)
     if caveMatch:
@@ -464,7 +431,7 @@ def parseAutoLogBookEntry(filename):
             cave = getCaveByReference(caveRef)
         except AssertionError:
             cave = None
-            errors.append("Cave not found in database")   
+            errors.append(" - Cave not found in database")   
     else:
         cave = None
 
@@ -475,13 +442,13 @@ def parseAutoLogBookEntry(filename):
         location = None
        
     if cave is None and location is None:
-        errors.append("Location nor cave could not be found") 
+        errors.append(" - Location nor cave could not be found") 
 
     reportMatch = reportRegex.search(contents)
     if reportMatch:
         report, = reportMatch.groups()
     else:
-        errors.append("Contents could not be found") 
+        errors.append(" - Contents could not be found") 
     if errors:
         return errors # Easiest to bail out at this point as we need to make sure that we know which expedition to look for people from.
     people = []
@@ -492,20 +459,20 @@ def parseAutoLogBookEntry(filename):
            if name.lower() in personExpeditionNameLookup:
                personExpo = personExpeditionNameLookup[name.lower()]
            else:
-               errors.append("Person could not be found in database")
+               errors.append(" - Person could not be found in database")
            author = bool(author)
        else:
-           errors.append("Persons name could not be found")
+           errors.append(" - Persons name could not be found")
        
        TUMatch = TURegex.search(contents)
        if TUMatch:
            TU, = TUMatch.groups()
        else:
-           errors.append("TU could not be found")
+           errors.append(" - TU could not be found")
        if not errors:
            people.append((name, author, TU))
     if errors:
-        return errors # Bail out before commiting to the database
+        return errors # Bail out before committing to the database
     logbookEntry = LogbookEntry(date = date, 
                                        expedition  = expedition,
                                        title = title, cave = cave, place = location,