bug fix in logbook parser

2020-05-30 20:31:20 +01:00
parent 58c2650162
commit 3264b6edef
1 changed files with 83 additions and 116 deletions
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -1,5 +1,4 @@
 #.-*- coding: utf-8 -*-
 import csv
 import datetime
 import os
@@ -110,7 +109,7 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
    trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground)
    if not author:
-        print(("   - Skipping logentry: " + title + " - no author for entry"))
+        print("   * Skipping logentry: " + title + " - no author for entry")
        message = "Skipping logentry: %s - no author for entry in year '%s'" % (title, expedition.year)
        DataIssue.objects.create(parser='logbooks', message=message)
        return
@@ -153,7 +152,6 @@ def Parselogwikitxt(year, expedition, txt):
    trippara = re.findall(r"===(.*?)===([\s\S]*?)(?====)", txt)
    for triphead, triptext in trippara:
        tripheadp = triphead.split("|")
        #print "ttt", tripheadp
        assert len(tripheadp) == 3, (tripheadp, triptext)
        tripdate, tripplace, trippeople = tripheadp
        tripsplace = tripplace.split(" - ")
@@ -161,19 +159,14 @@ def Parselogwikitxt(year, expedition, txt):
        tul = re.findall(r"T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext)
        if tul:
            #assert len(tul) <= 1, (triphead, triptext)
            #assert tul[0][1] in ["hrs", "hours"], (triphead, triptext)
            tu = tul[0][0]
        else:
            tu = ""
            #assert tripcave == "Journey", (triphead, triptext)
        #print tripdate
        ldate = ParseDate(tripdate.strip(), year)
        #print "\n", tripcave, "---   ppp", trippeople, len(triptext)
        EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)
-# 2002, 2004, 2005, 2007, 2010 - 2018
+# 2002, 2004, 2005, 2007, 2010 - now
 def Parseloghtmltxt(year, expedition, txt):
    #print(" - Starting log html parser")
    tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt)
@@ -194,27 +187,20 @@ def Parseloghtmltxt(year, expedition, txt):
        if not s:
            if not re.search(r"Rigging Guide", trippara):
                print(("can't parse: ", trippara))  # this is 2007 which needs editing
            #assert s, trippara
            continue
        tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups()
        ldate = ParseDate(tripdate.strip(), year)
        #assert tripid[:-1] == "t" + tripdate, (tripid, tripdate)
        #trippeople = re.sub(r"Ol(?!l)", "Olly", trippeople)
        #trippeople = re.sub(r"Wook(?!e)", "Wookey", trippeople)
        triptitles = triptitle.split(" - ")
        if len(triptitles) >= 2:
            tripcave = triptitles[0]
        else:
            tripcave = "UNKNOWN"
        #print("\n", tripcave, "---   ppp", trippeople, len(triptext))
        ltriptext = re.sub(r"</p>", "", triptext)
        ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
        ltriptext = re.sub(r"<p>", "</br></br>", ltriptext).strip()
        EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext,
                          trippeople=trippeople, expedition=expedition, logtime_underground=0,
                          entry_type="html")
    if logbook_entry_count == 0:
        print(" - No trip entries found in logbook, check the syntax matches htmltxt format")
 # main parser for 1991 - 2001.  simpler because the data has been hacked so much to fit it
 def Parseloghtml01(year, expedition, txt):
@@ -227,9 +213,6 @@ def Parseloghtml01(year, expedition, txt):
        tripid = mtripid and mtripid.group(1) or ""
        tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader)
        #print "   ", [tripheader]
        #continue
        tripdate, triptitle, trippeople = tripheader.split("|")
        ldate = ParseDate(tripdate.strip(), year)
@@ -247,19 +230,14 @@ def Parseloghtml01(year, expedition, txt):
        mtail = re.search(r'(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&amp;|</?p>|\((?:same day|\d+)\))*$', ltriptext)
        if mtail:
            #print mtail.group(0)
            ltriptext = ltriptext[:mtail.start(0)]
        ltriptext = re.sub(r"</p>", "", ltriptext)
        ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
        ltriptext = re.sub(r"<p>|<br>", "\n\n", ltriptext).strip()
        #ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!]", "NONASCII", ltriptext)
        ltriptext = re.sub(r"</?u>", "_", ltriptext)
        ltriptext = re.sub(r"</?i>", "''", ltriptext)
        ltriptext = re.sub(r"</?b>", "'''", ltriptext)
        #print ldate, trippeople.strip()
            # could includ the tripid (url link for cross referencing)
        EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext,
                          trippeople=trippeople, expedition=expedition, logtime_underground=0,
                          entry_type="html")
@@ -286,7 +264,6 @@ def Parseloghtml03(year, expedition, txt):
            tripcave = triptitles[0]
        else:
            tripcave = "UNKNOWN"
        #print tripcave, "---   ppp", triptitle, trippeople, len(triptext)
        ltriptext = re.sub(r"</p>", "", triptext)
        ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
        ltriptext = re.sub(r"<p>", "\n\n", ltriptext).strip()
@@ -316,65 +293,71 @@ def SetDatesFromLogbookEntries(expedition):
 def LoadLogbookForExpedition(expedition):
-    """ Parses all logbook entries for one expedition """
+    """ Parses all logbook entries for one expedition 
-
+    """
    global logentries
    expowebbase = os.path.join(settings.EXPOWEB, "years")
    yearlinks = settings.LOGBOOK_PARSER_SETTINGS
    logbook_parseable = False
    logbook_cached = False
    yearlinks   = settings.LOGBOOK_PARSER_SETTINGS
    expologbase = os.path.join(settings.EXPOWEB, "years")
    if expedition.year in yearlinks:
-        # print " - Valid logbook year: ", expedition.year
+        logbookfile = os.path.join(expologbase, yearlinks[expedition.year][0])
-        year_settings = yearlinks[expedition.year]
+        parsefunc   = yearlinks[expedition.year][1]
    else:
        logbookfile = os.path.join(expologbase, expedition.year, settings.DEFAULT_LOGBOOK_FILE)
        parsefunc   = settings.DEFAULT_LOGBOOK_PARSER
    cache_filename = logbookfile + ".cache"
    try:
        bad_cache = False
            cache_filename = os.path.join(expowebbase, year_settings[0])+".cache"
        now = time.time()
        cache_t = os.path.getmtime(cache_filename)
-            file_t  = os.path.getmtime(os.path.join(expowebbase, year_settings[0]))
+        if os.path.getmtime(logbookfile) - cache_t > 2: # at least 2 secs later
            if file_t - cache_t > 2: # at least 2 secs later
                #print " - Cache is stale."
            bad_cache= True
        if now - cache_t > 30*24*60*60:
                #print " - Cache is more than 30 days old."
            bad_cache= True
        if bad_cache:
-                print(" - Cache is either stale or more than 30 days old. Deleting it.")
+            print(" - ! Cache is either stale or more than 30 days old. Deleting it.")
            os.remove(cache_filename)
            logentries=[]
            print("   ! Removed stale or corrupt cache file")
            raise
-            print((" - Reading cache: " + cache_filename ))
+        print("   - Reading cache: " + cache_filename, end='')
        try:
            with open(cache_filename, "rb") as f:
                logentries = pickle.load(f) 
-                print(" - Loaded ", len(logentries), " objects")
+            print("  -- Loaded ", len(logentries), " log entries")
            logbook_cached = True
        except:
-                print(" - Failed to load corrupt cache. Deleting it.\n")
+            print("\n   ! Failed to load corrupt cache. Deleting it.\n")
            os.remove(cache_filename)
            logentries=[]
            raise
-        except:
+    except : # no cache found
-            print(" - Opening logbook: ")
+        #print("   - No cache \"" + cache_filename +"\"")
-            file_in = open(os.path.join(expowebbase, year_settings[0]),'rb')
+        try:
            file_in = open(logbookfile,'rb')
            txt = file_in.read().decode("latin1")
            file_in.close()
            parsefunc = year_settings[1]
            logbook_parseable = True
-            print((" - Parsing logbook: " + year_settings[0] + "\n - Using parser: " + year_settings[1]))
+            print(("   - Using: " + parsefunc + " to parse " + logbookfile))
        except (IOError):
            logbook_parseable = False
            print(("   ! Couldn't open logbook " + logbookfile))
    if logbook_parseable:
        parser = globals()[parsefunc]
        parser(expedition.year, expedition, txt)
        SetDatesFromLogbookEntries(expedition)
-            # and this has also stored all the objects in logentries[]
+        # and this has also stored all the log entries in logentries[]
-            print(" - Storing " , len(logentries), " log entries")
+        if len(logentries) >0:
-            cache_filename = os.path.join(expowebbase, year_settings[0])+".cache"
+            print("   - Cacheing " , len(logentries), " log entries")
-            with open(cache_filename, "wb") as f:
+            with open(cache_filename, "wb") as fc:
-                pickle.dump(logentries, f, 2)
+                pickle.dump(logentries, fc, 2)
        else:
            print("   ! NO TRIP entries found in logbook, check the syntax.")
        logentries=[] # flush for next year
    if logbook_cached:
@@ -382,37 +365,21 @@ def LoadLogbookForExpedition(expedition):
        for entrytuple in range(len(logentries)):
            date, place, title, text, trippeople, expedition, logtime_underground, \
                entry_type = logentries[i]
                #print " - - obj ", i, date, title
            EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground,\
                entry_type)
            i +=1
    else:
            try:
                file_in = open(os.path.join(expowebbase, expedition.year, settings.DEFAULT_LOGBOOK_FILE),'rb')
                txt = file_in.read().decode("latin1")
                file_in.close()
                logbook_parseable = True
                print("No set parser found using default")
                parsefunc = settings.DEFAULT_LOGBOOK_PARSER
            except (IOError):
                logbook_parseable = False
                print(("Couldn't open default logbook file and nothing in settings for expo " + expedition.year))
    #return "TOLOAD: " + year + "  " + str(expedition.personexpedition_set.all()[1].logbookentry_set.count()) + "  " + str(PersonTrip.objects.filter(personexpedition__expedition=expedition).count())
 def LoadLogbooks():
-    """ This is the master function for parsing all logbooks into the Troggle database. """
+    """ This is the master function for parsing all logbooks into the Troggle database. 
-
+    """
    # Clear the logbook data issues as we are reloading
    DataIssue.objects.filter(parser='logbooks').delete()
    # Fetch all expos
    expos = Expedition.objects.all()
    nologbook = ["1976", "1977","1978","1979","1980","1980","1981","1983","1984",
    "1985","1986","1987","1988","1989","1990",]
    for expo in expos:
-        print(("\nLoading Logbook for: " + expo.year))
+        if expo.year not in nologbook:
-        
+            print((" - Logbook for: " + expo.year))
        # Load logbook for expo
            LoadLogbookForExpedition(expo)
@@ -437,7 +404,7 @@ def parseAutoLogBookEntry(filename):
        year, month, day = [int(x) for x in dateMatch.groups()]
        date = datetime.date(year, month, day)
    else:
-        errors.append("Date could not be found")
+        errors.append(" - Date could not be found")
    expeditionYearMatch = expeditionYearRegex.search(contents)
    if expeditionYearMatch:
@@ -445,17 +412,17 @@ def parseAutoLogBookEntry(filename):
            expedition = Expedition.objects.get(year = expeditionYearMatch.groups()[0])
            personExpeditionNameLookup = GetPersonExpeditionNameLookup(expedition)
        except Expedition.DoesNotExist:
-            errors.append("Expedition not in database")   
+            errors.append(" - Expedition not in database")   
    else:
-        errors.append("Expedition Year could not be parsed")   
+        errors.append(" - Expedition Year could not be parsed")   
    titleMatch = titleRegex.search(contents)
    if titleMatch:
        title, = titleMatch.groups()
        if len(title) > settings.MAX_LOGBOOK_ENTRY_TITLE_LENGTH:
-            errors.append("Title too long")   
+            errors.append(" - Title too long")   
    else:
-        errors.append("Title could not be found") 
+        errors.append(" - Title could not be found") 
    caveMatch = caveRegex.search(contents)
    if caveMatch:
@@ -464,7 +431,7 @@ def parseAutoLogBookEntry(filename):
            cave = getCaveByReference(caveRef)
        except AssertionError:
            cave = None
-            errors.append("Cave not found in database")   
+            errors.append(" - Cave not found in database")   
    else:
        cave = None
@@ -475,13 +442,13 @@ def parseAutoLogBookEntry(filename):
        location = None
    if cave is None and location is None:
-        errors.append("Location nor cave could not be found") 
+        errors.append(" - Location nor cave could not be found") 
    reportMatch = reportRegex.search(contents)
    if reportMatch:
        report, = reportMatch.groups()
    else:
-        errors.append("Contents could not be found") 
+        errors.append(" - Contents could not be found") 
    if errors:
        return errors # Easiest to bail out at this point as we need to make sure that we know which expedition to look for people from.
    people = []
@@ -492,20 +459,20 @@ def parseAutoLogBookEntry(filename):
           if name.lower() in personExpeditionNameLookup:
               personExpo = personExpeditionNameLookup[name.lower()]
           else:
-               errors.append("Person could not be found in database")
+               errors.append(" - Person could not be found in database")
           author = bool(author)
       else:
-           errors.append("Persons name could not be found")
+           errors.append(" - Persons name could not be found")
       TUMatch = TURegex.search(contents)
       if TUMatch:
           TU, = TUMatch.groups()
       else:
-           errors.append("TU could not be found")
+           errors.append(" - TU could not be found")
       if not errors:
           people.append((name, author, TU))
    if errors:
-        return errors # Bail out before commiting to the database
+        return errors # Bail out before committing to the database
    logbookEntry = LogbookEntry(date = date, 
                                       expedition  = expedition,
                                       title = title, cave = cave, place = location,