From 3264b6edefd28b07372518dc866d27da8f1e81ea Mon Sep 17 00:00:00 2001 From: Philip Sargent Date: Sat, 30 May 2020 20:31:20 +0100 Subject: [PATCH] bug fix in logbook parser --- parsers/logbooks.py | 199 ++++++++++++++++++-------------------------- 1 file changed, 83 insertions(+), 116 deletions(-) diff --git a/parsers/logbooks.py b/parsers/logbooks.py index cfc1a20..ce78e6d 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -1,5 +1,4 @@ #.-*- coding: utf-8 -*- - import csv import datetime import os @@ -110,7 +109,7 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_ trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground) if not author: - print((" - Skipping logentry: " + title + " - no author for entry")) + print(" * Skipping logentry: " + title + " - no author for entry") message = "Skipping logentry: %s - no author for entry in year '%s'" % (title, expedition.year) DataIssue.objects.create(parser='logbooks', message=message) return @@ -153,7 +152,6 @@ def Parselogwikitxt(year, expedition, txt): trippara = re.findall(r"===(.*?)===([\s\S]*?)(?====)", txt) for triphead, triptext in trippara: tripheadp = triphead.split("|") - #print "ttt", tripheadp assert len(tripheadp) == 3, (tripheadp, triptext) tripdate, tripplace, trippeople = tripheadp tripsplace = tripplace.split(" - ") @@ -161,19 +159,14 @@ def Parselogwikitxt(year, expedition, txt): tul = re.findall(r"T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext) if tul: - #assert len(tul) <= 1, (triphead, triptext) - #assert tul[0][1] in ["hrs", "hours"], (triphead, triptext) tu = tul[0][0] else: tu = "" - #assert tripcave == "Journey", (triphead, triptext) - #print tripdate ldate = ParseDate(tripdate.strip(), year) - #print "\n", tripcave, "--- ppp", trippeople, len(triptext) EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople, expedition=expedition, logtime_underground=0) -# 2002, 2004, 2005, 2007, 2010 - 2018 +# 2002, 2004, 2005, 2007, 2010 - now def Parseloghtmltxt(year, expedition, txt): #print(" - Starting log html parser") tripparas = re.findall(r"([\s\S]*?)(?== 2: tripcave = triptitles[0] else: tripcave = "UNKNOWN" - #print("\n", tripcave, "--- ppp", trippeople, len(triptext)) ltriptext = re.sub(r"

", "", triptext) ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext) ltriptext = re.sub(r"

", "

", ltriptext).strip() EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0, entry_type="html") - if logbook_entry_count == 0: - print(" - No trip entries found in logbook, check the syntax matches htmltxt format") # main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it def Parseloghtml01(year, expedition, txt): @@ -227,9 +213,6 @@ def Parseloghtml01(year, expedition, txt): tripid = mtripid and mtripid.group(1) or "" tripheader = re.sub(r"]*>", "", tripheader) - #print " ", [tripheader] - #continue - tripdate, triptitle, trippeople = tripheader.split("|") ldate = ParseDate(tripdate.strip(), year) @@ -247,19 +230,14 @@ def Parseloghtml01(year, expedition, txt): mtail = re.search(r'(?:[^<]*|\s|/|-|&||\((?:same day|\d+)\))*$', ltriptext) if mtail: - #print mtail.group(0) ltriptext = ltriptext[:mtail.start(0)] ltriptext = re.sub(r"

", "", ltriptext) ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext) ltriptext = re.sub(r"

|
", "\n\n", ltriptext).strip() - #ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!]", "NONASCII", ltriptext) ltriptext = re.sub(r"", "_", ltriptext) ltriptext = re.sub(r"", "''", ltriptext) ltriptext = re.sub(r"", "'''", ltriptext) - - #print ldate, trippeople.strip() - # could includ the tripid (url link for cross referencing) EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0, entry_type="html") @@ -286,7 +264,6 @@ def Parseloghtml03(year, expedition, txt): tripcave = triptitles[0] else: tripcave = "UNKNOWN" - #print tripcave, "--- ppp", triptitle, trippeople, len(triptext) ltriptext = re.sub(r"

", "", triptext) ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext) ltriptext = re.sub(r"

", "\n\n", ltriptext).strip() @@ -316,104 +293,94 @@ def SetDatesFromLogbookEntries(expedition): def LoadLogbookForExpedition(expedition): - """ Parses all logbook entries for one expedition """ - + """ Parses all logbook entries for one expedition + """ global logentries - - expowebbase = os.path.join(settings.EXPOWEB, "years") - yearlinks = settings.LOGBOOK_PARSER_SETTINGS - logbook_parseable = False logbook_cached = False + yearlinks = settings.LOGBOOK_PARSER_SETTINGS + expologbase = os.path.join(settings.EXPOWEB, "years") if expedition.year in yearlinks: - # print " - Valid logbook year: ", expedition.year - year_settings = yearlinks[expedition.year] + logbookfile = os.path.join(expologbase, yearlinks[expedition.year][0]) + parsefunc = yearlinks[expedition.year][1] + else: + logbookfile = os.path.join(expologbase, expedition.year, settings.DEFAULT_LOGBOOK_FILE) + parsefunc = settings.DEFAULT_LOGBOOK_PARSER + cache_filename = logbookfile + ".cache" + + try: + bad_cache = False + now = time.time() + cache_t = os.path.getmtime(cache_filename) + if os.path.getmtime(logbookfile) - cache_t > 2: # at least 2 secs later + bad_cache= True + if now - cache_t > 30*24*60*60: + bad_cache= True + if bad_cache: + print(" - ! Cache is either stale or more than 30 days old. Deleting it.") + os.remove(cache_filename) + logentries=[] + print(" ! Removed stale or corrupt cache file") + raise + print(" - Reading cache: " + cache_filename, end='') try: - bad_cache = False - cache_filename = os.path.join(expowebbase, year_settings[0])+".cache" - now = time.time() - cache_t = os.path.getmtime(cache_filename) - file_t = os.path.getmtime(os.path.join(expowebbase, year_settings[0])) - if file_t - cache_t > 2: # at least 2 secs later - #print " - Cache is stale." - bad_cache= True - if now - cache_t > 30*24*60*60: - #print " - Cache is more than 30 days old." - bad_cache= True - if bad_cache: - print(" - Cache is either stale or more than 30 days old. Deleting it.") - os.remove(cache_filename) - logentries=[] - raise - print((" - Reading cache: " + cache_filename )) - try: - with open(cache_filename, "rb") as f: - logentries = pickle.load(f) - print(" - Loaded ", len(logentries), " objects") - logbook_cached = True - except: - print(" - Failed to load corrupt cache. Deleting it.\n") - os.remove(cache_filename) - logentries=[] - raise + with open(cache_filename, "rb") as f: + logentries = pickle.load(f) + print(" -- Loaded ", len(logentries), " log entries") + logbook_cached = True except: - print(" - Opening logbook: ") - file_in = open(os.path.join(expowebbase, year_settings[0]),'rb') + print("\n ! Failed to load corrupt cache. Deleting it.\n") + os.remove(cache_filename) + logentries=[] + raise + except : # no cache found + #print(" - No cache \"" + cache_filename +"\"") + try: + file_in = open(logbookfile,'rb') txt = file_in.read().decode("latin1") file_in.close() - parsefunc = year_settings[1] logbook_parseable = True - print((" - Parsing logbook: " + year_settings[0] + "\n - Using parser: " + year_settings[1])) + print((" - Using: " + parsefunc + " to parse " + logbookfile)) + except (IOError): + logbook_parseable = False + print((" ! Couldn't open logbook " + logbookfile)) - if logbook_parseable: - parser = globals()[parsefunc] - parser(expedition.year, expedition, txt) - SetDatesFromLogbookEntries(expedition) - # and this has also stored all the objects in logentries[] - print(" - Storing " , len(logentries), " log entries") - cache_filename = os.path.join(expowebbase, year_settings[0])+".cache" - with open(cache_filename, "wb") as f: - pickle.dump(logentries, f, 2) - logentries=[] # flush for next year + if logbook_parseable: + parser = globals()[parsefunc] + parser(expedition.year, expedition, txt) + SetDatesFromLogbookEntries(expedition) + # and this has also stored all the log entries in logentries[] + if len(logentries) >0: + print(" - Cacheing " , len(logentries), " log entries") + with open(cache_filename, "wb") as fc: + pickle.dump(logentries, fc, 2) + else: + print(" ! NO TRIP entries found in logbook, check the syntax.") - if logbook_cached: - i=0 - for entrytuple in range(len(logentries)): - date, place, title, text, trippeople, expedition, logtime_underground, \ - entry_type = logentries[i] - #print " - - obj ", i, date, title - EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground,\ - entry_type) - i +=1 - else: - try: - file_in = open(os.path.join(expowebbase, expedition.year, settings.DEFAULT_LOGBOOK_FILE),'rb') - txt = file_in.read().decode("latin1") - file_in.close() - logbook_parseable = True - print("No set parser found using default") - parsefunc = settings.DEFAULT_LOGBOOK_PARSER - except (IOError): - logbook_parseable = False - print(("Couldn't open default logbook file and nothing in settings for expo " + expedition.year)) + logentries=[] # flush for next year - - #return "TOLOAD: " + year + " " + str(expedition.personexpedition_set.all()[1].logbookentry_set.count()) + " " + str(PersonTrip.objects.filter(personexpedition__expedition=expedition).count()) + if logbook_cached: + i=0 + for entrytuple in range(len(logentries)): + date, place, title, text, trippeople, expedition, logtime_underground, \ + entry_type = logentries[i] + EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground,\ + entry_type) + i +=1 def LoadLogbooks(): - """ This is the master function for parsing all logbooks into the Troggle database. """ - - # Clear the logbook data issues as we are reloading + """ This is the master function for parsing all logbooks into the Troggle database. + """ DataIssue.objects.filter(parser='logbooks').delete() - # Fetch all expos expos = Expedition.objects.all() + nologbook = ["1976", "1977","1978","1979","1980","1980","1981","1983","1984", + "1985","1986","1987","1988","1989","1990",] for expo in expos: - print(("\nLoading Logbook for: " + expo.year)) - - # Load logbook for expo - LoadLogbookForExpedition(expo) + if expo.year not in nologbook: + print((" - Logbook for: " + expo.year)) + LoadLogbookForExpedition(expo) dateRegex = re.compile(r'(\d\d\d\d)-(\d\d)-(\d\d)', re.S) @@ -437,7 +404,7 @@ def parseAutoLogBookEntry(filename): year, month, day = [int(x) for x in dateMatch.groups()] date = datetime.date(year, month, day) else: - errors.append("Date could not be found") + errors.append(" - Date could not be found") expeditionYearMatch = expeditionYearRegex.search(contents) if expeditionYearMatch: @@ -445,17 +412,17 @@ def parseAutoLogBookEntry(filename): expedition = Expedition.objects.get(year = expeditionYearMatch.groups()[0]) personExpeditionNameLookup = GetPersonExpeditionNameLookup(expedition) except Expedition.DoesNotExist: - errors.append("Expedition not in database") + errors.append(" - Expedition not in database") else: - errors.append("Expedition Year could not be parsed") + errors.append(" - Expedition Year could not be parsed") titleMatch = titleRegex.search(contents) if titleMatch: title, = titleMatch.groups() if len(title) > settings.MAX_LOGBOOK_ENTRY_TITLE_LENGTH: - errors.append("Title too long") + errors.append(" - Title too long") else: - errors.append("Title could not be found") + errors.append(" - Title could not be found") caveMatch = caveRegex.search(contents) if caveMatch: @@ -464,7 +431,7 @@ def parseAutoLogBookEntry(filename): cave = getCaveByReference(caveRef) except AssertionError: cave = None - errors.append("Cave not found in database") + errors.append(" - Cave not found in database") else: cave = None @@ -475,13 +442,13 @@ def parseAutoLogBookEntry(filename): location = None if cave is None and location is None: - errors.append("Location nor cave could not be found") + errors.append(" - Location nor cave could not be found") reportMatch = reportRegex.search(contents) if reportMatch: report, = reportMatch.groups() else: - errors.append("Contents could not be found") + errors.append(" - Contents could not be found") if errors: return errors # Easiest to bail out at this point as we need to make sure that we know which expedition to look for people from. people = [] @@ -492,20 +459,20 @@ def parseAutoLogBookEntry(filename): if name.lower() in personExpeditionNameLookup: personExpo = personExpeditionNameLookup[name.lower()] else: - errors.append("Person could not be found in database") + errors.append(" - Person could not be found in database") author = bool(author) else: - errors.append("Persons name could not be found") + errors.append(" - Persons name could not be found") TUMatch = TURegex.search(contents) if TUMatch: TU, = TUMatch.groups() else: - errors.append("TU could not be found") + errors.append(" - TU could not be found") if not errors: people.append((name, author, TU)) if errors: - return errors # Bail out before commiting to the database + return errors # Bail out before committing to the database logbookEntry = LogbookEntry(date = date, expedition = expedition, title = title, cave = cave, place = location,