bug fix in logbook parser

This commit is contained in:
Philip Sargent 2020-05-30 20:31:20 +01:00
parent 58c2650162
commit 3264b6edef

View File

@ -1,5 +1,4 @@
#.-*- coding: utf-8 -*- #.-*- coding: utf-8 -*-
import csv import csv
import datetime import datetime
import os import os
@ -110,7 +109,7 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground) trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground)
if not author: if not author:
print((" - Skipping logentry: " + title + " - no author for entry")) print(" * Skipping logentry: " + title + " - no author for entry")
message = "Skipping logentry: %s - no author for entry in year '%s'" % (title, expedition.year) message = "Skipping logentry: %s - no author for entry in year '%s'" % (title, expedition.year)
DataIssue.objects.create(parser='logbooks', message=message) DataIssue.objects.create(parser='logbooks', message=message)
return return
@ -153,7 +152,6 @@ def Parselogwikitxt(year, expedition, txt):
trippara = re.findall(r"===(.*?)===([\s\S]*?)(?====)", txt) trippara = re.findall(r"===(.*?)===([\s\S]*?)(?====)", txt)
for triphead, triptext in trippara: for triphead, triptext in trippara:
tripheadp = triphead.split("|") tripheadp = triphead.split("|")
#print "ttt", tripheadp
assert len(tripheadp) == 3, (tripheadp, triptext) assert len(tripheadp) == 3, (tripheadp, triptext)
tripdate, tripplace, trippeople = tripheadp tripdate, tripplace, trippeople = tripheadp
tripsplace = tripplace.split(" - ") tripsplace = tripplace.split(" - ")
@ -161,19 +159,14 @@ def Parselogwikitxt(year, expedition, txt):
tul = re.findall(r"T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext) tul = re.findall(r"T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext)
if tul: if tul:
#assert len(tul) <= 1, (triphead, triptext)
#assert tul[0][1] in ["hrs", "hours"], (triphead, triptext)
tu = tul[0][0] tu = tul[0][0]
else: else:
tu = "" tu = ""
#assert tripcave == "Journey", (triphead, triptext)
#print tripdate
ldate = ParseDate(tripdate.strip(), year) ldate = ParseDate(tripdate.strip(), year)
#print "\n", tripcave, "--- ppp", trippeople, len(triptext)
EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople, expedition=expedition, logtime_underground=0) EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)
# 2002, 2004, 2005, 2007, 2010 - 2018 # 2002, 2004, 2005, 2007, 2010 - now
def Parseloghtmltxt(year, expedition, txt): def Parseloghtmltxt(year, expedition, txt):
#print(" - Starting log html parser") #print(" - Starting log html parser")
tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt) tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt)
@ -194,27 +187,20 @@ def Parseloghtmltxt(year, expedition, txt):
if not s: if not s:
if not re.search(r"Rigging Guide", trippara): if not re.search(r"Rigging Guide", trippara):
print(("can't parse: ", trippara)) # this is 2007 which needs editing print(("can't parse: ", trippara)) # this is 2007 which needs editing
#assert s, trippara
continue continue
tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups() tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups()
ldate = ParseDate(tripdate.strip(), year) ldate = ParseDate(tripdate.strip(), year)
#assert tripid[:-1] == "t" + tripdate, (tripid, tripdate)
#trippeople = re.sub(r"Ol(?!l)", "Olly", trippeople)
#trippeople = re.sub(r"Wook(?!e)", "Wookey", trippeople)
triptitles = triptitle.split(" - ") triptitles = triptitle.split(" - ")
if len(triptitles) >= 2: if len(triptitles) >= 2:
tripcave = triptitles[0] tripcave = triptitles[0]
else: else:
tripcave = "UNKNOWN" tripcave = "UNKNOWN"
#print("\n", tripcave, "--- ppp", trippeople, len(triptext))
ltriptext = re.sub(r"</p>", "", triptext) ltriptext = re.sub(r"</p>", "", triptext)
ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext) ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
ltriptext = re.sub(r"<p>", "</br></br>", ltriptext).strip() ltriptext = re.sub(r"<p>", "</br></br>", ltriptext).strip()
EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext,
trippeople=trippeople, expedition=expedition, logtime_underground=0, trippeople=trippeople, expedition=expedition, logtime_underground=0,
entry_type="html") entry_type="html")
if logbook_entry_count == 0:
print(" - No trip entries found in logbook, check the syntax matches htmltxt format")
# main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it # main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it
def Parseloghtml01(year, expedition, txt): def Parseloghtml01(year, expedition, txt):
@ -227,9 +213,6 @@ def Parseloghtml01(year, expedition, txt):
tripid = mtripid and mtripid.group(1) or "" tripid = mtripid and mtripid.group(1) or ""
tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader) tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader)
#print " ", [tripheader]
#continue
tripdate, triptitle, trippeople = tripheader.split("|") tripdate, triptitle, trippeople = tripheader.split("|")
ldate = ParseDate(tripdate.strip(), year) ldate = ParseDate(tripdate.strip(), year)
@ -247,19 +230,14 @@ def Parseloghtml01(year, expedition, txt):
mtail = re.search(r'(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&amp;|</?p>|\((?:same day|\d+)\))*$', ltriptext) mtail = re.search(r'(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&amp;|</?p>|\((?:same day|\d+)\))*$', ltriptext)
if mtail: if mtail:
#print mtail.group(0)
ltriptext = ltriptext[:mtail.start(0)] ltriptext = ltriptext[:mtail.start(0)]
ltriptext = re.sub(r"</p>", "", ltriptext) ltriptext = re.sub(r"</p>", "", ltriptext)
ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext) ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
ltriptext = re.sub(r"<p>|<br>", "\n\n", ltriptext).strip() ltriptext = re.sub(r"<p>|<br>", "\n\n", ltriptext).strip()
#ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!]", "NONASCII", ltriptext)
ltriptext = re.sub(r"</?u>", "_", ltriptext) ltriptext = re.sub(r"</?u>", "_", ltriptext)
ltriptext = re.sub(r"</?i>", "''", ltriptext) ltriptext = re.sub(r"</?i>", "''", ltriptext)
ltriptext = re.sub(r"</?b>", "'''", ltriptext) ltriptext = re.sub(r"</?b>", "'''", ltriptext)
#print ldate, trippeople.strip()
# could includ the tripid (url link for cross referencing)
EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext, EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext,
trippeople=trippeople, expedition=expedition, logtime_underground=0, trippeople=trippeople, expedition=expedition, logtime_underground=0,
entry_type="html") entry_type="html")
@ -286,7 +264,6 @@ def Parseloghtml03(year, expedition, txt):
tripcave = triptitles[0] tripcave = triptitles[0]
else: else:
tripcave = "UNKNOWN" tripcave = "UNKNOWN"
#print tripcave, "--- ppp", triptitle, trippeople, len(triptext)
ltriptext = re.sub(r"</p>", "", triptext) ltriptext = re.sub(r"</p>", "", triptext)
ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext) ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
ltriptext = re.sub(r"<p>", "\n\n", ltriptext).strip() ltriptext = re.sub(r"<p>", "\n\n", ltriptext).strip()
@ -316,65 +293,71 @@ def SetDatesFromLogbookEntries(expedition):
def LoadLogbookForExpedition(expedition): def LoadLogbookForExpedition(expedition):
""" Parses all logbook entries for one expedition """ """ Parses all logbook entries for one expedition
"""
global logentries global logentries
expowebbase = os.path.join(settings.EXPOWEB, "years")
yearlinks = settings.LOGBOOK_PARSER_SETTINGS
logbook_parseable = False logbook_parseable = False
logbook_cached = False logbook_cached = False
yearlinks = settings.LOGBOOK_PARSER_SETTINGS
expologbase = os.path.join(settings.EXPOWEB, "years")
if expedition.year in yearlinks: if expedition.year in yearlinks:
# print " - Valid logbook year: ", expedition.year logbookfile = os.path.join(expologbase, yearlinks[expedition.year][0])
year_settings = yearlinks[expedition.year] parsefunc = yearlinks[expedition.year][1]
else:
logbookfile = os.path.join(expologbase, expedition.year, settings.DEFAULT_LOGBOOK_FILE)
parsefunc = settings.DEFAULT_LOGBOOK_PARSER
cache_filename = logbookfile + ".cache"
try: try:
bad_cache = False bad_cache = False
cache_filename = os.path.join(expowebbase, year_settings[0])+".cache"
now = time.time() now = time.time()
cache_t = os.path.getmtime(cache_filename) cache_t = os.path.getmtime(cache_filename)
file_t = os.path.getmtime(os.path.join(expowebbase, year_settings[0])) if os.path.getmtime(logbookfile) - cache_t > 2: # at least 2 secs later
if file_t - cache_t > 2: # at least 2 secs later
#print " - Cache is stale."
bad_cache= True bad_cache= True
if now - cache_t > 30*24*60*60: if now - cache_t > 30*24*60*60:
#print " - Cache is more than 30 days old."
bad_cache= True bad_cache= True
if bad_cache: if bad_cache:
print(" - Cache is either stale or more than 30 days old. Deleting it.") print(" - ! Cache is either stale or more than 30 days old. Deleting it.")
os.remove(cache_filename) os.remove(cache_filename)
logentries=[] logentries=[]
print(" ! Removed stale or corrupt cache file")
raise raise
print((" - Reading cache: " + cache_filename )) print(" - Reading cache: " + cache_filename, end='')
try: try:
with open(cache_filename, "rb") as f: with open(cache_filename, "rb") as f:
logentries = pickle.load(f) logentries = pickle.load(f)
print(" - Loaded ", len(logentries), " objects") print(" -- Loaded ", len(logentries), " log entries")
logbook_cached = True logbook_cached = True
except: except:
print(" - Failed to load corrupt cache. Deleting it.\n") print("\n ! Failed to load corrupt cache. Deleting it.\n")
os.remove(cache_filename) os.remove(cache_filename)
logentries=[] logentries=[]
raise raise
except: except : # no cache found
print(" - Opening logbook: ") #print(" - No cache \"" + cache_filename +"\"")
file_in = open(os.path.join(expowebbase, year_settings[0]),'rb') try:
file_in = open(logbookfile,'rb')
txt = file_in.read().decode("latin1") txt = file_in.read().decode("latin1")
file_in.close() file_in.close()
parsefunc = year_settings[1]
logbook_parseable = True logbook_parseable = True
print((" - Parsing logbook: " + year_settings[0] + "\n - Using parser: " + year_settings[1])) print((" - Using: " + parsefunc + " to parse " + logbookfile))
except (IOError):
logbook_parseable = False
print((" ! Couldn't open logbook " + logbookfile))
if logbook_parseable: if logbook_parseable:
parser = globals()[parsefunc] parser = globals()[parsefunc]
parser(expedition.year, expedition, txt) parser(expedition.year, expedition, txt)
SetDatesFromLogbookEntries(expedition) SetDatesFromLogbookEntries(expedition)
# and this has also stored all the objects in logentries[] # and this has also stored all the log entries in logentries[]
print(" - Storing " , len(logentries), " log entries") if len(logentries) >0:
cache_filename = os.path.join(expowebbase, year_settings[0])+".cache" print(" - Cacheing " , len(logentries), " log entries")
with open(cache_filename, "wb") as f: with open(cache_filename, "wb") as fc:
pickle.dump(logentries, f, 2) pickle.dump(logentries, fc, 2)
else:
print(" ! NO TRIP entries found in logbook, check the syntax.")
logentries=[] # flush for next year logentries=[] # flush for next year
if logbook_cached: if logbook_cached:
@ -382,37 +365,21 @@ def LoadLogbookForExpedition(expedition):
for entrytuple in range(len(logentries)): for entrytuple in range(len(logentries)):
date, place, title, text, trippeople, expedition, logtime_underground, \ date, place, title, text, trippeople, expedition, logtime_underground, \
entry_type = logentries[i] entry_type = logentries[i]
#print " - - obj ", i, date, title
EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground,\ EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground,\
entry_type) entry_type)
i +=1 i +=1
else:
try:
file_in = open(os.path.join(expowebbase, expedition.year, settings.DEFAULT_LOGBOOK_FILE),'rb')
txt = file_in.read().decode("latin1")
file_in.close()
logbook_parseable = True
print("No set parser found using default")
parsefunc = settings.DEFAULT_LOGBOOK_PARSER
except (IOError):
logbook_parseable = False
print(("Couldn't open default logbook file and nothing in settings for expo " + expedition.year))
#return "TOLOAD: " + year + " " + str(expedition.personexpedition_set.all()[1].logbookentry_set.count()) + " " + str(PersonTrip.objects.filter(personexpedition__expedition=expedition).count())
def LoadLogbooks(): def LoadLogbooks():
""" This is the master function for parsing all logbooks into the Troggle database. """ """ This is the master function for parsing all logbooks into the Troggle database.
"""
# Clear the logbook data issues as we are reloading
DataIssue.objects.filter(parser='logbooks').delete() DataIssue.objects.filter(parser='logbooks').delete()
# Fetch all expos
expos = Expedition.objects.all() expos = Expedition.objects.all()
nologbook = ["1976", "1977","1978","1979","1980","1980","1981","1983","1984",
"1985","1986","1987","1988","1989","1990",]
for expo in expos: for expo in expos:
print(("\nLoading Logbook for: " + expo.year)) if expo.year not in nologbook:
print((" - Logbook for: " + expo.year))
# Load logbook for expo
LoadLogbookForExpedition(expo) LoadLogbookForExpedition(expo)
@ -437,7 +404,7 @@ def parseAutoLogBookEntry(filename):
year, month, day = [int(x) for x in dateMatch.groups()] year, month, day = [int(x) for x in dateMatch.groups()]
date = datetime.date(year, month, day) date = datetime.date(year, month, day)
else: else:
errors.append("Date could not be found") errors.append(" - Date could not be found")
expeditionYearMatch = expeditionYearRegex.search(contents) expeditionYearMatch = expeditionYearRegex.search(contents)
if expeditionYearMatch: if expeditionYearMatch:
@ -445,17 +412,17 @@ def parseAutoLogBookEntry(filename):
expedition = Expedition.objects.get(year = expeditionYearMatch.groups()[0]) expedition = Expedition.objects.get(year = expeditionYearMatch.groups()[0])
personExpeditionNameLookup = GetPersonExpeditionNameLookup(expedition) personExpeditionNameLookup = GetPersonExpeditionNameLookup(expedition)
except Expedition.DoesNotExist: except Expedition.DoesNotExist:
errors.append("Expedition not in database") errors.append(" - Expedition not in database")
else: else:
errors.append("Expedition Year could not be parsed") errors.append(" - Expedition Year could not be parsed")
titleMatch = titleRegex.search(contents) titleMatch = titleRegex.search(contents)
if titleMatch: if titleMatch:
title, = titleMatch.groups() title, = titleMatch.groups()
if len(title) > settings.MAX_LOGBOOK_ENTRY_TITLE_LENGTH: if len(title) > settings.MAX_LOGBOOK_ENTRY_TITLE_LENGTH:
errors.append("Title too long") errors.append(" - Title too long")
else: else:
errors.append("Title could not be found") errors.append(" - Title could not be found")
caveMatch = caveRegex.search(contents) caveMatch = caveRegex.search(contents)
if caveMatch: if caveMatch:
@ -464,7 +431,7 @@ def parseAutoLogBookEntry(filename):
cave = getCaveByReference(caveRef) cave = getCaveByReference(caveRef)
except AssertionError: except AssertionError:
cave = None cave = None
errors.append("Cave not found in database") errors.append(" - Cave not found in database")
else: else:
cave = None cave = None
@ -475,13 +442,13 @@ def parseAutoLogBookEntry(filename):
location = None location = None
if cave is None and location is None: if cave is None and location is None:
errors.append("Location nor cave could not be found") errors.append(" - Location nor cave could not be found")
reportMatch = reportRegex.search(contents) reportMatch = reportRegex.search(contents)
if reportMatch: if reportMatch:
report, = reportMatch.groups() report, = reportMatch.groups()
else: else:
errors.append("Contents could not be found") errors.append(" - Contents could not be found")
if errors: if errors:
return errors # Easiest to bail out at this point as we need to make sure that we know which expedition to look for people from. return errors # Easiest to bail out at this point as we need to make sure that we know which expedition to look for people from.
people = [] people = []
@ -492,20 +459,20 @@ def parseAutoLogBookEntry(filename):
if name.lower() in personExpeditionNameLookup: if name.lower() in personExpeditionNameLookup:
personExpo = personExpeditionNameLookup[name.lower()] personExpo = personExpeditionNameLookup[name.lower()]
else: else:
errors.append("Person could not be found in database") errors.append(" - Person could not be found in database")
author = bool(author) author = bool(author)
else: else:
errors.append("Persons name could not be found") errors.append(" - Persons name could not be found")
TUMatch = TURegex.search(contents) TUMatch = TURegex.search(contents)
if TUMatch: if TUMatch:
TU, = TUMatch.groups() TU, = TUMatch.groups()
else: else:
errors.append("TU could not be found") errors.append(" - TU could not be found")
if not errors: if not errors:
people.append((name, author, TU)) people.append((name, author, TU))
if errors: if errors:
return errors # Bail out before commiting to the database return errors # Bail out before committing to the database
logbookEntry = LogbookEntry(date = date, logbookEntry = LogbookEntry(date = date,
expedition = expedition, expedition = expedition,
title = title, cave = cave, place = location, title = title, cave = cave, place = location,