2
0
mirror of https://expo.survex.com/repositories/troggle/.git synced 2025-01-19 09:22:32 +00:00

bug fix in logbook parser

This commit is contained in:
Philip Sargent 2020-05-30 20:31:20 +01:00
parent 58c2650162
commit 3264b6edef

View File

@ -1,5 +1,4 @@
#.-*- coding: utf-8 -*-
import csv
import datetime
import os
@ -110,7 +109,7 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground)
if not author:
print((" - Skipping logentry: " + title + " - no author for entry"))
print(" * Skipping logentry: " + title + " - no author for entry")
message = "Skipping logentry: %s - no author for entry in year '%s'" % (title, expedition.year)
DataIssue.objects.create(parser='logbooks', message=message)
return
@ -153,7 +152,6 @@ def Parselogwikitxt(year, expedition, txt):
trippara = re.findall(r"===(.*?)===([\s\S]*?)(?====)", txt)
for triphead, triptext in trippara:
tripheadp = triphead.split("|")
#print "ttt", tripheadp
assert len(tripheadp) == 3, (tripheadp, triptext)
tripdate, tripplace, trippeople = tripheadp
tripsplace = tripplace.split(" - ")
@ -161,19 +159,14 @@ def Parselogwikitxt(year, expedition, txt):
tul = re.findall(r"T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext)
if tul:
#assert len(tul) <= 1, (triphead, triptext)
#assert tul[0][1] in ["hrs", "hours"], (triphead, triptext)
tu = tul[0][0]
else:
tu = ""
#assert tripcave == "Journey", (triphead, triptext)
#print tripdate
ldate = ParseDate(tripdate.strip(), year)
#print "\n", tripcave, "--- ppp", trippeople, len(triptext)
EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)
# 2002, 2004, 2005, 2007, 2010 - 2018
# 2002, 2004, 2005, 2007, 2010 - now
def Parseloghtmltxt(year, expedition, txt):
#print(" - Starting log html parser")
tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt)
@ -194,27 +187,20 @@ def Parseloghtmltxt(year, expedition, txt):
if not s:
if not re.search(r"Rigging Guide", trippara):
print(("can't parse: ", trippara)) # this is 2007 which needs editing
#assert s, trippara
continue
tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups()
ldate = ParseDate(tripdate.strip(), year)
#assert tripid[:-1] == "t" + tripdate, (tripid, tripdate)
#trippeople = re.sub(r"Ol(?!l)", "Olly", trippeople)
#trippeople = re.sub(r"Wook(?!e)", "Wookey", trippeople)
triptitles = triptitle.split(" - ")
if len(triptitles) >= 2:
tripcave = triptitles[0]
else:
tripcave = "UNKNOWN"
#print("\n", tripcave, "--- ppp", trippeople, len(triptext))
ltriptext = re.sub(r"</p>", "", triptext)
ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
ltriptext = re.sub(r"<p>", "</br></br>", ltriptext).strip()
EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext,
trippeople=trippeople, expedition=expedition, logtime_underground=0,
entry_type="html")
if logbook_entry_count == 0:
print(" - No trip entries found in logbook, check the syntax matches htmltxt format")
# main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it
def Parseloghtml01(year, expedition, txt):
@ -227,9 +213,6 @@ def Parseloghtml01(year, expedition, txt):
tripid = mtripid and mtripid.group(1) or ""
tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader)
#print " ", [tripheader]
#continue
tripdate, triptitle, trippeople = tripheader.split("|")
ldate = ParseDate(tripdate.strip(), year)
@ -247,19 +230,14 @@ def Parseloghtml01(year, expedition, txt):
mtail = re.search(r'(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&amp;|</?p>|\((?:same day|\d+)\))*$', ltriptext)
if mtail:
#print mtail.group(0)
ltriptext = ltriptext[:mtail.start(0)]
ltriptext = re.sub(r"</p>", "", ltriptext)
ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
ltriptext = re.sub(r"<p>|<br>", "\n\n", ltriptext).strip()
#ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!]", "NONASCII", ltriptext)
ltriptext = re.sub(r"</?u>", "_", ltriptext)
ltriptext = re.sub(r"</?i>", "''", ltriptext)
ltriptext = re.sub(r"</?b>", "'''", ltriptext)
#print ldate, trippeople.strip()
# could includ the tripid (url link for cross referencing)
EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext,
trippeople=trippeople, expedition=expedition, logtime_underground=0,
entry_type="html")
@ -286,7 +264,6 @@ def Parseloghtml03(year, expedition, txt):
tripcave = triptitles[0]
else:
tripcave = "UNKNOWN"
#print tripcave, "--- ppp", triptitle, trippeople, len(triptext)
ltriptext = re.sub(r"</p>", "", triptext)
ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
ltriptext = re.sub(r"<p>", "\n\n", ltriptext).strip()
@ -316,104 +293,94 @@ def SetDatesFromLogbookEntries(expedition):
def LoadLogbookForExpedition(expedition):
""" Parses all logbook entries for one expedition """
""" Parses all logbook entries for one expedition
"""
global logentries
expowebbase = os.path.join(settings.EXPOWEB, "years")
yearlinks = settings.LOGBOOK_PARSER_SETTINGS
logbook_parseable = False
logbook_cached = False
yearlinks = settings.LOGBOOK_PARSER_SETTINGS
expologbase = os.path.join(settings.EXPOWEB, "years")
if expedition.year in yearlinks:
# print " - Valid logbook year: ", expedition.year
year_settings = yearlinks[expedition.year]
logbookfile = os.path.join(expologbase, yearlinks[expedition.year][0])
parsefunc = yearlinks[expedition.year][1]
else:
logbookfile = os.path.join(expologbase, expedition.year, settings.DEFAULT_LOGBOOK_FILE)
parsefunc = settings.DEFAULT_LOGBOOK_PARSER
cache_filename = logbookfile + ".cache"
try:
bad_cache = False
now = time.time()
cache_t = os.path.getmtime(cache_filename)
if os.path.getmtime(logbookfile) - cache_t > 2: # at least 2 secs later
bad_cache= True
if now - cache_t > 30*24*60*60:
bad_cache= True
if bad_cache:
print(" - ! Cache is either stale or more than 30 days old. Deleting it.")
os.remove(cache_filename)
logentries=[]
print(" ! Removed stale or corrupt cache file")
raise
print(" - Reading cache: " + cache_filename, end='')
try:
bad_cache = False
cache_filename = os.path.join(expowebbase, year_settings[0])+".cache"
now = time.time()
cache_t = os.path.getmtime(cache_filename)
file_t = os.path.getmtime(os.path.join(expowebbase, year_settings[0]))
if file_t - cache_t > 2: # at least 2 secs later
#print " - Cache is stale."
bad_cache= True
if now - cache_t > 30*24*60*60:
#print " - Cache is more than 30 days old."
bad_cache= True
if bad_cache:
print(" - Cache is either stale or more than 30 days old. Deleting it.")
os.remove(cache_filename)
logentries=[]
raise
print((" - Reading cache: " + cache_filename ))
try:
with open(cache_filename, "rb") as f:
logentries = pickle.load(f)
print(" - Loaded ", len(logentries), " objects")
logbook_cached = True
except:
print(" - Failed to load corrupt cache. Deleting it.\n")
os.remove(cache_filename)
logentries=[]
raise
with open(cache_filename, "rb") as f:
logentries = pickle.load(f)
print(" -- Loaded ", len(logentries), " log entries")
logbook_cached = True
except:
print(" - Opening logbook: ")
file_in = open(os.path.join(expowebbase, year_settings[0]),'rb')
print("\n ! Failed to load corrupt cache. Deleting it.\n")
os.remove(cache_filename)
logentries=[]
raise
except : # no cache found
#print(" - No cache \"" + cache_filename +"\"")
try:
file_in = open(logbookfile,'rb')
txt = file_in.read().decode("latin1")
file_in.close()
parsefunc = year_settings[1]
logbook_parseable = True
print((" - Parsing logbook: " + year_settings[0] + "\n - Using parser: " + year_settings[1]))
print((" - Using: " + parsefunc + " to parse " + logbookfile))
except (IOError):
logbook_parseable = False
print((" ! Couldn't open logbook " + logbookfile))
if logbook_parseable:
parser = globals()[parsefunc]
parser(expedition.year, expedition, txt)
SetDatesFromLogbookEntries(expedition)
# and this has also stored all the objects in logentries[]
print(" - Storing " , len(logentries), " log entries")
cache_filename = os.path.join(expowebbase, year_settings[0])+".cache"
with open(cache_filename, "wb") as f:
pickle.dump(logentries, f, 2)
logentries=[] # flush for next year
if logbook_parseable:
parser = globals()[parsefunc]
parser(expedition.year, expedition, txt)
SetDatesFromLogbookEntries(expedition)
# and this has also stored all the log entries in logentries[]
if len(logentries) >0:
print(" - Cacheing " , len(logentries), " log entries")
with open(cache_filename, "wb") as fc:
pickle.dump(logentries, fc, 2)
else:
print(" ! NO TRIP entries found in logbook, check the syntax.")
if logbook_cached:
i=0
for entrytuple in range(len(logentries)):
date, place, title, text, trippeople, expedition, logtime_underground, \
entry_type = logentries[i]
#print " - - obj ", i, date, title
EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground,\
entry_type)
i +=1
else:
try:
file_in = open(os.path.join(expowebbase, expedition.year, settings.DEFAULT_LOGBOOK_FILE),'rb')
txt = file_in.read().decode("latin1")
file_in.close()
logbook_parseable = True
print("No set parser found using default")
parsefunc = settings.DEFAULT_LOGBOOK_PARSER
except (IOError):
logbook_parseable = False
print(("Couldn't open default logbook file and nothing in settings for expo " + expedition.year))
logentries=[] # flush for next year
#return "TOLOAD: " + year + " " + str(expedition.personexpedition_set.all()[1].logbookentry_set.count()) + " " + str(PersonTrip.objects.filter(personexpedition__expedition=expedition).count())
if logbook_cached:
i=0
for entrytuple in range(len(logentries)):
date, place, title, text, trippeople, expedition, logtime_underground, \
entry_type = logentries[i]
EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground,\
entry_type)
i +=1
def LoadLogbooks():
""" This is the master function for parsing all logbooks into the Troggle database. """
# Clear the logbook data issues as we are reloading
""" This is the master function for parsing all logbooks into the Troggle database.
"""
DataIssue.objects.filter(parser='logbooks').delete()
# Fetch all expos
expos = Expedition.objects.all()
nologbook = ["1976", "1977","1978","1979","1980","1980","1981","1983","1984",
"1985","1986","1987","1988","1989","1990",]
for expo in expos:
print(("\nLoading Logbook for: " + expo.year))
# Load logbook for expo
LoadLogbookForExpedition(expo)
if expo.year not in nologbook:
print((" - Logbook for: " + expo.year))
LoadLogbookForExpedition(expo)
dateRegex = re.compile(r'<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S)
@ -437,7 +404,7 @@ def parseAutoLogBookEntry(filename):
year, month, day = [int(x) for x in dateMatch.groups()]
date = datetime.date(year, month, day)
else:
errors.append("Date could not be found")
errors.append(" - Date could not be found")
expeditionYearMatch = expeditionYearRegex.search(contents)
if expeditionYearMatch:
@ -445,17 +412,17 @@ def parseAutoLogBookEntry(filename):
expedition = Expedition.objects.get(year = expeditionYearMatch.groups()[0])
personExpeditionNameLookup = GetPersonExpeditionNameLookup(expedition)
except Expedition.DoesNotExist:
errors.append("Expedition not in database")
errors.append(" - Expedition not in database")
else:
errors.append("Expedition Year could not be parsed")
errors.append(" - Expedition Year could not be parsed")
titleMatch = titleRegex.search(contents)
if titleMatch:
title, = titleMatch.groups()
if len(title) > settings.MAX_LOGBOOK_ENTRY_TITLE_LENGTH:
errors.append("Title too long")
errors.append(" - Title too long")
else:
errors.append("Title could not be found")
errors.append(" - Title could not be found")
caveMatch = caveRegex.search(contents)
if caveMatch:
@ -464,7 +431,7 @@ def parseAutoLogBookEntry(filename):
cave = getCaveByReference(caveRef)
except AssertionError:
cave = None
errors.append("Cave not found in database")
errors.append(" - Cave not found in database")
else:
cave = None
@ -475,13 +442,13 @@ def parseAutoLogBookEntry(filename):
location = None
if cave is None and location is None:
errors.append("Location nor cave could not be found")
errors.append(" - Location nor cave could not be found")
reportMatch = reportRegex.search(contents)
if reportMatch:
report, = reportMatch.groups()
else:
errors.append("Contents could not be found")
errors.append(" - Contents could not be found")
if errors:
return errors # Easiest to bail out at this point as we need to make sure that we know which expedition to look for people from.
people = []
@ -492,20 +459,20 @@ def parseAutoLogBookEntry(filename):
if name.lower() in personExpeditionNameLookup:
personExpo = personExpeditionNameLookup[name.lower()]
else:
errors.append("Person could not be found in database")
errors.append(" - Person could not be found in database")
author = bool(author)
else:
errors.append("Persons name could not be found")
errors.append(" - Persons name could not be found")
TUMatch = TURegex.search(contents)
if TUMatch:
TU, = TUMatch.groups()
else:
errors.append("TU could not be found")
errors.append(" - TU could not be found")
if not errors:
people.append((name, author, TU))
if errors:
return errors # Bail out before commiting to the database
return errors # Bail out before committing to the database
logbookEntry = LogbookEntry(date = date,
expedition = expedition,
title = title, cave = cave, place = location,