mirror of
https://expo.survex.com/repositories/troggle/.git
synced 2025-01-19 09:22:32 +00:00
bug fix in logbook parser
This commit is contained in:
parent
58c2650162
commit
3264b6edef
@ -1,5 +1,4 @@
|
||||
#.-*- coding: utf-8 -*-
|
||||
|
||||
import csv
|
||||
import datetime
|
||||
import os
|
||||
@ -110,7 +109,7 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
|
||||
|
||||
trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground)
|
||||
if not author:
|
||||
print((" - Skipping logentry: " + title + " - no author for entry"))
|
||||
print(" * Skipping logentry: " + title + " - no author for entry")
|
||||
message = "Skipping logentry: %s - no author for entry in year '%s'" % (title, expedition.year)
|
||||
DataIssue.objects.create(parser='logbooks', message=message)
|
||||
return
|
||||
@ -153,7 +152,6 @@ def Parselogwikitxt(year, expedition, txt):
|
||||
trippara = re.findall(r"===(.*?)===([\s\S]*?)(?====)", txt)
|
||||
for triphead, triptext in trippara:
|
||||
tripheadp = triphead.split("|")
|
||||
#print "ttt", tripheadp
|
||||
assert len(tripheadp) == 3, (tripheadp, triptext)
|
||||
tripdate, tripplace, trippeople = tripheadp
|
||||
tripsplace = tripplace.split(" - ")
|
||||
@ -161,19 +159,14 @@ def Parselogwikitxt(year, expedition, txt):
|
||||
|
||||
tul = re.findall(r"T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext)
|
||||
if tul:
|
||||
#assert len(tul) <= 1, (triphead, triptext)
|
||||
#assert tul[0][1] in ["hrs", "hours"], (triphead, triptext)
|
||||
tu = tul[0][0]
|
||||
else:
|
||||
tu = ""
|
||||
#assert tripcave == "Journey", (triphead, triptext)
|
||||
|
||||
#print tripdate
|
||||
ldate = ParseDate(tripdate.strip(), year)
|
||||
#print "\n", tripcave, "--- ppp", trippeople, len(triptext)
|
||||
EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)
|
||||
|
||||
# 2002, 2004, 2005, 2007, 2010 - 2018
|
||||
# 2002, 2004, 2005, 2007, 2010 - now
|
||||
def Parseloghtmltxt(year, expedition, txt):
|
||||
#print(" - Starting log html parser")
|
||||
tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt)
|
||||
@ -194,27 +187,20 @@ def Parseloghtmltxt(year, expedition, txt):
|
||||
if not s:
|
||||
if not re.search(r"Rigging Guide", trippara):
|
||||
print(("can't parse: ", trippara)) # this is 2007 which needs editing
|
||||
#assert s, trippara
|
||||
continue
|
||||
tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups()
|
||||
ldate = ParseDate(tripdate.strip(), year)
|
||||
#assert tripid[:-1] == "t" + tripdate, (tripid, tripdate)
|
||||
#trippeople = re.sub(r"Ol(?!l)", "Olly", trippeople)
|
||||
#trippeople = re.sub(r"Wook(?!e)", "Wookey", trippeople)
|
||||
triptitles = triptitle.split(" - ")
|
||||
if len(triptitles) >= 2:
|
||||
tripcave = triptitles[0]
|
||||
else:
|
||||
tripcave = "UNKNOWN"
|
||||
#print("\n", tripcave, "--- ppp", trippeople, len(triptext))
|
||||
ltriptext = re.sub(r"</p>", "", triptext)
|
||||
ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
|
||||
ltriptext = re.sub(r"<p>", "</br></br>", ltriptext).strip()
|
||||
EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext,
|
||||
trippeople=trippeople, expedition=expedition, logtime_underground=0,
|
||||
entry_type="html")
|
||||
if logbook_entry_count == 0:
|
||||
print(" - No trip entries found in logbook, check the syntax matches htmltxt format")
|
||||
|
||||
# main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it
|
||||
def Parseloghtml01(year, expedition, txt):
|
||||
@ -227,9 +213,6 @@ def Parseloghtml01(year, expedition, txt):
|
||||
tripid = mtripid and mtripid.group(1) or ""
|
||||
tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader)
|
||||
|
||||
#print " ", [tripheader]
|
||||
#continue
|
||||
|
||||
tripdate, triptitle, trippeople = tripheader.split("|")
|
||||
ldate = ParseDate(tripdate.strip(), year)
|
||||
|
||||
@ -247,19 +230,14 @@ def Parseloghtml01(year, expedition, txt):
|
||||
|
||||
mtail = re.search(r'(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&|</?p>|\((?:same day|\d+)\))*$', ltriptext)
|
||||
if mtail:
|
||||
#print mtail.group(0)
|
||||
ltriptext = ltriptext[:mtail.start(0)]
|
||||
ltriptext = re.sub(r"</p>", "", ltriptext)
|
||||
ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
|
||||
ltriptext = re.sub(r"<p>|<br>", "\n\n", ltriptext).strip()
|
||||
#ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!]", "NONASCII", ltriptext)
|
||||
ltriptext = re.sub(r"</?u>", "_", ltriptext)
|
||||
ltriptext = re.sub(r"</?i>", "''", ltriptext)
|
||||
ltriptext = re.sub(r"</?b>", "'''", ltriptext)
|
||||
|
||||
|
||||
#print ldate, trippeople.strip()
|
||||
# could includ the tripid (url link for cross referencing)
|
||||
EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext,
|
||||
trippeople=trippeople, expedition=expedition, logtime_underground=0,
|
||||
entry_type="html")
|
||||
@ -286,7 +264,6 @@ def Parseloghtml03(year, expedition, txt):
|
||||
tripcave = triptitles[0]
|
||||
else:
|
||||
tripcave = "UNKNOWN"
|
||||
#print tripcave, "--- ppp", triptitle, trippeople, len(triptext)
|
||||
ltriptext = re.sub(r"</p>", "", triptext)
|
||||
ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
|
||||
ltriptext = re.sub(r"<p>", "\n\n", ltriptext).strip()
|
||||
@ -316,104 +293,94 @@ def SetDatesFromLogbookEntries(expedition):
|
||||
|
||||
|
||||
def LoadLogbookForExpedition(expedition):
|
||||
""" Parses all logbook entries for one expedition """
|
||||
|
||||
""" Parses all logbook entries for one expedition
|
||||
"""
|
||||
global logentries
|
||||
|
||||
expowebbase = os.path.join(settings.EXPOWEB, "years")
|
||||
yearlinks = settings.LOGBOOK_PARSER_SETTINGS
|
||||
|
||||
logbook_parseable = False
|
||||
logbook_cached = False
|
||||
yearlinks = settings.LOGBOOK_PARSER_SETTINGS
|
||||
expologbase = os.path.join(settings.EXPOWEB, "years")
|
||||
|
||||
if expedition.year in yearlinks:
|
||||
# print " - Valid logbook year: ", expedition.year
|
||||
year_settings = yearlinks[expedition.year]
|
||||
logbookfile = os.path.join(expologbase, yearlinks[expedition.year][0])
|
||||
parsefunc = yearlinks[expedition.year][1]
|
||||
else:
|
||||
logbookfile = os.path.join(expologbase, expedition.year, settings.DEFAULT_LOGBOOK_FILE)
|
||||
parsefunc = settings.DEFAULT_LOGBOOK_PARSER
|
||||
cache_filename = logbookfile + ".cache"
|
||||
|
||||
try:
|
||||
bad_cache = False
|
||||
now = time.time()
|
||||
cache_t = os.path.getmtime(cache_filename)
|
||||
if os.path.getmtime(logbookfile) - cache_t > 2: # at least 2 secs later
|
||||
bad_cache= True
|
||||
if now - cache_t > 30*24*60*60:
|
||||
bad_cache= True
|
||||
if bad_cache:
|
||||
print(" - ! Cache is either stale or more than 30 days old. Deleting it.")
|
||||
os.remove(cache_filename)
|
||||
logentries=[]
|
||||
print(" ! Removed stale or corrupt cache file")
|
||||
raise
|
||||
print(" - Reading cache: " + cache_filename, end='')
|
||||
try:
|
||||
bad_cache = False
|
||||
cache_filename = os.path.join(expowebbase, year_settings[0])+".cache"
|
||||
now = time.time()
|
||||
cache_t = os.path.getmtime(cache_filename)
|
||||
file_t = os.path.getmtime(os.path.join(expowebbase, year_settings[0]))
|
||||
if file_t - cache_t > 2: # at least 2 secs later
|
||||
#print " - Cache is stale."
|
||||
bad_cache= True
|
||||
if now - cache_t > 30*24*60*60:
|
||||
#print " - Cache is more than 30 days old."
|
||||
bad_cache= True
|
||||
if bad_cache:
|
||||
print(" - Cache is either stale or more than 30 days old. Deleting it.")
|
||||
os.remove(cache_filename)
|
||||
logentries=[]
|
||||
raise
|
||||
print((" - Reading cache: " + cache_filename ))
|
||||
try:
|
||||
with open(cache_filename, "rb") as f:
|
||||
logentries = pickle.load(f)
|
||||
print(" - Loaded ", len(logentries), " objects")
|
||||
logbook_cached = True
|
||||
except:
|
||||
print(" - Failed to load corrupt cache. Deleting it.\n")
|
||||
os.remove(cache_filename)
|
||||
logentries=[]
|
||||
raise
|
||||
with open(cache_filename, "rb") as f:
|
||||
logentries = pickle.load(f)
|
||||
print(" -- Loaded ", len(logentries), " log entries")
|
||||
logbook_cached = True
|
||||
except:
|
||||
print(" - Opening logbook: ")
|
||||
file_in = open(os.path.join(expowebbase, year_settings[0]),'rb')
|
||||
print("\n ! Failed to load corrupt cache. Deleting it.\n")
|
||||
os.remove(cache_filename)
|
||||
logentries=[]
|
||||
raise
|
||||
except : # no cache found
|
||||
#print(" - No cache \"" + cache_filename +"\"")
|
||||
try:
|
||||
file_in = open(logbookfile,'rb')
|
||||
txt = file_in.read().decode("latin1")
|
||||
file_in.close()
|
||||
parsefunc = year_settings[1]
|
||||
logbook_parseable = True
|
||||
print((" - Parsing logbook: " + year_settings[0] + "\n - Using parser: " + year_settings[1]))
|
||||
print((" - Using: " + parsefunc + " to parse " + logbookfile))
|
||||
except (IOError):
|
||||
logbook_parseable = False
|
||||
print((" ! Couldn't open logbook " + logbookfile))
|
||||
|
||||
if logbook_parseable:
|
||||
parser = globals()[parsefunc]
|
||||
parser(expedition.year, expedition, txt)
|
||||
SetDatesFromLogbookEntries(expedition)
|
||||
# and this has also stored all the objects in logentries[]
|
||||
print(" - Storing " , len(logentries), " log entries")
|
||||
cache_filename = os.path.join(expowebbase, year_settings[0])+".cache"
|
||||
with open(cache_filename, "wb") as f:
|
||||
pickle.dump(logentries, f, 2)
|
||||
logentries=[] # flush for next year
|
||||
if logbook_parseable:
|
||||
parser = globals()[parsefunc]
|
||||
parser(expedition.year, expedition, txt)
|
||||
SetDatesFromLogbookEntries(expedition)
|
||||
# and this has also stored all the log entries in logentries[]
|
||||
if len(logentries) >0:
|
||||
print(" - Cacheing " , len(logentries), " log entries")
|
||||
with open(cache_filename, "wb") as fc:
|
||||
pickle.dump(logentries, fc, 2)
|
||||
else:
|
||||
print(" ! NO TRIP entries found in logbook, check the syntax.")
|
||||
|
||||
if logbook_cached:
|
||||
i=0
|
||||
for entrytuple in range(len(logentries)):
|
||||
date, place, title, text, trippeople, expedition, logtime_underground, \
|
||||
entry_type = logentries[i]
|
||||
#print " - - obj ", i, date, title
|
||||
EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground,\
|
||||
entry_type)
|
||||
i +=1
|
||||
else:
|
||||
try:
|
||||
file_in = open(os.path.join(expowebbase, expedition.year, settings.DEFAULT_LOGBOOK_FILE),'rb')
|
||||
txt = file_in.read().decode("latin1")
|
||||
file_in.close()
|
||||
logbook_parseable = True
|
||||
print("No set parser found using default")
|
||||
parsefunc = settings.DEFAULT_LOGBOOK_PARSER
|
||||
except (IOError):
|
||||
logbook_parseable = False
|
||||
print(("Couldn't open default logbook file and nothing in settings for expo " + expedition.year))
|
||||
logentries=[] # flush for next year
|
||||
|
||||
|
||||
#return "TOLOAD: " + year + " " + str(expedition.personexpedition_set.all()[1].logbookentry_set.count()) + " " + str(PersonTrip.objects.filter(personexpedition__expedition=expedition).count())
|
||||
if logbook_cached:
|
||||
i=0
|
||||
for entrytuple in range(len(logentries)):
|
||||
date, place, title, text, trippeople, expedition, logtime_underground, \
|
||||
entry_type = logentries[i]
|
||||
EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground,\
|
||||
entry_type)
|
||||
i +=1
|
||||
|
||||
|
||||
def LoadLogbooks():
|
||||
""" This is the master function for parsing all logbooks into the Troggle database. """
|
||||
|
||||
# Clear the logbook data issues as we are reloading
|
||||
""" This is the master function for parsing all logbooks into the Troggle database.
|
||||
"""
|
||||
DataIssue.objects.filter(parser='logbooks').delete()
|
||||
# Fetch all expos
|
||||
expos = Expedition.objects.all()
|
||||
nologbook = ["1976", "1977","1978","1979","1980","1980","1981","1983","1984",
|
||||
"1985","1986","1987","1988","1989","1990",]
|
||||
for expo in expos:
|
||||
print(("\nLoading Logbook for: " + expo.year))
|
||||
|
||||
# Load logbook for expo
|
||||
LoadLogbookForExpedition(expo)
|
||||
if expo.year not in nologbook:
|
||||
print((" - Logbook for: " + expo.year))
|
||||
LoadLogbookForExpedition(expo)
|
||||
|
||||
|
||||
dateRegex = re.compile(r'<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S)
|
||||
@ -437,7 +404,7 @@ def parseAutoLogBookEntry(filename):
|
||||
year, month, day = [int(x) for x in dateMatch.groups()]
|
||||
date = datetime.date(year, month, day)
|
||||
else:
|
||||
errors.append("Date could not be found")
|
||||
errors.append(" - Date could not be found")
|
||||
|
||||
expeditionYearMatch = expeditionYearRegex.search(contents)
|
||||
if expeditionYearMatch:
|
||||
@ -445,17 +412,17 @@ def parseAutoLogBookEntry(filename):
|
||||
expedition = Expedition.objects.get(year = expeditionYearMatch.groups()[0])
|
||||
personExpeditionNameLookup = GetPersonExpeditionNameLookup(expedition)
|
||||
except Expedition.DoesNotExist:
|
||||
errors.append("Expedition not in database")
|
||||
errors.append(" - Expedition not in database")
|
||||
else:
|
||||
errors.append("Expedition Year could not be parsed")
|
||||
errors.append(" - Expedition Year could not be parsed")
|
||||
|
||||
titleMatch = titleRegex.search(contents)
|
||||
if titleMatch:
|
||||
title, = titleMatch.groups()
|
||||
if len(title) > settings.MAX_LOGBOOK_ENTRY_TITLE_LENGTH:
|
||||
errors.append("Title too long")
|
||||
errors.append(" - Title too long")
|
||||
else:
|
||||
errors.append("Title could not be found")
|
||||
errors.append(" - Title could not be found")
|
||||
|
||||
caveMatch = caveRegex.search(contents)
|
||||
if caveMatch:
|
||||
@ -464,7 +431,7 @@ def parseAutoLogBookEntry(filename):
|
||||
cave = getCaveByReference(caveRef)
|
||||
except AssertionError:
|
||||
cave = None
|
||||
errors.append("Cave not found in database")
|
||||
errors.append(" - Cave not found in database")
|
||||
else:
|
||||
cave = None
|
||||
|
||||
@ -475,13 +442,13 @@ def parseAutoLogBookEntry(filename):
|
||||
location = None
|
||||
|
||||
if cave is None and location is None:
|
||||
errors.append("Location nor cave could not be found")
|
||||
errors.append(" - Location nor cave could not be found")
|
||||
|
||||
reportMatch = reportRegex.search(contents)
|
||||
if reportMatch:
|
||||
report, = reportMatch.groups()
|
||||
else:
|
||||
errors.append("Contents could not be found")
|
||||
errors.append(" - Contents could not be found")
|
||||
if errors:
|
||||
return errors # Easiest to bail out at this point as we need to make sure that we know which expedition to look for people from.
|
||||
people = []
|
||||
@ -492,20 +459,20 @@ def parseAutoLogBookEntry(filename):
|
||||
if name.lower() in personExpeditionNameLookup:
|
||||
personExpo = personExpeditionNameLookup[name.lower()]
|
||||
else:
|
||||
errors.append("Person could not be found in database")
|
||||
errors.append(" - Person could not be found in database")
|
||||
author = bool(author)
|
||||
else:
|
||||
errors.append("Persons name could not be found")
|
||||
errors.append(" - Persons name could not be found")
|
||||
|
||||
TUMatch = TURegex.search(contents)
|
||||
if TUMatch:
|
||||
TU, = TUMatch.groups()
|
||||
else:
|
||||
errors.append("TU could not be found")
|
||||
errors.append(" - TU could not be found")
|
||||
if not errors:
|
||||
people.append((name, author, TU))
|
||||
if errors:
|
||||
return errors # Bail out before commiting to the database
|
||||
return errors # Bail out before committing to the database
|
||||
logbookEntry = LogbookEntry(date = date,
|
||||
expedition = expedition,
|
||||
title = title, cave = cave, place = location,
|
||||
|
Loading…
Reference in New Issue
Block a user