2
0
mirror of https://expo.survex.com/repositories/troggle/.git synced 2025-01-19 01:12:32 +00:00

more robust logbooks parsing

This commit is contained in:
Philip Sargent 2021-04-24 01:23:55 +01:00
parent b979bdb560
commit 8128870d57
2 changed files with 55 additions and 25 deletions

View File

@ -176,13 +176,17 @@ def personexpedition(request, first_name='', last_name='', year=''):
def logbookentry(request, date, slug):
this_logbookentry = LogbookEntry.objects.filter(date=date, slug=slug)
if len(this_logbookentry)>1:
return render(request, 'object_list.html',{'object_list':this_logbookentry})
if this_logbookentry:
if len(this_logbookentry)>1:
return render(request, 'object_list.html',{'object_list':this_logbookentry})
else:
this_logbookentry=this_logbookentry[0]
return render(request, 'logbookentry.html', {'logbookentry': this_logbookentry})
else:
this_logbookentry=this_logbookentry[0]
return render(request, 'logbookentry.html', {'logbookentry': this_logbookentry})
msg =(f' Logbook entry slug:"{slug}" not found in database on date:"{date}" ')
print(msg)
return render(request, 'errors/generic.html',{'message':msg})
def logbookSearch(request, extra):
query_string = ''

View File

@ -71,13 +71,18 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
for tripperson in re.split(r",|\+|&|&(?!\w+;)| and ", trippeople):
tripperson = tripperson.strip()
if not tid:
tid = expedition.year + "." + tripperson + datetime.now().strftime("%S%f") # no good. Should be getting the tid
mul = re.match(r"<u>(.*?)</u>$(?i)", tripperson)
if mul:
tripperson = mul.group(1).strip()
if tripperson and tripperson[0] != '*':
tripperson = re.sub(round_bracket_regex, "", tripperson).strip()
if tripperson =="Wiggy":
tripperson = "Phil Wigglesworth"
if tripperson =="Animal":
tripperson = "Mike Richardson"
personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower())
if not personyear:
message = f" ! - {expedition.year} No name match for: '{tripperson}' "
@ -91,6 +96,7 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
if not res:
return None, None
author = res[-1][0]
return res, author
def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, entry_type="wiki", tid=None):
@ -107,11 +113,11 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
return
if not author:
message = f" ! - {expedition.year} Skipping logentry: {title} - - no author for entry in year "
message = f" ! - {expedition.year} Warning: logentry: {title} - no author for entry in year "
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues["title"]=message
print(message)
return
#return
# This needs attention. The slug field is derived from 'title'
# both GetCaveLookup() and GetTripCave() need to work together better. None of this data is *used* though?
@ -298,7 +304,7 @@ def Parseloghtmltxt(year, expedition, txt):
"html", tripid1, logbook_entry_count, tid=tid)
# main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it
# trying it out for years 1982 - 1990 too. Some logbook editing required by hand..
# trying it out for years 1982 - 1990 too. Some logbook editing required by hand.. place
def Parseloghtml01(year, expedition, txt):
global logentries
global logdataissues
@ -309,7 +315,9 @@ def Parseloghtml01(year, expedition, txt):
for trippara in tripparas:
logbook_entry_count += 1
tid = set_trip_id(year,logbook_entry_count)
# print(f" #0 - tid: {tid}")
try:
#print(f" #1 - tid: {tid}")
s = re.match(r"(?s)\s*(?:<p>)?(.*?)</?p>(.*)$(?i)", trippara)
if not s:
message = " ! - Skipping logentry {year} failure to parse header: " + tid + trippara[:300] + "..."
@ -317,22 +325,40 @@ def Parseloghtml01(year, expedition, txt):
logdataissues[tid]=message
print(message)
break
tripheader, triptext = s.group(1), s.group(2)
mtripid = re.search(r'<a id="(.*?)"', tripheader)
try:
tripheader, triptext = s.group(1), s.group(2)
except:
message = f" ! - Fail to set tripheader, triptext. trip:<{tid}> s:'{s}'"
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
print(message)
# mtripid = re.search(r'<a id="(.*?)"', tripheader)
# if not mtripid:
# # not an error, this is probabluy jusyt a different year
# message = f" ! - Fail id trip:{tid} header:'{tripheader}'"
# message = f" ! - A tag id not found. Never mind. Not needed. trip:<{tid}> header:'{tripheader}'"
# DataIssue.objects.create(parser='logbooks', message=message)
# logdataissues[tid]=message
# print(message)
tripid = mtripid and mtripid.group(1) or ""
#print(f" # - mtripid: {mtripid}")
# tripid = mtripid and mtripid.group(1) or ""
# print(f" # - mtripid: {mtripid}")
tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader)
tripdate, triptitle, trippeople = tripheader.split("|")
#print(f" #2 - tid: {tid}")
try:
tripdate, triptitle, trippeople = tripheader.split("|")
except:
message = f" ! - Fail to split out date|title|people. trip:<{tid}> '{tripheader.split('|')}'"
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
print(message)
tripdate, triptitle = tripheader.split("|")
trippeople = "anon"
#print(f" #3 - tid: {tid}")
ldate = ParseDate(tripdate.strip(), year)
#print(f" # - tid: {tid} <{tripdate}> <{triptitle}> <{trippeople}>")
#print(f" #4 - tid: {tid}")
mtu = re.search(r'<p[^>]*>(T/?U.*)', triptext)
if mtu:
tu = mtu.group(1)
@ -363,7 +389,7 @@ def Parseloghtml01(year, expedition, txt):
entrytuple = (ldate, tripcave, triptitle, ltriptext,
trippeople, expedition, tu, "html01", tripid)
trippeople, expedition, tu, "html01", tid)
logentries.append(entrytuple)
try:
EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext,
@ -377,7 +403,7 @@ def Parseloghtml01(year, expedition, txt):
try:
EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu,
"html01", tripid, logbook_entry_count, tid=tid)
"html01", tid, logbook_entry_count, tid=tid)
except:
message = " ! - Enter log entry into ObjectStore FAIL exception in: " + tid
DataIssue.objects.create(parser='logbooks', message=message)
@ -577,7 +603,7 @@ def LoadLogbookForExpedition(expedition, expect):
SetDatesFromLogbookEntries(expedition)
if len(logentries) >0:
print(" - Cacheing " , len(logentries), " log entries")
with open(cache_filename, "wb") as fc:
with open(cache_filename, "wb") as fc: # we much check that permission are g+w ! or expo can't delete the cache
logbk=(expedition,len(logentries),logentries)
pickle.dump(logbk, fc, protocol=4)
else:
@ -608,14 +634,14 @@ def LoadLogbooks():
if len(expos) <= 1:
print(" ! No expeditions found. Load 'people' first.\n")
nologbook = ["1976", "1977", "1978", "1979", "1980", "1981",
"1987", "1988", "1989",
"1987", "1988", "1989", # needs more hand-editing of log.htm
"1986", "2020",]
entries = {"2021": 0, "2019": 20, "2018": 74, "2017": 60, "2016": 81, "2015": 79,
"2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 52,
"2008": 49, "2007": 111, "2006": 60, "2005": 55, "2004": 76, "2003": 40, "2002": 31,
"2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 94, "1995": 41,
"1994": 32, "1993": 41, "1992": 61, "1991": 38, "1990": 87, "1989": 1,"1988": 1,"1987": 1,
"1985": 21,"1984": 19,"1983": 22,"1982": 42,}
"1985": 22,"1984": 32,"1983": 52,"1982": 42,}
# Logbooks log.htm exist for 1983, 84, 85, 87, 88, 89 but have no full-working parser, or need hand-editing.
try:
os.remove("loadlogbk.log")