From 8128870d57c751d0589b58f1ce88824261d36335 Mon Sep 17 00:00:00 2001 From: Philip Sargent <philip.sargent@klebos.com> Date: Sat, 24 Apr 2021 01:23:55 +0100 Subject: [PATCH] more robust logbooks parsing --- core/views/logbooks.py | 16 +++++++---- parsers/logbooks.py | 64 +++++++++++++++++++++++++++++------------- 2 files changed, 55 insertions(+), 25 deletions(-) diff --git a/core/views/logbooks.py b/core/views/logbooks.py index ecf0f6b..82dbf61 100644 --- a/core/views/logbooks.py +++ b/core/views/logbooks.py @@ -176,13 +176,17 @@ def personexpedition(request, first_name='', last_name='', year=''): def logbookentry(request, date, slug): this_logbookentry = LogbookEntry.objects.filter(date=date, slug=slug) - - if len(this_logbookentry)>1: - return render(request, 'object_list.html',{'object_list':this_logbookentry}) + + if this_logbookentry: + if len(this_logbookentry)>1: + return render(request, 'object_list.html',{'object_list':this_logbookentry}) + else: + this_logbookentry=this_logbookentry[0] + return render(request, 'logbookentry.html', {'logbookentry': this_logbookentry}) else: - this_logbookentry=this_logbookentry[0] - return render(request, 'logbookentry.html', {'logbookentry': this_logbookentry}) - + msg =(f' Logbook entry slug:"{slug}" not found in database on date:"{date}" ') + print(msg) + return render(request, 'errors/generic.html',{'message':msg}) def logbookSearch(request, extra): query_string = '' diff --git a/parsers/logbooks.py b/parsers/logbooks.py index c9d7796..8237bdc 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -71,13 +71,18 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None): for tripperson in re.split(r",|\+|&|&(?!\w+;)| and ", trippeople): tripperson = tripperson.strip() - if not tid: - tid = expedition.year + "." + tripperson + datetime.now().strftime("%S%f") # no good. Should be getting the tid mul = re.match(r"<u>(.*?)</u>$(?i)", tripperson) if mul: tripperson = mul.group(1).strip() if tripperson and tripperson[0] != '*': tripperson = re.sub(round_bracket_regex, "", tripperson).strip() + + if tripperson =="Wiggy": + tripperson = "Phil Wigglesworth" + if tripperson =="Animal": + tripperson = "Mike Richardson" + + personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower()) if not personyear: message = f" ! - {expedition.year} No name match for: '{tripperson}' " @@ -91,6 +96,7 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None): if not res: return None, None author = res[-1][0] + return res, author def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, entry_type="wiki", tid=None): @@ -107,11 +113,11 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_ return if not author: - message = f" ! - {expedition.year} Skipping logentry: {title} - - no author for entry in year " + message = f" ! - {expedition.year} Warning: logentry: {title} - no author for entry in year " DataIssue.objects.create(parser='logbooks', message=message) logdataissues["title"]=message print(message) - return + #return # This needs attention. The slug field is derived from 'title' # both GetCaveLookup() and GetTripCave() need to work together better. None of this data is *used* though? @@ -298,7 +304,7 @@ def Parseloghtmltxt(year, expedition, txt): "html", tripid1, logbook_entry_count, tid=tid) # main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it -# trying it out for years 1982 - 1990 too. Some logbook editing required by hand.. +# trying it out for years 1982 - 1990 too. Some logbook editing required by hand.. place def Parseloghtml01(year, expedition, txt): global logentries global logdataissues @@ -309,7 +315,9 @@ def Parseloghtml01(year, expedition, txt): for trippara in tripparas: logbook_entry_count += 1 tid = set_trip_id(year,logbook_entry_count) + # print(f" #0 - tid: {tid}") try: + #print(f" #1 - tid: {tid}") s = re.match(r"(?s)\s*(?:<p>)?(.*?)</?p>(.*)$(?i)", trippara) if not s: message = " ! - Skipping logentry {year} failure to parse header: " + tid + trippara[:300] + "..." @@ -317,22 +325,40 @@ def Parseloghtml01(year, expedition, txt): logdataissues[tid]=message print(message) break - tripheader, triptext = s.group(1), s.group(2) - mtripid = re.search(r'<a id="(.*?)"', tripheader) + try: + tripheader, triptext = s.group(1), s.group(2) + except: + message = f" ! - Fail to set tripheader, triptext. trip:<{tid}> s:'{s}'" + DataIssue.objects.create(parser='logbooks', message=message) + logdataissues[tid]=message + print(message) + + + # mtripid = re.search(r'<a id="(.*?)"', tripheader) # if not mtripid: - # # not an error, this is probabluy jusyt a different year - # message = f" ! - Fail id trip:{tid} header:'{tripheader}'" + # message = f" ! - A tag id not found. Never mind. Not needed. trip:<{tid}> header:'{tripheader}'" # DataIssue.objects.create(parser='logbooks', message=message) # logdataissues[tid]=message # print(message) - tripid = mtripid and mtripid.group(1) or "" - #print(f" # - mtripid: {mtripid}") + # tripid = mtripid and mtripid.group(1) or "" + # print(f" # - mtripid: {mtripid}") tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader) - - tripdate, triptitle, trippeople = tripheader.split("|") + #print(f" #2 - tid: {tid}") + try: + tripdate, triptitle, trippeople = tripheader.split("|") + except: + message = f" ! - Fail to split out date|title|people. trip:<{tid}> '{tripheader.split('|')}'" + DataIssue.objects.create(parser='logbooks', message=message) + logdataissues[tid]=message + print(message) + tripdate, triptitle = tripheader.split("|") + trippeople = "anon" + #print(f" #3 - tid: {tid}") ldate = ParseDate(tripdate.strip(), year) - + #print(f" # - tid: {tid} <{tripdate}> <{triptitle}> <{trippeople}>") + #print(f" #4 - tid: {tid}") + mtu = re.search(r'<p[^>]*>(T/?U.*)', triptext) if mtu: tu = mtu.group(1) @@ -363,7 +389,7 @@ def Parseloghtml01(year, expedition, txt): entrytuple = (ldate, tripcave, triptitle, ltriptext, - trippeople, expedition, tu, "html01", tripid) + trippeople, expedition, tu, "html01", tid) logentries.append(entrytuple) try: EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext, @@ -377,7 +403,7 @@ def Parseloghtml01(year, expedition, txt): try: EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu, - "html01", tripid, logbook_entry_count, tid=tid) + "html01", tid, logbook_entry_count, tid=tid) except: message = " ! - Enter log entry into ObjectStore FAIL exception in: " + tid DataIssue.objects.create(parser='logbooks', message=message) @@ -577,7 +603,7 @@ def LoadLogbookForExpedition(expedition, expect): SetDatesFromLogbookEntries(expedition) if len(logentries) >0: print(" - Cacheing " , len(logentries), " log entries") - with open(cache_filename, "wb") as fc: + with open(cache_filename, "wb") as fc: # we much check that permission are g+w ! or expo can't delete the cache logbk=(expedition,len(logentries),logentries) pickle.dump(logbk, fc, protocol=4) else: @@ -608,14 +634,14 @@ def LoadLogbooks(): if len(expos) <= 1: print(" ! No expeditions found. Load 'people' first.\n") nologbook = ["1976", "1977", "1978", "1979", "1980", "1981", - "1987", "1988", "1989", + "1987", "1988", "1989", # needs more hand-editing of log.htm "1986", "2020",] entries = {"2021": 0, "2019": 20, "2018": 74, "2017": 60, "2016": 81, "2015": 79, "2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 52, "2008": 49, "2007": 111, "2006": 60, "2005": 55, "2004": 76, "2003": 40, "2002": 31, "2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 94, "1995": 41, "1994": 32, "1993": 41, "1992": 61, "1991": 38, "1990": 87, "1989": 1,"1988": 1,"1987": 1, - "1985": 21,"1984": 19,"1983": 22,"1982": 42,} + "1985": 22,"1984": 32,"1983": 52,"1982": 42,} # Logbooks log.htm exist for 1983, 84, 85, 87, 88, 89 but have no full-working parser, or need hand-editing. try: os.remove("loadlogbk.log")