All broken logbooks now parsing correctly.

This commit is contained in:
Philip Sargent 2022-12-20 23:48:56 +00:00
parent 5ee3ebad3e
commit 517da57a0c
2 changed files with 4 additions and 136 deletions

View File

@ -41,7 +41,7 @@ def import_logbooks():
with transaction.atomic(): with transaction.atomic():
troggle.parsers.logbooks.LoadLogbooks() troggle.parsers.logbooks.LoadLogbooks()
def import_logbook(year=1988): def import_logbook(year=1989):
print(f"-- Importing Logbook {year}") print(f"-- Importing Logbook {year}")
with transaction.atomic(): with transaction.atomic():
troggle.parsers.logbooks.LoadLogbook(year) troggle.parsers.logbooks.LoadLogbook(year)

View File

@ -74,9 +74,7 @@ LOGBOOK_PARSER_SETTINGS = {
"1992": ("log.htm", "parser_html_01"), "1992": ("log.htm", "parser_html_01"),
"1991": ("log.htm", "parser_html_01"), "1991": ("log.htm", "parser_html_01"),
"1990": ("log.htm", "parser_html_01"), "1990": ("log.htm", "parser_html_01"),
"1989": ("log.htm", "parser_html_01"), #crashes MySQL "1989": ("logbook.html", "parser_html"),
#"1988": ("log.htm", "parser_02"), #crashes MySQL
#"1987": ("log.htm", "parser_02"), #crashes MySQL
"1988": ("logbook.html", "parser_html"), "1988": ("logbook.html", "parser_html"),
"1987": ("logbook.html", "parser_html"), "1987": ("logbook.html", "parser_html"),
"1985": ("logbook.html", "parser_html"), "1985": ("logbook.html", "parser_html"),
@ -89,7 +87,7 @@ entries = { "2022": 86, "2019": 56, "2018": 100, "2017": 76, "2016": 83, "2015":
"2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 53, "2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 53,
"2008": 49, "2007": 113, "2006": 60, "2005": 55, "2004": 76, "2003": 42, "2002": 31, "2008": 49, "2007": 113, "2006": 60, "2005": 55, "2004": 76, "2003": 42, "2002": 31,
"2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 95, "1995": 42, "2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 95, "1995": 42,
"1994": 32, "1993": 41, "1992": 62, "1991": 39, "1990": 87, "1989": 1,"1988": 61,"1987": 34, "1994": 32, "1993": 41, "1992": 62, "1991": 39, "1990": 87, "1989": 63,"1988": 61,"1987": 34,
"1985": 24, "1984": 32, "1983": 52, "1982": 42,} "1985": 24, "1984": 32, "1983": 52, "1982": 42,}
# Logbooks log.htm exist for 87, 88, 89 but have no full-working parser, or need hand-editing. # Logbooks log.htm exist for 87, 88, 89 but have no full-working parser, or need hand-editing.
@ -369,7 +367,6 @@ def parser_html(year, expedition, txt, seq=""):
logentries.append(entrytuple) logentries.append(entrytuple)
# main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it # main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it
# trying it out for years 1982 - 1990 too. Some logbook editing required by hand.. place
def parser_html_01(year, expedition, txt, seq=""): def parser_html_01(year, expedition, txt, seq=""):
global logentries global logentries
global logdataissues global logdataissues
@ -478,135 +475,6 @@ def parser_html_01(year, expedition, txt, seq=""):
print(message) print(message)
return return
# variant parser for 1987
def parser_02(year, expedition, txt, seq=""):
'''This uses some of the more obscure capabilities of regular expressions,
see https://docs.python.org/3/library/re.html
'''
global logentries
global logdataissues
errorcount = 0
# extract front material and stash for later use when rebuilding from list of entries
headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt)
headpara = headmatch.groups()[0].strip()
#print(f" - headpara:\n'{headpara}'")
if(len(headpara)>0):
frontpath = Path(settings.EXPOWEB, "years", year, "frontmatter.html")
with open(frontpath,"w") as front:
front.write(headpara+"\n")
tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
logbook_entry_count = 0
for trippara in tripparas:
logbook_entry_count += 1
tid = set_trip_id(year,logbook_entry_count)
# print(f" #0 - tid: {tid}")
try:
# print(f" #1 - tid: {tid}")
#s = re.match(r"(?i)(?s)\s*(?:<p>)?(.*?)</?p>(.*)$", trippara)
s = re.match(r"(?i)(?s)\s*(<hr[\s/]*>)?.*?<a[^>]*>([\s\S]*?)</a>(.*)$", trippara)
if not s:
message = f" ! - Skipping logentry {tid} failure to parse header: " + tid + trippara[:300] + "..."
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
print(message)
break
#print(s.group(2))
#print(s.group(3)[:80])
try:
tripheader, triptext = s.group(2), s.group(3)
except:
message = f" ! - Fail to set tripheader, triptext. trip:<{tid}> s:'{s}'"
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
print(message)
print(f" {tid} {tripheader}")
if not tripheader:
continue
# mtripid = re.search(r'<a id="(.*?)"', tripheader)
# if not mtripid:
# message = f" ! - A tag id not found. Never mind. Not needed. trip:<{tid}> header:'{tripheader}'"
# DataIssue.objects.create(parser='logbooks', message=message)
# logdataissues[tid]=message
# print(message)
# tripid = mtripid and mtripid.group(1) or ""
# print(f" # - mtripid: {mtripid}")
tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader)
# print(f" #2 - tid: {tid}")
try:
tripdate, triptitle, trippeople = tripheader.split("|")
except:
message = f" ! - Fail 3 to split out date|title|people. trip:<{tid}>\n '{tripheader.split('|')}'"
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
print(message)
try:
tripdate, triptitle = tripheader.split("|")
trippeople = "GUESS ANON"
except:
message = f" ! - Skipping logentry {year} Fail 2 to split out date|title (anon). trip:<{tid}> '{tripheader.split('|')}' CRASHES MySQL !"
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
print(message)
break
# print(f" #3 - tid: {tid}")
triptitle = triptitle.strip()
ldate = ParseDate(tripdate.strip(), year)
#print(f" # - tid: {tid} <{tripdate}> <{triptitle}> <{trippeople}>")
# print(f" #4 - tid: {tid}")
mtu = re.search(r'<p[^>]*>(T/?U.*)', triptext)
if mtu:
tu = mtu.group(1)
triptext = triptext[:mtu.start(0)] + triptext[mtu.end():]
else:
tu = ""
triptitles = triptitle.split(" - ")
tripcave = triptitles[0].strip()
ltriptext = triptext
mtail = re.search(r'(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&amp;|</?p>|\((?:same day|\d+)\))*$', ltriptext)
if mtail:
ltriptext = ltriptext[:mtail.start(0)]
ltriptext = re.sub(r"</p>", "", ltriptext)
ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
ltriptext = re.sub(r"</?u>", "_", ltriptext)
ltriptext = re.sub(r"</?i>", "''", ltriptext)
ltriptext = re.sub(r"</?b>", "'''", ltriptext)
ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
if ltriptext == "":
message = " ! - Zero content for logbook entry!: " + tid
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
print(message)
entrytuple = (ldate, tripcave, triptitle, ltriptext,
trippeople, expedition, tu, tid)
logentries.append(entrytuple)
except:
message = f" ! - Skipping logentry {year} due to exception in: {tid}"
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
print(message)
errorcount += 1
raise
if errorcount >5 :
message = f" !!- TOO MANY ERRORS - aborting at '{tid}' logbook: {year}"
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
print(message)
return
def parser_blog(year, expedition, txt, sq=""): def parser_blog(year, expedition, txt, sq=""):
'''Parses the format of web pages collected as 'Save As HTML" from the UK Caving blog website. '''Parses the format of web pages collected as 'Save As HTML" from the UK Caving blog website.
Note that the entries have dates and authors, but no titles. Note that the entries have dates and authors, but no titles.
@ -847,7 +715,7 @@ def LoadLogbooks():
noexpo = ["1986", "2020", "2021",] #no expo noexpo = ["1986", "2020", "2021",] #no expo
lostlogbook = ["1976", "1977", "1978", "1979", "1980", "1981"] lostlogbook = ["1976", "1977", "1978", "1979", "1980", "1981"]
sqlfail = ["1989"] # breaks mysql with db constraint fail - debug locally first] sqlfail = [""] # breaks mysql with db constraint fail - all now fixed.]
nologbook = noexpo + lostlogbook + sqlfail nologbook = noexpo + lostlogbook + sqlfail
nlbe={} nlbe={}