converting 1987 logbook

This commit is contained in:
Philip Sargent 2022-12-20 19:59:36 +00:00
parent dc3a61addd
commit 05df2e084c
2 changed files with 134 additions and 19 deletions

View File

@ -41,7 +41,7 @@ def import_logbooks():
with transaction.atomic(): with transaction.atomic():
troggle.parsers.logbooks.LoadLogbooks() troggle.parsers.logbooks.LoadLogbooks()
def import_logbook(year=1982): def import_logbook(year=1987):
print(f"-- Importing Logbook {year}") print(f"-- Importing Logbook {year}")
with transaction.atomic(): with transaction.atomic():
troggle.parsers.logbooks.LoadLogbook(year) troggle.parsers.logbooks.LoadLogbook(year)

View File

@ -76,7 +76,8 @@ LOGBOOK_PARSER_SETTINGS = {
"1990": ("log.htm", "parser_html_01"), "1990": ("log.htm", "parser_html_01"),
"1989": ("log.htm", "parser_html_01"), #crashes MySQL "1989": ("log.htm", "parser_html_01"), #crashes MySQL
"1988": ("log.htm", "parser_html_01"), #crashes MySQL "1988": ("log.htm", "parser_html_01"), #crashes MySQL
"1987": ("log.htm", "parser_html_01"), #crashes MySQL #"1987": ("log.htm", "parser_02"), #crashes MySQL
"1987": ("logbook.html", "parser_html"),
"1985": ("logbook.html", "parser_html"), "1985": ("logbook.html", "parser_html"),
"1984": ("logbook.html", "parser_html"), "1984": ("logbook.html", "parser_html"),
"1983": ("logbook.html", "parser_html"), "1983": ("logbook.html", "parser_html"),
@ -87,7 +88,7 @@ entries = { "2022": 86, "2019": 56, "2018": 100, "2017": 76, "2016": 83, "2015":
"2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 53, "2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 53,
"2008": 49, "2007": 113, "2006": 60, "2005": 55, "2004": 76, "2003": 42, "2002": 31, "2008": 49, "2007": 113, "2006": 60, "2005": 55, "2004": 76, "2003": 42, "2002": 31,
"2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 95, "1995": 42, "2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 95, "1995": 42,
"1994": 32, "1993": 41, "1992": 62, "1991": 39, "1990": 87, "1989": 1,"1988": 1,"1987": 1, "1994": 32, "1993": 41, "1992": 62, "1991": 39, "1990": 87, "1989": 1,"1988": 1,"1987": 34,
"1985": 24, "1984": 32, "1983": 52, "1982": 42,} "1985": 24, "1984": 32, "1983": 52, "1982": 42,}
# Logbooks log.htm exist for 87, 88, 89 but have no full-working parser, or need hand-editing. # Logbooks log.htm exist for 87, 88, 89 but have no full-working parser, or need hand-editing.
@ -366,27 +367,13 @@ def parser_html(year, expedition, txt, seq=""):
trippeople, expedition, tu, tripid1) trippeople, expedition, tu, tripid1)
logentries.append(entrytuple) logentries.append(entrytuple)
# main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it # main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it
# trying it out for years 1982 - 1990 too. Some logbook editing required by hand.. place # trying it out for years 1982 - 1990 too. Some logbook editing required by hand.. place
def parser_html_01(year, expedition, txt, seq=""): def parser_html_01(year, expedition, txt, seq=""):
'''This uses some of the more obscure capabilities of regular expressions,
see https://docs.python.org/3/library/re.html
'''
global logentries global logentries
global logdataissues global logdataissues
errorcount = 0 errorcount = 0
# extract front material and stash for later use when rebuilding from list of entries
headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt)
headpara = headmatch.groups()[0].strip()
# print(f" - headpara:\n'{headpara}'")
if(len(headpara)>0):
frontpath = Path(settings.EXPOWEB, "years", year, "frontmatter.html")
with open(frontpath,"w") as front:
front.write(headpara+"\n")
tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt) tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
logbook_entry_count = 0 logbook_entry_count = 0
for trippara in tripparas: for trippara in tripparas:
@ -439,7 +426,6 @@ def parser_html_01(year, expedition, txt, seq=""):
print(message) print(message)
break break
#print(f" #3 - tid: {tid}") #print(f" #3 - tid: {tid}")
triptitle = triptitle.strip()
ldate = ParseDate(tripdate.strip(), year) ldate = ParseDate(tripdate.strip(), year)
#print(f" # - tid: {tid} <{tripdate}> <{triptitle}> <{trippeople}>") #print(f" # - tid: {tid} <{tripdate}> <{triptitle}> <{trippeople}>")
#print(f" #4 - tid: {tid}") #print(f" #4 - tid: {tid}")
@ -491,6 +477,135 @@ def parser_html_01(year, expedition, txt, seq=""):
print(message) print(message)
return return
# variant parser for 1987
def parser_02(year, expedition, txt, seq=""):
'''This uses some of the more obscure capabilities of regular expressions,
see https://docs.python.org/3/library/re.html
'''
global logentries
global logdataissues
errorcount = 0
# extract front material and stash for later use when rebuilding from list of entries
headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt)
headpara = headmatch.groups()[0].strip()
#print(f" - headpara:\n'{headpara}'")
if(len(headpara)>0):
frontpath = Path(settings.EXPOWEB, "years", year, "frontmatter.html")
with open(frontpath,"w") as front:
front.write(headpara+"\n")
tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
logbook_entry_count = 0
for trippara in tripparas:
logbook_entry_count += 1
tid = set_trip_id(year,logbook_entry_count)
# print(f" #0 - tid: {tid}")
try:
# print(f" #1 - tid: {tid}")
#s = re.match(r"(?i)(?s)\s*(?:<p>)?(.*?)</?p>(.*)$", trippara)
s = re.match(r"(?i)(?s)\s*(<hr[\s/]*>)?.*?<a[^>]*>([\s\S]*?)</a>(.*)$", trippara)
if not s:
message = " ! - Skipping logentry {tid} failure to parse header: " + tid + trippara[:300] + "..."
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
print(message)
break
#print(s.group(2))
#print(s.group(3)[:80])
try:
tripheader, triptext = s.group(2), s.group(3)
except:
message = f" ! - Fail to set tripheader, triptext. trip:<{tid}> s:'{s}'"
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
print(message)
print(f" {tid} {tripheader}")
if not tripheader:
continue
# mtripid = re.search(r'<a id="(.*?)"', tripheader)
# if not mtripid:
# message = f" ! - A tag id not found. Never mind. Not needed. trip:<{tid}> header:'{tripheader}'"
# DataIssue.objects.create(parser='logbooks', message=message)
# logdataissues[tid]=message
# print(message)
# tripid = mtripid and mtripid.group(1) or ""
# print(f" # - mtripid: {mtripid}")
tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader)
# print(f" #2 - tid: {tid}")
try:
tripdate, triptitle, trippeople = tripheader.split("|")
except:
message = f" ! - Fail 3 to split out date|title|people. trip:<{tid}>\n '{tripheader.split('|')}'"
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
print(message)
try:
tripdate, triptitle = tripheader.split("|")
trippeople = "GUESS ANON"
except:
message = f" ! - Skipping logentry {year} Fail 2 to split out date|title (anon). trip:<{tid}> '{tripheader.split('|')}' CRASHES MySQL !"
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
print(message)
break
# print(f" #3 - tid: {tid}")
triptitle = triptitle.strip()
ldate = ParseDate(tripdate.strip(), year)
#print(f" # - tid: {tid} <{tripdate}> <{triptitle}> <{trippeople}>")
# print(f" #4 - tid: {tid}")
mtu = re.search(r'<p[^>]*>(T/?U.*)', triptext)
if mtu:
tu = mtu.group(1)
triptext = triptext[:mtu.start(0)] + triptext[mtu.end():]
else:
tu = ""
triptitles = triptitle.split(" - ")
tripcave = triptitles[0].strip()
ltriptext = triptext
mtail = re.search(r'(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&amp;|</?p>|\((?:same day|\d+)\))*$', ltriptext)
if mtail:
ltriptext = ltriptext[:mtail.start(0)]
ltriptext = re.sub(r"</p>", "", ltriptext)
ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
ltriptext = re.sub(r"</?u>", "_", ltriptext)
ltriptext = re.sub(r"</?i>", "''", ltriptext)
ltriptext = re.sub(r"</?b>", "'''", ltriptext)
ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
if ltriptext == "":
message = " ! - Zero content for logbook entry!: " + tid
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
print(message)
entrytuple = (ldate, tripcave, triptitle, ltriptext,
trippeople, expedition, tu, tid)
logentries.append(entrytuple)
except:
message = f" ! - Skipping logentry {year} due to exception in: {tid}"
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
print(message)
errorcount += 1
raise
if errorcount >5 :
message = f" !!- TOO MANY ERRORS - aborting at '{tid}' logbook: {year}"
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
print(message)
return
def parser_blog(year, expedition, txt, sq=""): def parser_blog(year, expedition, txt, sq=""):
'''Parses the format of web pages collected as 'Save As HTML" from the UK Caving blog website. '''Parses the format of web pages collected as 'Save As HTML" from the UK Caving blog website.
Note that the entries have dates and authors, but no titles. Note that the entries have dates and authors, but no titles.