forked from expo/troggle
converting 1987 logbook
This commit is contained in:
parent
dc3a61addd
commit
05df2e084c
@ -41,7 +41,7 @@ def import_logbooks():
|
|||||||
with transaction.atomic():
|
with transaction.atomic():
|
||||||
troggle.parsers.logbooks.LoadLogbooks()
|
troggle.parsers.logbooks.LoadLogbooks()
|
||||||
|
|
||||||
def import_logbook(year=1982):
|
def import_logbook(year=1987):
|
||||||
print(f"-- Importing Logbook {year}")
|
print(f"-- Importing Logbook {year}")
|
||||||
with transaction.atomic():
|
with transaction.atomic():
|
||||||
troggle.parsers.logbooks.LoadLogbook(year)
|
troggle.parsers.logbooks.LoadLogbook(year)
|
||||||
|
@ -76,7 +76,8 @@ LOGBOOK_PARSER_SETTINGS = {
|
|||||||
"1990": ("log.htm", "parser_html_01"),
|
"1990": ("log.htm", "parser_html_01"),
|
||||||
"1989": ("log.htm", "parser_html_01"), #crashes MySQL
|
"1989": ("log.htm", "parser_html_01"), #crashes MySQL
|
||||||
"1988": ("log.htm", "parser_html_01"), #crashes MySQL
|
"1988": ("log.htm", "parser_html_01"), #crashes MySQL
|
||||||
"1987": ("log.htm", "parser_html_01"), #crashes MySQL
|
#"1987": ("log.htm", "parser_02"), #crashes MySQL
|
||||||
|
"1987": ("logbook.html", "parser_html"),
|
||||||
"1985": ("logbook.html", "parser_html"),
|
"1985": ("logbook.html", "parser_html"),
|
||||||
"1984": ("logbook.html", "parser_html"),
|
"1984": ("logbook.html", "parser_html"),
|
||||||
"1983": ("logbook.html", "parser_html"),
|
"1983": ("logbook.html", "parser_html"),
|
||||||
@ -87,7 +88,7 @@ entries = { "2022": 86, "2019": 56, "2018": 100, "2017": 76, "2016": 83, "2015":
|
|||||||
"2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 53,
|
"2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 53,
|
||||||
"2008": 49, "2007": 113, "2006": 60, "2005": 55, "2004": 76, "2003": 42, "2002": 31,
|
"2008": 49, "2007": 113, "2006": 60, "2005": 55, "2004": 76, "2003": 42, "2002": 31,
|
||||||
"2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 95, "1995": 42,
|
"2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 95, "1995": 42,
|
||||||
"1994": 32, "1993": 41, "1992": 62, "1991": 39, "1990": 87, "1989": 1,"1988": 1,"1987": 1,
|
"1994": 32, "1993": 41, "1992": 62, "1991": 39, "1990": 87, "1989": 1,"1988": 1,"1987": 34,
|
||||||
"1985": 24, "1984": 32, "1983": 52, "1982": 42,}
|
"1985": 24, "1984": 32, "1983": 52, "1982": 42,}
|
||||||
# Logbooks log.htm exist for 87, 88, 89 but have no full-working parser, or need hand-editing.
|
# Logbooks log.htm exist for 87, 88, 89 but have no full-working parser, or need hand-editing.
|
||||||
|
|
||||||
@ -366,27 +367,13 @@ def parser_html(year, expedition, txt, seq=""):
|
|||||||
trippeople, expedition, tu, tripid1)
|
trippeople, expedition, tu, tripid1)
|
||||||
logentries.append(entrytuple)
|
logentries.append(entrytuple)
|
||||||
|
|
||||||
|
|
||||||
# main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it
|
# main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it
|
||||||
# trying it out for years 1982 - 1990 too. Some logbook editing required by hand.. place
|
# trying it out for years 1982 - 1990 too. Some logbook editing required by hand.. place
|
||||||
def parser_html_01(year, expedition, txt, seq=""):
|
def parser_html_01(year, expedition, txt, seq=""):
|
||||||
'''This uses some of the more obscure capabilities of regular expressions,
|
|
||||||
see https://docs.python.org/3/library/re.html
|
|
||||||
'''
|
|
||||||
global logentries
|
global logentries
|
||||||
global logdataissues
|
global logdataissues
|
||||||
errorcount = 0
|
errorcount = 0
|
||||||
|
|
||||||
# extract front material and stash for later use when rebuilding from list of entries
|
|
||||||
headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt)
|
|
||||||
headpara = headmatch.groups()[0].strip()
|
|
||||||
|
|
||||||
# print(f" - headpara:\n'{headpara}'")
|
|
||||||
if(len(headpara)>0):
|
|
||||||
frontpath = Path(settings.EXPOWEB, "years", year, "frontmatter.html")
|
|
||||||
with open(frontpath,"w") as front:
|
|
||||||
front.write(headpara+"\n")
|
|
||||||
|
|
||||||
tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
|
tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
|
||||||
logbook_entry_count = 0
|
logbook_entry_count = 0
|
||||||
for trippara in tripparas:
|
for trippara in tripparas:
|
||||||
@ -439,7 +426,6 @@ def parser_html_01(year, expedition, txt, seq=""):
|
|||||||
print(message)
|
print(message)
|
||||||
break
|
break
|
||||||
#print(f" #3 - tid: {tid}")
|
#print(f" #3 - tid: {tid}")
|
||||||
triptitle = triptitle.strip()
|
|
||||||
ldate = ParseDate(tripdate.strip(), year)
|
ldate = ParseDate(tripdate.strip(), year)
|
||||||
#print(f" # - tid: {tid} <{tripdate}> <{triptitle}> <{trippeople}>")
|
#print(f" # - tid: {tid} <{tripdate}> <{triptitle}> <{trippeople}>")
|
||||||
#print(f" #4 - tid: {tid}")
|
#print(f" #4 - tid: {tid}")
|
||||||
@ -491,6 +477,135 @@ def parser_html_01(year, expedition, txt, seq=""):
|
|||||||
print(message)
|
print(message)
|
||||||
return
|
return
|
||||||
|
|
||||||
|
# variant parser for 1987
|
||||||
|
def parser_02(year, expedition, txt, seq=""):
|
||||||
|
'''This uses some of the more obscure capabilities of regular expressions,
|
||||||
|
see https://docs.python.org/3/library/re.html
|
||||||
|
'''
|
||||||
|
global logentries
|
||||||
|
global logdataissues
|
||||||
|
errorcount = 0
|
||||||
|
|
||||||
|
# extract front material and stash for later use when rebuilding from list of entries
|
||||||
|
headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt)
|
||||||
|
headpara = headmatch.groups()[0].strip()
|
||||||
|
|
||||||
|
#print(f" - headpara:\n'{headpara}'")
|
||||||
|
if(len(headpara)>0):
|
||||||
|
frontpath = Path(settings.EXPOWEB, "years", year, "frontmatter.html")
|
||||||
|
with open(frontpath,"w") as front:
|
||||||
|
front.write(headpara+"\n")
|
||||||
|
|
||||||
|
tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
|
||||||
|
logbook_entry_count = 0
|
||||||
|
for trippara in tripparas:
|
||||||
|
logbook_entry_count += 1
|
||||||
|
tid = set_trip_id(year,logbook_entry_count)
|
||||||
|
# print(f" #0 - tid: {tid}")
|
||||||
|
try:
|
||||||
|
# print(f" #1 - tid: {tid}")
|
||||||
|
#s = re.match(r"(?i)(?s)\s*(?:<p>)?(.*?)</?p>(.*)$", trippara)
|
||||||
|
s = re.match(r"(?i)(?s)\s*(<hr[\s/]*>)?.*?<a[^>]*>([\s\S]*?)</a>(.*)$", trippara)
|
||||||
|
if not s:
|
||||||
|
message = " ! - Skipping logentry {tid} failure to parse header: " + tid + trippara[:300] + "..."
|
||||||
|
DataIssue.objects.create(parser='logbooks', message=message)
|
||||||
|
logdataissues[tid]=message
|
||||||
|
print(message)
|
||||||
|
break
|
||||||
|
#print(s.group(2))
|
||||||
|
#print(s.group(3)[:80])
|
||||||
|
try:
|
||||||
|
tripheader, triptext = s.group(2), s.group(3)
|
||||||
|
except:
|
||||||
|
message = f" ! - Fail to set tripheader, triptext. trip:<{tid}> s:'{s}'"
|
||||||
|
DataIssue.objects.create(parser='logbooks', message=message)
|
||||||
|
logdataissues[tid]=message
|
||||||
|
print(message)
|
||||||
|
|
||||||
|
print(f" {tid} {tripheader}")
|
||||||
|
if not tripheader:
|
||||||
|
continue
|
||||||
|
|
||||||
|
# mtripid = re.search(r'<a id="(.*?)"', tripheader)
|
||||||
|
# if not mtripid:
|
||||||
|
# message = f" ! - A tag id not found. Never mind. Not needed. trip:<{tid}> header:'{tripheader}'"
|
||||||
|
# DataIssue.objects.create(parser='logbooks', message=message)
|
||||||
|
# logdataissues[tid]=message
|
||||||
|
# print(message)
|
||||||
|
|
||||||
|
# tripid = mtripid and mtripid.group(1) or ""
|
||||||
|
# print(f" # - mtripid: {mtripid}")
|
||||||
|
tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader)
|
||||||
|
# print(f" #2 - tid: {tid}")
|
||||||
|
try:
|
||||||
|
tripdate, triptitle, trippeople = tripheader.split("|")
|
||||||
|
except:
|
||||||
|
message = f" ! - Fail 3 to split out date|title|people. trip:<{tid}>\n '{tripheader.split('|')}'"
|
||||||
|
DataIssue.objects.create(parser='logbooks', message=message)
|
||||||
|
logdataissues[tid]=message
|
||||||
|
print(message)
|
||||||
|
try:
|
||||||
|
tripdate, triptitle = tripheader.split("|")
|
||||||
|
trippeople = "GUESS ANON"
|
||||||
|
except:
|
||||||
|
message = f" ! - Skipping logentry {year} Fail 2 to split out date|title (anon). trip:<{tid}> '{tripheader.split('|')}' CRASHES MySQL !"
|
||||||
|
DataIssue.objects.create(parser='logbooks', message=message)
|
||||||
|
logdataissues[tid]=message
|
||||||
|
print(message)
|
||||||
|
break
|
||||||
|
# print(f" #3 - tid: {tid}")
|
||||||
|
triptitle = triptitle.strip()
|
||||||
|
ldate = ParseDate(tripdate.strip(), year)
|
||||||
|
#print(f" # - tid: {tid} <{tripdate}> <{triptitle}> <{trippeople}>")
|
||||||
|
# print(f" #4 - tid: {tid}")
|
||||||
|
|
||||||
|
mtu = re.search(r'<p[^>]*>(T/?U.*)', triptext)
|
||||||
|
if mtu:
|
||||||
|
tu = mtu.group(1)
|
||||||
|
triptext = triptext[:mtu.start(0)] + triptext[mtu.end():]
|
||||||
|
else:
|
||||||
|
tu = ""
|
||||||
|
|
||||||
|
triptitles = triptitle.split(" - ")
|
||||||
|
tripcave = triptitles[0].strip()
|
||||||
|
|
||||||
|
ltriptext = triptext
|
||||||
|
|
||||||
|
mtail = re.search(r'(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&|</?p>|\((?:same day|\d+)\))*$', ltriptext)
|
||||||
|
if mtail:
|
||||||
|
ltriptext = ltriptext[:mtail.start(0)]
|
||||||
|
ltriptext = re.sub(r"</p>", "", ltriptext)
|
||||||
|
ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
|
||||||
|
ltriptext = re.sub(r"</?u>", "_", ltriptext)
|
||||||
|
ltriptext = re.sub(r"</?i>", "''", ltriptext)
|
||||||
|
ltriptext = re.sub(r"</?b>", "'''", ltriptext)
|
||||||
|
ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
|
||||||
|
|
||||||
|
if ltriptext == "":
|
||||||
|
message = " ! - Zero content for logbook entry!: " + tid
|
||||||
|
DataIssue.objects.create(parser='logbooks', message=message)
|
||||||
|
logdataissues[tid]=message
|
||||||
|
print(message)
|
||||||
|
|
||||||
|
|
||||||
|
entrytuple = (ldate, tripcave, triptitle, ltriptext,
|
||||||
|
trippeople, expedition, tu, tid)
|
||||||
|
logentries.append(entrytuple)
|
||||||
|
|
||||||
|
except:
|
||||||
|
message = f" ! - Skipping logentry {year} due to exception in: {tid}"
|
||||||
|
DataIssue.objects.create(parser='logbooks', message=message)
|
||||||
|
logdataissues[tid]=message
|
||||||
|
print(message)
|
||||||
|
errorcount += 1
|
||||||
|
raise
|
||||||
|
if errorcount >5 :
|
||||||
|
message = f" !!- TOO MANY ERRORS - aborting at '{tid}' logbook: {year}"
|
||||||
|
DataIssue.objects.create(parser='logbooks', message=message)
|
||||||
|
logdataissues[tid]=message
|
||||||
|
print(message)
|
||||||
|
return
|
||||||
|
|
||||||
def parser_blog(year, expedition, txt, sq=""):
|
def parser_blog(year, expedition, txt, sq=""):
|
||||||
'''Parses the format of web pages collected as 'Save As HTML" from the UK Caving blog website.
|
'''Parses the format of web pages collected as 'Save As HTML" from the UK Caving blog website.
|
||||||
Note that the entries have dates and authors, but no titles.
|
Note that the entries have dates and authors, but no titles.
|
||||||
|
Loading…
Reference in New Issue
Block a user