mirror of
https://expo.survex.com/repositories/troggle/.git
synced 2025-03-01 19:10:15 +00:00
reformatted all old logbook formats
This commit is contained in:
parent
1be3a3892c
commit
733765802e
parsers
@ -40,7 +40,7 @@ def import_logbooks():
|
|||||||
troggle.parsers.logbooks.LoadLogbooks()
|
troggle.parsers.logbooks.LoadLogbooks()
|
||||||
|
|
||||||
|
|
||||||
def import_logbook(year=2022):
|
def import_logbook(year=1996):
|
||||||
print(f"-- Importing Logbook {year}")
|
print(f"-- Importing Logbook {year}")
|
||||||
with transaction.atomic():
|
with transaction.atomic():
|
||||||
troggle.parsers.logbooks.LoadLogbook(year)
|
troggle.parsers.logbooks.LoadLogbook(year)
|
||||||
|
@ -26,12 +26,6 @@ todo = """
|
|||||||
|
|
||||||
- far too many uses of Django field dereferencing to get values, which is SLOW
|
- far too many uses of Django field dereferencing to get values, which is SLOW
|
||||||
|
|
||||||
- Logbooks 1987, 1988, 1989 all crash on MySql - but not sqlite - with db constraint fail. Edit logbook to fix.
|
|
||||||
|
|
||||||
- import/parse/re-export-as-html the 'artisanal-format' old logbooks so that
|
|
||||||
we keep only a modern HTML05 format. Then we can retire the old parsers and reduce the
|
|
||||||
volume of code here substantially.
|
|
||||||
|
|
||||||
- rewrite to use generators rather than storing everything intermediate in lists - to reduce memory impact.
|
- rewrite to use generators rather than storing everything intermediate in lists - to reduce memory impact.
|
||||||
|
|
||||||
- We should ensure logbook.html is utf-8 and stop this crap:
|
- We should ensure logbook.html is utf-8 and stop this crap:
|
||||||
@ -46,35 +40,16 @@ data for old logbooks? Not worth it..
|
|||||||
"""
|
"""
|
||||||
MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200
|
MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200
|
||||||
BLOG_PARSER_SETTINGS = {
|
BLOG_PARSER_SETTINGS = {
|
||||||
# "2022": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
|
# "2022": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
|
||||||
# "2019": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
|
# "2019": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
|
||||||
# "2018": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
|
# "2018": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
|
||||||
# "2017": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
|
# "2017": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
|
||||||
}
|
}
|
||||||
DEFAULT_LOGBOOK_FILE = "logbook.html"
|
DEFAULT_LOGBOOK_FILE = "logbook.html"
|
||||||
DEFAULT_LOGBOOK_PARSER = "parser_html"
|
DEFAULT_LOGBOOK_PARSER = "parser_html"
|
||||||
# All years since 2002 use the default value for Logbook parser
|
# All years now (Jan.2023) use the default value for Logbook parser
|
||||||
# dont forget to update expoweb/pubs.htm to match.
|
# dont forget to update expoweb/pubs.htm to match. 1982 left as reminder of expected format.
|
||||||
LOGBOOK_PARSER_SETTINGS = {
|
LOGBOOK_PARSER_SETTINGS = {
|
||||||
"2002": ("logbook.html", "parser_html"),
|
|
||||||
"2001": ("log.htm", "parser_html_01"),
|
|
||||||
"2000": ("log.htm", "parser_html_01"),
|
|
||||||
"1999": ("log.htm", "parser_html_01"),
|
|
||||||
"1998": ("log.htm", "parser_html_01"),
|
|
||||||
"1997": ("log.htm", "parser_html_01"),
|
|
||||||
"1996": ("log.htm", "parser_html_01"),
|
|
||||||
"1995": ("log.htm", "parser_html_01"),
|
|
||||||
"1994": ("logbook.html", "parser_html"),
|
|
||||||
"1993": ("logbook.html", "parser_html"),
|
|
||||||
"1992": ("logbook.html", "parser_html"),
|
|
||||||
"1991": ("logbook.html", "parser_html"),
|
|
||||||
"1990": ("logbook.html", "parser_html"),
|
|
||||||
"1989": ("logbook.html", "parser_html"),
|
|
||||||
"1988": ("logbook.html", "parser_html"),
|
|
||||||
"1987": ("logbook.html", "parser_html"),
|
|
||||||
"1985": ("logbook.html", "parser_html"),
|
|
||||||
"1984": ("logbook.html", "parser_html"),
|
|
||||||
"1983": ("logbook.html", "parser_html"),
|
|
||||||
"1982": ("logbook.html", "parser_html"),
|
"1982": ("logbook.html", "parser_html"),
|
||||||
}
|
}
|
||||||
|
|
||||||
@ -325,13 +300,12 @@ def ParseDate(tripdate, year):
|
|||||||
return datetime.date(1970, 1, 1)
|
return datetime.date(1970, 1, 1)
|
||||||
|
|
||||||
|
|
||||||
# 2002 - now
|
|
||||||
def parser_html(year, expedition, txt, seq=""):
|
def parser_html(year, expedition, txt, seq=""):
|
||||||
"""This uses some of the more obscure capabilities of regular expressions,
|
"""This uses some of the more obscure capabilities of regular expressions,
|
||||||
see https://docs.python.org/3/library/re.html
|
see https://docs.python.org/3/library/re.html
|
||||||
|
|
||||||
You can't see it here, but a round-trip export-then-import will move
|
You can't see it here, but a round-trip export-then-import will move
|
||||||
the endmatter up to the frontmatter. This makes sense when moving
|
the endmatter up to the frontmatter. This made sense when translating
|
||||||
from parser_html_01 format logfiles, believe me.
|
from parser_html_01 format logfiles, believe me.
|
||||||
"""
|
"""
|
||||||
global logentries
|
global logentries
|
||||||
@ -422,134 +396,134 @@ def parser_html(year, expedition, txt, seq=""):
|
|||||||
|
|
||||||
|
|
||||||
# main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it
|
# main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it
|
||||||
def parser_html_01(year, expedition, txt, seq=""):
|
# def parser_html_01(year, expedition, txt, seq=""):
|
||||||
global logentries
|
# global logentries
|
||||||
global logdataissues
|
# global logdataissues
|
||||||
errorcount = 0
|
# errorcount = 0
|
||||||
|
|
||||||
# extract front material and stash for later use when rebuilding from list of entries
|
# # extract front material and stash for later use when rebuilding from list of entries
|
||||||
headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt)
|
# headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt)
|
||||||
headpara = headmatch.groups()[0].strip()
|
# headpara = headmatch.groups()[0].strip()
|
||||||
|
|
||||||
# print(f" - headpara:\n'{headpara}'")
|
# # print(f" - headpara:\n'{headpara}'")
|
||||||
if len(headpara) > 0:
|
# if len(headpara) > 0:
|
||||||
frontpath = Path(settings.EXPOWEB, "years", year, "frontmatter.html")
|
# frontpath = Path(settings.EXPOWEB, "years", year, "frontmatter.html")
|
||||||
with open(frontpath, "w") as front:
|
# with open(frontpath, "w") as front:
|
||||||
front.write(headpara + "\n")
|
# front.write(headpara + "\n")
|
||||||
|
|
||||||
# extract END material and stash for later use when rebuilding from list of entries
|
# # extract END material and stash for later use when rebuilding from list of entries
|
||||||
endmatch = re.match(r"(?i)(?s).*<hr\s*/>([\s\S]*?)(?=</body)", txt)
|
# endmatch = re.match(r"(?i)(?s).*<hr\s*/>([\s\S]*?)(?=</body)", txt)
|
||||||
if endmatch:
|
# if endmatch:
|
||||||
endpara = endmatch.groups()[0].strip()
|
# endpara = endmatch.groups()[0].strip()
|
||||||
else:
|
# else:
|
||||||
print(f" ! - {year} NO endmatch")
|
# print(f" ! - {year} NO endmatch")
|
||||||
endpara = ""
|
# endpara = ""
|
||||||
|
|
||||||
# print(f" - endpara:\n'{endpara}'")
|
# # print(f" - endpara:\n'{endpara}'")
|
||||||
if len(endpara) > 0:
|
# if len(endpara) > 0:
|
||||||
endpath = Path(settings.EXPOWEB, "years", year, "endmatter.html")
|
# endpath = Path(settings.EXPOWEB, "years", year, "endmatter.html")
|
||||||
with open(endpath, "w") as end:
|
# with open(endpath, "w") as end:
|
||||||
end.write(endpara + "\n")
|
# end.write(endpara + "\n")
|
||||||
|
|
||||||
tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
|
# tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
|
||||||
logbook_entry_count = 0
|
# logbook_entry_count = 0
|
||||||
for trippara in tripparas:
|
# for trippara in tripparas:
|
||||||
logbook_entry_count += 1
|
# logbook_entry_count += 1
|
||||||
tid = set_trip_id(year, logbook_entry_count)
|
# tid = set_trip_id(year, logbook_entry_count)
|
||||||
# print(f" #0 - tid: {tid}")
|
# # print(f" #0 - tid: {tid}")
|
||||||
try:
|
# try:
|
||||||
# print(f" #1 - tid: {tid}")
|
# # print(f" #1 - tid: {tid}")
|
||||||
s = re.match(r"(?i)(?s)\s*(?:<p>)?(.*?)</?p>(.*)$", trippara)
|
# s = re.match(r"(?i)(?s)\s*(?:<p>)?(.*?)</?p>(.*)$", trippara)
|
||||||
if not s:
|
# if not s:
|
||||||
message = " ! - Skipping logentry {year} failure to parse header: " + tid + trippara[:300] + "..."
|
# message = " ! - Skipping logentry {year} failure to parse header: " + tid + trippara[:300] + "..."
|
||||||
DataIssue.objects.create(parser="logbooks", message=message)
|
# DataIssue.objects.create(parser="logbooks", message=message)
|
||||||
logdataissues[tid] = message
|
# logdataissues[tid] = message
|
||||||
print(message)
|
# print(message)
|
||||||
break
|
# break
|
||||||
try:
|
# try:
|
||||||
tripheader, triptext = s.group(1), s.group(2)
|
# tripheader, triptext = s.group(1), s.group(2)
|
||||||
except:
|
# except:
|
||||||
message = f" ! - Fail to set tripheader, triptext. trip:<{tid}> s:'{s}'"
|
# message = f" ! - Fail to set tripheader, triptext. trip:<{tid}> s:'{s}'"
|
||||||
DataIssue.objects.create(parser="logbooks", message=message)
|
# DataIssue.objects.create(parser="logbooks", message=message)
|
||||||
logdataissues[tid] = message
|
# logdataissues[tid] = message
|
||||||
print(message)
|
# print(message)
|
||||||
|
|
||||||
# mtripid = re.search(r'<a id="(.*?)"', tripheader)
|
# # mtripid = re.search(r'<a id="(.*?)"', tripheader)
|
||||||
# if not mtripid:
|
# # if not mtripid:
|
||||||
# message = f" ! - A tag id not found. Never mind. Not needed. trip:<{tid}> header:'{tripheader}'"
|
# # message = f" ! - A tag id not found. Never mind. Not needed. trip:<{tid}> header:'{tripheader}'"
|
||||||
# DataIssue.objects.create(parser='logbooks', message=message)
|
# # DataIssue.objects.create(parser='logbooks', message=message)
|
||||||
# logdataissues[tid]=message
|
# # logdataissues[tid]=message
|
||||||
|
# # print(message)
|
||||||
|
|
||||||
|
# # tripid = mtripid and mtripid.group(1) or ""
|
||||||
|
# # print(f" # - mtripid: {mtripid}")
|
||||||
|
# tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader)
|
||||||
|
# # print(f" #2 - tid: {tid}")
|
||||||
|
# try:
|
||||||
|
# tripdate, triptitle, trippeople = tripheader.split("|")
|
||||||
|
# except:
|
||||||
|
# message = f" ! - Fail 3 to split out date|title|people. trip:<{tid}> '{tripheader.split('|')}'"
|
||||||
|
# DataIssue.objects.create(parser="logbooks", message=message)
|
||||||
|
# logdataissues[tid] = message
|
||||||
|
# print(message)
|
||||||
|
# try:
|
||||||
|
# tripdate, triptitle = tripheader.split("|")
|
||||||
|
# trippeople = "GUESS ANON"
|
||||||
|
# except:
|
||||||
|
# message = f" ! - Skipping logentry {year} Fail 2 to split out date|title (anon). trip:<{tid}> '{tripheader.split('|')}' CRASHES MySQL !"
|
||||||
|
# DataIssue.objects.create(parser="logbooks", message=message)
|
||||||
|
# logdataissues[tid] = message
|
||||||
|
# print(message)
|
||||||
|
# break
|
||||||
|
# # print(f" #3 - tid: {tid}")
|
||||||
|
# ldate = ParseDate(tripdate.strip(), year)
|
||||||
|
# # print(f" # - tid: {tid} <{tripdate}> <{triptitle}> <{trippeople}>")
|
||||||
|
# # print(f" #4 - tid: {tid}")
|
||||||
|
|
||||||
|
# mtu = re.search(r"<p[^>]*>(T/?U.*)", triptext)
|
||||||
|
# if mtu:
|
||||||
|
# tu = mtu.group(1)
|
||||||
|
# triptext = triptext[: mtu.start(0)] + triptext[mtu.end() :]
|
||||||
|
# else:
|
||||||
|
# tu = ""
|
||||||
|
|
||||||
|
# triptitles = triptitle.split(" - ")
|
||||||
|
# tripcave = triptitles[0].strip()
|
||||||
|
|
||||||
|
# ltriptext = triptext
|
||||||
|
|
||||||
|
# mtail = re.search(r'(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&|</?p>|\((?:same day|\d+)\))*$', ltriptext)
|
||||||
|
# if mtail:
|
||||||
|
# ltriptext = ltriptext[: mtail.start(0)]
|
||||||
|
# ltriptext = re.sub(r"</p>", "", ltriptext)
|
||||||
|
# ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
|
||||||
|
# ltriptext = re.sub(r"</?u>", "_", ltriptext)
|
||||||
|
# ltriptext = re.sub(r"</?i>", "''", ltriptext)
|
||||||
|
# ltriptext = re.sub(r"</?b>", "'''", ltriptext)
|
||||||
|
# ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
|
||||||
|
|
||||||
|
# if ltriptext == "":
|
||||||
|
# message = " ! - Zero content for logbook entry!: " + tid
|
||||||
|
# DataIssue.objects.create(parser="logbooks", message=message)
|
||||||
|
# logdataissues[tid] = message
|
||||||
|
# print(message)
|
||||||
|
|
||||||
|
# entrytuple = (ldate, tripcave, triptitle, ltriptext, trippeople, expedition, tu, tid)
|
||||||
|
# logentries.append(entrytuple)
|
||||||
|
|
||||||
|
# except:
|
||||||
|
# message = f" ! - Skipping logentry {year} due to exception in: {tid}"
|
||||||
|
# DataIssue.objects.create(parser="logbooks", message=message)
|
||||||
|
# logdataissues[tid] = message
|
||||||
# print(message)
|
# print(message)
|
||||||
|
# errorcount += 1
|
||||||
# tripid = mtripid and mtripid.group(1) or ""
|
# raise
|
||||||
# print(f" # - mtripid: {mtripid}")
|
# if errorcount > 5:
|
||||||
tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader)
|
# message = f" !!- TOO MANY ERRORS - aborting at '{tid}' logbook: {year}"
|
||||||
# print(f" #2 - tid: {tid}")
|
# DataIssue.objects.create(parser="logbooks", message=message)
|
||||||
try:
|
# logdataissues[tid] = message
|
||||||
tripdate, triptitle, trippeople = tripheader.split("|")
|
# print(message)
|
||||||
except:
|
# return
|
||||||
message = f" ! - Fail 3 to split out date|title|people. trip:<{tid}> '{tripheader.split('|')}'"
|
|
||||||
DataIssue.objects.create(parser="logbooks", message=message)
|
|
||||||
logdataissues[tid] = message
|
|
||||||
print(message)
|
|
||||||
try:
|
|
||||||
tripdate, triptitle = tripheader.split("|")
|
|
||||||
trippeople = "GUESS ANON"
|
|
||||||
except:
|
|
||||||
message = f" ! - Skipping logentry {year} Fail 2 to split out date|title (anon). trip:<{tid}> '{tripheader.split('|')}' CRASHES MySQL !"
|
|
||||||
DataIssue.objects.create(parser="logbooks", message=message)
|
|
||||||
logdataissues[tid] = message
|
|
||||||
print(message)
|
|
||||||
break
|
|
||||||
# print(f" #3 - tid: {tid}")
|
|
||||||
ldate = ParseDate(tripdate.strip(), year)
|
|
||||||
# print(f" # - tid: {tid} <{tripdate}> <{triptitle}> <{trippeople}>")
|
|
||||||
# print(f" #4 - tid: {tid}")
|
|
||||||
|
|
||||||
mtu = re.search(r"<p[^>]*>(T/?U.*)", triptext)
|
|
||||||
if mtu:
|
|
||||||
tu = mtu.group(1)
|
|
||||||
triptext = triptext[: mtu.start(0)] + triptext[mtu.end() :]
|
|
||||||
else:
|
|
||||||
tu = ""
|
|
||||||
|
|
||||||
triptitles = triptitle.split(" - ")
|
|
||||||
tripcave = triptitles[0].strip()
|
|
||||||
|
|
||||||
ltriptext = triptext
|
|
||||||
|
|
||||||
mtail = re.search(r'(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&|</?p>|\((?:same day|\d+)\))*$', ltriptext)
|
|
||||||
if mtail:
|
|
||||||
ltriptext = ltriptext[: mtail.start(0)]
|
|
||||||
ltriptext = re.sub(r"</p>", "", ltriptext)
|
|
||||||
ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
|
|
||||||
ltriptext = re.sub(r"</?u>", "_", ltriptext)
|
|
||||||
ltriptext = re.sub(r"</?i>", "''", ltriptext)
|
|
||||||
ltriptext = re.sub(r"</?b>", "'''", ltriptext)
|
|
||||||
ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
|
|
||||||
|
|
||||||
if ltriptext == "":
|
|
||||||
message = " ! - Zero content for logbook entry!: " + tid
|
|
||||||
DataIssue.objects.create(parser="logbooks", message=message)
|
|
||||||
logdataissues[tid] = message
|
|
||||||
print(message)
|
|
||||||
|
|
||||||
entrytuple = (ldate, tripcave, triptitle, ltriptext, trippeople, expedition, tu, tid)
|
|
||||||
logentries.append(entrytuple)
|
|
||||||
|
|
||||||
except:
|
|
||||||
message = f" ! - Skipping logentry {year} due to exception in: {tid}"
|
|
||||||
DataIssue.objects.create(parser="logbooks", message=message)
|
|
||||||
logdataissues[tid] = message
|
|
||||||
print(message)
|
|
||||||
errorcount += 1
|
|
||||||
raise
|
|
||||||
if errorcount > 5:
|
|
||||||
message = f" !!- TOO MANY ERRORS - aborting at '{tid}' logbook: {year}"
|
|
||||||
DataIssue.objects.create(parser="logbooks", message=message)
|
|
||||||
logdataissues[tid] = message
|
|
||||||
print(message)
|
|
||||||
return
|
|
||||||
|
|
||||||
|
|
||||||
def parser_blog(year, expedition, txt, sq=""):
|
def parser_blog(year, expedition, txt, sq=""):
|
||||||
|
Loading…
x
Reference in New Issue
Block a user