From 733765802e4f2b2c3d8c1bb2d0c4a86ca3c5f39d Mon Sep 17 00:00:00 2001 From: Philip Sargent Date: Thu, 26 Jan 2023 21:33:06 +0000 Subject: [PATCH] reformatted all old logbook formats --- parsers/imports.py | 2 +- parsers/logbooks.py | 282 ++++++++++++++++++++------------------------ 2 files changed, 129 insertions(+), 155 deletions(-) diff --git a/parsers/imports.py b/parsers/imports.py index c1de034..7be9ab6 100644 --- a/parsers/imports.py +++ b/parsers/imports.py @@ -40,7 +40,7 @@ def import_logbooks(): troggle.parsers.logbooks.LoadLogbooks() -def import_logbook(year=2022): +def import_logbook(year=1996): print(f"-- Importing Logbook {year}") with transaction.atomic(): troggle.parsers.logbooks.LoadLogbook(year) diff --git a/parsers/logbooks.py b/parsers/logbooks.py index 0ab902a..5ef125e 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -26,12 +26,6 @@ todo = """ - far too many uses of Django field dereferencing to get values, which is SLOW -- Logbooks 1987, 1988, 1989 all crash on MySql - but not sqlite - with db constraint fail. Edit logbook to fix. - -- import/parse/re-export-as-html the 'artisanal-format' old logbooks so that - we keep only a modern HTML05 format. Then we can retire the old parsers and reduce the - volume of code here substantially. - - rewrite to use generators rather than storing everything intermediate in lists - to reduce memory impact. - We should ensure logbook.html is utf-8 and stop this crap: @@ -46,35 +40,16 @@ data for old logbooks? Not worth it.. """ MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200 BLOG_PARSER_SETTINGS = { - # "2022": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html - # "2019": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html - # "2018": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html - # "2017": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html + # "2022": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html + # "2019": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html + # "2018": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html + # "2017": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html } DEFAULT_LOGBOOK_FILE = "logbook.html" DEFAULT_LOGBOOK_PARSER = "parser_html" -# All years since 2002 use the default value for Logbook parser -# dont forget to update expoweb/pubs.htm to match. +# All years now (Jan.2023) use the default value for Logbook parser +# dont forget to update expoweb/pubs.htm to match. 1982 left as reminder of expected format. LOGBOOK_PARSER_SETTINGS = { - "2002": ("logbook.html", "parser_html"), - "2001": ("log.htm", "parser_html_01"), - "2000": ("log.htm", "parser_html_01"), - "1999": ("log.htm", "parser_html_01"), - "1998": ("log.htm", "parser_html_01"), - "1997": ("log.htm", "parser_html_01"), - "1996": ("log.htm", "parser_html_01"), - "1995": ("log.htm", "parser_html_01"), - "1994": ("logbook.html", "parser_html"), - "1993": ("logbook.html", "parser_html"), - "1992": ("logbook.html", "parser_html"), - "1991": ("logbook.html", "parser_html"), - "1990": ("logbook.html", "parser_html"), - "1989": ("logbook.html", "parser_html"), - "1988": ("logbook.html", "parser_html"), - "1987": ("logbook.html", "parser_html"), - "1985": ("logbook.html", "parser_html"), - "1984": ("logbook.html", "parser_html"), - "1983": ("logbook.html", "parser_html"), "1982": ("logbook.html", "parser_html"), } @@ -325,13 +300,12 @@ def ParseDate(tripdate, year): return datetime.date(1970, 1, 1) -# 2002 - now def parser_html(year, expedition, txt, seq=""): """This uses some of the more obscure capabilities of regular expressions, see https://docs.python.org/3/library/re.html You can't see it here, but a round-trip export-then-import will move - the endmatter up to the frontmatter. This makes sense when moving + the endmatter up to the frontmatter. This made sense when translating from parser_html_01 format logfiles, believe me. """ global logentries @@ -422,134 +396,134 @@ def parser_html(year, expedition, txt, seq=""): # main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it -def parser_html_01(year, expedition, txt, seq=""): - global logentries - global logdataissues - errorcount = 0 +# def parser_html_01(year, expedition, txt, seq=""): + # global logentries + # global logdataissues + # errorcount = 0 - # extract front material and stash for later use when rebuilding from list of entries - headmatch = re.match(r"(?i)(?s).*]*>(.*?)]*>(.*?) 0: - frontpath = Path(settings.EXPOWEB, "years", year, "frontmatter.html") - with open(frontpath, "w") as front: - front.write(headpara + "\n") + # # print(f" - headpara:\n'{headpara}'") + # if len(headpara) > 0: + # frontpath = Path(settings.EXPOWEB, "years", year, "frontmatter.html") + # with open(frontpath, "w") as front: + # front.write(headpara + "\n") - # extract END material and stash for later use when rebuilding from list of entries - endmatch = re.match(r"(?i)(?s).*([\s\S]*?)(?=([\s\S]*?)(?= 0: - endpath = Path(settings.EXPOWEB, "years", year, "endmatter.html") - with open(endpath, "w") as end: - end.write(endpara + "\n") + # # print(f" - endpara:\n'{endpara}'") + # if len(endpara) > 0: + # endpath = Path(settings.EXPOWEB, "years", year, "endmatter.html") + # with open(endpath, "w") as end: + # end.write(endpara + "\n") - tripparas = re.findall(r"([\s\S]*?)(?=)?(.*?)(.*)$", trippara) - if not s: - message = " ! - Skipping logentry {year} failure to parse header: " + tid + trippara[:300] + "..." - DataIssue.objects.create(parser="logbooks", message=message) - logdataissues[tid] = message - print(message) - break - try: - tripheader, triptext = s.group(1), s.group(2) - except: - message = f" ! - Fail to set tripheader, triptext. trip:<{tid}> s:'{s}'" - DataIssue.objects.create(parser="logbooks", message=message) - logdataissues[tid] = message - print(message) + # tripparas = re.findall(r"([\s\S]*?)(?=)?(.*?)(.*)$", trippara) + # if not s: + # message = " ! - Skipping logentry {year} failure to parse header: " + tid + trippara[:300] + "..." + # DataIssue.objects.create(parser="logbooks", message=message) + # logdataissues[tid] = message + # print(message) + # break + # try: + # tripheader, triptext = s.group(1), s.group(2) + # except: + # message = f" ! - Fail to set tripheader, triptext. trip:<{tid}> s:'{s}'" + # DataIssue.objects.create(parser="logbooks", message=message) + # logdataissues[tid] = message + # print(message) - # mtripid = re.search(r' header:'{tripheader}'" - # DataIssue.objects.create(parser='logbooks', message=message) - # logdataissues[tid]=message + # # mtripid = re.search(r' header:'{tripheader}'" + # # DataIssue.objects.create(parser='logbooks', message=message) + # # logdataissues[tid]=message + # # print(message) + + # # tripid = mtripid and mtripid.group(1) or "" + # # print(f" # - mtripid: {mtripid}") + # tripheader = re.sub(r"]*>", "", tripheader) + # # print(f" #2 - tid: {tid}") + # try: + # tripdate, triptitle, trippeople = tripheader.split("|") + # except: + # message = f" ! - Fail 3 to split out date|title|people. trip:<{tid}> '{tripheader.split('|')}'" + # DataIssue.objects.create(parser="logbooks", message=message) + # logdataissues[tid] = message + # print(message) + # try: + # tripdate, triptitle = tripheader.split("|") + # trippeople = "GUESS ANON" + # except: + # message = f" ! - Skipping logentry {year} Fail 2 to split out date|title (anon). trip:<{tid}> '{tripheader.split('|')}' CRASHES MySQL !" + # DataIssue.objects.create(parser="logbooks", message=message) + # logdataissues[tid] = message + # print(message) + # break + # # print(f" #3 - tid: {tid}") + # ldate = ParseDate(tripdate.strip(), year) + # # print(f" # - tid: {tid} <{tripdate}> <{triptitle}> <{trippeople}>") + # # print(f" #4 - tid: {tid}") + + # mtu = re.search(r"]*>(T/?U.*)", triptext) + # if mtu: + # tu = mtu.group(1) + # triptext = triptext[: mtu.start(0)] + triptext[mtu.end() :] + # else: + # tu = "" + + # triptitles = triptitle.split(" - ") + # tripcave = triptitles[0].strip() + + # ltriptext = triptext + + # mtail = re.search(r'(?:[^<]*|\s|/|-|&||\((?:same day|\d+)\))*$', ltriptext) + # if mtail: + # ltriptext = ltriptext[: mtail.start(0)] + # ltriptext = re.sub(r"

", "", ltriptext) + # ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext) + # ltriptext = re.sub(r"", "_", ltriptext) + # ltriptext = re.sub(r"", "''", ltriptext) + # ltriptext = re.sub(r"", "'''", ltriptext) + # ltriptext = re.sub(r"

", "

", ltriptext).strip() + + # if ltriptext == "": + # message = " ! - Zero content for logbook entry!: " + tid + # DataIssue.objects.create(parser="logbooks", message=message) + # logdataissues[tid] = message + # print(message) + + # entrytuple = (ldate, tripcave, triptitle, ltriptext, trippeople, expedition, tu, tid) + # logentries.append(entrytuple) + + # except: + # message = f" ! - Skipping logentry {year} due to exception in: {tid}" + # DataIssue.objects.create(parser="logbooks", message=message) + # logdataissues[tid] = message # print(message) - - # tripid = mtripid and mtripid.group(1) or "" - # print(f" # - mtripid: {mtripid}") - tripheader = re.sub(r"]*>", "", tripheader) - # print(f" #2 - tid: {tid}") - try: - tripdate, triptitle, trippeople = tripheader.split("|") - except: - message = f" ! - Fail 3 to split out date|title|people. trip:<{tid}> '{tripheader.split('|')}'" - DataIssue.objects.create(parser="logbooks", message=message) - logdataissues[tid] = message - print(message) - try: - tripdate, triptitle = tripheader.split("|") - trippeople = "GUESS ANON" - except: - message = f" ! - Skipping logentry {year} Fail 2 to split out date|title (anon). trip:<{tid}> '{tripheader.split('|')}' CRASHES MySQL !" - DataIssue.objects.create(parser="logbooks", message=message) - logdataissues[tid] = message - print(message) - break - # print(f" #3 - tid: {tid}") - ldate = ParseDate(tripdate.strip(), year) - # print(f" # - tid: {tid} <{tripdate}> <{triptitle}> <{trippeople}>") - # print(f" #4 - tid: {tid}") - - mtu = re.search(r"]*>(T/?U.*)", triptext) - if mtu: - tu = mtu.group(1) - triptext = triptext[: mtu.start(0)] + triptext[mtu.end() :] - else: - tu = "" - - triptitles = triptitle.split(" - ") - tripcave = triptitles[0].strip() - - ltriptext = triptext - - mtail = re.search(r'(?:[^<]*|\s|/|-|&||\((?:same day|\d+)\))*$', ltriptext) - if mtail: - ltriptext = ltriptext[: mtail.start(0)] - ltriptext = re.sub(r"

", "", ltriptext) - ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext) - ltriptext = re.sub(r"", "_", ltriptext) - ltriptext = re.sub(r"", "''", ltriptext) - ltriptext = re.sub(r"", "'''", ltriptext) - ltriptext = re.sub(r"

", "

", ltriptext).strip() - - if ltriptext == "": - message = " ! - Zero content for logbook entry!: " + tid - DataIssue.objects.create(parser="logbooks", message=message) - logdataissues[tid] = message - print(message) - - entrytuple = (ldate, tripcave, triptitle, ltriptext, trippeople, expedition, tu, tid) - logentries.append(entrytuple) - - except: - message = f" ! - Skipping logentry {year} due to exception in: {tid}" - DataIssue.objects.create(parser="logbooks", message=message) - logdataissues[tid] = message - print(message) - errorcount += 1 - raise - if errorcount > 5: - message = f" !!- TOO MANY ERRORS - aborting at '{tid}' logbook: {year}" - DataIssue.objects.create(parser="logbooks", message=message) - logdataissues[tid] = message - print(message) - return + # errorcount += 1 + # raise + # if errorcount > 5: + # message = f" !!- TOO MANY ERRORS - aborting at '{tid}' logbook: {year}" + # DataIssue.objects.create(parser="logbooks", message=message) + # logdataissues[tid] = message + # print(message) + # return def parser_blog(year, expedition, txt, sq=""):