reformatted all old logbook formats

2023-01-26 21:33:06 +00:00
parent 1be3a3892c
commit 733765802e
2 changed files with 129 additions and 155 deletions
--- a/parsers/imports.py
+++ b/parsers/imports.py
@@ -40,7 +40,7 @@ def import_logbooks():
        troggle.parsers.logbooks.LoadLogbooks()


-def import_logbook(year=2022):
+def import_logbook(year=1996):
    print(f"-- Importing Logbook {year}")
    with transaction.atomic():
        troggle.parsers.logbooks.LoadLogbook(year)
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -26,12 +26,6 @@ todo = """

 - far too many uses of Django field dereferencing to get values, which is SLOW

- Logbooks 1987, 1988, 1989 all crash on MySql - but not sqlite - with db constraint fail. Edit logbook to fix.
-
- import/parse/re-export-as-html the 'artisanal-format' old logbooks so that
-  we keep only a modern HTML05 format. Then we can retire the old parsers and reduce the
-  volume of code here substantially.
-
 - rewrite to use generators rather than storing everything intermediate in lists - to reduce memory impact.
  
 - We should ensure logbook.html is utf-8 and stop this crap:             
@@ -46,35 +40,16 @@ data for old logbooks? Not worth it..
 """
 MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200
 BLOG_PARSER_SETTINGS = {
-    #               "2022": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
-    #               "2019": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
-    #               "2018": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
-    #               "2017": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
+    #  "2022": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
+    #  "2019": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
+    #  "2018": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
+    #  "2017": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
 }
 DEFAULT_LOGBOOK_FILE = "logbook.html"
 DEFAULT_LOGBOOK_PARSER = "parser_html"
-# All years since 2002 use the default value for Logbook parser
-# dont forget to update expoweb/pubs.htm to match.
+# All years now (Jan.2023) use the default value for Logbook parser
+# dont forget to update expoweb/pubs.htm to match. 1982 left as reminder of expected format.
 LOGBOOK_PARSER_SETTINGS = {
-    "2002": ("logbook.html", "parser_html"),
-    "2001": ("log.htm", "parser_html_01"),
-    "2000": ("log.htm", "parser_html_01"),
-    "1999": ("log.htm", "parser_html_01"),
-    "1998": ("log.htm", "parser_html_01"),
-    "1997": ("log.htm", "parser_html_01"),
-    "1996": ("log.htm", "parser_html_01"),
-    "1995": ("log.htm", "parser_html_01"),
-    "1994": ("logbook.html", "parser_html"),
-    "1993": ("logbook.html", "parser_html"),
-    "1992": ("logbook.html", "parser_html"),
-    "1991": ("logbook.html", "parser_html"),
-    "1990": ("logbook.html", "parser_html"),
-    "1989": ("logbook.html", "parser_html"),
-    "1988": ("logbook.html", "parser_html"),
-    "1987": ("logbook.html", "parser_html"),
-    "1985": ("logbook.html", "parser_html"),
-    "1984": ("logbook.html", "parser_html"),
-    "1983": ("logbook.html", "parser_html"),
    "1982": ("logbook.html", "parser_html"),
 }

@@ -325,13 +300,12 @@ def ParseDate(tripdate, year):
        return datetime.date(1970, 1, 1)


-# 2002 - now
 def parser_html(year, expedition, txt, seq=""):
    """This uses some of the more obscure capabilities of regular expressions,
    see https://docs.python.org/3/library/re.html

    You can't see it here, but a round-trip export-then-import will move
-    the endmatter up to the frontmatter. This makes sense when moving
+    the endmatter up to the frontmatter. This made sense when translating
    from parser_html_01 format logfiles, believe me.
    """
    global logentries
@@ -422,134 +396,134 @@ def parser_html(year, expedition, txt, seq=""):


 # main parser for 1991 - 2001.  simpler because the data has been hacked so much to fit it
-def parser_html_01(year, expedition, txt, seq=""):
-    global logentries
-    global logdataissues
-    errorcount = 0
+# def parser_html_01(year, expedition, txt, seq=""):
+    # global logentries
+    # global logdataissues
+    # errorcount = 0

-    # extract front material and stash for later use when rebuilding from list of entries
-    headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt)
-    headpara = headmatch.groups()[0].strip()
+    # # extract front material and stash for later use when rebuilding from list of entries
+    # headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt)
+    # headpara = headmatch.groups()[0].strip()

-    # print(f" - headpara:\n'{headpara}'")
-    if len(headpara) > 0:
-        frontpath = Path(settings.EXPOWEB, "years", year, "frontmatter.html")
-        with open(frontpath, "w") as front:
-            front.write(headpara + "\n")
+    # # print(f" - headpara:\n'{headpara}'")
+    # if len(headpara) > 0:
+        # frontpath = Path(settings.EXPOWEB, "years", year, "frontmatter.html")
+        # with open(frontpath, "w") as front:
+            # front.write(headpara + "\n")

-    # extract END material and stash for later use when rebuilding from list of entries
-    endmatch = re.match(r"(?i)(?s).*<hr\s*/>([\s\S]*?)(?=</body)", txt)
-    if endmatch:
-        endpara = endmatch.groups()[0].strip()
-    else:
-        print(f" ! - {year} NO endmatch")
-        endpara = ""
+    # # extract END material and stash for later use when rebuilding from list of entries
+    # endmatch = re.match(r"(?i)(?s).*<hr\s*/>([\s\S]*?)(?=</body)", txt)
+    # if endmatch:
+        # endpara = endmatch.groups()[0].strip()
+    # else:
+        # print(f" ! - {year} NO endmatch")
+        # endpara = ""

-    # print(f" - endpara:\n'{endpara}'")
-    if len(endpara) > 0:
-        endpath = Path(settings.EXPOWEB, "years", year, "endmatter.html")
-        with open(endpath, "w") as end:
-            end.write(endpara + "\n")
+    # # print(f" - endpara:\n'{endpara}'")
+    # if len(endpara) > 0:
+        # endpath = Path(settings.EXPOWEB, "years", year, "endmatter.html")
+        # with open(endpath, "w") as end:
+            # end.write(endpara + "\n")

-    tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
-    logbook_entry_count = 0
-    for trippara in tripparas:
-        logbook_entry_count += 1
-        tid = set_trip_id(year, logbook_entry_count)
-        # print(f" #0 - tid: {tid}")
-        try:
-            # print(f" #1 - tid: {tid}")
-            s = re.match(r"(?i)(?s)\s*(?:<p>)?(.*?)</?p>(.*)$", trippara)
-            if not s:
-                message = " ! - Skipping logentry {year} failure to parse header: " + tid + trippara[:300] + "..."
-                DataIssue.objects.create(parser="logbooks", message=message)
-                logdataissues[tid] = message
-                print(message)
-                break
-            try:
-                tripheader, triptext = s.group(1), s.group(2)
-            except:
-                message = f" ! - Fail to set tripheader, triptext. trip:<{tid}> s:'{s}'"
-                DataIssue.objects.create(parser="logbooks", message=message)
-                logdataissues[tid] = message
-                print(message)
+    # tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
+    # logbook_entry_count = 0
+    # for trippara in tripparas:
+        # logbook_entry_count += 1
+        # tid = set_trip_id(year, logbook_entry_count)
+        # # print(f" #0 - tid: {tid}")
+        # try:
+            # # print(f" #1 - tid: {tid}")
+            # s = re.match(r"(?i)(?s)\s*(?:<p>)?(.*?)</?p>(.*)$", trippara)
+            # if not s:
+                # message = " ! - Skipping logentry {year} failure to parse header: " + tid + trippara[:300] + "..."
+                # DataIssue.objects.create(parser="logbooks", message=message)
+                # logdataissues[tid] = message
+                # print(message)
+                # break
+            # try:
+                # tripheader, triptext = s.group(1), s.group(2)
+            # except:
+                # message = f" ! - Fail to set tripheader, triptext. trip:<{tid}> s:'{s}'"
+                # DataIssue.objects.create(parser="logbooks", message=message)
+                # logdataissues[tid] = message
+                # print(message)

-            # mtripid = re.search(r'<a id="(.*?)"', tripheader)
-            # if not mtripid:
-            # message = f" ! - A tag id not found. Never mind. Not needed. trip:<{tid}> header:'{tripheader}'"
-            # DataIssue.objects.create(parser='logbooks', message=message)
-            # logdataissues[tid]=message
+            # # mtripid = re.search(r'<a id="(.*?)"', tripheader)
+            # # if not mtripid:
+            # # message = f" ! - A tag id not found. Never mind. Not needed. trip:<{tid}> header:'{tripheader}'"
+            # # DataIssue.objects.create(parser='logbooks', message=message)
+            # # logdataissues[tid]=message
+            # # print(message)
+
+            # # tripid = mtripid and mtripid.group(1) or ""
+            # # print(f" # - mtripid: {mtripid}")
+            # tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader)
+            # # print(f" #2 - tid: {tid}")
+            # try:
+                # tripdate, triptitle, trippeople = tripheader.split("|")
+            # except:
+                # message = f" ! - Fail 3 to split out date|title|people. trip:<{tid}> '{tripheader.split('|')}'"
+                # DataIssue.objects.create(parser="logbooks", message=message)
+                # logdataissues[tid] = message
+                # print(message)
+                # try:
+                    # tripdate, triptitle = tripheader.split("|")
+                    # trippeople = "GUESS ANON"
+                # except:
+                    # message = f" ! - Skipping logentry {year} Fail 2 to split out date|title (anon). trip:<{tid}> '{tripheader.split('|')}' CRASHES MySQL !"
+                    # DataIssue.objects.create(parser="logbooks", message=message)
+                    # logdataissues[tid] = message
+                    # print(message)
+                    # break
+            # # print(f" #3 - tid: {tid}")
+            # ldate = ParseDate(tripdate.strip(), year)
+            # # print(f" # -             tid: {tid} <{tripdate}> <{triptitle}> <{trippeople}>")
+            # # print(f" #4 - tid: {tid}")
+
+            # mtu = re.search(r"<p[^>]*>(T/?U.*)", triptext)
+            # if mtu:
+                # tu = mtu.group(1)
+                # triptext = triptext[: mtu.start(0)] + triptext[mtu.end() :]
+            # else:
+                # tu = ""
+
+            # triptitles = triptitle.split(" - ")
+            # tripcave = triptitles[0].strip()
+
+            # ltriptext = triptext
+
+            # mtail = re.search(r'(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&amp;|</?p>|\((?:same day|\d+)\))*$', ltriptext)
+            # if mtail:
+                # ltriptext = ltriptext[: mtail.start(0)]
+            # ltriptext = re.sub(r"</p>", "", ltriptext)
+            # ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
+            # ltriptext = re.sub(r"</?u>", "_", ltriptext)
+            # ltriptext = re.sub(r"</?i>", "''", ltriptext)
+            # ltriptext = re.sub(r"</?b>", "'''", ltriptext)
+            # ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
+
+            # if ltriptext == "":
+                # message = " ! - Zero content for logbook entry!: " + tid
+                # DataIssue.objects.create(parser="logbooks", message=message)
+                # logdataissues[tid] = message
+                # print(message)
+
+            # entrytuple = (ldate, tripcave, triptitle, ltriptext, trippeople, expedition, tu, tid)
+            # logentries.append(entrytuple)
+
+        # except:
+            # message = f" ! - Skipping logentry {year} due to exception in: {tid}"
+            # DataIssue.objects.create(parser="logbooks", message=message)
+            # logdataissues[tid] = message
            # print(message)
-
-            # tripid = mtripid and mtripid.group(1) or ""
-            # print(f" # - mtripid: {mtripid}")
-            tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader)
-            # print(f" #2 - tid: {tid}")
-            try:
-                tripdate, triptitle, trippeople = tripheader.split("|")
-            except:
-                message = f" ! - Fail 3 to split out date|title|people. trip:<{tid}> '{tripheader.split('|')}'"
-                DataIssue.objects.create(parser="logbooks", message=message)
-                logdataissues[tid] = message
-                print(message)
-                try:
-                    tripdate, triptitle = tripheader.split("|")
-                    trippeople = "GUESS ANON"
-                except:
-                    message = f" ! - Skipping logentry {year} Fail 2 to split out date|title (anon). trip:<{tid}> '{tripheader.split('|')}' CRASHES MySQL !"
-                    DataIssue.objects.create(parser="logbooks", message=message)
-                    logdataissues[tid] = message
-                    print(message)
-                    break
-            # print(f" #3 - tid: {tid}")
-            ldate = ParseDate(tripdate.strip(), year)
-            # print(f" # -             tid: {tid} <{tripdate}> <{triptitle}> <{trippeople}>")
-            # print(f" #4 - tid: {tid}")
-
-            mtu = re.search(r"<p[^>]*>(T/?U.*)", triptext)
-            if mtu:
-                tu = mtu.group(1)
-                triptext = triptext[: mtu.start(0)] + triptext[mtu.end() :]
-            else:
-                tu = ""
-
-            triptitles = triptitle.split(" - ")
-            tripcave = triptitles[0].strip()
-
-            ltriptext = triptext
-
-            mtail = re.search(r'(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&amp;|</?p>|\((?:same day|\d+)\))*$', ltriptext)
-            if mtail:
-                ltriptext = ltriptext[: mtail.start(0)]
-            ltriptext = re.sub(r"</p>", "", ltriptext)
-            ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
-            ltriptext = re.sub(r"</?u>", "_", ltriptext)
-            ltriptext = re.sub(r"</?i>", "''", ltriptext)
-            ltriptext = re.sub(r"</?b>", "'''", ltriptext)
-            ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
-
-            if ltriptext == "":
-                message = " ! - Zero content for logbook entry!: " + tid
-                DataIssue.objects.create(parser="logbooks", message=message)
-                logdataissues[tid] = message
-                print(message)
-
-            entrytuple = (ldate, tripcave, triptitle, ltriptext, trippeople, expedition, tu, tid)
-            logentries.append(entrytuple)
-
-        except:
-            message = f" ! - Skipping logentry {year} due to exception in: {tid}"
-            DataIssue.objects.create(parser="logbooks", message=message)
-            logdataissues[tid] = message
-            print(message)
-            errorcount += 1
-            raise
-            if errorcount > 5:
-                message = f" !!- TOO MANY ERRORS - aborting at '{tid}' logbook: {year}"
-                DataIssue.objects.create(parser="logbooks", message=message)
-                logdataissues[tid] = message
-                print(message)
-                return
+            # errorcount += 1
+            # raise
+            # if errorcount > 5:
+                # message = f" !!- TOO MANY ERRORS - aborting at '{tid}' logbook: {year}"
+                # DataIssue.objects.create(parser="logbooks", message=message)
+                # logdataissues[tid] = message
+                # print(message)
+                # return


 def parser_blog(year, expedition, txt, sq=""):