deleted old parser code

2023-01-26 21:33:17 +00:00
parent 733765802e
commit 0c4ce6dc3c
1 changed files with 0 additions and 131 deletions
@@ -395,137 +395,6 @@ def parser_html(year, expedition, txt, seq=""):
        logentries.append(entrytuple)


-# main parser for 1991 - 2001.  simpler because the data has been hacked so much to fit it
-# def parser_html_01(year, expedition, txt, seq=""):
-    # global logentries
-    # global logdataissues
-    # errorcount = 0
-
-    # # extract front material and stash for later use when rebuilding from list of entries
-    # headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt)
-    # headpara = headmatch.groups()[0].strip()
-
-    # # print(f" - headpara:\n'{headpara}'")
-    # if len(headpara) > 0:
-        # frontpath = Path(settings.EXPOWEB, "years", year, "frontmatter.html")
-        # with open(frontpath, "w") as front:
-            # front.write(headpara + "\n")
-
-    # # extract END material and stash for later use when rebuilding from list of entries
-    # endmatch = re.match(r"(?i)(?s).*<hr\s*/>([\s\S]*?)(?=</body)", txt)
-    # if endmatch:
-        # endpara = endmatch.groups()[0].strip()
-    # else:
-        # print(f" ! - {year} NO endmatch")
-        # endpara = ""
-
-    # # print(f" - endpara:\n'{endpara}'")
-    # if len(endpara) > 0:
-        # endpath = Path(settings.EXPOWEB, "years", year, "endmatter.html")
-        # with open(endpath, "w") as end:
-            # end.write(endpara + "\n")
-
-    # tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
-    # logbook_entry_count = 0
-    # for trippara in tripparas:
-        # logbook_entry_count += 1
-        # tid = set_trip_id(year, logbook_entry_count)
-        # # print(f" #0 - tid: {tid}")
-        # try:
-            # # print(f" #1 - tid: {tid}")
-            # s = re.match(r"(?i)(?s)\s*(?:<p>)?(.*?)</?p>(.*)$", trippara)
-            # if not s:
-                # message = " ! - Skipping logentry {year} failure to parse header: " + tid + trippara[:300] + "..."
-                # DataIssue.objects.create(parser="logbooks", message=message)
-                # logdataissues[tid] = message
-                # print(message)
-                # break
-            # try:
-                # tripheader, triptext = s.group(1), s.group(2)
-            # except:
-                # message = f" ! - Fail to set tripheader, triptext. trip:<{tid}> s:'{s}'"
-                # DataIssue.objects.create(parser="logbooks", message=message)
-                # logdataissues[tid] = message
-                # print(message)
-
-            # # mtripid = re.search(r'<a id="(.*?)"', tripheader)
-            # # if not mtripid:
-            # # message = f" ! - A tag id not found. Never mind. Not needed. trip:<{tid}> header:'{tripheader}'"
-            # # DataIssue.objects.create(parser='logbooks', message=message)
-            # # logdataissues[tid]=message
-            # # print(message)
-
-            # # tripid = mtripid and mtripid.group(1) or ""
-            # # print(f" # - mtripid: {mtripid}")
-            # tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader)
-            # # print(f" #2 - tid: {tid}")
-            # try:
-                # tripdate, triptitle, trippeople = tripheader.split("|")
-            # except:
-                # message = f" ! - Fail 3 to split out date|title|people. trip:<{tid}> '{tripheader.split('|')}'"
-                # DataIssue.objects.create(parser="logbooks", message=message)
-                # logdataissues[tid] = message
-                # print(message)
-                # try:
-                    # tripdate, triptitle = tripheader.split("|")
-                    # trippeople = "GUESS ANON"
-                # except:
-                    # message = f" ! - Skipping logentry {year} Fail 2 to split out date|title (anon). trip:<{tid}> '{tripheader.split('|')}' CRASHES MySQL !"
-                    # DataIssue.objects.create(parser="logbooks", message=message)
-                    # logdataissues[tid] = message
-                    # print(message)
-                    # break
-            # # print(f" #3 - tid: {tid}")
-            # ldate = ParseDate(tripdate.strip(), year)
-            # # print(f" # -             tid: {tid} <{tripdate}> <{triptitle}> <{trippeople}>")
-            # # print(f" #4 - tid: {tid}")
-
-            # mtu = re.search(r"<p[^>]*>(T/?U.*)", triptext)
-            # if mtu:
-                # tu = mtu.group(1)
-                # triptext = triptext[: mtu.start(0)] + triptext[mtu.end() :]
-            # else:
-                # tu = ""
-
-            # triptitles = triptitle.split(" - ")
-            # tripcave = triptitles[0].strip()
-
-            # ltriptext = triptext
-
-            # mtail = re.search(r'(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&amp;|</?p>|\((?:same day|\d+)\))*$', ltriptext)
-            # if mtail:
-                # ltriptext = ltriptext[: mtail.start(0)]
-            # ltriptext = re.sub(r"</p>", "", ltriptext)
-            # ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
-            # ltriptext = re.sub(r"</?u>", "_", ltriptext)
-            # ltriptext = re.sub(r"</?i>", "''", ltriptext)
-            # ltriptext = re.sub(r"</?b>", "'''", ltriptext)
-            # ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
-
-            # if ltriptext == "":
-                # message = " ! - Zero content for logbook entry!: " + tid
-                # DataIssue.objects.create(parser="logbooks", message=message)
-                # logdataissues[tid] = message
-                # print(message)
-
-            # entrytuple = (ldate, tripcave, triptitle, ltriptext, trippeople, expedition, tu, tid)
-            # logentries.append(entrytuple)
-
-        # except:
-            # message = f" ! - Skipping logentry {year} due to exception in: {tid}"
-            # DataIssue.objects.create(parser="logbooks", message=message)
-            # logdataissues[tid] = message
-            # print(message)
-            # errorcount += 1
-            # raise
-            # if errorcount > 5:
-                # message = f" !!- TOO MANY ERRORS - aborting at '{tid}' logbook: {year}"
-                # DataIssue.objects.create(parser="logbooks", message=message)
-                # logdataissues[tid] = message
-                # print(message)
-                # return
-
-
 def parser_blog(year, expedition, txt, sq=""):
    """Parses the format of web pages collected as 'Save As HTML" from the UK Caving blog website.
    Note that the entries have dates and authors, but no titles.