deleted old parser code

This commit is contained in:
Philip Sargent 2023-01-26 21:33:17 +00:00
parent 733765802e
commit 0c4ce6dc3c

@ -395,137 +395,6 @@ def parser_html(year, expedition, txt, seq=""):
logentries.append(entrytuple) logentries.append(entrytuple)
# main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it
# def parser_html_01(year, expedition, txt, seq=""):
# global logentries
# global logdataissues
# errorcount = 0
# # extract front material and stash for later use when rebuilding from list of entries
# headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt)
# headpara = headmatch.groups()[0].strip()
# # print(f" - headpara:\n'{headpara}'")
# if len(headpara) > 0:
# frontpath = Path(settings.EXPOWEB, "years", year, "frontmatter.html")
# with open(frontpath, "w") as front:
# front.write(headpara + "\n")
# # extract END material and stash for later use when rebuilding from list of entries
# endmatch = re.match(r"(?i)(?s).*<hr\s*/>([\s\S]*?)(?=</body)", txt)
# if endmatch:
# endpara = endmatch.groups()[0].strip()
# else:
# print(f" ! - {year} NO endmatch")
# endpara = ""
# # print(f" - endpara:\n'{endpara}'")
# if len(endpara) > 0:
# endpath = Path(settings.EXPOWEB, "years", year, "endmatter.html")
# with open(endpath, "w") as end:
# end.write(endpara + "\n")
# tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
# logbook_entry_count = 0
# for trippara in tripparas:
# logbook_entry_count += 1
# tid = set_trip_id(year, logbook_entry_count)
# # print(f" #0 - tid: {tid}")
# try:
# # print(f" #1 - tid: {tid}")
# s = re.match(r"(?i)(?s)\s*(?:<p>)?(.*?)</?p>(.*)$", trippara)
# if not s:
# message = " ! - Skipping logentry {year} failure to parse header: " + tid + trippara[:300] + "..."
# DataIssue.objects.create(parser="logbooks", message=message)
# logdataissues[tid] = message
# print(message)
# break
# try:
# tripheader, triptext = s.group(1), s.group(2)
# except:
# message = f" ! - Fail to set tripheader, triptext. trip:<{tid}> s:'{s}'"
# DataIssue.objects.create(parser="logbooks", message=message)
# logdataissues[tid] = message
# print(message)
# # mtripid = re.search(r'<a id="(.*?)"', tripheader)
# # if not mtripid:
# # message = f" ! - A tag id not found. Never mind. Not needed. trip:<{tid}> header:'{tripheader}'"
# # DataIssue.objects.create(parser='logbooks', message=message)
# # logdataissues[tid]=message
# # print(message)
# # tripid = mtripid and mtripid.group(1) or ""
# # print(f" # - mtripid: {mtripid}")
# tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader)
# # print(f" #2 - tid: {tid}")
# try:
# tripdate, triptitle, trippeople = tripheader.split("|")
# except:
# message = f" ! - Fail 3 to split out date|title|people. trip:<{tid}> '{tripheader.split('|')}'"
# DataIssue.objects.create(parser="logbooks", message=message)
# logdataissues[tid] = message
# print(message)
# try:
# tripdate, triptitle = tripheader.split("|")
# trippeople = "GUESS ANON"
# except:
# message = f" ! - Skipping logentry {year} Fail 2 to split out date|title (anon). trip:<{tid}> '{tripheader.split('|')}' CRASHES MySQL !"
# DataIssue.objects.create(parser="logbooks", message=message)
# logdataissues[tid] = message
# print(message)
# break
# # print(f" #3 - tid: {tid}")
# ldate = ParseDate(tripdate.strip(), year)
# # print(f" # - tid: {tid} <{tripdate}> <{triptitle}> <{trippeople}>")
# # print(f" #4 - tid: {tid}")
# mtu = re.search(r"<p[^>]*>(T/?U.*)", triptext)
# if mtu:
# tu = mtu.group(1)
# triptext = triptext[: mtu.start(0)] + triptext[mtu.end() :]
# else:
# tu = ""
# triptitles = triptitle.split(" - ")
# tripcave = triptitles[0].strip()
# ltriptext = triptext
# mtail = re.search(r'(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&amp;|</?p>|\((?:same day|\d+)\))*$', ltriptext)
# if mtail:
# ltriptext = ltriptext[: mtail.start(0)]
# ltriptext = re.sub(r"</p>", "", ltriptext)
# ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
# ltriptext = re.sub(r"</?u>", "_", ltriptext)
# ltriptext = re.sub(r"</?i>", "''", ltriptext)
# ltriptext = re.sub(r"</?b>", "'''", ltriptext)
# ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
# if ltriptext == "":
# message = " ! - Zero content for logbook entry!: " + tid
# DataIssue.objects.create(parser="logbooks", message=message)
# logdataissues[tid] = message
# print(message)
# entrytuple = (ldate, tripcave, triptitle, ltriptext, trippeople, expedition, tu, tid)
# logentries.append(entrytuple)
# except:
# message = f" ! - Skipping logentry {year} due to exception in: {tid}"
# DataIssue.objects.create(parser="logbooks", message=message)
# logdataissues[tid] = message
# print(message)
# errorcount += 1
# raise
# if errorcount > 5:
# message = f" !!- TOO MANY ERRORS - aborting at '{tid}' logbook: {year}"
# DataIssue.objects.create(parser="logbooks", message=message)
# logdataissues[tid] = message
# print(message)
# return
def parser_blog(year, expedition, txt, sq=""): def parser_blog(year, expedition, txt, sq=""):
"""Parses the format of web pages collected as 'Save As HTML" from the UK Caving blog website. """Parses the format of web pages collected as 'Save As HTML" from the UK Caving blog website.
Note that the entries have dates and authors, but no titles. Note that the entries have dates and authors, but no titles.