From 0c4ce6dc3ce0e2acfa1049d292fe9e0046acee5d Mon Sep 17 00:00:00 2001 From: Philip Sargent <philip.sargent@gmail.com> Date: Thu, 26 Jan 2023 21:33:17 +0000 Subject: [PATCH] deleted old parser code --- parsers/logbooks.py | 131 -------------------------------------------- 1 file changed, 131 deletions(-) diff --git a/parsers/logbooks.py b/parsers/logbooks.py index 5ef125e..c30831f 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -395,137 +395,6 @@ def parser_html(year, expedition, txt, seq=""): logentries.append(entrytuple) -# main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it -# def parser_html_01(year, expedition, txt, seq=""): - # global logentries - # global logdataissues - # errorcount = 0 - - # # extract front material and stash for later use when rebuilding from list of entries - # headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt) - # headpara = headmatch.groups()[0].strip() - - # # print(f" - headpara:\n'{headpara}'") - # if len(headpara) > 0: - # frontpath = Path(settings.EXPOWEB, "years", year, "frontmatter.html") - # with open(frontpath, "w") as front: - # front.write(headpara + "\n") - - # # extract END material and stash for later use when rebuilding from list of entries - # endmatch = re.match(r"(?i)(?s).*<hr\s*/>([\s\S]*?)(?=</body)", txt) - # if endmatch: - # endpara = endmatch.groups()[0].strip() - # else: - # print(f" ! - {year} NO endmatch") - # endpara = "" - - # # print(f" - endpara:\n'{endpara}'") - # if len(endpara) > 0: - # endpath = Path(settings.EXPOWEB, "years", year, "endmatter.html") - # with open(endpath, "w") as end: - # end.write(endpara + "\n") - - # tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt) - # logbook_entry_count = 0 - # for trippara in tripparas: - # logbook_entry_count += 1 - # tid = set_trip_id(year, logbook_entry_count) - # # print(f" #0 - tid: {tid}") - # try: - # # print(f" #1 - tid: {tid}") - # s = re.match(r"(?i)(?s)\s*(?:<p>)?(.*?)</?p>(.*)$", trippara) - # if not s: - # message = " ! - Skipping logentry {year} failure to parse header: " + tid + trippara[:300] + "..." - # DataIssue.objects.create(parser="logbooks", message=message) - # logdataissues[tid] = message - # print(message) - # break - # try: - # tripheader, triptext = s.group(1), s.group(2) - # except: - # message = f" ! - Fail to set tripheader, triptext. trip:<{tid}> s:'{s}'" - # DataIssue.objects.create(parser="logbooks", message=message) - # logdataissues[tid] = message - # print(message) - - # # mtripid = re.search(r'<a id="(.*?)"', tripheader) - # # if not mtripid: - # # message = f" ! - A tag id not found. Never mind. Not needed. trip:<{tid}> header:'{tripheader}'" - # # DataIssue.objects.create(parser='logbooks', message=message) - # # logdataissues[tid]=message - # # print(message) - - # # tripid = mtripid and mtripid.group(1) or "" - # # print(f" # - mtripid: {mtripid}") - # tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader) - # # print(f" #2 - tid: {tid}") - # try: - # tripdate, triptitle, trippeople = tripheader.split("|") - # except: - # message = f" ! - Fail 3 to split out date|title|people. trip:<{tid}> '{tripheader.split('|')}'" - # DataIssue.objects.create(parser="logbooks", message=message) - # logdataissues[tid] = message - # print(message) - # try: - # tripdate, triptitle = tripheader.split("|") - # trippeople = "GUESS ANON" - # except: - # message = f" ! - Skipping logentry {year} Fail 2 to split out date|title (anon). trip:<{tid}> '{tripheader.split('|')}' CRASHES MySQL !" - # DataIssue.objects.create(parser="logbooks", message=message) - # logdataissues[tid] = message - # print(message) - # break - # # print(f" #3 - tid: {tid}") - # ldate = ParseDate(tripdate.strip(), year) - # # print(f" # - tid: {tid} <{tripdate}> <{triptitle}> <{trippeople}>") - # # print(f" #4 - tid: {tid}") - - # mtu = re.search(r"<p[^>]*>(T/?U.*)", triptext) - # if mtu: - # tu = mtu.group(1) - # triptext = triptext[: mtu.start(0)] + triptext[mtu.end() :] - # else: - # tu = "" - - # triptitles = triptitle.split(" - ") - # tripcave = triptitles[0].strip() - - # ltriptext = triptext - - # mtail = re.search(r'(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&|</?p>|\((?:same day|\d+)\))*$', ltriptext) - # if mtail: - # ltriptext = ltriptext[: mtail.start(0)] - # ltriptext = re.sub(r"</p>", "", ltriptext) - # ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext) - # ltriptext = re.sub(r"</?u>", "_", ltriptext) - # ltriptext = re.sub(r"</?i>", "''", ltriptext) - # ltriptext = re.sub(r"</?b>", "'''", ltriptext) - # ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip() - - # if ltriptext == "": - # message = " ! - Zero content for logbook entry!: " + tid - # DataIssue.objects.create(parser="logbooks", message=message) - # logdataissues[tid] = message - # print(message) - - # entrytuple = (ldate, tripcave, triptitle, ltriptext, trippeople, expedition, tu, tid) - # logentries.append(entrytuple) - - # except: - # message = f" ! - Skipping logentry {year} due to exception in: {tid}" - # DataIssue.objects.create(parser="logbooks", message=message) - # logdataissues[tid] = message - # print(message) - # errorcount += 1 - # raise - # if errorcount > 5: - # message = f" !!- TOO MANY ERRORS - aborting at '{tid}' logbook: {year}" - # DataIssue.objects.create(parser="logbooks", message=message) - # logdataissues[tid] = message - # print(message) - # return - - def parser_blog(year, expedition, txt, sq=""): """Parses the format of web pages collected as 'Save As HTML" from the UK Caving blog website. Note that the entries have dates and authors, but no titles.