forked from expo/troggle
deleted old parser code
This commit is contained in:
parent
733765802e
commit
0c4ce6dc3c
@ -395,137 +395,6 @@ def parser_html(year, expedition, txt, seq=""):
|
||||
logentries.append(entrytuple)
|
||||
|
||||
|
||||
# main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it
|
||||
# def parser_html_01(year, expedition, txt, seq=""):
|
||||
# global logentries
|
||||
# global logdataissues
|
||||
# errorcount = 0
|
||||
|
||||
# # extract front material and stash for later use when rebuilding from list of entries
|
||||
# headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt)
|
||||
# headpara = headmatch.groups()[0].strip()
|
||||
|
||||
# # print(f" - headpara:\n'{headpara}'")
|
||||
# if len(headpara) > 0:
|
||||
# frontpath = Path(settings.EXPOWEB, "years", year, "frontmatter.html")
|
||||
# with open(frontpath, "w") as front:
|
||||
# front.write(headpara + "\n")
|
||||
|
||||
# # extract END material and stash for later use when rebuilding from list of entries
|
||||
# endmatch = re.match(r"(?i)(?s).*<hr\s*/>([\s\S]*?)(?=</body)", txt)
|
||||
# if endmatch:
|
||||
# endpara = endmatch.groups()[0].strip()
|
||||
# else:
|
||||
# print(f" ! - {year} NO endmatch")
|
||||
# endpara = ""
|
||||
|
||||
# # print(f" - endpara:\n'{endpara}'")
|
||||
# if len(endpara) > 0:
|
||||
# endpath = Path(settings.EXPOWEB, "years", year, "endmatter.html")
|
||||
# with open(endpath, "w") as end:
|
||||
# end.write(endpara + "\n")
|
||||
|
||||
# tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
|
||||
# logbook_entry_count = 0
|
||||
# for trippara in tripparas:
|
||||
# logbook_entry_count += 1
|
||||
# tid = set_trip_id(year, logbook_entry_count)
|
||||
# # print(f" #0 - tid: {tid}")
|
||||
# try:
|
||||
# # print(f" #1 - tid: {tid}")
|
||||
# s = re.match(r"(?i)(?s)\s*(?:<p>)?(.*?)</?p>(.*)$", trippara)
|
||||
# if not s:
|
||||
# message = " ! - Skipping logentry {year} failure to parse header: " + tid + trippara[:300] + "..."
|
||||
# DataIssue.objects.create(parser="logbooks", message=message)
|
||||
# logdataissues[tid] = message
|
||||
# print(message)
|
||||
# break
|
||||
# try:
|
||||
# tripheader, triptext = s.group(1), s.group(2)
|
||||
# except:
|
||||
# message = f" ! - Fail to set tripheader, triptext. trip:<{tid}> s:'{s}'"
|
||||
# DataIssue.objects.create(parser="logbooks", message=message)
|
||||
# logdataissues[tid] = message
|
||||
# print(message)
|
||||
|
||||
# # mtripid = re.search(r'<a id="(.*?)"', tripheader)
|
||||
# # if not mtripid:
|
||||
# # message = f" ! - A tag id not found. Never mind. Not needed. trip:<{tid}> header:'{tripheader}'"
|
||||
# # DataIssue.objects.create(parser='logbooks', message=message)
|
||||
# # logdataissues[tid]=message
|
||||
# # print(message)
|
||||
|
||||
# # tripid = mtripid and mtripid.group(1) or ""
|
||||
# # print(f" # - mtripid: {mtripid}")
|
||||
# tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader)
|
||||
# # print(f" #2 - tid: {tid}")
|
||||
# try:
|
||||
# tripdate, triptitle, trippeople = tripheader.split("|")
|
||||
# except:
|
||||
# message = f" ! - Fail 3 to split out date|title|people. trip:<{tid}> '{tripheader.split('|')}'"
|
||||
# DataIssue.objects.create(parser="logbooks", message=message)
|
||||
# logdataissues[tid] = message
|
||||
# print(message)
|
||||
# try:
|
||||
# tripdate, triptitle = tripheader.split("|")
|
||||
# trippeople = "GUESS ANON"
|
||||
# except:
|
||||
# message = f" ! - Skipping logentry {year} Fail 2 to split out date|title (anon). trip:<{tid}> '{tripheader.split('|')}' CRASHES MySQL !"
|
||||
# DataIssue.objects.create(parser="logbooks", message=message)
|
||||
# logdataissues[tid] = message
|
||||
# print(message)
|
||||
# break
|
||||
# # print(f" #3 - tid: {tid}")
|
||||
# ldate = ParseDate(tripdate.strip(), year)
|
||||
# # print(f" # - tid: {tid} <{tripdate}> <{triptitle}> <{trippeople}>")
|
||||
# # print(f" #4 - tid: {tid}")
|
||||
|
||||
# mtu = re.search(r"<p[^>]*>(T/?U.*)", triptext)
|
||||
# if mtu:
|
||||
# tu = mtu.group(1)
|
||||
# triptext = triptext[: mtu.start(0)] + triptext[mtu.end() :]
|
||||
# else:
|
||||
# tu = ""
|
||||
|
||||
# triptitles = triptitle.split(" - ")
|
||||
# tripcave = triptitles[0].strip()
|
||||
|
||||
# ltriptext = triptext
|
||||
|
||||
# mtail = re.search(r'(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&|</?p>|\((?:same day|\d+)\))*$', ltriptext)
|
||||
# if mtail:
|
||||
# ltriptext = ltriptext[: mtail.start(0)]
|
||||
# ltriptext = re.sub(r"</p>", "", ltriptext)
|
||||
# ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
|
||||
# ltriptext = re.sub(r"</?u>", "_", ltriptext)
|
||||
# ltriptext = re.sub(r"</?i>", "''", ltriptext)
|
||||
# ltriptext = re.sub(r"</?b>", "'''", ltriptext)
|
||||
# ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
|
||||
|
||||
# if ltriptext == "":
|
||||
# message = " ! - Zero content for logbook entry!: " + tid
|
||||
# DataIssue.objects.create(parser="logbooks", message=message)
|
||||
# logdataissues[tid] = message
|
||||
# print(message)
|
||||
|
||||
# entrytuple = (ldate, tripcave, triptitle, ltriptext, trippeople, expedition, tu, tid)
|
||||
# logentries.append(entrytuple)
|
||||
|
||||
# except:
|
||||
# message = f" ! - Skipping logentry {year} due to exception in: {tid}"
|
||||
# DataIssue.objects.create(parser="logbooks", message=message)
|
||||
# logdataissues[tid] = message
|
||||
# print(message)
|
||||
# errorcount += 1
|
||||
# raise
|
||||
# if errorcount > 5:
|
||||
# message = f" !!- TOO MANY ERRORS - aborting at '{tid}' logbook: {year}"
|
||||
# DataIssue.objects.create(parser="logbooks", message=message)
|
||||
# logdataissues[tid] = message
|
||||
# print(message)
|
||||
# return
|
||||
|
||||
|
||||
def parser_blog(year, expedition, txt, sq=""):
|
||||
"""Parses the format of web pages collected as 'Save As HTML" from the UK Caving blog website.
|
||||
Note that the entries have dates and authors, but no titles.
|
||||
|
Loading…
Reference in New Issue
Block a user