From 517da57a0cecf3a8c50cea72c6df3690c506e08b Mon Sep 17 00:00:00 2001 From: Philip Sargent Date: Tue, 20 Dec 2022 23:48:56 +0000 Subject: [PATCH] All broken logbooks now parsing correctly. --- parsers/imports.py | 2 +- parsers/logbooks.py | 138 +------------------------------------------- 2 files changed, 4 insertions(+), 136 deletions(-) diff --git a/parsers/imports.py b/parsers/imports.py index ffe6851..4b531bc 100644 --- a/parsers/imports.py +++ b/parsers/imports.py @@ -41,7 +41,7 @@ def import_logbooks(): with transaction.atomic(): troggle.parsers.logbooks.LoadLogbooks() -def import_logbook(year=1988): +def import_logbook(year=1989): print(f"-- Importing Logbook {year}") with transaction.atomic(): troggle.parsers.logbooks.LoadLogbook(year) diff --git a/parsers/logbooks.py b/parsers/logbooks.py index db0099c..5a1a2f4 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -74,9 +74,7 @@ LOGBOOK_PARSER_SETTINGS = { "1992": ("log.htm", "parser_html_01"), "1991": ("log.htm", "parser_html_01"), "1990": ("log.htm", "parser_html_01"), - "1989": ("log.htm", "parser_html_01"), #crashes MySQL - #"1988": ("log.htm", "parser_02"), #crashes MySQL - #"1987": ("log.htm", "parser_02"), #crashes MySQL + "1989": ("logbook.html", "parser_html"), "1988": ("logbook.html", "parser_html"), "1987": ("logbook.html", "parser_html"), "1985": ("logbook.html", "parser_html"), @@ -89,7 +87,7 @@ entries = { "2022": 86, "2019": 56, "2018": 100, "2017": 76, "2016": 83, "2015": "2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 53, "2008": 49, "2007": 113, "2006": 60, "2005": 55, "2004": 76, "2003": 42, "2002": 31, "2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 95, "1995": 42, - "1994": 32, "1993": 41, "1992": 62, "1991": 39, "1990": 87, "1989": 1,"1988": 61,"1987": 34, + "1994": 32, "1993": 41, "1992": 62, "1991": 39, "1990": 87, "1989": 63,"1988": 61,"1987": 34, "1985": 24, "1984": 32, "1983": 52, "1982": 42,} # Logbooks log.htm exist for 87, 88, 89 but have no full-working parser, or need hand-editing. @@ -369,7 +367,6 @@ def parser_html(year, expedition, txt, seq=""): logentries.append(entrytuple) # main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it -# trying it out for years 1982 - 1990 too. Some logbook editing required by hand.. place def parser_html_01(year, expedition, txt, seq=""): global logentries global logdataissues @@ -478,135 +475,6 @@ def parser_html_01(year, expedition, txt, seq=""): print(message) return -# variant parser for 1987 -def parser_02(year, expedition, txt, seq=""): - '''This uses some of the more obscure capabilities of regular expressions, - see https://docs.python.org/3/library/re.html - ''' - global logentries - global logdataissues - errorcount = 0 - - # extract front material and stash for later use when rebuilding from list of entries - headmatch = re.match(r"(?i)(?s).*]*>(.*?)0): - frontpath = Path(settings.EXPOWEB, "years", year, "frontmatter.html") - with open(frontpath,"w") as front: - front.write(headpara+"\n") - - tripparas = re.findall(r"([\s\S]*?)(?=)?(.*?)(.*)$", trippara) - s = re.match(r"(?i)(?s)\s*()?.*?]*>([\s\S]*?)(.*)$", trippara) - if not s: - message = f" ! - Skipping logentry {tid} failure to parse header: " + tid + trippara[:300] + "..." - DataIssue.objects.create(parser='logbooks', message=message) - logdataissues[tid]=message - print(message) - break - #print(s.group(2)) - #print(s.group(3)[:80]) - try: - tripheader, triptext = s.group(2), s.group(3) - except: - message = f" ! - Fail to set tripheader, triptext. trip:<{tid}> s:'{s}'" - DataIssue.objects.create(parser='logbooks', message=message) - logdataissues[tid]=message - print(message) - - print(f" {tid} {tripheader}") - if not tripheader: - continue - - # mtripid = re.search(r' header:'{tripheader}'" - # DataIssue.objects.create(parser='logbooks', message=message) - # logdataissues[tid]=message - # print(message) - - # tripid = mtripid and mtripid.group(1) or "" - # print(f" # - mtripid: {mtripid}") - tripheader = re.sub(r"]*>", "", tripheader) - # print(f" #2 - tid: {tid}") - try: - tripdate, triptitle, trippeople = tripheader.split("|") - except: - message = f" ! - Fail 3 to split out date|title|people. trip:<{tid}>\n '{tripheader.split('|')}'" - DataIssue.objects.create(parser='logbooks', message=message) - logdataissues[tid]=message - print(message) - try: - tripdate, triptitle = tripheader.split("|") - trippeople = "GUESS ANON" - except: - message = f" ! - Skipping logentry {year} Fail 2 to split out date|title (anon). trip:<{tid}> '{tripheader.split('|')}' CRASHES MySQL !" - DataIssue.objects.create(parser='logbooks', message=message) - logdataissues[tid]=message - print(message) - break - # print(f" #3 - tid: {tid}") - triptitle = triptitle.strip() - ldate = ParseDate(tripdate.strip(), year) - #print(f" # - tid: {tid} <{tripdate}> <{triptitle}> <{trippeople}>") - # print(f" #4 - tid: {tid}") - - mtu = re.search(r']*>(T/?U.*)', triptext) - if mtu: - tu = mtu.group(1) - triptext = triptext[:mtu.start(0)] + triptext[mtu.end():] - else: - tu = "" - - triptitles = triptitle.split(" - ") - tripcave = triptitles[0].strip() - - ltriptext = triptext - - mtail = re.search(r'(?:[^<]*|\s|/|-|&||\((?:same day|\d+)\))*$', ltriptext) - if mtail: - ltriptext = ltriptext[:mtail.start(0)] - ltriptext = re.sub(r"

", "", ltriptext) - ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext) - ltriptext = re.sub(r"", "_", ltriptext) - ltriptext = re.sub(r"", "''", ltriptext) - ltriptext = re.sub(r"", "'''", ltriptext) - ltriptext = re.sub(r"

", "

", ltriptext).strip() - - if ltriptext == "": - message = " ! - Zero content for logbook entry!: " + tid - DataIssue.objects.create(parser='logbooks', message=message) - logdataissues[tid]=message - print(message) - - - entrytuple = (ldate, tripcave, triptitle, ltriptext, - trippeople, expedition, tu, tid) - logentries.append(entrytuple) - - except: - message = f" ! - Skipping logentry {year} due to exception in: {tid}" - DataIssue.objects.create(parser='logbooks', message=message) - logdataissues[tid]=message - print(message) - errorcount += 1 - raise - if errorcount >5 : - message = f" !!- TOO MANY ERRORS - aborting at '{tid}' logbook: {year}" - DataIssue.objects.create(parser='logbooks', message=message) - logdataissues[tid]=message - print(message) - return - def parser_blog(year, expedition, txt, sq=""): '''Parses the format of web pages collected as 'Save As HTML" from the UK Caving blog website. Note that the entries have dates and authors, but no titles. @@ -847,7 +715,7 @@ def LoadLogbooks(): noexpo = ["1986", "2020", "2021",] #no expo lostlogbook = ["1976", "1977", "1978", "1979", "1980", "1981"] - sqlfail = ["1989"] # breaks mysql with db constraint fail - debug locally first] + sqlfail = [""] # breaks mysql with db constraint fail - all now fixed.] nologbook = noexpo + lostlogbook + sqlfail nlbe={}