diff --git a/parsers/imports.py b/parsers/imports.py index e445a2a..2f318b8 100644 --- a/parsers/imports.py +++ b/parsers/imports.py @@ -41,7 +41,7 @@ def import_logbooks(): with transaction.atomic(): troggle.parsers.logbooks.LoadLogbooks() -def import_logbook(year=1982): +def import_logbook(year=1987): print(f"-- Importing Logbook {year}") with transaction.atomic(): troggle.parsers.logbooks.LoadLogbook(year) diff --git a/parsers/logbooks.py b/parsers/logbooks.py index e3d9d06..cd95304 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -76,7 +76,8 @@ LOGBOOK_PARSER_SETTINGS = { "1990": ("log.htm", "parser_html_01"), "1989": ("log.htm", "parser_html_01"), #crashes MySQL "1988": ("log.htm", "parser_html_01"), #crashes MySQL - "1987": ("log.htm", "parser_html_01"), #crashes MySQL + #"1987": ("log.htm", "parser_02"), #crashes MySQL + "1987": ("logbook.html", "parser_html"), "1985": ("logbook.html", "parser_html"), "1984": ("logbook.html", "parser_html"), "1983": ("logbook.html", "parser_html"), @@ -87,7 +88,7 @@ entries = { "2022": 86, "2019": 56, "2018": 100, "2017": 76, "2016": 83, "2015": "2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 53, "2008": 49, "2007": 113, "2006": 60, "2005": 55, "2004": 76, "2003": 42, "2002": 31, "2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 95, "1995": 42, - "1994": 32, "1993": 41, "1992": 62, "1991": 39, "1990": 87, "1989": 1,"1988": 1,"1987": 1, + "1994": 32, "1993": 41, "1992": 62, "1991": 39, "1990": 87, "1989": 1,"1988": 1,"1987": 34, "1985": 24, "1984": 32, "1983": 52, "1982": 42,} # Logbooks log.htm exist for 87, 88, 89 but have no full-working parser, or need hand-editing. @@ -366,27 +367,13 @@ def parser_html(year, expedition, txt, seq=""): trippeople, expedition, tu, tripid1) logentries.append(entrytuple) - # main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it # trying it out for years 1982 - 1990 too. Some logbook editing required by hand.. place def parser_html_01(year, expedition, txt, seq=""): - '''This uses some of the more obscure capabilities of regular expressions, - see https://docs.python.org/3/library/re.html - ''' global logentries global logdataissues errorcount = 0 - - # extract front material and stash for later use when rebuilding from list of entries - headmatch = re.match(r"(?i)(?s).*
]*>(.*?)]*>(T/?U.*)', triptext) + if mtu: + tu = mtu.group(1) + triptext = triptext[:mtu.start(0)] + triptext[mtu.end():] + else: + tu = "" + + triptitles = triptitle.split(" - ") + tripcave = triptitles[0].strip() + + ltriptext = triptext + + mtail = re.search(r'(?:[^<]*|\s|/|-|&|?p>|\((?:same day|\d+)\))*$', ltriptext) + if mtail: + ltriptext = ltriptext[:mtail.start(0)] + ltriptext = re.sub(r"
", "", ltriptext) + ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext) + ltriptext = re.sub(r"?u>", "_", ltriptext) + ltriptext = re.sub(r"?i>", "''", ltriptext) + ltriptext = re.sub(r"?b>", "'''", ltriptext) + ltriptext = re.sub(r"", "
", ltriptext).strip()
+
+ if ltriptext == "":
+ message = " ! - Zero content for logbook entry!: " + tid
+ DataIssue.objects.create(parser='logbooks', message=message)
+ logdataissues[tid]=message
+ print(message)
+
+
+ entrytuple = (ldate, tripcave, triptitle, ltriptext,
+ trippeople, expedition, tu, tid)
+ logentries.append(entrytuple)
+
+ except:
+ message = f" ! - Skipping logentry {year} due to exception in: {tid}"
+ DataIssue.objects.create(parser='logbooks', message=message)
+ logdataissues[tid]=message
+ print(message)
+ errorcount += 1
+ raise
+ if errorcount >5 :
+ message = f" !!- TOO MANY ERRORS - aborting at '{tid}' logbook: {year}"
+ DataIssue.objects.create(parser='logbooks', message=message)
+ logdataissues[tid]=message
+ print(message)
+ return
+
def parser_blog(year, expedition, txt, sq=""):
'''Parses the format of web pages collected as 'Save As HTML" from the UK Caving blog website.
Note that the entries have dates and authors, but no titles.