diff --git a/parsers/imports.py b/parsers/imports.py index ffe6851..4b531bc 100644 --- a/parsers/imports.py +++ b/parsers/imports.py @@ -41,7 +41,7 @@ def import_logbooks(): with transaction.atomic(): troggle.parsers.logbooks.LoadLogbooks() -def import_logbook(year=1988): +def import_logbook(year=1989): print(f"-- Importing Logbook {year}") with transaction.atomic(): troggle.parsers.logbooks.LoadLogbook(year) diff --git a/parsers/logbooks.py b/parsers/logbooks.py index db0099c..5a1a2f4 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -74,9 +74,7 @@ LOGBOOK_PARSER_SETTINGS = { "1992": ("log.htm", "parser_html_01"), "1991": ("log.htm", "parser_html_01"), "1990": ("log.htm", "parser_html_01"), - "1989": ("log.htm", "parser_html_01"), #crashes MySQL - #"1988": ("log.htm", "parser_02"), #crashes MySQL - #"1987": ("log.htm", "parser_02"), #crashes MySQL + "1989": ("logbook.html", "parser_html"), "1988": ("logbook.html", "parser_html"), "1987": ("logbook.html", "parser_html"), "1985": ("logbook.html", "parser_html"), @@ -89,7 +87,7 @@ entries = { "2022": 86, "2019": 56, "2018": 100, "2017": 76, "2016": 83, "2015": "2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 53, "2008": 49, "2007": 113, "2006": 60, "2005": 55, "2004": 76, "2003": 42, "2002": 31, "2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 95, "1995": 42, - "1994": 32, "1993": 41, "1992": 62, "1991": 39, "1990": 87, "1989": 1,"1988": 61,"1987": 34, + "1994": 32, "1993": 41, "1992": 62, "1991": 39, "1990": 87, "1989": 63,"1988": 61,"1987": 34, "1985": 24, "1984": 32, "1983": 52, "1982": 42,} # Logbooks log.htm exist for 87, 88, 89 but have no full-working parser, or need hand-editing. @@ -369,7 +367,6 @@ def parser_html(year, expedition, txt, seq=""): logentries.append(entrytuple) # main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it -# trying it out for years 1982 - 1990 too. Some logbook editing required by hand.. place def parser_html_01(year, expedition, txt, seq=""): global logentries global logdataissues @@ -478,135 +475,6 @@ def parser_html_01(year, expedition, txt, seq=""): print(message) return -# variant parser for 1987 -def parser_02(year, expedition, txt, seq=""): - '''This uses some of the more obscure capabilities of regular expressions, - see https://docs.python.org/3/library/re.html - ''' - global logentries - global logdataissues - errorcount = 0 - - # extract front material and stash for later use when rebuilding from list of entries - headmatch = re.match(r"(?i)(?s).*
]*>(.*?)]*>(T/?U.*)', triptext) - if mtu: - tu = mtu.group(1) - triptext = triptext[:mtu.start(0)] + triptext[mtu.end():] - else: - tu = "" - - triptitles = triptitle.split(" - ") - tripcave = triptitles[0].strip() - - ltriptext = triptext - - mtail = re.search(r'(?:[^<]*|\s|/|-|&|?p>|\((?:same day|\d+)\))*$', ltriptext) - if mtail: - ltriptext = ltriptext[:mtail.start(0)] - ltriptext = re.sub(r"
", "", ltriptext) - ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext) - ltriptext = re.sub(r"?u>", "_", ltriptext) - ltriptext = re.sub(r"?i>", "''", ltriptext) - ltriptext = re.sub(r"?b>", "'''", ltriptext) - ltriptext = re.sub(r"", "
", ltriptext).strip()
-
- if ltriptext == "":
- message = " ! - Zero content for logbook entry!: " + tid
- DataIssue.objects.create(parser='logbooks', message=message)
- logdataissues[tid]=message
- print(message)
-
-
- entrytuple = (ldate, tripcave, triptitle, ltriptext,
- trippeople, expedition, tu, tid)
- logentries.append(entrytuple)
-
- except:
- message = f" ! - Skipping logentry {year} due to exception in: {tid}"
- DataIssue.objects.create(parser='logbooks', message=message)
- logdataissues[tid]=message
- print(message)
- errorcount += 1
- raise
- if errorcount >5 :
- message = f" !!- TOO MANY ERRORS - aborting at '{tid}' logbook: {year}"
- DataIssue.objects.create(parser='logbooks', message=message)
- logdataissues[tid]=message
- print(message)
- return
-
def parser_blog(year, expedition, txt, sq=""):
'''Parses the format of web pages collected as 'Save As HTML" from the UK Caving blog website.
Note that the entries have dates and authors, but no titles.
@@ -847,7 +715,7 @@ def LoadLogbooks():
noexpo = ["1986", "2020", "2021",] #no expo
lostlogbook = ["1976", "1977", "1978", "1979", "1980", "1981"]
- sqlfail = ["1989"] # breaks mysql with db constraint fail - debug locally first]
+ sqlfail = [""] # breaks mysql with db constraint fail - all now fixed.]
nologbook = noexpo + lostlogbook + sqlfail
nlbe={}