From 259f85742aa0ffabe300329ca0e671ecaa80ef79 Mon Sep 17 00:00:00 2001 From: Philip Sargent Date: Mon, 21 Nov 2022 16:47:25 +0000 Subject: [PATCH] moved parser settings --- parsers/logbooks.py | 72 ++++++++++++++++++++++----------------------- settings.py | 37 ----------------------- 2 files changed, 36 insertions(+), 73 deletions(-) diff --git a/parsers/logbooks.py b/parsers/logbooks.py index a1df040..d79a989 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -46,41 +46,41 @@ todo=''' ''' MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200 -DEFAULT_LOGBOOK_PARSER = "Parseloghtmltxt" +DEFAULT_LOGBOOK_PARSER = "parser_html" DEFAULT_LOGBOOK_FILE = "logbook.html" # All years since 2010 use the default value for Logbook parser # but several don't work, and are skipped by the parsing code, e.g. 1983 LOGBOOK_PARSER_SETTINGS = { - "2010": ("logbook.html", "Parseloghtmltxt"), - "2009": ("2009logbook.txt", "Parselogwikitxt"), - "2008": ("2008logbook.txt", "Parselogwikitxt"), - "2007": ("logbook.html", "Parseloghtmltxt"), - "2006": ("logbook.html", "Parseloghtmltxt"), -# "2006": ("logbook/logbook_06.txt", "Parselogwikitxt"), - "2006": ("logbook.html", "Parseloghtmltxt"), - "2005": ("logbook.html", "Parseloghtmltxt"), - "2004": ("logbook.html", "Parseloghtmltxt"), - "2003": ("logbook.html", "Parseloghtml03"), - "2002": ("logbook.html", "Parseloghtmltxt"), - "2001": ("log.htm", "Parseloghtml01"), - "2000": ("log.htm", "Parseloghtml01"), - "1999": ("log.htm", "Parseloghtml01"), - "1998": ("log.htm", "Parseloghtml01"), - "1997": ("log.htm", "Parseloghtml01"), - "1996": ("log.htm", "Parseloghtml01"), - "1995": ("log.htm", "Parseloghtml01"), - "1994": ("log.htm", "Parseloghtml01"), - "1993": ("log.htm", "Parseloghtml01"), - "1992": ("log.htm", "Parseloghtml01"), - "1991": ("log.htm", "Parseloghtml01"), - "1990": ("log.htm", "Parseloghtml01"), - "1989": ("log.htm", "Parseloghtml01"), #crashes MySQL - "1988": ("log.htm", "Parseloghtml01"), #crashes MySQL - "1987": ("log.htm", "Parseloghtml01"), #crashes MySQL - "1985": ("log.htm", "Parseloghtml01"), - "1984": ("log.htm", "Parseloghtml01"), - "1983": ("log.htm", "Parseloghtml01"), - "1982": ("log.htm", "Parseloghtml01"), + "2010": ("logbook.html", "parser_html"), + "2009": ("2009logbook.txt", "parser_wiki"), + "2008": ("2008logbook.txt", "parser_wiki"), + "2007": ("logbook.html", "parser_html"), + "2006": ("logbook.html", "parser_html"), +# "2006": ("logbook/logbook_06.txt", "parser_wiki"), + "2006": ("logbook.html", "parser_html"), + "2005": ("logbook.html", "parser_html"), + "2004": ("logbook.html", "parser_html"), + "2003": ("logbook.html", "parser_html_03"), + "2002": ("logbook.html", "parser_html"), + "2001": ("log.htm", "parser_html_01"), + "2000": ("log.htm", "parser_html_01"), + "1999": ("log.htm", "parser_html_01"), + "1998": ("log.htm", "parser_html_01"), + "1997": ("log.htm", "parser_html_01"), + "1996": ("log.htm", "parser_html_01"), + "1995": ("log.htm", "parser_html_01"), + "1994": ("log.htm", "parser_html_01"), + "1993": ("log.htm", "parser_html_01"), + "1992": ("log.htm", "parser_html_01"), + "1991": ("log.htm", "parser_html_01"), + "1990": ("log.htm", "parser_html_01"), + "1989": ("log.htm", "parser_html_01"), #crashes MySQL + "1988": ("log.htm", "parser_html_01"), #crashes MySQL + "1987": ("log.htm", "parser_html_01"), #crashes MySQL + "1985": ("log.htm", "parser_html_01"), + "1984": ("log.htm", "parser_html_01"), + "1983": ("log.htm", "parser_html_01"), + "1982": ("log.htm", "parser_html_01"), } entries = { "2022": 64, "2019": 44, "2018": 74, "2017": 60, "2016": 81, "2015": 79, @@ -248,7 +248,7 @@ def ParseDate(tripdate, year): return datetime.date(1970, 1, 1) # (2006 - not any more), 2008 - 2009 -def Parselogwikitxt(year, expedition, txt): +def parser_wiki(year, expedition, txt): global logentries global logdataissues @@ -290,7 +290,7 @@ def Parselogwikitxt(year, expedition, txt): # 2002, 2004, 2005, 2007, 2010 - now # 2006 wiki text is incomplete, but the html all there. So using this parser now. -def Parseloghtmltxt(year, expedition, txt): +def parser_html(year, expedition, txt): global logentries global logdataissues @@ -349,7 +349,7 @@ def Parseloghtmltxt(year, expedition, txt): # main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it # trying it out for years 1982 - 1990 too. Some logbook editing required by hand.. place -def Parseloghtml01(year, expedition, txt): +def parser_html_01(year, expedition, txt): global logentries global logdataissues errorcount = 0 @@ -457,7 +457,7 @@ def Parseloghtml01(year, expedition, txt): return # parser for 2003 -def Parseloghtml03(year, expedition, txt): +def parser_html_03(year, expedition, txt): global logentries global logdataissues @@ -469,7 +469,7 @@ def Parseloghtml03(year, expedition, txt): s = re.match(r"(?s)\s*

(.*?)

(.*)$", trippara) if not ( s ) : - message = " ! - Skipping logentry {year} on failure to parse Parseloghtml03: {} {} {}...".format(tid,s,trippara[:300]) + message = " ! - Skipping logentry {year} on failure to parse parser_html_03: {} {} {}...".format(tid,s,trippara[:300]) DataIssue.objects.create(parser='logbooks', message=message) logdataissues[tid]=message print(message) diff --git a/settings.py b/settings.py index 051543b..540a6b2 100644 --- a/settings.py +++ b/settings.py @@ -69,43 +69,6 @@ FIX_PERMISSIONS = [] # top-level survex file basename (without .svx) SURVEX_TOPNAME = "1623-and-1626-no-schoenberg-hs" -MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200 -DEFAULT_LOGBOOK_PARSER = "Parseloghtmltxt" -DEFAULT_LOGBOOK_FILE = "logbook.html" -# All years since 2010 use the default value for Logbook parser -# but several don't work, and are skipped by the parsing code, e.g. 1983 -LOGBOOK_PARSER_SETTINGS = { - "2010": ("logbook.html", "Parseloghtmltxt"), - "2009": ("2009logbook.txt", "Parselogwikitxt"), - "2008": ("2008logbook.txt", "Parselogwikitxt"), - "2007": ("logbook.html", "Parseloghtmltxt"), - "2006": ("logbook.html", "Parseloghtmltxt"), -# "2006": ("logbook/logbook_06.txt", "Parselogwikitxt"), - "2006": ("logbook.html", "Parseloghtmltxt"), - "2005": ("logbook.html", "Parseloghtmltxt"), - "2004": ("logbook.html", "Parseloghtmltxt"), - "2003": ("logbook.html", "Parseloghtml03"), - "2002": ("logbook.html", "Parseloghtmltxt"), - "2001": ("log.htm", "Parseloghtml01"), - "2000": ("log.htm", "Parseloghtml01"), - "1999": ("log.htm", "Parseloghtml01"), - "1998": ("log.htm", "Parseloghtml01"), - "1997": ("log.htm", "Parseloghtml01"), - "1996": ("log.htm", "Parseloghtml01"), - "1995": ("log.htm", "Parseloghtml01"), - "1994": ("log.htm", "Parseloghtml01"), - "1993": ("log.htm", "Parseloghtml01"), - "1992": ("log.htm", "Parseloghtml01"), - "1991": ("log.htm", "Parseloghtml01"), - "1990": ("log.htm", "Parseloghtml01"), - "1989": ("log.htm", "Parseloghtml01"), #crashes MySQL - "1988": ("log.htm", "Parseloghtml01"), #crashes MySQL - "1987": ("log.htm", "Parseloghtml01"), #crashes MySQL - "1985": ("log.htm", "Parseloghtml01"), - "1984": ("log.htm", "Parseloghtml01"), - "1983": ("log.htm", "Parseloghtml01"), - "1982": ("log.htm", "Parseloghtml01"), - } # Caves for which survex files exist, but are not otherwise registered # replaced (?) by expoweb/cave_data/pendingcaves.txt