diff --git a/databaseReset.py b/databaseReset.py index db7a781..2d0af58 100644 --- a/databaseReset.py +++ b/databaseReset.py @@ -238,15 +238,16 @@ class JobQueue(): for runfunction in self.queue: start = time.time() memstart = get_process_memory() + jobname, jobparser = runfunction #-------------------- - runfunction[1]() # invokes function passed in the second item in the tuple + jobparser() # invokes function passed in the second item in the tuple #-------------------- memend = get_process_memory() duration = time.time()-start #print(" - MEMORY start:{:.3f} MB end:{:.3f} MB change={:.3f} MB".format(memstart,memend, )) - print("\n*- Ended \"", runfunction[0], f"\" {duration:.1f} seconds + {memend - memstart:.3f} MB ({memend:.3f} MB)") - self.results[runfunction[0]].pop() # the null item - self.results[runfunction[0]].append(duration) + print("\n*- Ended \"", jobname, f"\" {duration:.1f} seconds + {memend - memstart:.3f} MB ({memend:.3f} MB)") + self.results[jobname].pop() # the null item + self.results[jobname].append(duration) jobend = time.time() diff --git a/parsers/imports.py b/parsers/imports.py index 9cc945c..3723ce3 100644 --- a/parsers/imports.py +++ b/parsers/imports.py @@ -41,10 +41,10 @@ def import_logbooks(): with transaction.atomic(): troggle.parsers.logbooks.LoadLogbooks() -def import_logbook(year=2003): +def import_logbook(year=2019): print(f"-- Importing Logbook {year}") with transaction.atomic(): - troggle.parsers.logbooks.LoadLogbook(year) + troggle.parsers.logbooks.LoadLogbook(year, format="blog") def import_QMs(): print("-- Importing old QMs for 161, 204, 234 from CSV files") diff --git a/parsers/logbooks.py b/parsers/logbooks.py index 0bbc23d..7e2870b 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -56,6 +56,7 @@ DEFAULT_LOGBOOK_FILE = "logbook.html" # All years since 2010 use the default value for Logbook parser # but several don't work, and are skipped by the parsing code, e.g. 1983 LOGBOOK_PARSER_SETTINGS = { + "2019": ("logbook.html", "parser_html"), "2010": ("logbook.html", "parser_html"), "2009": ("2009logbook.txt", "wiki_parser"), "2008": ("2008logbook.txt", "wiki_parser"), @@ -171,7 +172,7 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_ return if not author: - message = f" ! - {expedition.year} Warning: logentry: {title} - no author for entry '{tid}'" + message = f" ! - {expedition.year} Warning: logentry: {title} - no expo member author for entry '{tid}'" DataIssue.objects.create(parser='logbooks', message=message) logdataissues["title"]=message print(message) @@ -471,60 +472,68 @@ def parser_html_01(year, expedition, txt): print(message) return -# parser for 2003. Retired after conversion of the logbook.html -# KEEP THIS COMMENTED-OUT example until after we have doen the same thing with the html_01 parser -# def parser_html_03(year, expedition, txt): - # global logentries - # global logdataissues +def parser_blog(year, expedition, txt): + '''Parses the format of web pages collected as 'Save As HTML" fromt eh UK Caving blog website. + Note that the entries have dates and authors, but no titles. + ''' + global logentries + global logdataissues + errorcount = 0 - # tripparas = re.findall(r"
", "
\n\n", ltriptext).strip()
- # #ltriptext = re.sub(r"[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext)
-
-
- # entrytuple = (ldate, location, tripname, ltriptext,
- # trippeople, expedition, tu, tid)
- # logentries.append(entrytuple)
+ # data-author="tcacrossley"
+ match_author = re.search(r".*data-author=\"([^\"]*)\" data-content=.*", triphead)
+ if not ( match_author ) :
+ message = f" ! - Skipping logentry {year}:{logbook_entry_count} on failure to parse data-author {tid} {triphead[:400]}..."
+ DataIssue.objects.create(parser='logbooks', message=message)
+ logdataissues[tid]=message
+ print(message)
+ break
+ trippeople = match_author.group(1)
+ # print(f" - tid: {tid} {trippeople}")
+ # datetime="2019-07-11T13:16:18+0100"
+ match_datetime = re.search(r".*datetime=\"([^\"]*)\" data-time=.*", triphead)
+ if not ( match_datetime ) :
+ message = f" ! - Skipping logentry {year}:{logbook_entry_count} on failure to parse datetime {tid} {triphead[:400]}..."
+ DataIssue.objects.create(parser='logbooks', message=message)
+ logdataissues[tid]=message
+ print(message)
+ break
+ datestamp = match_datetime.group(1)
+
+ tripdate = datetime.fromisoformat(datestamp)
+ print(f" - tid: {tid} '{trippeople}' '{tripdate}'")
+
+ tripname = f"UK Caving Blog post {logbook_entry_count}" # must be unique for a given date
+ entrytuple = (tripdate, location, tripname, trippara,
+ trippeople, expedition, tu, tid)
+ logentries.append(entrytuple)
+
def LoadLogbookForExpedition(expedition):
""" Parses all logbook entries for one expedition
"""
@@ -566,9 +575,12 @@ def LoadLogbookForExpedition(expedition):
cleanerrors(year)
if year in yearlinks:
- logbookpath = Path(expologbase) / year / yearlinks[year][0]
- expedition.logbookfile = yearlinks[year][0]
- parsefunc = yearlinks[year][1]
+ yearfile, yearparser = yearlinks[year]
+ logbookpath = Path(expologbase) / year / yearfile
+ expedition.logbookfile = yearfile
+ parsefunc = yearparser
+ print(f" - Logbook file {yearfile} using parser {yearparser}")
+
else:
logbookpath = Path(expologbase) / year / DEFAULT_LOGBOOK_FILE
expedition.logbookfile = DEFAULT_LOGBOOK_FILE
@@ -617,14 +629,20 @@ def LoadLogbookForExpedition(expedition):
return len(logentries)
-def LoadLogbook(year):
+def LoadLogbook(year, format="cucc"):
+ global LOGBOOK_PARSER_SETTINGS
+
nlbe={}
TROG['pagecache']['expedition'][year] = None # clear cache
expo = Expedition.objects.get(year=year)
- nlbe[expo] = LoadLogbookForExpedition(expo) # this actually loads the logbook for one expo
-
+ if (format=="blog"):
+ LOGBOOK_PARSER_SETTINGS[str(year)] = ("ukcavingblog.html", "parser_blog")
+ # print(f" - Logbook file {LOGBOOK_PARSER_SETTINGS[str(year)][0]} using parser {LOGBOOK_PARSER_SETTINGS[str(year)][1]}")
+
+ nlbe[expo] = LoadLogbookForExpedition(expo) # this actually loads the logbook for one expo
+
def LoadLogbooks():
""" This is the master function for parsing all logbooks into the Troggle database.
This should be rewritten to use coroutines to load all logbooks from disc in parallel,