From cb50528e2d6bc7a215a8b55b46a9859aae7f4f83 Mon Sep 17 00:00:00 2001
From: Philip Sargent
Date: Wed, 14 Dec 2022 23:46:14 +0000
Subject: [PATCH] exptl parse UK Caving blog
---
databaseReset.py | 9 +--
parsers/imports.py | 4 +-
parsers/logbooks.py | 130 +++++++++++++++++++++++++-------------------
3 files changed, 81 insertions(+), 62 deletions(-)
diff --git a/databaseReset.py b/databaseReset.py
index db7a781..2d0af58 100644
--- a/databaseReset.py
+++ b/databaseReset.py
@@ -238,15 +238,16 @@ class JobQueue():
for runfunction in self.queue:
start = time.time()
memstart = get_process_memory()
+ jobname, jobparser = runfunction
#--------------------
- runfunction[1]() # invokes function passed in the second item in the tuple
+ jobparser() # invokes function passed in the second item in the tuple
#--------------------
memend = get_process_memory()
duration = time.time()-start
#print(" - MEMORY start:{:.3f} MB end:{:.3f} MB change={:.3f} MB".format(memstart,memend, ))
- print("\n*- Ended \"", runfunction[0], f"\" {duration:.1f} seconds + {memend - memstart:.3f} MB ({memend:.3f} MB)")
- self.results[runfunction[0]].pop() # the null item
- self.results[runfunction[0]].append(duration)
+ print("\n*- Ended \"", jobname, f"\" {duration:.1f} seconds + {memend - memstart:.3f} MB ({memend:.3f} MB)")
+ self.results[jobname].pop() # the null item
+ self.results[jobname].append(duration)
jobend = time.time()
diff --git a/parsers/imports.py b/parsers/imports.py
index 9cc945c..3723ce3 100644
--- a/parsers/imports.py
+++ b/parsers/imports.py
@@ -41,10 +41,10 @@ def import_logbooks():
with transaction.atomic():
troggle.parsers.logbooks.LoadLogbooks()
-def import_logbook(year=2003):
+def import_logbook(year=2019):
print(f"-- Importing Logbook {year}")
with transaction.atomic():
- troggle.parsers.logbooks.LoadLogbook(year)
+ troggle.parsers.logbooks.LoadLogbook(year, format="blog")
def import_QMs():
print("-- Importing old QMs for 161, 204, 234 from CSV files")
diff --git a/parsers/logbooks.py b/parsers/logbooks.py
index 0bbc23d..7e2870b 100644
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -56,6 +56,7 @@ DEFAULT_LOGBOOK_FILE = "logbook.html"
# All years since 2010 use the default value for Logbook parser
# but several don't work, and are skipped by the parsing code, e.g. 1983
LOGBOOK_PARSER_SETTINGS = {
+ "2019": ("logbook.html", "parser_html"),
"2010": ("logbook.html", "parser_html"),
"2009": ("2009logbook.txt", "wiki_parser"),
"2008": ("2008logbook.txt", "wiki_parser"),
@@ -171,7 +172,7 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
return
if not author:
- message = f" ! - {expedition.year} Warning: logentry: {title} - no author for entry '{tid}'"
+ message = f" ! - {expedition.year} Warning: logentry: {title} - no expo member author for entry '{tid}'"
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues["title"]=message
print(message)
@@ -471,60 +472,68 @@ def parser_html_01(year, expedition, txt):
print(message)
return
-# parser for 2003. Retired after conversion of the logbook.html
-# KEEP THIS COMMENTED-OUT example until after we have doen the same thing with the html_01 parser
-# def parser_html_03(year, expedition, txt):
- # global logentries
- # global logdataissues
+def parser_blog(year, expedition, txt):
+ '''Parses the format of web pages collected as 'Save As HTML" fromt eh UK Caving blog website.
+ Note that the entries have dates and authors, but no titles.
+ '''
+ global logentries
+ global logdataissues
+ errorcount = 0
- # tripparas = re.findall(r"
([\s\S]*?)(?=
\s*([\s\S]*?)(?=(.*?)
(.*)$", trippara)
- # if not ( s ) :
- # message = " ! - Skipping logentry {year} on failure to parse parser_html_03: {} {} {}...".format(tid,s,trippara[:300])
- # DataIssue.objects.create(parser='logbooks', message=message)
- # logdataissues[tid]=message
- # print(message)
- # break
+ if (len(tripheads) !=len(tripparas)):
+ print(f"{len(tripheads)} != {len(tripparas)}")
+
+ location = "Plateau"
+ tripname = "UK Caving Blog post"
+ tu = 0
+ logbook_entry_count = 0
+ for i in range(0, len(tripparas)):
+ trippara = tripparas[i]
+ triphead = tripheads[i]
+ logbook_entry_count += 1
+ tid = set_trip_id(year,logbook_entry_count) +"_blog"
+ # print(f" - tid: {tid}")
- # tripheader, triptext = s.group(1), s.group(2)
- # tripheader = re.sub(r" ", " ", tripheader)
- # tripheader = re.sub(r"\s+", " ", tripheader).strip()
- # sheader = tripheader.split(" -- ")
- # tu = ""
- # if re.match("T/U|Time underwater", sheader[-1]):
- # tu = sheader.pop() # not a number in 2003 usually
- # # print(f" - {logbook_entry_count} '{tu}' ")
- # if len(sheader) != 3:
- # print(" ! Header not three pieces for parser_html_03() ", sheader)
- # tripdate, triptitle, trippeople = sheader
- # ldate = ParseDate(tripdate.strip(), year)
- # # print(f" - {logbook_entry_count} '{ldate}' from '{tripdate.strip()}' ")
- # # print(f" - {logbook_entry_count} '{trippeople}' ")
- # titlelist = triptitle.split(" , ")
- # if len(titlelist) >= 2:
- # location, *namelist = titlelist # list unpacking operator
- # tripname = ", ".join(namelist) # concatenate strings
- # # print(f" - {logbook_entry_count} {location} '{tripname}'")
- # else:
- # location = "UNKNOWN"
-
- # ltriptext = triptext + "
\n\n" + tu
- # ltriptext = re.sub(r"", "", ltriptext)
- # #ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
- # ltriptext = re.sub(r"", "
\n\n", ltriptext).strip()
- # #ltriptext = re.sub(r"[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext)
-
-
- # entrytuple = (ldate, location, tripname, ltriptext,
- # trippeople, expedition, tu, tid)
- # logentries.append(entrytuple)
+ # data-author="tcacrossley"
+ match_author = re.search(r".*data-author=\"([^\"]*)\" data-content=.*", triphead)
+ if not ( match_author ) :
+ message = f" ! - Skipping logentry {year}:{logbook_entry_count} on failure to parse data-author {tid} {triphead[:400]}..."
+ DataIssue.objects.create(parser='logbooks', message=message)
+ logdataissues[tid]=message
+ print(message)
+ break
+ trippeople = match_author.group(1)
+ # print(f" - tid: {tid} {trippeople}")
+ # datetime="2019-07-11T13:16:18+0100"
+ match_datetime = re.search(r".*datetime=\"([^\"]*)\" data-time=.*", triphead)
+ if not ( match_datetime ) :
+ message = f" ! - Skipping logentry {year}:{logbook_entry_count} on failure to parse datetime {tid} {triphead[:400]}..."
+ DataIssue.objects.create(parser='logbooks', message=message)
+ logdataissues[tid]=message
+ print(message)
+ break
+ datestamp = match_datetime.group(1)
+
+ tripdate = datetime.fromisoformat(datestamp)
+ print(f" - tid: {tid} '{trippeople}' '{tripdate}'")
+
+ tripname = f"UK Caving Blog post {logbook_entry_count}" # must be unique for a given date
+ entrytuple = (tripdate, location, tripname, trippara,
+ trippeople, expedition, tu, tid)
+ logentries.append(entrytuple)
+
def LoadLogbookForExpedition(expedition):
""" Parses all logbook entries for one expedition
"""
@@ -566,9 +575,12 @@ def LoadLogbookForExpedition(expedition):
cleanerrors(year)
if year in yearlinks:
- logbookpath = Path(expologbase) / year / yearlinks[year][0]
- expedition.logbookfile = yearlinks[year][0]
- parsefunc = yearlinks[year][1]
+ yearfile, yearparser = yearlinks[year]
+ logbookpath = Path(expologbase) / year / yearfile
+ expedition.logbookfile = yearfile
+ parsefunc = yearparser
+ print(f" - Logbook file {yearfile} using parser {yearparser}")
+
else:
logbookpath = Path(expologbase) / year / DEFAULT_LOGBOOK_FILE
expedition.logbookfile = DEFAULT_LOGBOOK_FILE
@@ -617,14 +629,20 @@ def LoadLogbookForExpedition(expedition):
return len(logentries)
-def LoadLogbook(year):
+def LoadLogbook(year, format="cucc"):
+ global LOGBOOK_PARSER_SETTINGS
+
nlbe={}
TROG['pagecache']['expedition'][year] = None # clear cache
expo = Expedition.objects.get(year=year)
- nlbe[expo] = LoadLogbookForExpedition(expo) # this actually loads the logbook for one expo
-
+ if (format=="blog"):
+ LOGBOOK_PARSER_SETTINGS[str(year)] = ("ukcavingblog.html", "parser_blog")
+ # print(f" - Logbook file {LOGBOOK_PARSER_SETTINGS[str(year)][0]} using parser {LOGBOOK_PARSER_SETTINGS[str(year)][1]}")
+
+ nlbe[expo] = LoadLogbookForExpedition(expo) # this actually loads the logbook for one expo
+
def LoadLogbooks():
""" This is the master function for parsing all logbooks into the Troggle database.
This should be rewritten to use coroutines to load all logbooks from disc in parallel,