exptl parse UK Caving blog

This commit is contained in:
Philip Sargent 2022-12-14 23:46:14 +00:00
parent 6dd8e5a75c
commit cb50528e2d
3 changed files with 81 additions and 62 deletions

View File

@ -238,15 +238,16 @@ class JobQueue():
for runfunction in self.queue:
start = time.time()
memstart = get_process_memory()
jobname, jobparser = runfunction
#--------------------
runfunction[1]() # invokes function passed in the second item in the tuple
jobparser() # invokes function passed in the second item in the tuple
#--------------------
memend = get_process_memory()
duration = time.time()-start
#print(" - MEMORY start:{:.3f} MB end:{:.3f} MB change={:.3f} MB".format(memstart,memend, ))
print("\n*- Ended \"", runfunction[0], f"\" {duration:.1f} seconds + {memend - memstart:.3f} MB ({memend:.3f} MB)")
self.results[runfunction[0]].pop() # the null item
self.results[runfunction[0]].append(duration)
print("\n*- Ended \"", jobname, f"\" {duration:.1f} seconds + {memend - memstart:.3f} MB ({memend:.3f} MB)")
self.results[jobname].pop() # the null item
self.results[jobname].append(duration)
jobend = time.time()

View File

@ -41,10 +41,10 @@ def import_logbooks():
with transaction.atomic():
troggle.parsers.logbooks.LoadLogbooks()
def import_logbook(year=2003):
def import_logbook(year=2019):
print(f"-- Importing Logbook {year}")
with transaction.atomic():
troggle.parsers.logbooks.LoadLogbook(year)
troggle.parsers.logbooks.LoadLogbook(year, format="blog")
def import_QMs():
print("-- Importing old QMs for 161, 204, 234 from CSV files")

View File

@ -56,6 +56,7 @@ DEFAULT_LOGBOOK_FILE = "logbook.html"
# All years since 2010 use the default value for Logbook parser
# but several don't work, and are skipped by the parsing code, e.g. 1983
LOGBOOK_PARSER_SETTINGS = {
"2019": ("logbook.html", "parser_html"),
"2010": ("logbook.html", "parser_html"),
"2009": ("2009logbook.txt", "wiki_parser"),
"2008": ("2008logbook.txt", "wiki_parser"),
@ -171,7 +172,7 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
return
if not author:
message = f" ! - {expedition.year} Warning: logentry: {title} - no author for entry '{tid}'"
message = f" ! - {expedition.year} Warning: logentry: {title} - no expo member author for entry '{tid}'"
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues["title"]=message
print(message)
@ -471,58 +472,66 @@ def parser_html_01(year, expedition, txt):
print(message)
return
# parser for 2003. Retired after conversion of the logbook.html
# KEEP THIS COMMENTED-OUT example until after we have doen the same thing with the html_01 parser
# def parser_html_03(year, expedition, txt):
# global logentries
# global logdataissues
def parser_blog(year, expedition, txt):
'''Parses the format of web pages collected as 'Save As HTML" fromt eh UK Caving blog website.
Note that the entries have dates and authors, but no titles.
'''
global logentries
global logdataissues
errorcount = 0
# tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt)
# logbook_entry_count = 0
# for trippara in tripparas:
# logbook_entry_count += 1
# tid = set_trip_id(year,logbook_entry_count) # default trip id, before we read the date
tripheads = re.findall(r"<article class=\"message message--post js-post js-inlineModContainer\s*\"\s*([\s\S]*?)(?=</article)", txt)
if not ( tripheads ) :
message = f" ! - Skipping on failure to parse article header: {txt[:500]}"
print(message)
# s = re.match(r"(?s)\s*<p>(.*?)</p>(.*)$", trippara)
# if not ( s ) :
# message = " ! - Skipping logentry {year} on failure to parse parser_html_03: {} {} {}...".format(tid,s,trippara[:300])
# DataIssue.objects.create(parser='logbooks', message=message)
# logdataissues[tid]=message
# print(message)
# break
tripparas = re.findall(r"<article class=\"message-body js-selectToQuote\"\>\s*([\s\S]*?)(?=</article)", txt)
if not ( tripparas ) :
message = f" ! - Skipping on failure to parse article content: {txt[:500]}"
print(message)
# tripheader, triptext = s.group(1), s.group(2)
# tripheader = re.sub(r"&nbsp;", " ", tripheader)
# tripheader = re.sub(r"\s+", " ", tripheader).strip()
# sheader = tripheader.split(" -- ")
# tu = ""
# if re.match("T/U|Time underwater", sheader[-1]):
# tu = sheader.pop() # not a number in 2003 usually
# # print(f" - {logbook_entry_count} '{tu}' ")
# if len(sheader) != 3:
# print(" ! Header not three pieces for parser_html_03() ", sheader)
# tripdate, triptitle, trippeople = sheader
# ldate = ParseDate(tripdate.strip(), year)
# # print(f" - {logbook_entry_count} '{ldate}' from '{tripdate.strip()}' ")
# # print(f" - {logbook_entry_count} '{trippeople}' ")
# titlelist = triptitle.split(" , ")
# if len(titlelist) >= 2:
# location, *namelist = titlelist # list unpacking operator
# tripname = ", ".join(namelist) # concatenate strings
# # print(f" - {logbook_entry_count} {location} '{tripname}'")
# else:
# location = "UNKNOWN"
if (len(tripheads) !=len(tripparas)):
print(f"{len(tripheads)} != {len(tripparas)}")
# ltriptext = triptext + "<br /><br />\n\n" + tu
# ltriptext = re.sub(r"</p>", "", ltriptext)
# #ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
# ltriptext = re.sub(r"<p>", "<br /><br />\n\n", ltriptext).strip()
# #ltriptext = re.sub(r"[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext)
location = "Plateau"
tripname = "UK Caving Blog post"
tu = 0
logbook_entry_count = 0
for i in range(0, len(tripparas)):
trippara = tripparas[i]
triphead = tripheads[i]
logbook_entry_count += 1
tid = set_trip_id(year,logbook_entry_count) +"_blog"
# print(f" - tid: {tid}")
# data-author="tcacrossley"
match_author = re.search(r".*data-author=\"([^\"]*)\" data-content=.*", triphead)
if not ( match_author ) :
message = f" ! - Skipping logentry {year}:{logbook_entry_count} on failure to parse data-author {tid} {triphead[:400]}..."
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
print(message)
break
trippeople = match_author.group(1)
# print(f" - tid: {tid} {trippeople}")
# datetime="2019-07-11T13:16:18+0100"
match_datetime = re.search(r".*datetime=\"([^\"]*)\" data-time=.*", triphead)
if not ( match_datetime ) :
message = f" ! - Skipping logentry {year}:{logbook_entry_count} on failure to parse datetime {tid} {triphead[:400]}..."
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
print(message)
break
datestamp = match_datetime.group(1)
# entrytuple = (ldate, location, tripname, ltriptext,
# trippeople, expedition, tu, tid)
# logentries.append(entrytuple)
tripdate = datetime.fromisoformat(datestamp)
print(f" - tid: {tid} '{trippeople}' '{tripdate}'")
tripname = f"UK Caving Blog post {logbook_entry_count}" # must be unique for a given date
entrytuple = (tripdate, location, tripname, trippara,
trippeople, expedition, tu, tid)
logentries.append(entrytuple)
def LoadLogbookForExpedition(expedition):
@ -566,9 +575,12 @@ def LoadLogbookForExpedition(expedition):
cleanerrors(year)
if year in yearlinks:
logbookpath = Path(expologbase) / year / yearlinks[year][0]
expedition.logbookfile = yearlinks[year][0]
parsefunc = yearlinks[year][1]
yearfile, yearparser = yearlinks[year]
logbookpath = Path(expologbase) / year / yearfile
expedition.logbookfile = yearfile
parsefunc = yearparser
print(f" - Logbook file {yearfile} using parser {yearparser}")
else:
logbookpath = Path(expologbase) / year / DEFAULT_LOGBOOK_FILE
expedition.logbookfile = DEFAULT_LOGBOOK_FILE
@ -617,13 +629,19 @@ def LoadLogbookForExpedition(expedition):
return len(logentries)
def LoadLogbook(year):
def LoadLogbook(year, format="cucc"):
global LOGBOOK_PARSER_SETTINGS
nlbe={}
TROG['pagecache']['expedition'][year] = None # clear cache
expo = Expedition.objects.get(year=year)
nlbe[expo] = LoadLogbookForExpedition(expo) # this actually loads the logbook for one expo
if (format=="blog"):
LOGBOOK_PARSER_SETTINGS[str(year)] = ("ukcavingblog.html", "parser_blog")
# print(f" - Logbook file {LOGBOOK_PARSER_SETTINGS[str(year)][0]} using parser {LOGBOOK_PARSER_SETTINGS[str(year)][1]}")
nlbe[expo] = LoadLogbookForExpedition(expo) # this actually loads the logbook for one expo
def LoadLogbooks():
""" This is the master function for parsing all logbooks into the Troggle database.