2
0
mirror of https://expo.survex.com/repositories/troggle/.git synced 2025-04-03 09:21:48 +01:00

exptl parse UK Caving blog

This commit is contained in:
Philip Sargent 2022-12-14 23:46:14 +00:00
parent 6dd8e5a75c
commit cb50528e2d
3 changed files with 81 additions and 62 deletions

View File

@ -238,15 +238,16 @@ class JobQueue():
for runfunction in self.queue: for runfunction in self.queue:
start = time.time() start = time.time()
memstart = get_process_memory() memstart = get_process_memory()
jobname, jobparser = runfunction
#-------------------- #--------------------
runfunction[1]() # invokes function passed in the second item in the tuple jobparser() # invokes function passed in the second item in the tuple
#-------------------- #--------------------
memend = get_process_memory() memend = get_process_memory()
duration = time.time()-start duration = time.time()-start
#print(" - MEMORY start:{:.3f} MB end:{:.3f} MB change={:.3f} MB".format(memstart,memend, )) #print(" - MEMORY start:{:.3f} MB end:{:.3f} MB change={:.3f} MB".format(memstart,memend, ))
print("\n*- Ended \"", runfunction[0], f"\" {duration:.1f} seconds + {memend - memstart:.3f} MB ({memend:.3f} MB)") print("\n*- Ended \"", jobname, f"\" {duration:.1f} seconds + {memend - memstart:.3f} MB ({memend:.3f} MB)")
self.results[runfunction[0]].pop() # the null item self.results[jobname].pop() # the null item
self.results[runfunction[0]].append(duration) self.results[jobname].append(duration)
jobend = time.time() jobend = time.time()

View File

@ -41,10 +41,10 @@ def import_logbooks():
with transaction.atomic(): with transaction.atomic():
troggle.parsers.logbooks.LoadLogbooks() troggle.parsers.logbooks.LoadLogbooks()
def import_logbook(year=2003): def import_logbook(year=2019):
print(f"-- Importing Logbook {year}") print(f"-- Importing Logbook {year}")
with transaction.atomic(): with transaction.atomic():
troggle.parsers.logbooks.LoadLogbook(year) troggle.parsers.logbooks.LoadLogbook(year, format="blog")
def import_QMs(): def import_QMs():
print("-- Importing old QMs for 161, 204, 234 from CSV files") print("-- Importing old QMs for 161, 204, 234 from CSV files")

View File

@ -56,6 +56,7 @@ DEFAULT_LOGBOOK_FILE = "logbook.html"
# All years since 2010 use the default value for Logbook parser # All years since 2010 use the default value for Logbook parser
# but several don't work, and are skipped by the parsing code, e.g. 1983 # but several don't work, and are skipped by the parsing code, e.g. 1983
LOGBOOK_PARSER_SETTINGS = { LOGBOOK_PARSER_SETTINGS = {
"2019": ("logbook.html", "parser_html"),
"2010": ("logbook.html", "parser_html"), "2010": ("logbook.html", "parser_html"),
"2009": ("2009logbook.txt", "wiki_parser"), "2009": ("2009logbook.txt", "wiki_parser"),
"2008": ("2008logbook.txt", "wiki_parser"), "2008": ("2008logbook.txt", "wiki_parser"),
@ -171,7 +172,7 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
return return
if not author: if not author:
message = f" ! - {expedition.year} Warning: logentry: {title} - no author for entry '{tid}'" message = f" ! - {expedition.year} Warning: logentry: {title} - no expo member author for entry '{tid}'"
DataIssue.objects.create(parser='logbooks', message=message) DataIssue.objects.create(parser='logbooks', message=message)
logdataissues["title"]=message logdataissues["title"]=message
print(message) print(message)
@ -471,60 +472,68 @@ def parser_html_01(year, expedition, txt):
print(message) print(message)
return return
# parser for 2003. Retired after conversion of the logbook.html def parser_blog(year, expedition, txt):
# KEEP THIS COMMENTED-OUT example until after we have doen the same thing with the html_01 parser '''Parses the format of web pages collected as 'Save As HTML" fromt eh UK Caving blog website.
# def parser_html_03(year, expedition, txt): Note that the entries have dates and authors, but no titles.
# global logentries '''
# global logdataissues global logentries
global logdataissues
errorcount = 0
# tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt) tripheads = re.findall(r"<article class=\"message message--post js-post js-inlineModContainer\s*\"\s*([\s\S]*?)(?=</article)", txt)
# logbook_entry_count = 0 if not ( tripheads ) :
# for trippara in tripparas: message = f" ! - Skipping on failure to parse article header: {txt[:500]}"
# logbook_entry_count += 1 print(message)
# tid = set_trip_id(year,logbook_entry_count) # default trip id, before we read the date
tripparas = re.findall(r"<article class=\"message-body js-selectToQuote\"\>\s*([\s\S]*?)(?=</article)", txt)
if not ( tripparas ) :
message = f" ! - Skipping on failure to parse article content: {txt[:500]}"
print(message)
# s = re.match(r"(?s)\s*<p>(.*?)</p>(.*)$", trippara) if (len(tripheads) !=len(tripparas)):
# if not ( s ) : print(f"{len(tripheads)} != {len(tripparas)}")
# message = " ! - Skipping logentry {year} on failure to parse parser_html_03: {} {} {}...".format(tid,s,trippara[:300])
# DataIssue.objects.create(parser='logbooks', message=message) location = "Plateau"
# logdataissues[tid]=message tripname = "UK Caving Blog post"
# print(message) tu = 0
# break logbook_entry_count = 0
for i in range(0, len(tripparas)):
trippara = tripparas[i]
triphead = tripheads[i]
logbook_entry_count += 1
tid = set_trip_id(year,logbook_entry_count) +"_blog"
# print(f" - tid: {tid}")
# tripheader, triptext = s.group(1), s.group(2) # data-author="tcacrossley"
# tripheader = re.sub(r"&nbsp;", " ", tripheader) match_author = re.search(r".*data-author=\"([^\"]*)\" data-content=.*", triphead)
# tripheader = re.sub(r"\s+", " ", tripheader).strip() if not ( match_author ) :
# sheader = tripheader.split(" -- ") message = f" ! - Skipping logentry {year}:{logbook_entry_count} on failure to parse data-author {tid} {triphead[:400]}..."
# tu = "" DataIssue.objects.create(parser='logbooks', message=message)
# if re.match("T/U|Time underwater", sheader[-1]): logdataissues[tid]=message
# tu = sheader.pop() # not a number in 2003 usually print(message)
# # print(f" - {logbook_entry_count} '{tu}' ") break
# if len(sheader) != 3: trippeople = match_author.group(1)
# print(" ! Header not three pieces for parser_html_03() ", sheader) # print(f" - tid: {tid} {trippeople}")
# tripdate, triptitle, trippeople = sheader # datetime="2019-07-11T13:16:18+0100"
# ldate = ParseDate(tripdate.strip(), year) match_datetime = re.search(r".*datetime=\"([^\"]*)\" data-time=.*", triphead)
# # print(f" - {logbook_entry_count} '{ldate}' from '{tripdate.strip()}' ") if not ( match_datetime ) :
# # print(f" - {logbook_entry_count} '{trippeople}' ") message = f" ! - Skipping logentry {year}:{logbook_entry_count} on failure to parse datetime {tid} {triphead[:400]}..."
# titlelist = triptitle.split(" , ") DataIssue.objects.create(parser='logbooks', message=message)
# if len(titlelist) >= 2: logdataissues[tid]=message
# location, *namelist = titlelist # list unpacking operator print(message)
# tripname = ", ".join(namelist) # concatenate strings break
# # print(f" - {logbook_entry_count} {location} '{tripname}'") datestamp = match_datetime.group(1)
# else:
# location = "UNKNOWN" tripdate = datetime.fromisoformat(datestamp)
print(f" - tid: {tid} '{trippeople}' '{tripdate}'")
# ltriptext = triptext + "<br /><br />\n\n" + tu
# ltriptext = re.sub(r"</p>", "", ltriptext) tripname = f"UK Caving Blog post {logbook_entry_count}" # must be unique for a given date
# #ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
# ltriptext = re.sub(r"<p>", "<br /><br />\n\n", ltriptext).strip()
# #ltriptext = re.sub(r"[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext)
# entrytuple = (ldate, location, tripname, ltriptext,
# trippeople, expedition, tu, tid)
# logentries.append(entrytuple)
entrytuple = (tripdate, location, tripname, trippara,
trippeople, expedition, tu, tid)
logentries.append(entrytuple)
def LoadLogbookForExpedition(expedition): def LoadLogbookForExpedition(expedition):
""" Parses all logbook entries for one expedition """ Parses all logbook entries for one expedition
""" """
@ -566,9 +575,12 @@ def LoadLogbookForExpedition(expedition):
cleanerrors(year) cleanerrors(year)
if year in yearlinks: if year in yearlinks:
logbookpath = Path(expologbase) / year / yearlinks[year][0] yearfile, yearparser = yearlinks[year]
expedition.logbookfile = yearlinks[year][0] logbookpath = Path(expologbase) / year / yearfile
parsefunc = yearlinks[year][1] expedition.logbookfile = yearfile
parsefunc = yearparser
print(f" - Logbook file {yearfile} using parser {yearparser}")
else: else:
logbookpath = Path(expologbase) / year / DEFAULT_LOGBOOK_FILE logbookpath = Path(expologbase) / year / DEFAULT_LOGBOOK_FILE
expedition.logbookfile = DEFAULT_LOGBOOK_FILE expedition.logbookfile = DEFAULT_LOGBOOK_FILE
@ -617,14 +629,20 @@ def LoadLogbookForExpedition(expedition):
return len(logentries) return len(logentries)
def LoadLogbook(year): def LoadLogbook(year, format="cucc"):
global LOGBOOK_PARSER_SETTINGS
nlbe={} nlbe={}
TROG['pagecache']['expedition'][year] = None # clear cache TROG['pagecache']['expedition'][year] = None # clear cache
expo = Expedition.objects.get(year=year) expo = Expedition.objects.get(year=year)
nlbe[expo] = LoadLogbookForExpedition(expo) # this actually loads the logbook for one expo
if (format=="blog"):
LOGBOOK_PARSER_SETTINGS[str(year)] = ("ukcavingblog.html", "parser_blog")
# print(f" - Logbook file {LOGBOOK_PARSER_SETTINGS[str(year)][0]} using parser {LOGBOOK_PARSER_SETTINGS[str(year)][1]}")
nlbe[expo] = LoadLogbookForExpedition(expo) # this actually loads the logbook for one expo
def LoadLogbooks(): def LoadLogbooks():
""" This is the master function for parsing all logbooks into the Troggle database. """ This is the master function for parsing all logbooks into the Troggle database.
This should be rewritten to use coroutines to load all logbooks from disc in parallel, This should be rewritten to use coroutines to load all logbooks from disc in parallel,