mirror of
https://expo.survex.com/repositories/troggle/.git
synced 2025-04-03 09:21:48 +01:00
exptl parse UK Caving blog
This commit is contained in:
parent
6dd8e5a75c
commit
cb50528e2d
@ -238,15 +238,16 @@ class JobQueue():
|
|||||||
for runfunction in self.queue:
|
for runfunction in self.queue:
|
||||||
start = time.time()
|
start = time.time()
|
||||||
memstart = get_process_memory()
|
memstart = get_process_memory()
|
||||||
|
jobname, jobparser = runfunction
|
||||||
#--------------------
|
#--------------------
|
||||||
runfunction[1]() # invokes function passed in the second item in the tuple
|
jobparser() # invokes function passed in the second item in the tuple
|
||||||
#--------------------
|
#--------------------
|
||||||
memend = get_process_memory()
|
memend = get_process_memory()
|
||||||
duration = time.time()-start
|
duration = time.time()-start
|
||||||
#print(" - MEMORY start:{:.3f} MB end:{:.3f} MB change={:.3f} MB".format(memstart,memend, ))
|
#print(" - MEMORY start:{:.3f} MB end:{:.3f} MB change={:.3f} MB".format(memstart,memend, ))
|
||||||
print("\n*- Ended \"", runfunction[0], f"\" {duration:.1f} seconds + {memend - memstart:.3f} MB ({memend:.3f} MB)")
|
print("\n*- Ended \"", jobname, f"\" {duration:.1f} seconds + {memend - memstart:.3f} MB ({memend:.3f} MB)")
|
||||||
self.results[runfunction[0]].pop() # the null item
|
self.results[jobname].pop() # the null item
|
||||||
self.results[runfunction[0]].append(duration)
|
self.results[jobname].append(duration)
|
||||||
|
|
||||||
|
|
||||||
jobend = time.time()
|
jobend = time.time()
|
||||||
|
@ -41,10 +41,10 @@ def import_logbooks():
|
|||||||
with transaction.atomic():
|
with transaction.atomic():
|
||||||
troggle.parsers.logbooks.LoadLogbooks()
|
troggle.parsers.logbooks.LoadLogbooks()
|
||||||
|
|
||||||
def import_logbook(year=2003):
|
def import_logbook(year=2019):
|
||||||
print(f"-- Importing Logbook {year}")
|
print(f"-- Importing Logbook {year}")
|
||||||
with transaction.atomic():
|
with transaction.atomic():
|
||||||
troggle.parsers.logbooks.LoadLogbook(year)
|
troggle.parsers.logbooks.LoadLogbook(year, format="blog")
|
||||||
|
|
||||||
def import_QMs():
|
def import_QMs():
|
||||||
print("-- Importing old QMs for 161, 204, 234 from CSV files")
|
print("-- Importing old QMs for 161, 204, 234 from CSV files")
|
||||||
|
@ -56,6 +56,7 @@ DEFAULT_LOGBOOK_FILE = "logbook.html"
|
|||||||
# All years since 2010 use the default value for Logbook parser
|
# All years since 2010 use the default value for Logbook parser
|
||||||
# but several don't work, and are skipped by the parsing code, e.g. 1983
|
# but several don't work, and are skipped by the parsing code, e.g. 1983
|
||||||
LOGBOOK_PARSER_SETTINGS = {
|
LOGBOOK_PARSER_SETTINGS = {
|
||||||
|
"2019": ("logbook.html", "parser_html"),
|
||||||
"2010": ("logbook.html", "parser_html"),
|
"2010": ("logbook.html", "parser_html"),
|
||||||
"2009": ("2009logbook.txt", "wiki_parser"),
|
"2009": ("2009logbook.txt", "wiki_parser"),
|
||||||
"2008": ("2008logbook.txt", "wiki_parser"),
|
"2008": ("2008logbook.txt", "wiki_parser"),
|
||||||
@ -171,7 +172,7 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
|
|||||||
return
|
return
|
||||||
|
|
||||||
if not author:
|
if not author:
|
||||||
message = f" ! - {expedition.year} Warning: logentry: {title} - no author for entry '{tid}'"
|
message = f" ! - {expedition.year} Warning: logentry: {title} - no expo member author for entry '{tid}'"
|
||||||
DataIssue.objects.create(parser='logbooks', message=message)
|
DataIssue.objects.create(parser='logbooks', message=message)
|
||||||
logdataissues["title"]=message
|
logdataissues["title"]=message
|
||||||
print(message)
|
print(message)
|
||||||
@ -471,60 +472,68 @@ def parser_html_01(year, expedition, txt):
|
|||||||
print(message)
|
print(message)
|
||||||
return
|
return
|
||||||
|
|
||||||
# parser for 2003. Retired after conversion of the logbook.html
|
def parser_blog(year, expedition, txt):
|
||||||
# KEEP THIS COMMENTED-OUT example until after we have doen the same thing with the html_01 parser
|
'''Parses the format of web pages collected as 'Save As HTML" fromt eh UK Caving blog website.
|
||||||
# def parser_html_03(year, expedition, txt):
|
Note that the entries have dates and authors, but no titles.
|
||||||
# global logentries
|
'''
|
||||||
# global logdataissues
|
global logentries
|
||||||
|
global logdataissues
|
||||||
|
errorcount = 0
|
||||||
|
|
||||||
# tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt)
|
tripheads = re.findall(r"<article class=\"message message--post js-post js-inlineModContainer\s*\"\s*([\s\S]*?)(?=</article)", txt)
|
||||||
# logbook_entry_count = 0
|
if not ( tripheads ) :
|
||||||
# for trippara in tripparas:
|
message = f" ! - Skipping on failure to parse article header: {txt[:500]}"
|
||||||
# logbook_entry_count += 1
|
print(message)
|
||||||
# tid = set_trip_id(year,logbook_entry_count) # default trip id, before we read the date
|
|
||||||
|
tripparas = re.findall(r"<article class=\"message-body js-selectToQuote\"\>\s*([\s\S]*?)(?=</article)", txt)
|
||||||
|
if not ( tripparas ) :
|
||||||
|
message = f" ! - Skipping on failure to parse article content: {txt[:500]}"
|
||||||
|
print(message)
|
||||||
|
|
||||||
# s = re.match(r"(?s)\s*<p>(.*?)</p>(.*)$", trippara)
|
if (len(tripheads) !=len(tripparas)):
|
||||||
# if not ( s ) :
|
print(f"{len(tripheads)} != {len(tripparas)}")
|
||||||
# message = " ! - Skipping logentry {year} on failure to parse parser_html_03: {} {} {}...".format(tid,s,trippara[:300])
|
|
||||||
# DataIssue.objects.create(parser='logbooks', message=message)
|
location = "Plateau"
|
||||||
# logdataissues[tid]=message
|
tripname = "UK Caving Blog post"
|
||||||
# print(message)
|
tu = 0
|
||||||
# break
|
logbook_entry_count = 0
|
||||||
|
for i in range(0, len(tripparas)):
|
||||||
|
trippara = tripparas[i]
|
||||||
|
triphead = tripheads[i]
|
||||||
|
logbook_entry_count += 1
|
||||||
|
tid = set_trip_id(year,logbook_entry_count) +"_blog"
|
||||||
|
# print(f" - tid: {tid}")
|
||||||
|
|
||||||
# tripheader, triptext = s.group(1), s.group(2)
|
# data-author="tcacrossley"
|
||||||
# tripheader = re.sub(r" ", " ", tripheader)
|
match_author = re.search(r".*data-author=\"([^\"]*)\" data-content=.*", triphead)
|
||||||
# tripheader = re.sub(r"\s+", " ", tripheader).strip()
|
if not ( match_author ) :
|
||||||
# sheader = tripheader.split(" -- ")
|
message = f" ! - Skipping logentry {year}:{logbook_entry_count} on failure to parse data-author {tid} {triphead[:400]}..."
|
||||||
# tu = ""
|
DataIssue.objects.create(parser='logbooks', message=message)
|
||||||
# if re.match("T/U|Time underwater", sheader[-1]):
|
logdataissues[tid]=message
|
||||||
# tu = sheader.pop() # not a number in 2003 usually
|
print(message)
|
||||||
# # print(f" - {logbook_entry_count} '{tu}' ")
|
break
|
||||||
# if len(sheader) != 3:
|
trippeople = match_author.group(1)
|
||||||
# print(" ! Header not three pieces for parser_html_03() ", sheader)
|
# print(f" - tid: {tid} {trippeople}")
|
||||||
# tripdate, triptitle, trippeople = sheader
|
# datetime="2019-07-11T13:16:18+0100"
|
||||||
# ldate = ParseDate(tripdate.strip(), year)
|
match_datetime = re.search(r".*datetime=\"([^\"]*)\" data-time=.*", triphead)
|
||||||
# # print(f" - {logbook_entry_count} '{ldate}' from '{tripdate.strip()}' ")
|
if not ( match_datetime ) :
|
||||||
# # print(f" - {logbook_entry_count} '{trippeople}' ")
|
message = f" ! - Skipping logentry {year}:{logbook_entry_count} on failure to parse datetime {tid} {triphead[:400]}..."
|
||||||
# titlelist = triptitle.split(" , ")
|
DataIssue.objects.create(parser='logbooks', message=message)
|
||||||
# if len(titlelist) >= 2:
|
logdataissues[tid]=message
|
||||||
# location, *namelist = titlelist # list unpacking operator
|
print(message)
|
||||||
# tripname = ", ".join(namelist) # concatenate strings
|
break
|
||||||
# # print(f" - {logbook_entry_count} {location} '{tripname}'")
|
datestamp = match_datetime.group(1)
|
||||||
# else:
|
|
||||||
# location = "UNKNOWN"
|
tripdate = datetime.fromisoformat(datestamp)
|
||||||
|
print(f" - tid: {tid} '{trippeople}' '{tripdate}'")
|
||||||
# ltriptext = triptext + "<br /><br />\n\n" + tu
|
|
||||||
# ltriptext = re.sub(r"</p>", "", ltriptext)
|
tripname = f"UK Caving Blog post {logbook_entry_count}" # must be unique for a given date
|
||||||
# #ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
|
|
||||||
# ltriptext = re.sub(r"<p>", "<br /><br />\n\n", ltriptext).strip()
|
|
||||||
# #ltriptext = re.sub(r"[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext)
|
|
||||||
|
|
||||||
|
|
||||||
# entrytuple = (ldate, location, tripname, ltriptext,
|
|
||||||
# trippeople, expedition, tu, tid)
|
|
||||||
# logentries.append(entrytuple)
|
|
||||||
|
|
||||||
|
entrytuple = (tripdate, location, tripname, trippara,
|
||||||
|
trippeople, expedition, tu, tid)
|
||||||
|
logentries.append(entrytuple)
|
||||||
|
|
||||||
|
|
||||||
def LoadLogbookForExpedition(expedition):
|
def LoadLogbookForExpedition(expedition):
|
||||||
""" Parses all logbook entries for one expedition
|
""" Parses all logbook entries for one expedition
|
||||||
"""
|
"""
|
||||||
@ -566,9 +575,12 @@ def LoadLogbookForExpedition(expedition):
|
|||||||
cleanerrors(year)
|
cleanerrors(year)
|
||||||
|
|
||||||
if year in yearlinks:
|
if year in yearlinks:
|
||||||
logbookpath = Path(expologbase) / year / yearlinks[year][0]
|
yearfile, yearparser = yearlinks[year]
|
||||||
expedition.logbookfile = yearlinks[year][0]
|
logbookpath = Path(expologbase) / year / yearfile
|
||||||
parsefunc = yearlinks[year][1]
|
expedition.logbookfile = yearfile
|
||||||
|
parsefunc = yearparser
|
||||||
|
print(f" - Logbook file {yearfile} using parser {yearparser}")
|
||||||
|
|
||||||
else:
|
else:
|
||||||
logbookpath = Path(expologbase) / year / DEFAULT_LOGBOOK_FILE
|
logbookpath = Path(expologbase) / year / DEFAULT_LOGBOOK_FILE
|
||||||
expedition.logbookfile = DEFAULT_LOGBOOK_FILE
|
expedition.logbookfile = DEFAULT_LOGBOOK_FILE
|
||||||
@ -617,14 +629,20 @@ def LoadLogbookForExpedition(expedition):
|
|||||||
|
|
||||||
return len(logentries)
|
return len(logentries)
|
||||||
|
|
||||||
def LoadLogbook(year):
|
def LoadLogbook(year, format="cucc"):
|
||||||
|
global LOGBOOK_PARSER_SETTINGS
|
||||||
|
|
||||||
nlbe={}
|
nlbe={}
|
||||||
TROG['pagecache']['expedition'][year] = None # clear cache
|
TROG['pagecache']['expedition'][year] = None # clear cache
|
||||||
|
|
||||||
expo = Expedition.objects.get(year=year)
|
expo = Expedition.objects.get(year=year)
|
||||||
nlbe[expo] = LoadLogbookForExpedition(expo) # this actually loads the logbook for one expo
|
|
||||||
|
|
||||||
|
|
||||||
|
if (format=="blog"):
|
||||||
|
LOGBOOK_PARSER_SETTINGS[str(year)] = ("ukcavingblog.html", "parser_blog")
|
||||||
|
# print(f" - Logbook file {LOGBOOK_PARSER_SETTINGS[str(year)][0]} using parser {LOGBOOK_PARSER_SETTINGS[str(year)][1]}")
|
||||||
|
|
||||||
|
nlbe[expo] = LoadLogbookForExpedition(expo) # this actually loads the logbook for one expo
|
||||||
|
|
||||||
def LoadLogbooks():
|
def LoadLogbooks():
|
||||||
""" This is the master function for parsing all logbooks into the Troggle database.
|
""" This is the master function for parsing all logbooks into the Troggle database.
|
||||||
This should be rewritten to use coroutines to load all logbooks from disc in parallel,
|
This should be rewritten to use coroutines to load all logbooks from disc in parallel,
|
||||||
|
Loading…
x
Reference in New Issue
Block a user