Fixing wiki-parsing for 2009 logbook

This commit is contained in:
2022-12-18 19:33:56 +00:00
parent 73b710d53f
commit d1b94763b4
5 changed files with 74 additions and 65 deletions

View File

@@ -52,7 +52,7 @@ data for old logbooks. New design needed, with a mechanism for flagging fixtures
'''
MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200
BLOG_PARSER_SETTINGS = {
# "2017": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
# "2017": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
"2018": ("ukcavingblog.html", "parser_blog"),
"2019": ("ukcavingblog.html", "parser_blog"),
"2022": ("ukcavingblog.html", "parser_blog"),
@@ -60,12 +60,13 @@ BLOG_PARSER_SETTINGS = {
DEFAULT_LOGBOOK_FILE = "logbook.html"
DEFAULT_LOGBOOK_PARSER = "parser_html"
# All years since 2010 use the default value for Logbook parser
# but several don't work, and are skipped by the parsing code, e.g. 1983
LOGBOOK_PARSER_SETTINGS = {
"2019": ("logbook.html", "parser_html"),
"2010": ("logbook.html", "parser_html"),
"2009": ("2009logbook.txt", "wiki_parser"),
"2008": ("2008logbook.txt", "wiki_parser"),
# "2009": ("2009logbook.txt", "wiki_parser"), # converted to html
# "2008": ("2008logbook.txt", "wiki_parser"), # converted to html
"2009": ("logbook.html", "parser_html"),
"2008": ("logbook.html", "parser_html"),
"2007": ("logbook.html", "parser_html"),
"2006": ("logbook.html", "parser_html"),
# "2006": ("logbook/logbook_06.txt", "wiki_parser"), # converted to html
@@ -96,15 +97,15 @@ LOGBOOK_PARSER_SETTINGS = {
}
entries = { "2022": 64, "2019": 56, "2018": 75, "2017": 76, "2016": 81, "2015": 79,
"2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 52,
"2008": 49, "2007": 111, "2006": 60, "2005": 55, "2004": 76, "2003": 42, "2002": 31,
"2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 53,
"2008": 49, "2007": 113, "2006": 60, "2005": 55, "2004": 76, "2003": 42, "2002": 31,
"2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 95, "1995": 42,
"1994": 32, "1993": 41, "1992": 62, "1991": 39, "1990": 87, "1989": 1,"1988": 1,"1987": 1,
"1985": 24, "1984": 32, "1983": 52, "1982": 42,}
# Logbooks log.htm exist for 1983, 84, 85, 87, 88, 89 but have no full-working parser, or need hand-editing.
logentries = [] # the entire logbook for one year is a single object: a list of entries
noncaveplaces = [ "QMplaceholder", "Journey", "Loser Plateau", "UNKNOWN", 'plateau',
noncaveplaces = [ "Journey", "Loser Plateau", "UNKNOWN", 'plateau',
'base camp', 'basecamp', 'top camp', 'topcamp' ]
logdataissues = TROG['issues']['logdataissues']
trips ={}
@@ -170,11 +171,30 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
""" saves a logbook entry and related persontrips
Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday !
troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite
but we are saving the same thing too many times..
troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite but we are saving the same thing too many times..
Until 18 Dec.2022, this was overwriting logbook entries for the same date with the same title, because
lookupAttribs={'date':date, 'title':title}
"""
# Nasty hack, must tidy this up..
if logtime_underground:
try:
logtime_underground = float(logtime_underground)
except:
# print(f"logtime_underground = {logtime_underground}")
tu_match = re.match(r"(T/U:\s*)?(\d+[.]?\d*).*", logtime_underground)
if tu_match:
# print(f"logtime_underground = {tu_match.group(2)}")
logtime_underground = float(tu_match.group(2))
else:
logtime_underground = 0
else:
logtime_underground = 0
try:
trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground, tid=tid)
# print(f" - {author} - {logtime_underground}")
except:
message = f" ! - {expedition.year} Skipping logentry: {title} - GetTripPersons FAIL"
DataIssue.objects.create(parser='logbooks', message=message)
@@ -223,11 +243,13 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
# slug = tid + "_" + slugify(title)[:10].replace('-','_')
else:
slug = str(randint(1000,9999)) + "_" + slugify(title)[:10].replace('-','_')
nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave_slug':str(cave), 'slug': slug}
nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition,
'time_underground':logtime_underground, 'cave_slug':str(cave), 'slug': slug}
# This creates the lbo instance of LogbookEntry
lbo, created=save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs)
# for PersonTrip time_underground is float (decimal hours)
for tripperson, time_underground in trippersons:
# print(f" - {tid} '{tripperson}' author:{tripperson == author}")
lookupAttribs={'personexpedition':tripperson, 'logbook_entry':lbo}
@@ -300,24 +322,29 @@ def wiki_parser(year, expedition, txt, seq=""):
else:
tripsplace = tripsplace[1]
#print(f"! LOGBOOK {year} {logbook_entry_count:2} {len(triptext):4} '{tripsplace}'")
tul = re.findall(r"T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext)
#tul = re.findall(r"T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext)
tul = re.findall(r"T/U:?\s*(\d+[.]?\d*)\s*(hr|hrs|hours)?.*", triptext)
if tul:
tu = tul[0][0]
else:
tu = ""
print(f"! LOGBOOK {year} {logbook_entry_count:2} {len(triptext):4} T/U:{tu} '{tripcave} - {tripsplace}' ")
ldate = ParseDate(tripdate.strip(), year)
tripid =""
entrytuple = (ldate, tripcave, tripsplace, triptext,
tripid = set_trip_id(year,logbook_entry_count)
ltriptext = re.sub(r"\n", "<br /><br />\n", triptext)
ltriptext = ltriptext.replace("<br /><br />\n<br /><br />\n","<br /><br />\n")
triptitle = f'{tripcave} - {tripsplace}'
entrytuple = (ldate, tripcave, triptitle, ltriptext,
trippeople, expedition, tu, tripid)
logentries.append(entrytuple)
# 2002, 2004, 2005, 2007, 2010 - now
# 2006 wiki text is incomplete, but the html all there. So using this parser now.
# 2002, 2004 - now
def parser_html(year, expedition, txt, seq=""):
global logentries
global logdataissues
@@ -382,7 +409,7 @@ def parser_html(year, expedition, txt, seq=""):
else:
tripcave = "UNKNOWN"
ltriptext = re.sub(r"</p>", "", triptext)
ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
#ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
entrytuple = (ldate, tripcave, triptitle, ltriptext,
@@ -665,15 +692,17 @@ def LoadLogbookForExpedition(expedition, clean=True):
print(f' - {year} parsing with {parsefunc} - {lb}')
parser(year, expedition, txt, sq) # this launches the right parser for this year
# --------------------
dupl = {}
for entrytuple in logentries:
# date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
try:
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
except ValueError: # cope with removal of entry_type but still in cache files. Remove in Dec. 2022.
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, entry_type, tripid1 = entrytuple
print(f' - Exception entry_type "{entry_type}" {tripid1}')
EnterLogIntoDbase(date, tripcave, triptitle, text, trippeople, expedition, 0,
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
check = (date, triptitle)
if check in dupl:
dupl[check] += 1
triptitle = f"{triptitle} #{dupl[check]}"
print(f' - {triptitle}')
else:
dupl[check] = 1
EnterLogIntoDbase(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground,
tripid1)
if len(logentries) == expect:
@@ -684,19 +713,16 @@ def LoadLogbookForExpedition(expedition, clean=True):
return len(logentries)
# def LoadLogbook(year, format="cucc"):
# global LOGBOOK_PARSER_SETTINGS
def LoadLogbook(year):
'''One off logbook for testing purposes
'''
global LOGBOOK_PARSER_SETTINGS
# nlbe={}
# TROG['pagecache']['expedition'][year] = None # clear cache
nlbe={}
TROG['pagecache']['expedition'][year] = None # clear cache
# expo = Expedition.objects.get(year=year)
# if (format=="blog"):
# LOGBOOK_PARSER_SETTINGS[str(year)] = BLOG_PARSER_SETTINGS[str(year)]
# # print(f" - Logbook file {LOGBOOK_PARSER_SETTINGS[str(year)][0]} using parser {LOGBOOK_PARSER_SETTINGS[str(year)][1]}")
# nlbe[expo] = LoadLogbookForExpedition(expo) # this actually loads the logbook for one expo
expo = Expedition.objects.get(year=year)
nlbe[expo] = LoadLogbookForExpedition(expo) # this actually loads the logbook for one expo
def LoadLogbooks():
""" This is the master function for parsing all logbooks into the Troggle database.
@@ -721,8 +747,6 @@ def LoadLogbooks():
sqlfail = ["1987", "1988", "1989"] # breaks mysql with db constraint fail - debug locally first]
nologbook = noexpo + lostlogbook + sqlfail
# blogs = ["2019"]
nlbe={}
expd ={}
loglist = []