parse several UK caving blogs per year - working

This commit is contained in:
Philip Sargent 2022-12-16 19:57:56 +00:00
parent 5e9fd7fd77
commit f80e4efed8
4 changed files with 129 additions and 78 deletions

View File

@ -27,11 +27,9 @@ Also has code to download a logbook in a choice of formats (why?!)
''' '''
todo = ''' todo = '''
- Check that the logbookdownloader works by testing with a round trip.
- Use it to convert all older logbooks into the 2005-variant of HTML then we can - Use logbookdownloader to convert all older logbooks into the 2005-variant of HTML then we can
get rid of the parsers for older formats. There are no images stored in the database, get rid of the parsers for older formats.
so this is only a tool for a first pass, to be followed by extensive hand-editing!
When we have done all the old logbooks, delete this function and the two templates. When we have done all the old logbooks, delete this function and the two templates.
@ -152,7 +150,7 @@ def exportlogbook(request,year=None,extension=None):
for the current year. Formats available are HTML2005 (others old & broken or not written yet) for the current year. Formats available are HTML2005 (others old & broken or not written yet)
There are no images stored in the database, so this is only a tool for a first pass, to be followed by There are no images stored in the database, so this is only a tool for a first pass, to be followed by
hand-editing. However links to images work int he HTML text of a logbook entry hand-editing. However links to images work in the HTML text of a logbook entry
NEED TO ADD IN THE MATERIAL WHICH IS NOT IN ANY LBE ! e.g. front matter. NEED TO ADD IN THE MATERIAL WHICH IS NOT IN ANY LBE ! e.g. front matter.
@ -184,6 +182,18 @@ def exportlogbook(request,year=None,extension=None):
t=loader.get_template(template) t=loader.get_template(template)
logbookfile = (t.render({'logbook_entries':logbook_entries})) logbookfile = (t.render({'logbook_entries':logbook_entries}))
frontpath = Path(settings.EXPOWEB, "years", year, "frontmatter.html")
if frontpath.is_file():
try:
with open(frontpath,"r") as front:
frontmatter = front.read()
except:
print(" ! Very Bad Error opening " + frontpath)
logbookfile = re.sub(r"<body>", "<body>\n"+frontmatter , logbookfile)
else:
logbookfile = re.sub(r"<body>", f"<body>\n<h1>Expo {year}</h1>\n", logbookfile)
dir = Path(settings.EXPOWEB) / "years" / year dir = Path(settings.EXPOWEB) / "years" / year
filepath = Path(dir, filename) filepath = Path(dir, filename)
with(open(filepath, 'w')) as lb: with(open(filepath, 'w')) as lb:

View File

@ -43,8 +43,9 @@ def import_logbooks():
def import_logbook(year=2019): def import_logbook(year=2019):
print(f"-- Importing Logbook {year}") print(f"-- Importing Logbook {year}")
with transaction.atomic(): print(f"-- - commented out")
troggle.parsers.logbooks.LoadLogbook(year, format="blog") # with transaction.atomic():
# troggle.parsers.logbooks.LoadLogbook(year, format="cucc")
def import_QMs(): def import_QMs():
print("-- Importing old QMs for 161, 204, 234 from CSV files") print("-- Importing old QMs for 161, 204, 234 from CSV files")

View File

@ -51,8 +51,13 @@ data for old logbooks. New design needed, with a mechanism for flagging fixtures
''' '''
MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200 MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200
DEFAULT_LOGBOOK_PARSER = "parser_html" BLOG_PARSER_SETTINGS = {
"2017": ("ukcavingblog.html", "parser_blog"),
"2019": ("ukcavingblog.html", "parser_blog"),
"2022": ("ukcavingblog.html", "parser_blog"),
}
DEFAULT_LOGBOOK_FILE = "logbook.html" DEFAULT_LOGBOOK_FILE = "logbook.html"
DEFAULT_LOGBOOK_PARSER = "parser_html"
# All years since 2010 use the default value for Logbook parser # All years since 2010 use the default value for Logbook parser
# but several don't work, and are skipped by the parsing code, e.g. 1983 # but several don't work, and are skipped by the parsing code, e.g. 1983
LOGBOOK_PARSER_SETTINGS = { LOGBOOK_PARSER_SETTINGS = {
@ -89,11 +94,11 @@ LOGBOOK_PARSER_SETTINGS = {
"1982": ("log.htm", "parser_html_01"), "1982": ("log.htm", "parser_html_01"),
} }
entries = { "2022": 64, "2019": 56, "2018": 74, "2017": 60, "2016": 81, "2015": 79, entries = { "2022": 64, "2019": 56, "2018": 75, "2017": 61, "2016": 81, "2015": 79,
"2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 52, "2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 52,
"2008": 49, "2007": 111, "2006": 60, "2005": 55, "2004": 76, "2003": 42, "2002": 31, "2008": 49, "2007": 111, "2006": 60, "2005": 55, "2004": 76, "2003": 42, "2002": 31,
"2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 94, "1995": 41, "2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 95, "1995": 42,
"1994": 32, "1993": 41, "1992": 61, "1991": 38, "1990": 87, "1989": 1,"1988": 1,"1987": 1, "1994": 32, "1993": 41, "1992": 62, "1991": 39, "1990": 87, "1989": 1,"1988": 1,"1987": 1,
"1985": 24, "1984": 32, "1983": 52, "1982": 42,} "1985": 24, "1984": 32, "1983": 52, "1982": 42,}
# Logbooks log.htm exist for 1983, 84, 85, 87, 88, 89 but have no full-working parser, or need hand-editing. # Logbooks log.htm exist for 1983, 84, 85, 87, 88, 89 but have no full-working parser, or need hand-editing.
@ -258,7 +263,7 @@ def ParseDate(tripdate, year):
return datetime.date(1970, 1, 1) return datetime.date(1970, 1, 1)
# (2006 - not any more), 2008 - 2009 # (2006 - not any more), 2008 - 2009
def wiki_parser(year, expedition, txt): def wiki_parser(year, expedition, txt, seq=""):
global logentries global logentries
global logdataissues global logdataissues
@ -300,10 +305,20 @@ def wiki_parser(year, expedition, txt):
# 2002, 2004, 2005, 2007, 2010 - now # 2002, 2004, 2005, 2007, 2010 - now
# 2006 wiki text is incomplete, but the html all there. So using this parser now. # 2006 wiki text is incomplete, but the html all there. So using this parser now.
def parser_html(year, expedition, txt): def parser_html(year, expedition, txt, seq=""):
global logentries global logentries
global logdataissues global logdataissues
# extract front material and stash for later use when rebuilding from list of entries
headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt)
headpara = headmatch.groups()[0].strip()
# print(f" - headpara:\n'{headpara}'")
if(len(headpara)>0):
frontpath = Path(settings.EXPOWEB, "years", year, "frontmatter.html")
with open(frontpath,"w") as front:
front.write(headpara+"\n")
tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt) tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt)
logbook_entry_count = 0 logbook_entry_count = 0
for trippara in tripparas: for trippara in tripparas:
@ -323,7 +338,7 @@ def parser_html(year, expedition, txt):
if s: if s:
tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups() tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups()
else: # allow title and people to be swapped in order else: # allow title and people to be swapped in order
msg = f" !- {year} Can't parse:{logbook_entry_count} '{trippara[:40]}'..." msg = f" !- {year} Can't parse:{logbook_entry_count} '{trippara[:50]}'..."
print(msg) print(msg)
DataIssue.objects.create(parser='logbooks', message=msg) DataIssue.objects.create(parser='logbooks', message=msg)
logdataissues[tid]=msg logdataissues[tid]=msg
@ -340,11 +355,11 @@ def parser_html(year, expedition, txt):
if s2: if s2:
tripid, tripid1, tripdate, triptitle, trippeople, triptext, tu = s2.groups() tripid, tripid1, tripdate, triptitle, trippeople, triptext, tu = s2.groups()
else: else:
if not re.search(r"Rigging Guide", trippara): # if not re.search(r"Rigging Guide", trippara):
msg = f" !- Logbook. Can't parse entry on 2nd pass:{logbook_entry_count} '{trippara[:40]}'..." msg = f" !- Logbook. Can't parse entry on 2nd pass:{logbook_entry_count} '{trippara[:50]}'..."
print(msg) print(msg)
DataIssue.objects.create(parser='logbooks', message=msg) DataIssue.objects.create(parser='logbooks', message=msg)
logdataissues[tid]=msg logdataissues[tid]=msg
continue continue
ldate = ParseDate(tripdate.strip(), year) ldate = ParseDate(tripdate.strip(), year)
@ -364,11 +379,21 @@ def parser_html(year, expedition, txt):
# main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it # main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it
# trying it out for years 1982 - 1990 too. Some logbook editing required by hand.. place # trying it out for years 1982 - 1990 too. Some logbook editing required by hand.. place
def parser_html_01(year, expedition, txt): def parser_html_01(year, expedition, txt, seq=""):
global logentries global logentries
global logdataissues global logdataissues
errorcount = 0 errorcount = 0
# extract front material and stash for later use when rebuilding from list of entries
headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt)
headpara = headmatch.groups()[0].strip()
# print(f" - headpara:\n'{headpara}'")
if(len(headpara)>0):
frontpath = Path(settings.EXPOWEB, "years", year, "frontmatter.html")
with open(frontpath,"w") as front:
front.write(headpara+"\n")
tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt) tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
logbook_entry_count = 0 logbook_entry_count = 0
for trippara in tripparas: for trippara in tripparas:
@ -472,8 +497,8 @@ def parser_html_01(year, expedition, txt):
print(message) print(message)
return return
def parser_blog(year, expedition, txt): def parser_blog(year, expedition, txt, sq=""):
'''Parses the format of web pages collected as 'Save As HTML" fromt eh UK Caving blog website. '''Parses the format of web pages collected as 'Save As HTML" from the UK Caving blog website.
Note that the entries have dates and authors, but no titles. Note that the entries have dates and authors, but no titles.
''' '''
global logentries global logentries
@ -494,14 +519,13 @@ def parser_blog(year, expedition, txt):
print(f"{len(tripheads)} != {len(tripparas)}") print(f"{len(tripheads)} != {len(tripparas)}")
location = "Plateau" location = "Plateau"
tripname = "UK Caving Blog post"
tu = 0 tu = 0
logbook_entry_count = 0 logbook_entry_count = 0
for i in range(0, len(tripparas)): for i in range(0, len(tripparas)):
trippara = tripparas[i] trippara = tripparas[i]
triphead = tripheads[i] triphead = tripheads[i]
logbook_entry_count += 1 logbook_entry_count += 1
tid = set_trip_id(year,logbook_entry_count) +"_blog" tid = set_trip_id(year,logbook_entry_count) +"_blog" + sq
# print(f" - tid: {tid}") # print(f" - tid: {tid}")
# data-author="tcacrossley" # data-author="tcacrossley"
@ -514,7 +538,7 @@ def parser_blog(year, expedition, txt):
break break
trippeople = match_author.group(1) trippeople = match_author.group(1)
# print(f" - tid: {tid} {trippeople}") # print(f" - tid: {tid} {trippeople}")
# datetime="2019-07-11T13:16:18+0100" # datetime="2019-07-11T13:16:18+0100"
match_datetime = re.search(r".*datetime=\"([^\"]*)\" data-time=.*", triphead) match_datetime = re.search(r".*datetime=\"([^\"]*)\" data-time=.*", triphead)
if not ( match_datetime ) : if not ( match_datetime ) :
message = f" ! - Skipping logentry {year}:{logbook_entry_count} on failure to parse datetime {tid} {triphead[:400]}..." message = f" ! - Skipping logentry {year}:{logbook_entry_count} on failure to parse datetime {tid} {triphead[:400]}..."
@ -527,19 +551,25 @@ def parser_blog(year, expedition, txt):
try: try:
tripdate = datetime.fromisoformat(datestamp) tripdate = datetime.fromisoformat(datestamp)
except: except:
print(datestamp[0:9]) message = f" ! - FROMISOFORMAT fail logentry {year}:{logbook_entry_count} {tid} '{datestamp}'"
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
print(message)
# fallback, ignore the timestamp bits:
tripdate = datetime.fromisoformat(datestamp[0:10]) tripdate = datetime.fromisoformat(datestamp[0:10])
# print(f" - tid: {tid} '{trippeople}' '{tripdate}'") print(f" - tid: {tid} '{trippeople}' '{tripdate}'")
tripname = f"UK Caving Blog post {logbook_entry_count}" # must be unique for a given date tripname = f"UK Caving Blog{sq} post {logbook_entry_count}" # must be unique for a given date
tripcontent = trippara + f"\n\nBlog Author: {trippeople}"
entrytuple = (tripdate, location, tripname, trippara, entrytuple = (tripdate, location, tripname, tripcontent,
trippeople, expedition, tu, tid) trippeople, expedition, tu, tid)
logentries.append(entrytuple) logentries.append(entrytuple)
def LoadLogbookForExpedition(expedition, clean=True): def LoadLogbookForExpedition(expedition, clean=True):
""" Parses all logbook entries for one expedition """ Parses all logbook entries for one expedition
if clean==True then it deletes all entries for this year first.
""" """
global logentries global logentries
# absolutely horrid. REFACTOR THIS (all my fault..) # absolutely horrid. REFACTOR THIS (all my fault..)
@ -580,13 +610,13 @@ def LoadLogbookForExpedition(expedition, clean=True):
if year in yearlinks: if year in yearlinks:
yearfile, yearparser = yearlinks[year] yearfile, yearparser = yearlinks[year]
logbookpath = Path(expologbase) / year / yearfile logbookpath = Path(yearfile)
expedition.logbookfile = yearfile expedition.logbookfile = yearfile
parsefunc = yearparser parsefunc = yearparser
# print(f" - Logbook file {yearfile} using parser {yearparser}") # print(f" - Logbook file {yearfile} using parser {yearparser}")
else: else:
logbookpath = Path(expologbase) / year / DEFAULT_LOGBOOK_FILE logbookpath = Path(DEFAULT_LOGBOOK_FILE)
expedition.logbookfile = DEFAULT_LOGBOOK_FILE expedition.logbookfile = DEFAULT_LOGBOOK_FILE
parsefunc = DEFAULT_LOGBOOK_PARSER parsefunc = DEFAULT_LOGBOOK_PARSER
@ -597,34 +627,39 @@ def LoadLogbookForExpedition(expedition, clean=True):
for lbe in lbes: for lbe in lbes:
lbe.delete() lbe.delete()
try: for sq in ["", "2", "3", "4"]: # cope with blog saved as many separate files
file_in = open(logbookpath,'rb') lb = Path(expologbase, year, logbookpath.stem + sq + logbookpath.suffix)
txt = file_in.read().decode("utf-8") if not (lb.is_file()):
file_in.close() # print(f" ! End of blog. Next blog file in sequence not there:{lb}")
logbook_parseable = True break
except (IOError):
logbook_parseable = False
print(" ! Couldn't open logbook as UTF-8 " + logbookpath)
except:
logbook_parseable = False
print(" ! Very Bad Error opening " + logbookpath)
if logbook_parseable:
parser = globals()[parsefunc]
print(f' - {year} parsing with {parsefunc}')
parser(year, expedition, txt) # this launches the right parser for this year
i=0
for entrytuple in logentries:
# date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
try: try:
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple with open(lb,'rb') as file_in:
except ValueError: # cope with removal of entry_type but still in cache files. Remove in Dec. 2022. txt = file_in.read().decode("utf-8")
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, entry_type, tripid1 = entrytuple logbook_parseable = True
print(f' - Exception entry_type "{entry_type}" {tripid1}') except (IOError):
EnterLogIntoDbase(date, tripcave, triptitle, text, trippeople, expedition, 0, logbook_parseable = False
tripid1) print(f" ! Couldn't open logbook as UTF-8 {lb}")
i +=1 except:
logbook_parseable = False
print(f" ! Very Bad Error opening {lb}")
if logbook_parseable:
# --------------------
parser = globals()[parsefunc]
print(f' - {year} parsing with {parsefunc} - {lb}')
parser(year, expedition, txt, sq) # this launches the right parser for this year
# --------------------
for entrytuple in logentries:
# date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
try:
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
except ValueError: # cope with removal of entry_type but still in cache files. Remove in Dec. 2022.
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, entry_type, tripid1 = entrytuple
print(f' - Exception entry_type "{entry_type}" {tripid1}')
EnterLogIntoDbase(date, tripcave, triptitle, text, trippeople, expedition, 0,
tripid1)
if len(logentries) == expect: if len(logentries) == expect:
# print(f"OK {year} {len(logentries):5d} is {expect}\n") # print(f"OK {year} {len(logentries):5d} is {expect}\n")
@ -634,19 +669,19 @@ def LoadLogbookForExpedition(expedition, clean=True):
return len(logentries) return len(logentries)
def LoadLogbook(year, format="cucc"): # def LoadLogbook(year, format="cucc"):
global LOGBOOK_PARSER_SETTINGS # global LOGBOOK_PARSER_SETTINGS
nlbe={} # nlbe={}
TROG['pagecache']['expedition'][year] = None # clear cache # TROG['pagecache']['expedition'][year] = None # clear cache
expo = Expedition.objects.get(year=year) # expo = Expedition.objects.get(year=year)
if (format=="blog"): # if (format=="blog"):
LOGBOOK_PARSER_SETTINGS[str(year)] = ("ukcavingblog.html", "parser_blog") # LOGBOOK_PARSER_SETTINGS[str(year)] = BLOG_PARSER_SETTINGS[str(year)]
# print(f" - Logbook file {LOGBOOK_PARSER_SETTINGS[str(year)][0]} using parser {LOGBOOK_PARSER_SETTINGS[str(year)][1]}") # # print(f" - Logbook file {LOGBOOK_PARSER_SETTINGS[str(year)][0]} using parser {LOGBOOK_PARSER_SETTINGS[str(year)][1]}")
nlbe[expo] = LoadLogbookForExpedition(expo) # this actually loads the logbook for one expo # nlbe[expo] = LoadLogbookForExpedition(expo) # this actually loads the logbook for one expo
def LoadLogbooks(): def LoadLogbooks():
""" This is the master function for parsing all logbooks into the Troggle database. """ This is the master function for parsing all logbooks into the Troggle database.
@ -671,7 +706,7 @@ def LoadLogbooks():
sqlfail = ["1987", "1988", "1989"] # breaks mysql with db constraint fail - debug locally first] sqlfail = ["1987", "1988", "1989"] # breaks mysql with db constraint fail - debug locally first]
nologbook = noexpo + lostlogbook + sqlfail nologbook = noexpo + lostlogbook + sqlfail
blogs = ["2019"] # blogs = ["2019"]
nlbe={} nlbe={}
expd ={} expd ={}
@ -694,17 +729,21 @@ def LoadLogbooks():
else: else:
print(" - No Logbook yet for: " + year) # catch case when preparing for next expo print(" - No Logbook yet for: " + year) # catch case when preparing for next expo
if year in blogs: if year in BLOG_PARSER_SETTINGS:
bloglist.append(expo) bloglist.append(expo)
for ex in loglist: for ex in loglist:
nlbe[ex] = LoadLogbookForExpedition(ex) # this actually loads the logbook for one expo nlbe[ex] = LoadLogbookForExpedition(ex) # this loads the logbook for one expo
for b in bloglist: for b in bloglist:
orig = LOGBOOK_PARSER_SETTINGS[str(b)] if str(b) in LOGBOOK_PARSER_SETTINGS:
LOGBOOK_PARSER_SETTINGS[str(b)] = ("ukcavingblog.html", "parser_blog") orig = LOGBOOK_PARSER_SETTINGS[str(b)]
nlbe[b] = LoadLogbookForExpedition(b, clean=False) # this actually loads the logbook for one expo else:
orig = (DEFAULT_LOGBOOK_FILE, DEFAULT_LOGBOOK_PARSER)
LOGBOOK_PARSER_SETTINGS[str(b)] = BLOG_PARSER_SETTINGS[str(b)]
print(f" - BLOG: {b}")
nlbe[b] = LoadLogbookForExpedition(b, clean=False) # this loads the blog logbook for one expo
LOGBOOK_PARSER_SETTINGS[str(b)] = orig LOGBOOK_PARSER_SETTINGS[str(b)] = orig
# tried to use map with concurrent threads - but sqlite database is not concurrent, so failed with database lock # tried to use map with concurrent threads - but sqlite database is not concurrent, so failed with database lock

View File

@ -4,14 +4,15 @@
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" /> <meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
<title>{{logbook_entries.0.expedition}} Expo Logbook</title> <title>{{logbook_entries.0.expedition}} Expo Logbook</title>
<link rel="stylesheet" href="../../css/main2.css" /> <link rel="stylesheet" href="../../css/main2.css" />
<style>figure {font-weight: bold; font-size: small; font-family: sans-serif;font-variant-caps: small-caps;}</style>
</head> </head>
<!-- Exported by troggle in this format after having been imported using a different format and a different parser. <!-- Exported by troggle in this format after having been imported using a different format and a different
This is because we are steadily converting old formats to a new common format so that we do not need to maintain half parser. This is because we are steadily converting old formats to a new common format so that we do not need to
a dozen parser functions. maintain half a dozen parser functions.
Exported on {% now 'Y-m-d D' %} using control panel webpage and exportlogbook() in troggle/code/views/other.py Exported on {% now 'Y-m-d D' %} using control panel webpage and exportlogbook() in troggle/code/views/other.py
--> -->
<body> <body>
<h1>Expo {{logbook_entries.0.expedition}}</h1>
{%for logbook_entry in logbook_entries%} {%for logbook_entry in logbook_entries%}
<hr /> <hr />