forked from expo/troggle
parse several UK caving blogs per year - working
This commit is contained in:
parent
5e9fd7fd77
commit
f80e4efed8
@ -27,11 +27,9 @@ Also has code to download a logbook in a choice of formats (why?!)
|
||||
'''
|
||||
|
||||
todo = '''
|
||||
- Check that the logbookdownloader works by testing with a round trip.
|
||||
|
||||
- Use it to convert all older logbooks into the 2005-variant of HTML then we can
|
||||
get rid of the parsers for older formats. There are no images stored in the database,
|
||||
so this is only a tool for a first pass, to be followed by extensive hand-editing!
|
||||
- Use logbookdownloader to convert all older logbooks into the 2005-variant of HTML then we can
|
||||
get rid of the parsers for older formats.
|
||||
When we have done all the old logbooks, delete this function and the two templates.
|
||||
|
||||
|
||||
@ -152,7 +150,7 @@ def exportlogbook(request,year=None,extension=None):
|
||||
for the current year. Formats available are HTML2005 (others old & broken or not written yet)
|
||||
|
||||
There are no images stored in the database, so this is only a tool for a first pass, to be followed by
|
||||
hand-editing. However links to images work int he HTML text of a logbook entry
|
||||
hand-editing. However links to images work in the HTML text of a logbook entry
|
||||
|
||||
NEED TO ADD IN THE MATERIAL WHICH IS NOT IN ANY LBE ! e.g. front matter.
|
||||
|
||||
@ -184,6 +182,18 @@ def exportlogbook(request,year=None,extension=None):
|
||||
t=loader.get_template(template)
|
||||
logbookfile = (t.render({'logbook_entries':logbook_entries}))
|
||||
|
||||
frontpath = Path(settings.EXPOWEB, "years", year, "frontmatter.html")
|
||||
if frontpath.is_file():
|
||||
try:
|
||||
with open(frontpath,"r") as front:
|
||||
frontmatter = front.read()
|
||||
except:
|
||||
print(" ! Very Bad Error opening " + frontpath)
|
||||
logbookfile = re.sub(r"<body>", "<body>\n"+frontmatter , logbookfile)
|
||||
else:
|
||||
logbookfile = re.sub(r"<body>", f"<body>\n<h1>Expo {year}</h1>\n", logbookfile)
|
||||
|
||||
|
||||
dir = Path(settings.EXPOWEB) / "years" / year
|
||||
filepath = Path(dir, filename)
|
||||
with(open(filepath, 'w')) as lb:
|
||||
|
@ -43,8 +43,9 @@ def import_logbooks():
|
||||
|
||||
def import_logbook(year=2019):
|
||||
print(f"-- Importing Logbook {year}")
|
||||
with transaction.atomic():
|
||||
troggle.parsers.logbooks.LoadLogbook(year, format="blog")
|
||||
print(f"-- - commented out")
|
||||
# with transaction.atomic():
|
||||
# troggle.parsers.logbooks.LoadLogbook(year, format="cucc")
|
||||
|
||||
def import_QMs():
|
||||
print("-- Importing old QMs for 161, 204, 234 from CSV files")
|
||||
|
@ -51,8 +51,13 @@ data for old logbooks. New design needed, with a mechanism for flagging fixtures
|
||||
|
||||
'''
|
||||
MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200
|
||||
DEFAULT_LOGBOOK_PARSER = "parser_html"
|
||||
BLOG_PARSER_SETTINGS = {
|
||||
"2017": ("ukcavingblog.html", "parser_blog"),
|
||||
"2019": ("ukcavingblog.html", "parser_blog"),
|
||||
"2022": ("ukcavingblog.html", "parser_blog"),
|
||||
}
|
||||
DEFAULT_LOGBOOK_FILE = "logbook.html"
|
||||
DEFAULT_LOGBOOK_PARSER = "parser_html"
|
||||
# All years since 2010 use the default value for Logbook parser
|
||||
# but several don't work, and are skipped by the parsing code, e.g. 1983
|
||||
LOGBOOK_PARSER_SETTINGS = {
|
||||
@ -89,11 +94,11 @@ LOGBOOK_PARSER_SETTINGS = {
|
||||
"1982": ("log.htm", "parser_html_01"),
|
||||
}
|
||||
|
||||
entries = { "2022": 64, "2019": 56, "2018": 74, "2017": 60, "2016": 81, "2015": 79,
|
||||
entries = { "2022": 64, "2019": 56, "2018": 75, "2017": 61, "2016": 81, "2015": 79,
|
||||
"2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 52,
|
||||
"2008": 49, "2007": 111, "2006": 60, "2005": 55, "2004": 76, "2003": 42, "2002": 31,
|
||||
"2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 94, "1995": 41,
|
||||
"1994": 32, "1993": 41, "1992": 61, "1991": 38, "1990": 87, "1989": 1,"1988": 1,"1987": 1,
|
||||
"2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 95, "1995": 42,
|
||||
"1994": 32, "1993": 41, "1992": 62, "1991": 39, "1990": 87, "1989": 1,"1988": 1,"1987": 1,
|
||||
"1985": 24, "1984": 32, "1983": 52, "1982": 42,}
|
||||
# Logbooks log.htm exist for 1983, 84, 85, 87, 88, 89 but have no full-working parser, or need hand-editing.
|
||||
|
||||
@ -258,7 +263,7 @@ def ParseDate(tripdate, year):
|
||||
return datetime.date(1970, 1, 1)
|
||||
|
||||
# (2006 - not any more), 2008 - 2009
|
||||
def wiki_parser(year, expedition, txt):
|
||||
def wiki_parser(year, expedition, txt, seq=""):
|
||||
global logentries
|
||||
global logdataissues
|
||||
|
||||
@ -300,10 +305,20 @@ def wiki_parser(year, expedition, txt):
|
||||
|
||||
# 2002, 2004, 2005, 2007, 2010 - now
|
||||
# 2006 wiki text is incomplete, but the html all there. So using this parser now.
|
||||
def parser_html(year, expedition, txt):
|
||||
def parser_html(year, expedition, txt, seq=""):
|
||||
global logentries
|
||||
global logdataissues
|
||||
|
||||
# extract front material and stash for later use when rebuilding from list of entries
|
||||
headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt)
|
||||
headpara = headmatch.groups()[0].strip()
|
||||
|
||||
# print(f" - headpara:\n'{headpara}'")
|
||||
if(len(headpara)>0):
|
||||
frontpath = Path(settings.EXPOWEB, "years", year, "frontmatter.html")
|
||||
with open(frontpath,"w") as front:
|
||||
front.write(headpara+"\n")
|
||||
|
||||
tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt)
|
||||
logbook_entry_count = 0
|
||||
for trippara in tripparas:
|
||||
@ -323,7 +338,7 @@ def parser_html(year, expedition, txt):
|
||||
if s:
|
||||
tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups()
|
||||
else: # allow title and people to be swapped in order
|
||||
msg = f" !- {year} Can't parse:{logbook_entry_count} '{trippara[:40]}'..."
|
||||
msg = f" !- {year} Can't parse:{logbook_entry_count} '{trippara[:50]}'..."
|
||||
print(msg)
|
||||
DataIssue.objects.create(parser='logbooks', message=msg)
|
||||
logdataissues[tid]=msg
|
||||
@ -340,11 +355,11 @@ def parser_html(year, expedition, txt):
|
||||
if s2:
|
||||
tripid, tripid1, tripdate, triptitle, trippeople, triptext, tu = s2.groups()
|
||||
else:
|
||||
if not re.search(r"Rigging Guide", trippara):
|
||||
msg = f" !- Logbook. Can't parse entry on 2nd pass:{logbook_entry_count} '{trippara[:40]}'..."
|
||||
print(msg)
|
||||
DataIssue.objects.create(parser='logbooks', message=msg)
|
||||
logdataissues[tid]=msg
|
||||
# if not re.search(r"Rigging Guide", trippara):
|
||||
msg = f" !- Logbook. Can't parse entry on 2nd pass:{logbook_entry_count} '{trippara[:50]}'..."
|
||||
print(msg)
|
||||
DataIssue.objects.create(parser='logbooks', message=msg)
|
||||
logdataissues[tid]=msg
|
||||
continue
|
||||
|
||||
ldate = ParseDate(tripdate.strip(), year)
|
||||
@ -364,11 +379,21 @@ def parser_html(year, expedition, txt):
|
||||
|
||||
# main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it
|
||||
# trying it out for years 1982 - 1990 too. Some logbook editing required by hand.. place
|
||||
def parser_html_01(year, expedition, txt):
|
||||
def parser_html_01(year, expedition, txt, seq=""):
|
||||
global logentries
|
||||
global logdataissues
|
||||
errorcount = 0
|
||||
|
||||
# extract front material and stash for later use when rebuilding from list of entries
|
||||
headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt)
|
||||
headpara = headmatch.groups()[0].strip()
|
||||
|
||||
# print(f" - headpara:\n'{headpara}'")
|
||||
if(len(headpara)>0):
|
||||
frontpath = Path(settings.EXPOWEB, "years", year, "frontmatter.html")
|
||||
with open(frontpath,"w") as front:
|
||||
front.write(headpara+"\n")
|
||||
|
||||
tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
|
||||
logbook_entry_count = 0
|
||||
for trippara in tripparas:
|
||||
@ -472,8 +497,8 @@ def parser_html_01(year, expedition, txt):
|
||||
print(message)
|
||||
return
|
||||
|
||||
def parser_blog(year, expedition, txt):
|
||||
'''Parses the format of web pages collected as 'Save As HTML" fromt eh UK Caving blog website.
|
||||
def parser_blog(year, expedition, txt, sq=""):
|
||||
'''Parses the format of web pages collected as 'Save As HTML" from the UK Caving blog website.
|
||||
Note that the entries have dates and authors, but no titles.
|
||||
'''
|
||||
global logentries
|
||||
@ -494,14 +519,13 @@ def parser_blog(year, expedition, txt):
|
||||
print(f"{len(tripheads)} != {len(tripparas)}")
|
||||
|
||||
location = "Plateau"
|
||||
tripname = "UK Caving Blog post"
|
||||
tu = 0
|
||||
logbook_entry_count = 0
|
||||
for i in range(0, len(tripparas)):
|
||||
trippara = tripparas[i]
|
||||
triphead = tripheads[i]
|
||||
logbook_entry_count += 1
|
||||
tid = set_trip_id(year,logbook_entry_count) +"_blog"
|
||||
tid = set_trip_id(year,logbook_entry_count) +"_blog" + sq
|
||||
# print(f" - tid: {tid}")
|
||||
|
||||
# data-author="tcacrossley"
|
||||
@ -514,7 +538,7 @@ def parser_blog(year, expedition, txt):
|
||||
break
|
||||
trippeople = match_author.group(1)
|
||||
# print(f" - tid: {tid} {trippeople}")
|
||||
# datetime="2019-07-11T13:16:18+0100"
|
||||
# datetime="2019-07-11T13:16:18+0100"
|
||||
match_datetime = re.search(r".*datetime=\"([^\"]*)\" data-time=.*", triphead)
|
||||
if not ( match_datetime ) :
|
||||
message = f" ! - Skipping logentry {year}:{logbook_entry_count} on failure to parse datetime {tid} {triphead[:400]}..."
|
||||
@ -527,19 +551,25 @@ def parser_blog(year, expedition, txt):
|
||||
try:
|
||||
tripdate = datetime.fromisoformat(datestamp)
|
||||
except:
|
||||
print(datestamp[0:9])
|
||||
message = f" ! - FROMISOFORMAT fail logentry {year}:{logbook_entry_count} {tid} '{datestamp}'"
|
||||
DataIssue.objects.create(parser='logbooks', message=message)
|
||||
logdataissues[tid]=message
|
||||
print(message)
|
||||
# fallback, ignore the timestamp bits:
|
||||
tripdate = datetime.fromisoformat(datestamp[0:10])
|
||||
# print(f" - tid: {tid} '{trippeople}' '{tripdate}'")
|
||||
print(f" - tid: {tid} '{trippeople}' '{tripdate}'")
|
||||
|
||||
tripname = f"UK Caving Blog post {logbook_entry_count}" # must be unique for a given date
|
||||
tripname = f"UK Caving Blog{sq} post {logbook_entry_count}" # must be unique for a given date
|
||||
tripcontent = trippara + f"\n\nBlog Author: {trippeople}"
|
||||
|
||||
entrytuple = (tripdate, location, tripname, trippara,
|
||||
entrytuple = (tripdate, location, tripname, tripcontent,
|
||||
trippeople, expedition, tu, tid)
|
||||
logentries.append(entrytuple)
|
||||
|
||||
|
||||
def LoadLogbookForExpedition(expedition, clean=True):
|
||||
""" Parses all logbook entries for one expedition
|
||||
if clean==True then it deletes all entries for this year first.
|
||||
"""
|
||||
global logentries
|
||||
# absolutely horrid. REFACTOR THIS (all my fault..)
|
||||
@ -580,13 +610,13 @@ def LoadLogbookForExpedition(expedition, clean=True):
|
||||
|
||||
if year in yearlinks:
|
||||
yearfile, yearparser = yearlinks[year]
|
||||
logbookpath = Path(expologbase) / year / yearfile
|
||||
logbookpath = Path(yearfile)
|
||||
expedition.logbookfile = yearfile
|
||||
parsefunc = yearparser
|
||||
# print(f" - Logbook file {yearfile} using parser {yearparser}")
|
||||
|
||||
else:
|
||||
logbookpath = Path(expologbase) / year / DEFAULT_LOGBOOK_FILE
|
||||
logbookpath = Path(DEFAULT_LOGBOOK_FILE)
|
||||
expedition.logbookfile = DEFAULT_LOGBOOK_FILE
|
||||
parsefunc = DEFAULT_LOGBOOK_PARSER
|
||||
|
||||
@ -597,34 +627,39 @@ def LoadLogbookForExpedition(expedition, clean=True):
|
||||
for lbe in lbes:
|
||||
lbe.delete()
|
||||
|
||||
try:
|
||||
file_in = open(logbookpath,'rb')
|
||||
txt = file_in.read().decode("utf-8")
|
||||
file_in.close()
|
||||
logbook_parseable = True
|
||||
except (IOError):
|
||||
logbook_parseable = False
|
||||
print(" ! Couldn't open logbook as UTF-8 " + logbookpath)
|
||||
except:
|
||||
logbook_parseable = False
|
||||
print(" ! Very Bad Error opening " + logbookpath)
|
||||
|
||||
if logbook_parseable:
|
||||
parser = globals()[parsefunc]
|
||||
print(f' - {year} parsing with {parsefunc}')
|
||||
parser(year, expedition, txt) # this launches the right parser for this year
|
||||
|
||||
i=0
|
||||
for entrytuple in logentries:
|
||||
# date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
|
||||
for sq in ["", "2", "3", "4"]: # cope with blog saved as many separate files
|
||||
lb = Path(expologbase, year, logbookpath.stem + sq + logbookpath.suffix)
|
||||
if not (lb.is_file()):
|
||||
# print(f" ! End of blog. Next blog file in sequence not there:{lb}")
|
||||
break
|
||||
try:
|
||||
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
|
||||
except ValueError: # cope with removal of entry_type but still in cache files. Remove in Dec. 2022.
|
||||
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, entry_type, tripid1 = entrytuple
|
||||
print(f' - Exception entry_type "{entry_type}" {tripid1}')
|
||||
EnterLogIntoDbase(date, tripcave, triptitle, text, trippeople, expedition, 0,
|
||||
tripid1)
|
||||
i +=1
|
||||
with open(lb,'rb') as file_in:
|
||||
txt = file_in.read().decode("utf-8")
|
||||
logbook_parseable = True
|
||||
except (IOError):
|
||||
logbook_parseable = False
|
||||
print(f" ! Couldn't open logbook as UTF-8 {lb}")
|
||||
except:
|
||||
logbook_parseable = False
|
||||
print(f" ! Very Bad Error opening {lb}")
|
||||
|
||||
if logbook_parseable:
|
||||
|
||||
# --------------------
|
||||
parser = globals()[parsefunc]
|
||||
print(f' - {year} parsing with {parsefunc} - {lb}')
|
||||
parser(year, expedition, txt, sq) # this launches the right parser for this year
|
||||
# --------------------
|
||||
|
||||
for entrytuple in logentries:
|
||||
# date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
|
||||
try:
|
||||
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
|
||||
except ValueError: # cope with removal of entry_type but still in cache files. Remove in Dec. 2022.
|
||||
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, entry_type, tripid1 = entrytuple
|
||||
print(f' - Exception entry_type "{entry_type}" {tripid1}')
|
||||
EnterLogIntoDbase(date, tripcave, triptitle, text, trippeople, expedition, 0,
|
||||
tripid1)
|
||||
|
||||
if len(logentries) == expect:
|
||||
# print(f"OK {year} {len(logentries):5d} is {expect}\n")
|
||||
@ -634,19 +669,19 @@ def LoadLogbookForExpedition(expedition, clean=True):
|
||||
|
||||
return len(logentries)
|
||||
|
||||
def LoadLogbook(year, format="cucc"):
|
||||
global LOGBOOK_PARSER_SETTINGS
|
||||
# def LoadLogbook(year, format="cucc"):
|
||||
# global LOGBOOK_PARSER_SETTINGS
|
||||
|
||||
nlbe={}
|
||||
TROG['pagecache']['expedition'][year] = None # clear cache
|
||||
# nlbe={}
|
||||
# TROG['pagecache']['expedition'][year] = None # clear cache
|
||||
|
||||
expo = Expedition.objects.get(year=year)
|
||||
# expo = Expedition.objects.get(year=year)
|
||||
|
||||
if (format=="blog"):
|
||||
LOGBOOK_PARSER_SETTINGS[str(year)] = ("ukcavingblog.html", "parser_blog")
|
||||
# print(f" - Logbook file {LOGBOOK_PARSER_SETTINGS[str(year)][0]} using parser {LOGBOOK_PARSER_SETTINGS[str(year)][1]}")
|
||||
# if (format=="blog"):
|
||||
# LOGBOOK_PARSER_SETTINGS[str(year)] = BLOG_PARSER_SETTINGS[str(year)]
|
||||
# # print(f" - Logbook file {LOGBOOK_PARSER_SETTINGS[str(year)][0]} using parser {LOGBOOK_PARSER_SETTINGS[str(year)][1]}")
|
||||
|
||||
nlbe[expo] = LoadLogbookForExpedition(expo) # this actually loads the logbook for one expo
|
||||
# nlbe[expo] = LoadLogbookForExpedition(expo) # this actually loads the logbook for one expo
|
||||
|
||||
def LoadLogbooks():
|
||||
""" This is the master function for parsing all logbooks into the Troggle database.
|
||||
@ -671,7 +706,7 @@ def LoadLogbooks():
|
||||
sqlfail = ["1987", "1988", "1989"] # breaks mysql with db constraint fail - debug locally first]
|
||||
nologbook = noexpo + lostlogbook + sqlfail
|
||||
|
||||
blogs = ["2019"]
|
||||
# blogs = ["2019"]
|
||||
|
||||
nlbe={}
|
||||
expd ={}
|
||||
@ -694,17 +729,21 @@ def LoadLogbooks():
|
||||
else:
|
||||
print(" - No Logbook yet for: " + year) # catch case when preparing for next expo
|
||||
|
||||
if year in blogs:
|
||||
if year in BLOG_PARSER_SETTINGS:
|
||||
bloglist.append(expo)
|
||||
|
||||
|
||||
for ex in loglist:
|
||||
nlbe[ex] = LoadLogbookForExpedition(ex) # this actually loads the logbook for one expo
|
||||
nlbe[ex] = LoadLogbookForExpedition(ex) # this loads the logbook for one expo
|
||||
|
||||
for b in bloglist:
|
||||
orig = LOGBOOK_PARSER_SETTINGS[str(b)]
|
||||
LOGBOOK_PARSER_SETTINGS[str(b)] = ("ukcavingblog.html", "parser_blog")
|
||||
nlbe[b] = LoadLogbookForExpedition(b, clean=False) # this actually loads the logbook for one expo
|
||||
if str(b) in LOGBOOK_PARSER_SETTINGS:
|
||||
orig = LOGBOOK_PARSER_SETTINGS[str(b)]
|
||||
else:
|
||||
orig = (DEFAULT_LOGBOOK_FILE, DEFAULT_LOGBOOK_PARSER)
|
||||
LOGBOOK_PARSER_SETTINGS[str(b)] = BLOG_PARSER_SETTINGS[str(b)]
|
||||
print(f" - BLOG: {b}")
|
||||
nlbe[b] = LoadLogbookForExpedition(b, clean=False) # this loads the blog logbook for one expo
|
||||
LOGBOOK_PARSER_SETTINGS[str(b)] = orig
|
||||
|
||||
# tried to use map with concurrent threads - but sqlite database is not concurrent, so failed with database lock
|
||||
|
@ -4,14 +4,15 @@
|
||||
<meta http-equiv="Content-Type" content="text/html; charset=utf-8" />
|
||||
<title>{{logbook_entries.0.expedition}} Expo Logbook</title>
|
||||
<link rel="stylesheet" href="../../css/main2.css" />
|
||||
<style>figure {font-weight: bold; font-size: small; font-family: sans-serif;font-variant-caps: small-caps;}</style>
|
||||
</head>
|
||||
<!-- Exported by troggle in this format after having been imported using a different format and a different parser.
|
||||
This is because we are steadily converting old formats to a new common format so that we do not need to maintain half
|
||||
a dozen parser functions.
|
||||
<!-- Exported by troggle in this format after having been imported using a different format and a different
|
||||
parser. This is because we are steadily converting old formats to a new common format so that we do not need to
|
||||
maintain half a dozen parser functions.
|
||||
|
||||
Exported on {% now 'Y-m-d D' %} using control panel webpage and exportlogbook() in troggle/code/views/other.py
|
||||
-->
|
||||
<body>
|
||||
<h1>Expo {{logbook_entries.0.expedition}}</h1>
|
||||
{%for logbook_entry in logbook_entries%}
|
||||
<hr />
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user