forked from expo/troggle
866 lines
35 KiB
Python
866 lines
35 KiB
Python
import csv
|
|
import os
|
|
import re
|
|
import time
|
|
from datetime import date, datetime
|
|
from pathlib import Path
|
|
from random import randint
|
|
|
|
from django.conf import settings
|
|
from django.template.defaultfilters import slugify
|
|
from django.utils.timezone import get_current_timezone, make_aware
|
|
|
|
from parsers.people import GetPersonExpeditionNameLookup
|
|
from troggle.core.models.caves import Cave, GetCaveLookup, LogbookEntry, PersonTrip
|
|
from troggle.core.models.troggle import DataIssue, Expedition
|
|
from troggle.core.utils import TROG, save_carefully
|
|
|
|
"""
|
|
Parses and imports logbooks in all their wonderful confusion
|
|
See detailed explanation of the complete process:
|
|
https://expo.survex.com/handbook/computing/logbooks-parsing.html
|
|
"""
|
|
todo = """
|
|
- refactor everything with some urgency, esp. LoadLogbookForExpedition()
|
|
|
|
- remove the TROG things since we need the database for multiuser access? Or not?
|
|
|
|
- profile the code to find bad repetitive things, of which there are many.
|
|
|
|
- far too many uses of Django field dereferencing to get values, which is SLOW
|
|
|
|
- Logbooks 1987, 1988, 1989 all crash on MySql - but not sqlite - with db constraint fail. Edit logbook to fix.
|
|
|
|
- import/parse/re-export-as-html the 'artisanal-format' old logbooks so that
|
|
we keep only a modern HTML05 format. Then we can retire the old parsers and reduce the
|
|
volume of code here substantially.
|
|
|
|
- rewrite to use generators rather than storing everything intermediate in lists - to reduce memory impact.
|
|
|
|
- We should ensure logbook.html is utf-8 and stop this crap:
|
|
file_in = open(logbookfile,'rb')
|
|
txt = file_in.read().decode("latin1")
|
|
|
|
- this is a slow and uncertain function: cave = getCaveByReference(caveRef)
|
|
|
|
- use Fixtures https://docs.djangoproject.com/en/4.1/ref/django-admin/#django-admin-loaddata to cache
|
|
data for old logbooks? Not worth it..
|
|
|
|
"""
|
|
MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200
|
|
BLOG_PARSER_SETTINGS = {
|
|
# "2022": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
|
|
# "2019": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
|
|
# "2018": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
|
|
# "2017": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
|
|
}
|
|
DEFAULT_LOGBOOK_FILE = "logbook.html"
|
|
DEFAULT_LOGBOOK_PARSER = "parser_html"
|
|
# All years since 2002 use the default value for Logbook parser
|
|
# dont forget to update expoweb/pubs.htm to match.
|
|
LOGBOOK_PARSER_SETTINGS = {
|
|
"2002": ("logbook.html", "parser_html"),
|
|
"2001": ("log.htm", "parser_html_01"),
|
|
"2000": ("log.htm", "parser_html_01"),
|
|
"1999": ("log.htm", "parser_html_01"),
|
|
"1998": ("log.htm", "parser_html_01"),
|
|
"1997": ("log.htm", "parser_html_01"),
|
|
"1996": ("log.htm", "parser_html_01"),
|
|
"1995": ("log.htm", "parser_html_01"),
|
|
"1994": ("logbook.html", "parser_html"),
|
|
"1993": ("logbook.html", "parser_html"),
|
|
"1992": ("logbook.html", "parser_html"),
|
|
"1991": ("logbook.html", "parser_html"),
|
|
"1990": ("logbook.html", "parser_html"),
|
|
"1989": ("logbook.html", "parser_html"),
|
|
"1988": ("logbook.html", "parser_html"),
|
|
"1987": ("logbook.html", "parser_html"),
|
|
"1985": ("logbook.html", "parser_html"),
|
|
"1984": ("logbook.html", "parser_html"),
|
|
"1983": ("logbook.html", "parser_html"),
|
|
"1982": ("logbook.html", "parser_html"),
|
|
}
|
|
|
|
entries = {
|
|
"2022": 89,
|
|
"2019": 55,
|
|
"2018": 95,
|
|
"2017": 74,
|
|
"2016": 86,
|
|
"2015": 80,
|
|
"2014": 65,
|
|
"2013": 52,
|
|
"2012": 75,
|
|
"2011": 71,
|
|
"2010": 22,
|
|
"2009": 53,
|
|
"2008": 49,
|
|
"2007": 113,
|
|
"2006": 60,
|
|
"2005": 55,
|
|
"2004": 76,
|
|
"2003": 42,
|
|
"2002": 31,
|
|
"2001": 49,
|
|
"2000": 54,
|
|
"1999": 79,
|
|
"1998": 43,
|
|
"1997": 53,
|
|
"1996": 95,
|
|
"1995": 42,
|
|
"1994": 32,
|
|
"1993": 41,
|
|
"1992": 62,
|
|
"1991": 39,
|
|
"1990": 87,
|
|
"1989": 63,
|
|
"1988": 61,
|
|
"1987": 34,
|
|
"1985": 24,
|
|
"1984": 32,
|
|
"1983": 52,
|
|
"1982": 42,
|
|
}
|
|
|
|
logentries = [] # the entire logbook for one year is a single object: a list of entries
|
|
noncaveplaces = ["Journey", "Loser Plateau", "UNKNOWN", "plateau", "base camp", "basecamp", "top camp", "topcamp"]
|
|
logdataissues = TROG["issues"]["logdataissues"]
|
|
trips = {}
|
|
|
|
#
|
|
# the logbook loading section
|
|
#
|
|
def set_trip_id(year, seq):
|
|
tid = f"{year}_s{seq:02d}"
|
|
return tid
|
|
|
|
|
|
rx_tripperson = re.compile(r"(?i)<u>(.*?)</u>$")
|
|
rx_round_bracket = re.compile(r"[\(\[].*?[\)\]]")
|
|
|
|
|
|
def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
|
|
res = []
|
|
author = None
|
|
# print(f'# {tid}')
|
|
# print(f" - {tid} '{trippeople}' ")
|
|
|
|
for tripperson in re.split(r",|\+|&|&(?!\w+;)| and ", trippeople):
|
|
tripperson = tripperson.strip()
|
|
# mul = re.match(r"(?i)<u>(.*?)</u>$", tripperson)
|
|
mul = rx_tripperson.match(tripperson)
|
|
if mul:
|
|
tripperson = mul.group(1).strip()
|
|
if tripperson and tripperson[0] != "*":
|
|
tripperson = re.sub(rx_round_bracket, "", tripperson).strip()
|
|
|
|
# these aliases should be moved to people.py GetPersonExpeditionNameLookup(expedition)
|
|
if tripperson == "Wiggy":
|
|
tripperson = "Phil Wigglesworth"
|
|
if tripperson == "Animal":
|
|
tripperson = "Mike Richardson"
|
|
if tripperson == "MikeTA":
|
|
tripperson = "Mike Richardson"
|
|
if tripperson == "CavingPig":
|
|
tripperson = "Elaine Oliver"
|
|
if tripperson == "nobrotson":
|
|
tripperson = "Rob Watson"
|
|
if tripperson == "Tinywoman":
|
|
tripperson = "Nadia"
|
|
if tripperson == "tcacrossley":
|
|
tripperson = "Tom Crossley"
|
|
if tripperson == "Samouse1":
|
|
tripperson = "Todd Rye"
|
|
|
|
personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower())
|
|
if not personyear:
|
|
message = f" ! - {expedition.year} No name match for: '{tripperson}' in entry {tid=} for this expedition year."
|
|
print(message)
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
logdataissues[tid] = message
|
|
res.append((personyear, logtime_underground))
|
|
if mul:
|
|
author = personyear
|
|
if not author:
|
|
if not res:
|
|
return "", 0
|
|
author = res[-1][0] # the previous valid person and a time of 0 hours
|
|
|
|
# print(f" - {tid} [{author.person}] '{res[0][0].person}'...")
|
|
return res, author
|
|
|
|
|
|
def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, tid=None):
|
|
"""saves a logbook entry and related persontrips
|
|
Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday !
|
|
|
|
troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite but we are saving the same thing too many times..
|
|
|
|
Until 18 Dec.2022, this was overwriting logbook entries for the same date with the same title, because
|
|
lookupAttribs={'date':date, 'title':title}
|
|
"""
|
|
|
|
# Nasty hack, must tidy this up..
|
|
if logtime_underground:
|
|
try:
|
|
logtime_underground = float(logtime_underground)
|
|
except:
|
|
# print(f"logtime_underground = {logtime_underground}")
|
|
tu_match = re.match(r"(T/U:\s*)?(\d+[.]?\d*).*", logtime_underground)
|
|
if tu_match:
|
|
# print(f"logtime_underground = {tu_match.group(2)}")
|
|
logtime_underground = float(tu_match.group(2))
|
|
else:
|
|
logtime_underground = 0
|
|
else:
|
|
logtime_underground = 0
|
|
|
|
try:
|
|
trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground, tid=tid)
|
|
# print(f" - {author} - {logtime_underground}")
|
|
except:
|
|
message = f" ! - {expedition.year} Skipping logentry: {title} - GetTripPersons FAIL"
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
logdataissues["title"] = message
|
|
print(message)
|
|
raise
|
|
return
|
|
|
|
if not author:
|
|
message = f" ! - {expedition.year} Warning: logentry: {title} - no expo member author for entry '{tid}'"
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
logdataissues["title"] = message
|
|
print(message)
|
|
# return
|
|
|
|
# This needs attention. The slug field is derived from 'title'
|
|
# both GetCaveLookup() and GetTripCave() need to work together better. None of this data is *used* though?
|
|
# tripCave = GetTripCave(place):
|
|
|
|
lplace = place.lower()
|
|
cave = None
|
|
if lplace not in noncaveplaces:
|
|
cave = GetCaveLookup().get(lplace)
|
|
|
|
y = str(date)[:4]
|
|
|
|
text = text.replace(' src="', f' src="/years/{y}/')
|
|
text = text.replace(" src='", f" src='/years/{y}/")
|
|
|
|
text = text.replace(f' src="/years/{y}//years/{y}/', f' src="/years/{y}/')
|
|
text = text.replace(f" src='/years/{y}//years/{y}/", f" src='/years/{y}/")
|
|
|
|
text = text.replace("\t", "")
|
|
text = text.replace("\n\n\n", "\n\n")
|
|
|
|
# Check for an existing copy of the current entry, and save
|
|
expeditionday = expedition.get_expedition_day(date)
|
|
lookupAttribs = {"date": date, "title": title}
|
|
# 'cave' is converted to a string doing this, which renders as the cave slug.
|
|
# but it is a db query which we should try to avoid - rewrite this
|
|
|
|
# NEW slug for a logbook entry here! Unique id + slugified title fragment
|
|
|
|
if tid is not None:
|
|
slug = tid
|
|
# slug = tid + "_" + slugify(title)[:10].replace('-','_')
|
|
else:
|
|
slug = str(randint(1000, 9999)) + "_" + slugify(title)[:10].replace("-", "_")
|
|
nonLookupAttribs = {
|
|
"place": place,
|
|
"text": text,
|
|
"expedition": expedition,
|
|
"time_underground": logtime_underground,
|
|
"cave_slug": str(cave),
|
|
"slug": slug,
|
|
}
|
|
|
|
# This creates the lbo instance of LogbookEntry
|
|
lbo, created = save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs)
|
|
|
|
# for PersonTrip time_underground is float (decimal hours)
|
|
for tripperson, time_underground in trippersons:
|
|
# print(f" - {tid} '{tripperson}' author:{tripperson == author}")
|
|
lookupAttribs = {"personexpedition": tripperson, "logbook_entry": lbo}
|
|
nonLookupAttribs = {"time_underground": time_underground, "is_logbook_entry_author": (tripperson == author)}
|
|
# this creates the PersonTrip instance.
|
|
save_carefully(PersonTrip, lookupAttribs, nonLookupAttribs)
|
|
|
|
|
|
def ParseDate(tripdate, year):
|
|
"""Interprets dates in the expo logbooks and returns a correct datetime.date object"""
|
|
dummydate = date(1970, 1, 1)
|
|
month = 1
|
|
day = 1
|
|
# message = f" ! - Trying to parse date in logbook: {tripdate} - {year}"
|
|
# print(message)
|
|
try:
|
|
mdatestandard = re.match(r"(\d\d\d\d)-(\d\d)-(\d\d)", tripdate)
|
|
mdategoof = re.match(r"(\d\d?)/0?(\d)/(20|19)?(\d\d)", tripdate)
|
|
if mdatestandard:
|
|
if not (mdatestandard.group(1) == year):
|
|
message = f" ! - Bad date (year) in logbook: {tripdate} - {year}"
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
logdataissues["tripdate"] = message
|
|
return dummydate
|
|
else:
|
|
year, month, day = int(mdatestandard.group(1)), int(mdatestandard.group(2)), int(mdatestandard.group(3))
|
|
elif mdategoof:
|
|
if not (not mdategoof.group(3) or mdategoof.group(3) == year[:2]):
|
|
message = " ! - Bad date mdategoof.group(3) in logbook: " + tripdate + " - " + mdategoof.group(3)
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
logdataissues["tripdate"] = message
|
|
return dummydate
|
|
else:
|
|
yadd = int(year[:2]) * 100
|
|
day, month, year = int(mdategoof.group(1)), int(mdategoof.group(2)), int(mdategoof.group(4)) + yadd
|
|
else:
|
|
year = 1970
|
|
message = f" ! - Bad date in logbook: {tripdate} - {year}"
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
logdataissues["tripdate"] = message
|
|
|
|
return date(year, month, day)
|
|
except:
|
|
message = f" ! - Failed to parse date in logbook: {tripdate} - {year}"
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
logdataissues["tripdate"] = message
|
|
return datetime.date(1970, 1, 1)
|
|
|
|
|
|
# 2002 - now
|
|
def parser_html(year, expedition, txt, seq=""):
|
|
"""This uses some of the more obscure capabilities of regular expressions,
|
|
see https://docs.python.org/3/library/re.html
|
|
|
|
You can't see it here, but a round-trip export-then-import will move
|
|
the endmatter up to the frontmatter. This makes sense when moving
|
|
from parser_html_01 format logfiles, believe me.
|
|
"""
|
|
global logentries
|
|
global logdataissues
|
|
|
|
# extract front material and stash for later use when rebuilding from list of entries
|
|
headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt)
|
|
headpara = headmatch.groups()[0].strip()
|
|
|
|
# print(f" - headpara:\n'{headpara}'")
|
|
if len(headpara) > 0:
|
|
frontpath = Path(settings.EXPOWEB, "years", year, "frontmatter.html")
|
|
with open(frontpath, "w") as front:
|
|
front.write(headpara + "\n")
|
|
|
|
# extract END material and stash for later use when rebuilding from list of entries
|
|
endmatch = re.match(r"(?i)(?s).*<hr\s*/>([\s\S]*?)(?=</body)", txt)
|
|
endpara = endmatch.groups()[0].strip()
|
|
|
|
# print(f" - endpara:\n'{endpara}'")
|
|
if len(endpara) > 0:
|
|
endpath = Path(settings.EXPOWEB, "years", year, "endmatter.html")
|
|
with open(endpath, "w") as end:
|
|
end.write(endpara + "\n")
|
|
|
|
tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt)
|
|
logbook_entry_count = 0
|
|
for trippara in tripparas:
|
|
logbook_entry_count += 1
|
|
tid = set_trip_id(year, logbook_entry_count)
|
|
# print(f' - new tid:{tid} lbe count: {logbook_entry_count}')
|
|
|
|
s = re.match(
|
|
r"""(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date
|
|
\s*(?:<a\s+id="(.*?)"\s*/>\s*</a>)?
|
|
\s*<div\s+class="tripdate"\s*(?:id="(.*?)")?>(.*?)</div>(?:<p>)?
|
|
\s*<div\s+class="trippeople">\s*(.*?)</div>
|
|
\s*<div\s+class="triptitle">\s*(.*?)</div>
|
|
([\s\S]*?)
|
|
\s*(?:<div\s+class="timeug">\s*(.*?)</div>)?
|
|
\s*$
|
|
""",
|
|
trippara,
|
|
)
|
|
if s:
|
|
tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups()
|
|
else: # allow title and people to be swapped in order
|
|
msg = f" !- {year} Can't parse:{logbook_entry_count} '{trippara[:50]}'..."
|
|
print(msg)
|
|
DataIssue.objects.create(parser="logbooks", message=msg)
|
|
logdataissues[tid] = msg
|
|
|
|
s2 = re.match(
|
|
r"""(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date
|
|
\s*(?:<a\s+id="(.*?)"\s*/>\s*</a>)?
|
|
\s*<div\s+class="tripdate"\s*(?:id="(.*?)")?>(.*?)</div>(?:<p>)?
|
|
\s*<div\s+class="triptitle">\s*(.*?)</div>
|
|
\s*<div\s+class="trippeople">\s*(.*?)</div>
|
|
([\s\S]*?)
|
|
\s*(?:<div\s+class="timeug">\s*(.*?)</div>)?
|
|
\s*$
|
|
""",
|
|
trippara,
|
|
)
|
|
if s2:
|
|
tripid, tripid1, tripdate, triptitle, trippeople, triptext, tu = s2.groups()
|
|
else:
|
|
# if not re.search(r"Rigging Guide", trippara):
|
|
msg = f" !- Logbook. Can't parse entry on 2nd pass:{logbook_entry_count} '{trippara[:50]}'..."
|
|
print(msg)
|
|
DataIssue.objects.create(parser="logbooks", message=msg)
|
|
logdataissues[tid] = msg
|
|
continue
|
|
|
|
ldate = ParseDate(tripdate.strip(), year)
|
|
triptitles = triptitle.split(" - ")
|
|
if len(triptitles) >= 2:
|
|
tripcave = triptitles[0]
|
|
else:
|
|
tripcave = "UNKNOWN"
|
|
ltriptext = re.sub(r"</p>", "", triptext)
|
|
# ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
|
|
ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
|
|
|
|
triptitle = triptitle.strip()
|
|
entrytuple = (ldate, tripcave, triptitle, ltriptext, trippeople, expedition, tu, tripid1)
|
|
logentries.append(entrytuple)
|
|
|
|
|
|
# main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it
|
|
def parser_html_01(year, expedition, txt, seq=""):
|
|
global logentries
|
|
global logdataissues
|
|
errorcount = 0
|
|
|
|
# extract front material and stash for later use when rebuilding from list of entries
|
|
headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt)
|
|
headpara = headmatch.groups()[0].strip()
|
|
|
|
# print(f" - headpara:\n'{headpara}'")
|
|
if len(headpara) > 0:
|
|
frontpath = Path(settings.EXPOWEB, "years", year, "frontmatter.html")
|
|
with open(frontpath, "w") as front:
|
|
front.write(headpara + "\n")
|
|
|
|
# extract END material and stash for later use when rebuilding from list of entries
|
|
endmatch = re.match(r"(?i)(?s).*<hr\s*/>([\s\S]*?)(?=</body)", txt)
|
|
if endmatch:
|
|
endpara = endmatch.groups()[0].strip()
|
|
else:
|
|
print(f" ! - {year} NO endmatch")
|
|
endpara = ""
|
|
|
|
# print(f" - endpara:\n'{endpara}'")
|
|
if len(endpara) > 0:
|
|
endpath = Path(settings.EXPOWEB, "years", year, "endmatter.html")
|
|
with open(endpath, "w") as end:
|
|
end.write(endpara + "\n")
|
|
|
|
tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
|
|
logbook_entry_count = 0
|
|
for trippara in tripparas:
|
|
logbook_entry_count += 1
|
|
tid = set_trip_id(year, logbook_entry_count)
|
|
# print(f" #0 - tid: {tid}")
|
|
try:
|
|
# print(f" #1 - tid: {tid}")
|
|
s = re.match(r"(?i)(?s)\s*(?:<p>)?(.*?)</?p>(.*)$", trippara)
|
|
if not s:
|
|
message = " ! - Skipping logentry {year} failure to parse header: " + tid + trippara[:300] + "..."
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
logdataissues[tid] = message
|
|
print(message)
|
|
break
|
|
try:
|
|
tripheader, triptext = s.group(1), s.group(2)
|
|
except:
|
|
message = f" ! - Fail to set tripheader, triptext. trip:<{tid}> s:'{s}'"
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
logdataissues[tid] = message
|
|
print(message)
|
|
|
|
# mtripid = re.search(r'<a id="(.*?)"', tripheader)
|
|
# if not mtripid:
|
|
# message = f" ! - A tag id not found. Never mind. Not needed. trip:<{tid}> header:'{tripheader}'"
|
|
# DataIssue.objects.create(parser='logbooks', message=message)
|
|
# logdataissues[tid]=message
|
|
# print(message)
|
|
|
|
# tripid = mtripid and mtripid.group(1) or ""
|
|
# print(f" # - mtripid: {mtripid}")
|
|
tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader)
|
|
# print(f" #2 - tid: {tid}")
|
|
try:
|
|
tripdate, triptitle, trippeople = tripheader.split("|")
|
|
except:
|
|
message = f" ! - Fail 3 to split out date|title|people. trip:<{tid}> '{tripheader.split('|')}'"
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
logdataissues[tid] = message
|
|
print(message)
|
|
try:
|
|
tripdate, triptitle = tripheader.split("|")
|
|
trippeople = "GUESS ANON"
|
|
except:
|
|
message = f" ! - Skipping logentry {year} Fail 2 to split out date|title (anon). trip:<{tid}> '{tripheader.split('|')}' CRASHES MySQL !"
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
logdataissues[tid] = message
|
|
print(message)
|
|
break
|
|
# print(f" #3 - tid: {tid}")
|
|
ldate = ParseDate(tripdate.strip(), year)
|
|
# print(f" # - tid: {tid} <{tripdate}> <{triptitle}> <{trippeople}>")
|
|
# print(f" #4 - tid: {tid}")
|
|
|
|
mtu = re.search(r"<p[^>]*>(T/?U.*)", triptext)
|
|
if mtu:
|
|
tu = mtu.group(1)
|
|
triptext = triptext[: mtu.start(0)] + triptext[mtu.end() :]
|
|
else:
|
|
tu = ""
|
|
|
|
triptitles = triptitle.split(" - ")
|
|
tripcave = triptitles[0].strip()
|
|
|
|
ltriptext = triptext
|
|
|
|
mtail = re.search(r'(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&|</?p>|\((?:same day|\d+)\))*$', ltriptext)
|
|
if mtail:
|
|
ltriptext = ltriptext[: mtail.start(0)]
|
|
ltriptext = re.sub(r"</p>", "", ltriptext)
|
|
ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
|
|
ltriptext = re.sub(r"</?u>", "_", ltriptext)
|
|
ltriptext = re.sub(r"</?i>", "''", ltriptext)
|
|
ltriptext = re.sub(r"</?b>", "'''", ltriptext)
|
|
ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
|
|
|
|
if ltriptext == "":
|
|
message = " ! - Zero content for logbook entry!: " + tid
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
logdataissues[tid] = message
|
|
print(message)
|
|
|
|
entrytuple = (ldate, tripcave, triptitle, ltriptext, trippeople, expedition, tu, tid)
|
|
logentries.append(entrytuple)
|
|
|
|
except:
|
|
message = f" ! - Skipping logentry {year} due to exception in: {tid}"
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
logdataissues[tid] = message
|
|
print(message)
|
|
errorcount += 1
|
|
raise
|
|
if errorcount > 5:
|
|
message = f" !!- TOO MANY ERRORS - aborting at '{tid}' logbook: {year}"
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
logdataissues[tid] = message
|
|
print(message)
|
|
return
|
|
|
|
|
|
def parser_blog(year, expedition, txt, sq=""):
|
|
"""Parses the format of web pages collected as 'Save As HTML" from the UK Caving blog website.
|
|
Note that the entries have dates and authors, but no titles.
|
|
See detailed explanation of the complete process:
|
|
https://expo.survex.com/handbook/computing/logbooks-parsing.html
|
|
https://expo.survex.com/handbook/computing/log-blog-parsing.html
|
|
|
|
This uses some of the more obscure capabilities of regular expressions,
|
|
see https://docs.python.org/3/library/re.html
|
|
|
|
BLOG entries have this structure:
|
|
<article ... data-author="Tinywoman" data-content="post-298780" id="js-post-298780">
|
|
<article class="message-body js-selectToQuote">
|
|
</article>
|
|
</article>
|
|
So the content is nested inside the header. Attachments (images) come after the content.
|
|
"""
|
|
global logentries
|
|
global logdataissues
|
|
errorcount = 0
|
|
|
|
tripheads = re.findall(
|
|
r"<article class=\"message message--post js-post js-inlineModContainer\s*\"\s*([\s\S]*?)(?=</article)", txt
|
|
)
|
|
if not (tripheads):
|
|
message = f" ! - Skipping on failure to parse article header: {txt[:500]}"
|
|
print(message)
|
|
|
|
# (?= is a non-consuming match, see https://docs.python.org/3/library/re.html
|
|
tripparas = re.findall(
|
|
r"<article class=\"message-body js-selectToQuote\"\>\s*([\s\S]*?)(</article[^>]*>)([\s\S]*?)(?=</article)", txt
|
|
)
|
|
if not (tripparas):
|
|
message = f" ! - Skipping on failure to parse article content: {txt[:500]}"
|
|
print(message)
|
|
|
|
if len(tripheads) != len(tripparas):
|
|
print(f"{len(tripheads)} != {len(tripparas)}")
|
|
print(f"{len(tripheads)} - {len(tripparas)}")
|
|
|
|
location = "Plateau" # best guess, fix manually later
|
|
tu = 0
|
|
logbook_entry_count = 0
|
|
for i in range(0, len(tripparas)):
|
|
tripstuff = tripparas[i]
|
|
attach = tripstuff[2]
|
|
# note use on non-greedy *? regex idiom here
|
|
attach = re.sub(r"<div class=\"file-content\">[\s\S]*?(?=</li>)", "", attach)
|
|
attach = re.sub(r"<footer[\s\S]*(</footer>)", "", attach)
|
|
tripcontent = tripstuff[0] + attach
|
|
# print(f"{i} - {len(tripstuff)} - {tripstuff[1]}")
|
|
triphead = tripheads[i]
|
|
logbook_entry_count += 1
|
|
tid = set_trip_id(year, logbook_entry_count) + "_blog" + sq
|
|
# print(f" - tid: {tid}")
|
|
|
|
# data-author="tcacrossley"
|
|
match_author = re.search(r".*data-author=\"([^\"]*)\" data-content=.*", triphead)
|
|
if not (match_author):
|
|
message = f" ! - Skipping logentry {year}:{logbook_entry_count} on failure to parse data-author {tid} {triphead[:400]}..."
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
logdataissues[tid] = message
|
|
print(message)
|
|
break
|
|
trippeople = match_author.group(1)
|
|
# print(f" - tid: {tid} {trippeople}")
|
|
# datetime="2019-07-11T13:16:18+0100"
|
|
match_datetime = re.search(r".*datetime=\"([^\"]*)\" data-time=.*", triphead)
|
|
if not (match_datetime):
|
|
message = f" ! - Skipping logentry {year}:{logbook_entry_count} on failure to parse datetime {tid} {triphead[:400]}..."
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
logdataissues[tid] = message
|
|
print(message)
|
|
break
|
|
datestamp = match_datetime.group(1)
|
|
|
|
try:
|
|
tripdate = datetime.fromisoformat(datestamp)
|
|
except:
|
|
message = f" ! - FROMISOFORMAT fail logentry {year}:{logbook_entry_count} {tid} '{datestamp}'"
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
logdataissues[tid] = message
|
|
print(message)
|
|
# fallback, ignore the timestamp bits:
|
|
tripdate = datetime.fromisoformat(datestamp[0:10])
|
|
# print(f" - tid: {tid} '{trippeople}' '{tripdate}'")
|
|
|
|
# tripname must have the location then a hyphen at the beginning as it is ignored by export function
|
|
location = "Unknown"
|
|
tripname = f"Expo - UK Caving Blog{sq} post {logbook_entry_count}" # must be unique for a given date
|
|
tripcontent = re.sub(r"(width=\"\d+\")", "", tripcontent)
|
|
tripcontent = re.sub(r"height=\"\d+\"", "", tripcontent)
|
|
tripcontent = re.sub(r"width: \d+px", "", tripcontent)
|
|
tripcontent = re.sub(r"\n\n+", "\n\n", tripcontent)
|
|
tripcontent = re.sub(r"<hr\s*>", "", tripcontent)
|
|
tripcontent = f"\n\n<!-- Content parsed from UK Caving Blog -->\nBlog Author: {trippeople}" + tripcontent
|
|
|
|
entrytuple = (tripdate, location, tripname, tripcontent, trippeople, expedition, tu, tid)
|
|
logentries.append(entrytuple)
|
|
|
|
|
|
def LoadLogbookForExpedition(expedition, clean=True):
|
|
"""Parses all logbook entries for one expedition
|
|
if clean==True then it deletes all entries for this year first.
|
|
"""
|
|
global logentries
|
|
# absolutely horrid. REFACTOR THIS (all my fault..)
|
|
global logdataissues
|
|
global entries
|
|
|
|
logbook_parseable = False
|
|
yearlinks = LOGBOOK_PARSER_SETTINGS
|
|
expologbase = os.path.join(settings.EXPOWEB, "years")
|
|
logentries = []
|
|
|
|
year = expedition.year
|
|
expect = entries[year]
|
|
# print(" - Logbook for: " + year)
|
|
|
|
def cleanerrors(year):
|
|
global logdataissues
|
|
dataissues = DataIssue.objects.filter(parser="logbooks")
|
|
for di in dataissues:
|
|
ph = year
|
|
if re.search(ph, di.message) is not None:
|
|
# print(f' - CLEANING dataissue {di.message}')
|
|
di.delete()
|
|
|
|
# print(f' - CLEAN {year} {len(logdataissues)} {type(logdataissues)} data issues for this year')
|
|
dellist = []
|
|
for key, value in logdataissues.items():
|
|
# print(f' - CLEANING logdataissues [{key}]: {value}')
|
|
if key.startswith(year):
|
|
# print(f' - CLEANING logdataissues [{key:12}]: {value} ')
|
|
dellist.append(key)
|
|
for i in dellist:
|
|
del logdataissues[i]
|
|
|
|
if clean:
|
|
cleanerrors(year)
|
|
|
|
if year in yearlinks:
|
|
yearfile, yearparser = yearlinks[year]
|
|
logbookpath = Path(yearfile)
|
|
expedition.logbookfile = yearfile
|
|
parsefunc = yearparser
|
|
# print(f" - Logbook file {yearfile} using parser {yearparser}")
|
|
|
|
else:
|
|
logbookpath = Path(DEFAULT_LOGBOOK_FILE)
|
|
expedition.logbookfile = DEFAULT_LOGBOOK_FILE
|
|
parsefunc = DEFAULT_LOGBOOK_PARSER
|
|
|
|
expedition.save()
|
|
|
|
lbes = LogbookEntry.objects.filter(expedition=expedition)
|
|
if clean:
|
|
for lbe in lbes:
|
|
lbe.delete()
|
|
|
|
for sq in ["", "2", "3", "4"]: # cope with blog saved as many separate files
|
|
lb = Path(expologbase, year, logbookpath.stem + sq + logbookpath.suffix)
|
|
if not (lb.is_file()):
|
|
# print(f" ! End of blog. Next blog file in sequence not there:{lb}")
|
|
break
|
|
try:
|
|
with open(lb, "rb") as file_in:
|
|
txt = file_in.read().decode("utf-8")
|
|
logbook_parseable = True
|
|
except (IOError):
|
|
logbook_parseable = False
|
|
print(f" ! Couldn't open logbook as UTF-8 {lb}")
|
|
except:
|
|
logbook_parseable = False
|
|
print(f" ! Very Bad Error opening {lb}")
|
|
|
|
if logbook_parseable:
|
|
|
|
# --------------------
|
|
parser = globals()[parsefunc]
|
|
print(f" - {year} parsing with {parsefunc} - {lb}")
|
|
parser(year, expedition, txt, sq) # this launches the right parser for this year
|
|
# --------------------
|
|
dupl = {}
|
|
for entrytuple in logentries:
|
|
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
|
|
check = (date, triptitle)
|
|
if check in dupl:
|
|
dupl[check] += 1
|
|
triptitle = f"{triptitle} #{dupl[check]}"
|
|
print(f" - {triptitle} -- {date}")
|
|
else:
|
|
dupl[check] = 1
|
|
EnterLogIntoDbase(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1)
|
|
|
|
if len(logentries) == expect:
|
|
# print(f"OK {year} {len(logentries):5d} is {expect}\n")
|
|
pass
|
|
else:
|
|
print(f"Mismatch in number of log entries: {year} {len(logentries):5d} is not {expect}\n")
|
|
|
|
return len(logentries)
|
|
|
|
|
|
def LoadLogbook(year):
|
|
"""One off logbook for testing purposes"""
|
|
global LOGBOOK_PARSER_SETTINGS
|
|
|
|
nlbe = {}
|
|
TROG["pagecache"]["expedition"][year] = None # clear cache
|
|
|
|
expo = Expedition.objects.get(year=year)
|
|
year = expo.year # some type funny
|
|
nlbe[expo] = LoadLogbookForExpedition(expo) # this actually loads the logbook for one expo
|
|
if year in BLOG_PARSER_SETTINGS:
|
|
print("BLOG parsing")
|
|
LOGBOOK_PARSER_SETTINGS[year] = BLOG_PARSER_SETTINGS[year]
|
|
nlbe[expo] = LoadLogbookForExpedition(expo, clean=False) # this loads the blog logbook for one expo
|
|
else:
|
|
print(
|
|
f"Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}"
|
|
)
|
|
|
|
|
|
def LoadLogbooks():
|
|
"""This is the master function for parsing all logbooks into the Troggle database.
|
|
This should be rewritten to use coroutines to load all logbooks from disc in parallel,
|
|
but must be serialised to write to database as sqlite is single-user.
|
|
"""
|
|
global logdataissues
|
|
global entries
|
|
|
|
logdataissues = {}
|
|
DataIssue.objects.filter(parser="logbooks").delete()
|
|
expos = Expedition.objects.all()
|
|
if len(expos) <= 1:
|
|
message = f" ! - No expeditions found. Load 'people' first"
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
logdataissues[f"sqlfail 0000"] = message
|
|
print(message)
|
|
return
|
|
|
|
noexpo = [
|
|
"1986",
|
|
"2020",
|
|
"2021",
|
|
] # no expo
|
|
lostlogbook = ["1976", "1977", "1978", "1979", "1980", "1981"]
|
|
sqlfail = [""] # breaks mysql with db constraint fail - all now fixed.]
|
|
nologbook = noexpo + lostlogbook + sqlfail
|
|
|
|
nlbe = {}
|
|
expd = {}
|
|
loglist = []
|
|
bloglist = []
|
|
|
|
for expo in expos: # pointless as we explicitly know the years in this code.
|
|
year = expo.year
|
|
TROG["pagecache"]["expedition"][year] = None # clear cache
|
|
if year in sqlfail:
|
|
print(" - Logbook for: " + year + " NO parsing attempted - known sql failures")
|
|
message = f" ! - Not even attempting to parse logbook for {year} until code fixed"
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
logdataissues[f"sqlfail {year}"] = message
|
|
print(message)
|
|
|
|
if year not in nologbook:
|
|
if year in entries:
|
|
loglist.append(expo)
|
|
else:
|
|
print(" - No Logbook yet for: " + year) # catch case when preparing for next expo
|
|
|
|
if year in BLOG_PARSER_SETTINGS:
|
|
bloglist.append(expo)
|
|
|
|
for ex in loglist:
|
|
nlbe[ex] = LoadLogbookForExpedition(ex) # this loads the logbook for one expo
|
|
|
|
for b in bloglist:
|
|
if str(b) in LOGBOOK_PARSER_SETTINGS:
|
|
orig = LOGBOOK_PARSER_SETTINGS[str(b)]
|
|
else:
|
|
orig = (DEFAULT_LOGBOOK_FILE, DEFAULT_LOGBOOK_PARSER)
|
|
LOGBOOK_PARSER_SETTINGS[str(b)] = BLOG_PARSER_SETTINGS[str(b)]
|
|
print(f" - BLOG: {b}")
|
|
nlbe[b] = LoadLogbookForExpedition(b, clean=False) # this loads the blog logbook for one expo
|
|
LOGBOOK_PARSER_SETTINGS[str(b)] = orig
|
|
|
|
# tried to use map with concurrent threads - but sqlite database is not concurrent, so failed with database lock
|
|
# yt = 0
|
|
# for r in map(LoadLogbookForExpedition, loglist):
|
|
# yt = r
|
|
|
|
yt = 0
|
|
for e in nlbe:
|
|
yt += nlbe[e]
|
|
print(f"total {yt:,} log entries parsed in all expeditions")
|
|
|
|
|
|
# dateRegex = re.compile(r'<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S)
|
|
# expeditionYearRegex = re.compile(r'<span\s+class="expeditionyear">(.*?)</span>', re.S)
|
|
# titleRegex = re.compile(r'<H1>(.*?)</H1>', re.S)
|
|
# reportRegex = re.compile(r'<div\s+class="report">(.*)</div>\s*</body>', re.S)
|
|
# personRegex = re.compile(r'<div\s+class="person">(.*?)</div>', re.S)
|
|
# nameAuthorRegex = re.compile(r'<span\s+class="name(,author|)">(.*?)</span>', re.S)
|
|
# TURegex = re.compile(r'<span\s+class="TU">([0-9]*\.?[0-9]+)</span>', re.S)
|
|
# locationRegex = re.compile(r'<span\s+class="location">(.*?)</span>', re.S)
|
|
# caveRegex = re.compile(r'<span\s+class="cave">(.*?)</span>', re.S)
|