import os
import re
import sys
import time
from datetime import date, datetime
from pathlib import Path
from random import randint
from django.conf import settings
from django.template.defaultfilters import slugify
from parsers.people import GetPersonExpeditionNameLookup, load_people_expos
from troggle.core.models.caves import GetCaveLookup
from troggle.core.models.logbooks import LogbookEntry, PersonLogEntry
from troggle.core.models.troggle import DataIssue, Expedition
from troggle.core.utils import get_process_memory
"""
Parses and imports logbooks in all their wonderful confusion
See detailed explanation of the complete process:
https://expo.survex.com/handbook/computing/logbooks-parsing.html
"""
todo = """
- Most of the time is during the database writing (6s out of 8s).
- this is a slow and uncertain function too: cave = getCaveByReference(caveRef)
- pre-compile all the heavily used regular expressions !
- profile the code to find bad repetitive things, of which there are many.
- attach or link a DataIssue to an individual expo (logbook) so that it can be found and deleted
- replace explicit 1970 date with a constant EPOCH
- rewrite to use generators rather than storing everything intermediate in lists - to reduce memory impact.
- We should ensure logbook.html is utf-8 and stop this crap:
file_in = open(logbookfile,'rb')
txt = file_in.read().decode("latin1")
- use Fixtures https://docs.djangoproject.com/en/4.1/ref/django-admin/#django-admin-loaddata to cache
data for old logbooks? Not worth it..
"""
MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200
BLOG_PARSER_SETTINGS = { # no default, must be explicit
# "2022": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
# "2019": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
# "2018": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
# "2017": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
}
DEFAULT_LOGBOOK_FILE = "logbook.html"
DEFAULT_LOGBOOK_PARSER = "parser_html"
# All years now (Jan.2023) use the default value for Logbook parser
# dont forget to update expoweb/pubs.htm to match. 1982 left as reminder of expected format.
LOGBOOK_PARSER_SETTINGS = {
"1982": ("logbook.html", "parser_html"),
}
ENTRIES = {
"2022": 90,
"2019": 55,
"2018": 95,
"2017": 74,
"2016": 86,
"2015": 80,
"2014": 67,
"2013": 52,
"2012": 76,
"2011": 71,
"2010": 22,
"2009": 53,
"2008": 49,
"2007": 113,
"2006": 60,
"2005": 55,
"2004": 76,
"2003": 42,
"2002": 31,
"2001": 49,
"2000": 54,
"1999": 79,
"1998": 43,
"1997": 53,
"1996": 95,
"1995": 42,
"1994": 32,
"1993": 41,
"1992": 62,
"1991": 39,
"1990": 87,
"1989": 63,
"1988": 61,
"1987": 34,
"1985": 24,
"1984": 32,
"1983": 52,
"1982": 42,
}
logentries = [] # the entire logbook for one year is a single object: a list of entries
noncaveplaces = ["travel", "Journey", "Loser Plateau", "UNKNOWN", "plateau", "base camp", "basecamp", "top camp", "topcamp"]
def set_trip_id(year, seq):
tid = f"{year}_s{seq:02d}"
return tid
rx_tripperson = re.compile(r"(?i)(.*?)$")
rx_round_bracket = re.compile(r"[\(\[].*?[\)\]]")
def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
res = []
author = None
# print(f'# {tid}')
# print(f" - {tid} '{trippeople}' ")
for tripperson in re.split(r",|\+|&|&(?!\w+;)| and ", trippeople):
tripperson = tripperson.strip()
# mul = re.match(r"(?i)(.*?)$", tripperson)
mul = rx_tripperson.match(tripperson)
if mul:
tripperson = mul.group(1).strip()
if tripperson and tripperson[0] != "*":
tripperson = re.sub(rx_round_bracket, "", tripperson).strip()
# Whacky aliases all handled in GetPersonExpeditionNameLookup()
personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower())
if not personyear:
message = f" ! - {expedition.year} No name match for: '{tripperson}' in entry {tid=} for this expedition year."
print(message)
DataIssue.objects.create(parser="logbooks", message=message)
res.append((personyear, logtime_underground))
if mul:
author = personyear
if not author:
if not res:
return "", 0
author = res[-1][0] # the previous valid person and a time of 0 hours
# print(f" - {tid} [{author.person}] '{res[0][0].person}'...")
return res, author
def tidy_time_underground(logtime_underground):
# Nasty hack, must tidy this up..
if logtime_underground:
try:
logtime_underground = float(logtime_underground)
except:
# print(f"logtime_underground = {logtime_underground}")
tu_match = re.match(r"(T/U:\s*)?(\d+[.]?\d*).*", logtime_underground)
if tu_match:
# print(f"logtime_underground = {tu_match.group(2)}")
logtime_underground = float(tu_match.group(2))
else:
logtime_underground = 0
else:
logtime_underground = 0
return logtime_underground
def tidy_trip_persons(trippeople, expedition, logtime_underground, tid):
try:
trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground, tid=tid)
# print(f" - {author} - {logtime_underground}")
except:
message = f" ! - {expedition.year} Skipping logentry: {title} - GetTripPersons FAIL"
DataIssue.objects.create(parser="logbooks", message=message)
print(message)
# raise
return
if not author:
message = f" ! - {expedition.year} Warning: logentry: {title} - no expo member author for entry '{tid}'"
DataIssue.objects.create(parser="logbooks", message=message)
print(message)
return trippersons, author
def tidy_trip_cave(place):
# GetCaveLookup() need to work better. None of this data is *used* though?
# 'tripcave' is converted to a string doing this, which renders as the cave slug.
lplace = place.lower()
cave = None
if lplace not in noncaveplaces:
cave = GetCaveLookup().get(lplace)
return cave
def tidy_trip_image_urls(text, date):
y = str(date)[:4]
text = text.replace(' src="', f' src="/years/{y}/')
text = text.replace(" src='", f" src='/years/{y}/")
text = text.replace(f' src="/years/{y}//years/{y}/', f' src="/years/{y}/')
text = text.replace(f" src='/years/{y}//years/{y}/", f" src='/years/{y}/")
text = text.replace("\t", "")
text = text.replace("\n\n\n", "\n\n")
return text
def tidy_tid(tid, title):
if tid is not None:
return tid
# print(f"! {title=} ")
tid = str(randint(1000, 9999)) + "_" + slugify(title)[:10].replace("-", "_")
return tid
def store_entry_into_database(date, place, tripcave, title, text, trippersons, author, expedition, logtime_underground, tid):
"""saves a single logbook entry and related personlogentry items
"""
nonLookupAttribs = {
"place": place,
"text": text,
"expedition": expedition,
"time_underground": logtime_underground,
"cave_slug": str(tripcave),
"slug": tid,
}
lookupAttribs = {"date": date, "title": title}
lbo = LogbookEntry.objects.create(**nonLookupAttribs, **lookupAttribs)
for tripperson, time_underground in trippersons:
lookupAttribs = {"personexpedition": tripperson, "logbook_entry": lbo}
nonLookupAttribs = {"time_underground": time_underground, "is_logbook_entry_author": (tripperson == author)}
pt = PersonLogEntry.objects.create(**nonLookupAttribs, **lookupAttribs)
def parser_date(tripdate, year):
"""Interprets dates in the expo logbooks and returns a correct datetime.date object"""
dummydate = date(1970, 1, 1) # replace with _EPOCH
month = 1
day = 1
# message = f" ! - Trying to parse date in logbook: {tripdate} - {year}"
# print(message)
try:
mdatestandard = re.match(r"(\d\d\d\d)-(\d\d)-(\d\d)", tripdate)
mdategoof = re.match(r"(\d\d?)/0?(\d)/(20|19)?(\d\d)", tripdate)
if mdatestandard:
if not (mdatestandard.group(1) == year):
message = f" ! - Bad date (year) in logbook: {tripdate} - {year}"
DataIssue.objects.create(parser="logbooks", message=message)
return dummydate
else:
year, month, day = int(mdatestandard.group(1)), int(mdatestandard.group(2)), int(mdatestandard.group(3))
elif mdategoof:
if not (not mdategoof.group(3) or mdategoof.group(3) == year[:2]):
message = " ! - Bad date mdategoof.group(3) in logbook: " + tripdate + " - " + mdategoof.group(3)
DataIssue.objects.create(parser="logbooks", message=message)
return dummydate
else:
yadd = int(year[:2]) * 100
day, month, year = int(mdategoof.group(1)), int(mdategoof.group(2)), int(mdategoof.group(4)) + yadd
else:
year = 1970 # replace with _EPOCH
message = f" ! - Bad date in logbook: {tripdate} - {year}"
DataIssue.objects.create(parser="logbooks", message=message)
return date(year, month, day)
except:
message = f" ! - Failed to parse date in logbook: {tripdate} - {year}"
DataIssue.objects.create(parser="logbooks", message=message)
return datetime.date(1970, 1, 1) # replace with _EPOCH
def parser_html(year, expedition, txt, seq=""):
"""This uses some of the more obscure capabilities of regular expressions,
see https://docs.python.org/3/library/re.html
You can't see it here, but a round-trip export-then-import will move
the endmatter up to the frontmatter. This made sense when translating
from parser_html_01 format logfiles, believe me.
"""
logentries = []
dupl = {}
# extract front material and stash for later use when rebuilding from list of entries
headmatch = re.match(r"(?i)(?s).*
]*>(.*?) 0:
frontpath = Path(settings.EXPOWEB, "years", year, "frontmatter.html")
with open(frontpath, "w") as front:
front.write(headpara + "\n")
# extract END material and stash for later use when rebuilding from list of entries
endmatch = re.match(r"(?i)(?s).*([\s\S]*?)(?= 0:
endpath = Path(settings.EXPOWEB, "years", year, "endmatter.html")
with open(endpath, "w") as end:
end.write(endpara + "\n")
tripparas = re.findall(r"([\s\S]*?)(?=.*?\s*
)? # second date
\s*(?:\s*)?
\s*
(.*?)
(?:
)?
\s*
\s*(.*?)
\s*
\s*(.*?)
([\s\S]*?)
\s*(?:
\s*(.*?)
)?
\s*$
""",
trippara,
)
if s:
tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups()
else: # allow title and people to be swapped in order
msg = f" !- {year} Can't parse:{logbook_entry_count} '{trippara[:50]}'..."
print(msg)
DataIssue.objects.create(parser="logbooks", message=msg)
s2 = re.match(
r"""(?x)(?:\s*
.*?
\s*
)? # second date
\s*(?:\s*)?
\s*
(.*?)
(?:
)?
\s*
\s*(.*?)
\s*
\s*(.*?)
([\s\S]*?)
\s*(?:
\s*(.*?)
)?
\s*$
""",
trippara,
)
if s2:
tripid, tripid1, tripdate, triptitle, trippeople, triptext, tu = s2.groups()
else:
# if not re.search(r"Rigging Guide", trippara):
msg = f" !- Logbook. Can't parse entry on 2nd pass:{logbook_entry_count} '{trippara[:50]}'..."
print(msg)
DataIssue.objects.create(parser="logbooks", message=msg)
continue
ldate = parser_date(tripdate.strip(), year)
triptitles = triptitle.split(" - ")
if len(triptitles) >= 2:
place = triptitles[0]
else:
place = "UNKNOWN"
tripcontent = re.sub(r"", "", triptext)
tripcontent = re.sub(r"
", "
", tripcontent).strip()
triptitle = triptitle.strip()
# triptitle must be unique for a given date. We fix this here.
check = (ldate, triptitle)
if check in dupl:
dupl[check] += 1
triptitle = f"{triptitle} #{dupl[check]}"
print(f" - {triptitle} -- {ldate}")
else:
dupl[check] = 1
tu = tidy_time_underground(tu)
trippersons, author = tidy_trip_persons(trippeople, expedition, tu, tid)
tripcave = tidy_trip_cave(place)
tripcontent = tidy_trip_image_urls(tripcontent, ldate)
tid = tidy_tid(tid, triptitle)
entrytuple = (ldate, place, tripcave, triptitle, tripcontent, trippersons, author, expedition, tu, tid)
logentries.append(entrytuple)
return logentries
def parser_blog(year, expedition, txt, sq=""):
"""Parses the format of web pages collected as 'Save As HTML" from the UK Caving blog website.
Note that the entries have dates and authors, but no titles.
See detailed explanation of the complete process:
https://expo.survex.com/handbook/computing/logbooks-parsing.html
https://expo.survex.com/handbook/computing/log-blog-parsing.html
This uses some of the more obscure capabilities of regular expressions,
see https://docs.python.org/3/library/re.html
BLOG entries have this structure:
So the content is nested inside the header. Attachments (images) come after the content.
It's a bugger, but it's out of our control.
"""
logentries = []
tripheads = re.findall(
r"\s*([\s\S]*?)(]*>)([\s\S]*?)(?=[\s\S]*?(?=)", "", attach)
attach = re.sub(r"