2
0
mirror of https://expo.survex.com/repositories/troggle/.git synced 2024-11-25 08:41:51 +00:00
troggle/parsers/logbooks.py

753 lines
30 KiB
Python
Raw Normal View History

2011-07-11 02:10:22 +01:00
import os
2020-05-28 02:20:50 +01:00
import re
2023-01-27 23:21:07 +00:00
import sys
2023-08-31 16:55:20 +01:00
import string
2023-01-27 23:21:07 +00:00
import time
2023-01-19 18:33:04 +00:00
from datetime import date, datetime
from pathlib import Path
2023-01-19 18:33:04 +00:00
from random import randint
2011-07-11 02:10:22 +01:00
2020-05-28 02:20:50 +01:00
from django.conf import settings
from django.template.defaultfilters import slugify
from parsers.people import GetPersonExpeditionNameLookup, load_people_expos, known_foreigner
2023-01-29 16:47:46 +00:00
from troggle.core.models.caves import GetCaveLookup
2023-01-30 16:18:19 +00:00
from troggle.core.models.logbooks import LogbookEntry, PersonLogEntry
2021-04-13 00:43:57 +01:00
from troggle.core.models.troggle import DataIssue, Expedition
2023-09-01 18:31:19 +01:00
from troggle.core.utils import get_process_memory, alphabet_suffix, unique_slug
2011-07-11 02:10:22 +01:00
2023-09-07 19:47:02 +01:00
EPOCH = settings.EPOCH
2023-01-19 21:18:42 +00:00
"""
2021-04-13 01:37:42 +01:00
Parses and imports logbooks in all their wonderful confusion
See detailed explanation of the complete process:
https://expo.survex.com/handbook/computing/logbooks-parsing.html
2023-01-19 21:18:42 +00:00
"""
todo = """
- check cross-references in other logbooks and other HTML frahments
2023-08-26 16:39:29 +01:00
e.g. cave descriptions
2023-01-28 13:14:54 +00:00
- Most of the time is during the database writing (6s out of 8s).
2023-01-28 10:47:25 +00:00
2022-08-30 15:58:49 +01:00
- profile the code to find bad repetitive things, of which there are many.
2023-01-27 17:24:31 +00:00
- attach or link a DataIssue to an individual expo (logbook) so that it can be found and deleted
2023-02-02 21:50:40 +00:00
- rewrite to use generators rather than storing everything intermediate in lists - to
reduce memory impact [low priority]
- We should ensure logbook.html is utf-8 and stop this crap:
file_in = open(logbookfile,'rb')
txt = file_in.read().decode("latin1")
2023-01-19 21:18:42 +00:00
"""
MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200
2023-01-27 23:21:07 +00:00
BLOG_PARSER_SETTINGS = { # no default, must be explicit
# "2023": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
2023-01-26 21:33:06 +00:00
# "2022": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
# "2019": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
# "2018": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
# "2017": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
2023-01-19 21:18:42 +00:00
}
DEFAULT_LOGBOOK_FILE = "logbook.html"
DEFAULT_LOGBOOK_PARSER = "parser_html"
2023-01-26 21:33:06 +00:00
# All years now (Jan.2023) use the default value for Logbook parser
# dont forget to update expoweb/pubs.htm to match. 1982 left as reminder of expected format.
2023-01-19 21:18:42 +00:00
LOGBOOK_PARSER_SETTINGS = {
"1982": ("logbook.html", "parser_html"),
}
2023-02-24 20:21:06 +00:00
LOGBOOKS_DIR = "years" # subfolder of settings.EXPOWEB
2023-01-19 21:18:42 +00:00
2023-01-27 23:21:07 +00:00
ENTRIES = {
2023-11-07 21:20:59 +00:00
"2023": 84,
2023-09-02 15:49:37 +01:00
"2022": 94,
2023-01-19 21:18:42 +00:00
"2019": 55,
"2018": 95,
"2017": 74,
2023-10-07 00:26:52 +01:00
"2016": 87,
2023-01-19 21:18:42 +00:00
"2015": 80,
"2014": 67,
2023-01-19 21:18:42 +00:00
"2013": 52,
2023-01-26 21:52:56 +00:00
"2012": 76,
2023-01-19 21:18:42 +00:00
"2011": 71,
"2010": 22,
"2009": 53,
"2008": 49,
"2007": 113,
"2006": 60,
"2005": 55,
"2004": 76,
"2003": 42,
"2002": 31,
"2001": 49,
"2000": 54,
"1999": 79,
"1998": 43,
"1997": 53,
"1996": 95,
"1995": 42,
"1994": 32,
"1993": 41,
"1992": 62,
"1991": 39,
"1990": 87,
"1989": 63,
"1988": 61,
"1987": 34,
"1985": 24,
"1984": 32,
"1983": 52,
"1982": 42,
2023-09-04 14:35:59 +01:00
"1979": 30,
2023-03-06 22:30:07 +00:00
"1978": 38,
2023-01-19 21:18:42 +00:00
}
2023-03-06 22:30:07 +00:00
# What about 1970s ! Yes, 80 and 81 are missing, so are 1976 and 1977.
2023-01-19 21:18:42 +00:00
logentries = [] # the entire logbook for one year is a single object: a list of entries
2023-01-27 23:21:07 +00:00
noncaveplaces = ["travel", "Journey", "Loser Plateau", "UNKNOWN", "plateau", "base camp", "basecamp", "top camp", "topcamp"]
2023-08-31 16:55:20 +01:00
tripsdate = {}
2023-09-01 18:31:19 +01:00
2011-07-11 02:10:22 +01:00
2023-08-31 16:55:20 +01:00
def set_trip_seq_id(year, seq):
'''We have not parsed the trip date yet, so this is a sequence number
2023-08-31 16:55:20 +01:00
'''
2023-01-19 21:18:42 +00:00
tid = f"{year}_s{seq:02d}"
return tid
2023-08-31 16:55:20 +01:00
def reset_trip_id(date):
'''Now we have the date, we can set the tripid (the lbe slug) to be in our standard form
of <date><letter>, i.e. '2003-07-30b'
BUT this gets re-set every time the logbook is imported,
2023-09-01 18:31:19 +01:00
However these are persistent as the entries are ordered on this field.
2023-08-31 16:55:20 +01:00
'''
already =tripsdate.get(date, 0) # returns zero if none found
2023-09-01 18:31:19 +01:00
n = already + 1
tripsdate[date] = n
suffix = alphabet_suffix(n)
2023-08-31 16:55:20 +01:00
2023-09-01 18:31:19 +01:00
tid = f"{date}{suffix}"
2023-09-02 15:49:37 +01:00
# print(already, n, tid)
2023-08-31 16:55:20 +01:00
return tid
2023-09-05 12:35:56 +01:00
rx_tripauthor = re.compile(r"(?i)<u>(.*?)</u>$")
rx_round_bracket = re.compile(r"[\(\[].*?[\)\]]")
def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
2023-01-19 21:18:42 +00:00
res = []
2011-07-11 02:10:22 +01:00
author = None
2023-09-02 15:49:37 +01:00
guests = []
# print(f'# {tid}')
# print(f" - {tid} '{trippeople}' ")
for tripperson in re.split(r",|\+|&amp;|&(?!\w+;)| and ", trippeople):
2011-07-11 02:10:22 +01:00
tripperson = tripperson.strip()
2023-02-02 15:40:50 +00:00
# author_u = re.match(r"(?i)<u>(.*?)</u>$", tripperson)
2023-09-05 12:35:56 +01:00
author_u = rx_tripauthor.match(tripperson)
2023-02-02 15:40:50 +00:00
if author_u:
tripperson = author_u.group(1).strip()
if tripperson:
2023-09-05 12:35:56 +01:00
if tripperson[0] == "*": # a name prefix of "*" is special
guests.append(tripperson)
# print(f" ! - {expedition.year} * GUEST : {tripperson}")
else:
2023-02-02 15:40:50 +00:00
tripperson = re.sub(rx_round_bracket, "", tripperson).strip()
2023-08-26 16:39:29 +01:00
# Whacky aliases all resolved in GetPersonExpeditionNameLookup()
nickname_used = tripperson
2023-08-07 21:10:30 +01:00
try:
personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower())
if not personyear:
2023-09-02 15:49:37 +01:00
guests.append(nickname_used)
if known_foreigner(nickname_used):
message = f" ! - {expedition.year} Known foreigner: '{nickname_used}' in entry {tid=}"
2023-09-07 23:26:01 +01:00
# print(message)
2023-08-07 21:10:30 +01:00
else:
2023-09-02 15:49:37 +01:00
message = f" ! - {expedition.year} No name match for: '{nickname_used}' in entry {tid=} for this year."
2023-08-07 21:10:30 +01:00
print(message)
DataIssue.objects.create(parser="logbooks", message=message)
2023-09-05 13:49:12 +01:00
else:
res.append((personyear, nickname_used, logtime_underground))
2023-08-07 21:10:30 +01:00
except:
# This should not happen. We do not raise exceptions in that function
2023-08-26 16:39:29 +01:00
message = f" ! - {expedition.year} EXCEPTION: '{tripperson}' ({nickname_used}) in entry {tid=} for this year."
2023-02-02 15:40:50 +00:00
print(message)
DataIssue.objects.create(parser="logbooks", message=message)
2023-08-07 21:10:30 +01:00
raise
2023-02-02 15:40:50 +00:00
if author_u:
author = personyear
2023-09-05 12:35:56 +01:00
2023-02-02 15:40:50 +00:00
2011-07-11 02:10:22 +01:00
if not author:
if not res:
2023-09-02 17:23:22 +01:00
return "", 0, ""
2023-09-05 12:35:56 +01:00
author = res[-1][0] # the last valid person and a time of 0 hours. BODGE. This gets written into the archive file as Truth.
2023-09-02 15:49:37 +01:00
return res, author, guests
2011-07-11 02:10:22 +01:00
2023-01-28 10:47:25 +00:00
def tidy_time_underground(logtime_underground):
2022-12-18 19:33:56 +00:00
# Nasty hack, must tidy this up..
if logtime_underground:
try:
logtime_underground = float(logtime_underground)
except:
# print(f"logtime_underground = {logtime_underground}")
tu_match = re.match(r"(T/U:\s*)?(\d+[.]?\d*).*", logtime_underground)
if tu_match:
# print(f"logtime_underground = {tu_match.group(2)}")
logtime_underground = float(tu_match.group(2))
else:
logtime_underground = 0
else:
logtime_underground = 0
2023-01-28 10:47:25 +00:00
return logtime_underground
2022-12-18 19:33:56 +00:00
2023-02-02 15:40:50 +00:00
def tidy_trip_persons(trippeople, title, expedition, logtime_underground, tid):
try:
2023-09-02 15:49:37 +01:00
trippersons, author, guests = GetTripPersons(trippeople, expedition, logtime_underground, tid=tid)
2023-08-26 16:39:29 +01:00
# trippersons is a list of tuples (personyear, nickname_used, logtime_underground)
except:
message = f" ! - {expedition.year} Logentry: {title} - GetTripPersons FAIL to recognise nickname"
2023-01-19 21:18:42 +00:00
DataIssue.objects.create(parser="logbooks", message=message)
print(message)
raise
2023-08-07 21:10:30 +01:00
return "", ""
2023-01-19 21:18:42 +00:00
2011-07-11 02:10:22 +01:00
if not author:
2022-12-14 23:46:14 +00:00
message = f" ! - {expedition.year} Warning: logentry: {title} - no expo member author for entry '{tid}'"
2023-01-19 21:18:42 +00:00
DataIssue.objects.create(parser="logbooks", message=message)
print(message)
2023-01-28 10:47:25 +00:00
2023-09-02 15:49:37 +01:00
return trippersons, author, guests
def tidy_trip_cave(place):
# GetCaveLookup() need to work better. None of this data is *used* though?
# 'tripcave' is converted to a string doing this, which renders as the cave slug.
2011-07-11 02:10:22 +01:00
lplace = place.lower()
2023-01-19 21:18:42 +00:00
cave = None
2011-07-11 02:10:22 +01:00
if lplace not in noncaveplaces:
2020-06-19 16:39:05 +01:00
cave = GetCaveLookup().get(lplace)
2023-01-19 21:18:42 +00:00
return cave
def tidy_trip_image_urls(text, date):
2021-05-02 15:50:20 +01:00
y = str(date)[:4]
2023-01-19 21:18:42 +00:00
text = text.replace(' src="', f' src="/years/{y}/')
text = text.replace(" src='", f" src='/years/{y}/")
2023-01-19 21:18:42 +00:00
text = text.replace(f' src="/years/{y}//years/{y}/', f' src="/years/{y}/')
text = text.replace(f" src='/years/{y}//years/{y}/", f" src='/years/{y}/")
2011-07-11 02:10:22 +01:00
2023-01-19 21:18:42 +00:00
text = text.replace("\t", "")
text = text.replace("\n\n\n", "\n\n")
return text
2023-01-28 13:14:54 +00:00
def tidy_tid(tid, title):
if tid is not None:
return tid
# print(f"! {title=} ")
tid = str(randint(1000, 9999)) + "_" + slugify(title)[:10].replace("-", "_")
return tid
2023-09-02 15:49:37 +01:00
def store_entry_into_database(date, place, tripcave, title, text, trippersons, author, guests, expedition, logtime_underground, tid):
2023-01-30 16:42:56 +00:00
"""saves a single logbook entry and related personlogentry items
2023-01-31 01:37:00 +00:00
We could do a bulk update to save all the entries, but then we would need to do a query on
2023-09-02 15:49:37 +01:00
each one to get the primary key to assign to the PersonLogEntries. So overall probably not much
2023-01-31 01:37:00 +00:00
faster ?
"""
2023-09-02 15:49:37 +01:00
other_people = ", ".join(guests) # join list members separated by comma
2023-09-07 19:47:02 +01:00
# if guests:
# print(f" {date} - {guests}")
otherAttribs = {
2023-01-19 21:18:42 +00:00
"place": place,
2023-09-02 15:49:37 +01:00
"other_people": other_people, # *Ol's Mum, foreigners..
2023-01-19 21:18:42 +00:00
"text": text,
"expedition": expedition,
"time_underground": logtime_underground,
"cave_slug": str(tripcave),
2023-01-19 21:18:42 +00:00
}
coUniqueAttribs = {"slug": tid, "date": date, "title": title}
2023-08-31 16:55:20 +01:00
if LogbookEntry.objects.filter(slug=tid).exists():
# oops. Our code should already have ensured this is unique.
2023-08-31 16:55:20 +01:00
message = " ! - DUPLICATE SLUG for logbook entry " + tripdate + " - " + slug
DataIssue.objects.create(parser="logbooks", message=message)
slug = slug + "_" + unique_slug(text,2)
lbo = LogbookEntry.objects.create(**otherAttribs, **coUniqueAttribs)
2023-01-28 13:14:54 +00:00
2023-01-31 01:37:00 +00:00
pt_list = []
2023-08-26 16:39:29 +01:00
for tripperson, nickname_used, time_underground in trippersons:
coUniqueAttribs = {"personexpedition": tripperson, "nickname_used": nickname_used, "logbook_entry": lbo} # lbo is primary key
otherAttribs = {"time_underground": time_underground, "is_logbook_entry_author": (tripperson == author)}
pt_list.append(PersonLogEntry(**otherAttribs, **coUniqueAttribs))
2023-01-31 01:37:00 +00:00
PersonLogEntry.objects.bulk_create(pt_list)
2023-01-27 23:21:07 +00:00
def parser_date(tripdate, year):
2023-08-26 16:39:29 +01:00
"""Interprets dates in the expo logbooks and returns a correct datetime.date object
Does NOT actually check that it is a truly valid date..
"""
2023-09-07 19:47:02 +01:00
dummydate = EPOCH
2022-08-25 13:54:00 +01:00
month = 1
day = 1
2022-09-21 22:22:09 +01:00
# message = f" ! - Trying to parse date in logbook: {tripdate} - {year}"
# print(message)
try:
mdatestandard = re.match(r"(\d\d\d\d)-(\d\d)-(\d\d)", tripdate)
mdategoof = re.match(r"(\d\d?)/0?(\d)/(20|19)?(\d\d)", tripdate)
if mdatestandard:
if not (mdatestandard.group(1) == year):
2022-07-08 23:30:49 +01:00
message = f" ! - Bad date (year) in logbook: {tripdate} - {year}"
2023-01-19 21:18:42 +00:00
DataIssue.objects.create(parser="logbooks", message=message)
2022-07-15 13:11:49 +01:00
return dummydate
else:
year, month, day = int(mdatestandard.group(1)), int(mdatestandard.group(2)), int(mdatestandard.group(3))
elif mdategoof:
if not (not mdategoof.group(3) or mdategoof.group(3) == year[:2]):
message = " ! - Bad date mdategoof.group(3) in logbook: " + tripdate + " - " + mdategoof.group(3)
2023-01-19 21:18:42 +00:00
DataIssue.objects.create(parser="logbooks", message=message)
2022-07-15 13:11:49 +01:00
return dummydate
else:
yadd = int(year[:2]) * 100
day, month, year = int(mdategoof.group(1)), int(mdategoof.group(2)), int(mdategoof.group(4)) + yadd
else:
2023-09-07 19:47:02 +01:00
year = EPOCH.year
2022-07-08 23:30:49 +01:00
message = f" ! - Bad date in logbook: {tripdate} - {year}"
2023-01-19 21:18:42 +00:00
DataIssue.objects.create(parser="logbooks", message=message)
return date(year, month, day)
except:
2022-07-08 23:30:49 +01:00
message = f" ! - Failed to parse date in logbook: {tripdate} - {year}"
2023-01-19 21:18:42 +00:00
DataIssue.objects.create(parser="logbooks", message=message)
2023-09-07 19:47:02 +01:00
return EPOCH
2023-01-19 21:18:42 +00:00
def parser_html(year, expedition, txt, seq=""):
2023-11-23 18:46:44 +00:00
r"""This uses some of the more obscure capabilities of regular expressions,
see https://docs.python.org/3/library/re.html
2023-09-02 15:49:37 +01:00
e.g.
* is greedy
*? is non-greedy
(?x) flag means VERBOSE
(?: ) non-capturing parentheses
\s whitespace
\S NOT whitespace
2023-01-19 21:18:42 +00:00
2023-10-22 19:28:38 +01:00
You can't see it here, but a round-trip export-then-import
for a new year logbook will move
2023-01-26 21:33:06 +00:00
the endmatter up to the frontmatter. This made sense when translating
2022-12-21 02:05:26 +00:00
from parser_html_01 format logfiles, believe me.
2023-01-19 21:18:42 +00:00
"""
2023-01-28 13:14:54 +00:00
logentries = []
2023-01-27 23:21:07 +00:00
dupl = {}
2020-06-08 21:33:32 +01:00
# extract front material and stash for later use when rebuilding from list of entries
headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt)
headpara = headmatch.groups()[0].strip()
2023-01-19 21:18:42 +00:00
if len(headpara) > 0:
2023-02-24 20:21:06 +00:00
frontpath = Path(settings.EXPOWEB, LOGBOOKS_DIR, year, "frontmatter.html")
2023-10-22 19:28:38 +01:00
if not frontpath.is_file:
# dont attempt to rewrite it. So this will only run once, for new logbook. Buggy otherwise.
with open(frontpath, "w") as front:
front.write(headpara + "\n")
2023-01-19 21:18:42 +00:00
2022-12-21 02:05:26 +00:00
# extract END material and stash for later use when rebuilding from list of entries
endmatch = re.match(r"(?i)(?s).*<hr\s*/>([\s\S]*?)(?=</body)", txt)
endpara = endmatch.groups()[0].strip()
2023-01-19 21:18:42 +00:00
if len(endpara) > 0:
print(f"\n - {year} endpara:\n'{endpara}'")
2023-02-24 20:21:06 +00:00
endpath = Path(settings.EXPOWEB, LOGBOOKS_DIR, year, "endmatter.html")
2023-01-19 21:18:42 +00:00
with open(endpath, "w") as end:
end.write(endpara + "\n")
tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt)
logbook_entry_count = 0
2011-07-11 02:10:22 +01:00
for trippara in tripparas:
logbook_entry_count += 1
2023-08-31 16:55:20 +01:00
tid = set_trip_seq_id(year, logbook_entry_count)
2023-09-02 15:49:37 +01:00
# print(f' - new seq tid:{tid} lbe count: {logbook_entry_count}')
2023-01-19 21:18:42 +00:00
s = re.match(
r"""(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date
2011-07-11 02:10:22 +01:00
\s*(?:<a\s+id="(.*?)"\s*/>\s*</a>)?
\s*<div\s+class="tripdate"\s*(?:id="(.*?)")?>(.*?)</div>(?:<p>)?
\s*<div\s+class="trippeople">\s*(.*?)</div>
\s*<div\s+class="triptitle">\s*(.*?)</div>
([\s\S]*?)
\s*(?:<div\s+class="timeug">\s*(.*?)</div>)?
2023-09-02 15:49:37 +01:00
\s*(?:<div\s+class="editentry"\s*.*?</div>)?
2011-07-11 02:10:22 +01:00
\s*$
2023-01-19 21:18:42 +00:00
""",
trippara,
)
2022-08-25 14:12:13 +01:00
if s:
tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups()
2023-09-02 15:49:37 +01:00
# print(f"#{logbook_entry_count} {tu} {len(triptext)} ")
else:
# if not re.search(r"Rigging Guide", trippara):
2023-09-02 15:49:37 +01:00
msg = f" !- Logbook. Can't parse entry, skipping:{logbook_entry_count} '{trippara[:75]}'..."
print(msg)
2023-01-19 21:18:42 +00:00
DataIssue.objects.create(parser="logbooks", message=msg)
continue
2023-01-19 21:18:42 +00:00
2023-01-27 23:21:07 +00:00
ldate = parser_date(tripdate.strip(), year)
2023-08-31 16:55:20 +01:00
# Now we have a date, we can reset tripid
tid = reset_trip_id(ldate)
2011-07-11 02:10:22 +01:00
triptitles = triptitle.split(" - ")
if len(triptitles) >= 2:
place = triptitles[0]
2011-07-11 02:10:22 +01:00
else:
place = "Unknown"
2023-11-07 21:20:59 +00:00
# tripcontent = re.sub(r"</p>", "", triptext)
# tripcontent = re.sub(r"<p>", "<br /><br />", tripcontent).strip()
tripcontent = triptext.strip()
2023-01-19 21:18:42 +00:00
2022-12-18 20:36:11 +00:00
triptitle = triptitle.strip()
# triptitle must be unique for a given date. [Why?!] We fix this here.
2023-01-27 23:21:07 +00:00
check = (ldate, triptitle)
if check in dupl:
dupl[check] += 1
triptitle = f"{triptitle} #{dupl[check]}"
print(f" - {triptitle} -- {ldate}")
2023-01-27 23:21:07 +00:00
else:
dupl[check] = 1
2023-01-28 10:47:25 +00:00
tu = tidy_time_underground(tu)
2023-09-02 15:49:37 +01:00
trippersons, author, guests = tidy_trip_persons(trippeople, triptitle, expedition, tu, tid)
tripcave = tidy_trip_cave(place)
tripcontent = tidy_trip_image_urls(tripcontent, ldate)
2023-01-28 13:14:54 +00:00
tid = tidy_tid(tid, triptitle)
2023-09-02 15:49:37 +01:00
entrytuple = (ldate, place, tripcave, triptitle, tripcontent, trippersons, author, guests, expedition, tu, tid)
2020-06-08 21:33:32 +01:00
logentries.append(entrytuple)
2023-01-28 13:14:54 +00:00
return logentries
2020-06-08 21:33:32 +01:00
2023-01-19 21:18:42 +00:00
def parser_blog(year, expedition, txt, sq=""):
2023-01-19 21:18:42 +00:00
"""Parses the format of web pages collected as 'Save As HTML" from the UK Caving blog website.
2022-12-14 23:46:14 +00:00
Note that the entries have dates and authors, but no titles.
See detailed explanation of the complete process:
https://expo.survex.com/handbook/computing/logbooks-parsing.html
https://expo.survex.com/handbook/computing/log-blog-parsing.html
2023-01-19 21:18:42 +00:00
This uses some of the more obscure capabilities of regular expressions,
see https://docs.python.org/3/library/re.html
2023-01-19 21:18:42 +00:00
BLOG entries have this structure:
<article ... data-author="Tinywoman" data-content="post-298780" id="js-post-298780">
<article class="message-body js-selectToQuote">
</article>
2023-01-19 21:18:42 +00:00
</article>
So the content is nested inside the header. Attachments (images) come after the content.
2023-01-27 23:21:07 +00:00
It's a bugger, but it's out of our control.
2023-01-19 21:18:42 +00:00
"""
2023-01-28 13:14:54 +00:00
logentries = []
2020-06-08 21:33:32 +01:00
2023-01-19 21:18:42 +00:00
tripheads = re.findall(
# note use of non-greedy capturing (?: regex idiom here
r"<article class=\"message message--post js-post js-inlineModContainer\s*(?:is-unread)*\s*\"\s*([\s\S]*?)(?=</article)", txt
2023-01-19 21:18:42 +00:00
)
if not (tripheads):
2022-12-14 23:46:14 +00:00
message = f" ! - Skipping on failure to parse article header: {txt[:500]}"
print(message)
2020-06-08 21:33:32 +01:00
# (?= is a non-consuming match, see https://docs.python.org/3/library/re.html
2023-01-19 21:18:42 +00:00
tripparas = re.findall(
r"<article class=\"message-body js-selectToQuote\"\>\s*([\s\S]*?)(</article[^>]*>)([\s\S]*?)(?=</article)", txt
)
if not (tripparas):
2022-12-14 23:46:14 +00:00
message = f" ! - Skipping on failure to parse article content: {txt[:500]}"
print(message)
2023-01-19 21:18:42 +00:00
if len(tripheads) != len(tripparas):
print(f"{len(tripheads)} != {len(tripparas)} not the same number of headers {len(tripheads)} as paras {len(tripparas)} !")
# print(f"{len(tripheads)} - {len(tripparas)}")
2022-12-14 23:46:14 +00:00
#location = "Plateau" # best guess, fix manually later
2023-01-28 10:47:25 +00:00
tu = 0 # no logged time underground in a blog entry
2022-12-14 23:46:14 +00:00
logbook_entry_count = 0
for i in range(0, len(tripparas)):
tripstuff = tripparas[i]
attach = tripstuff[2]
# note use on non-greedy *? regex idiom here
2023-01-19 21:18:42 +00:00
attach = re.sub(r"<div class=\"file-content\">[\s\S]*?(?=</li>)", "", attach)
attach = re.sub(r"<footer[\s\S]*(</footer>)", "", attach)
tripcontent = tripstuff[0] + attach
2023-01-19 21:18:42 +00:00
# print(f"{i} - {len(tripstuff)} - {tripstuff[1]}")
2022-12-14 23:46:14 +00:00
triphead = tripheads[i]
logbook_entry_count += 1
2023-08-31 16:55:20 +01:00
tid = set_trip_seq_id(year, logbook_entry_count) + "_blog" + sq
2022-12-14 23:46:14 +00:00
# print(f" - tid: {tid}")
2023-01-19 21:18:42 +00:00
2022-12-14 23:46:14 +00:00
# data-author="tcacrossley"
match_author = re.search(r".*data-author=\"([^\"]*)\" data-content=.*", triphead)
2023-01-19 21:18:42 +00:00
if not (match_author):
2022-12-14 23:46:14 +00:00
message = f" ! - Skipping logentry {year}:{logbook_entry_count} on failure to parse data-author {tid} {triphead[:400]}..."
2023-01-19 21:18:42 +00:00
DataIssue.objects.create(parser="logbooks", message=message)
2022-12-14 23:46:14 +00:00
print(message)
break
trippeople = match_author.group(1)
# print(f" - tid: {tid} {trippeople}")
# datetime="2019-07-11T13:16:18+0100"
2022-12-14 23:46:14 +00:00
match_datetime = re.search(r".*datetime=\"([^\"]*)\" data-time=.*", triphead)
2023-01-19 21:18:42 +00:00
if not (match_datetime):
2022-12-14 23:46:14 +00:00
message = f" ! - Skipping logentry {year}:{logbook_entry_count} on failure to parse datetime {tid} {triphead[:400]}..."
2023-01-19 21:18:42 +00:00
DataIssue.objects.create(parser="logbooks", message=message)
2022-12-14 23:46:14 +00:00
print(message)
break
datestamp = match_datetime.group(1)
2023-01-19 21:18:42 +00:00
2022-12-15 01:06:54 +00:00
try:
tripdate = datetime.fromisoformat(datestamp)
except:
message = f" ! - FROMISOFORMAT fail logentry {year}:{logbook_entry_count} {tid} '{datestamp}'"
2023-01-19 21:18:42 +00:00
DataIssue.objects.create(parser="logbooks", message=message)
print(message)
# fallback, ignore the timestamp bits:
2022-12-15 01:06:54 +00:00
tripdate = datetime.fromisoformat(datestamp[0:10])
# print(f" - tid: {tid} '{trippeople}' '{tripdate}'")
2023-01-19 21:18:42 +00:00
2023-01-27 23:21:07 +00:00
# triptitle must have the location then a hyphen at the beginning as it is ignored by export function. We can't know what this is, so we set it as 'Expo' and 'Unknown'.
place = "Unknown"
2023-01-27 23:21:07 +00:00
# triptitle must be unique for a given date. We can enforce this here.
triptitle = f"Expo - UK Caving Blog{sq} post {logbook_entry_count}"
2023-01-19 21:18:42 +00:00
tripcontent = re.sub(r"(width=\"\d+\")", "", tripcontent)
tripcontent = re.sub(r"height=\"\d+\"", "", tripcontent)
tripcontent = re.sub(r"width: \d+px", "", tripcontent)
tripcontent = re.sub(r"\n\n+", "\n\n", tripcontent)
tripcontent = re.sub(r"<hr\s*>", "", tripcontent)
tripcontent = f"\n\n<!-- Content parsed from UK Caving Blog -->\nBlog Author: {trippeople}" + tripcontent
logtime_underground = 0
2023-09-02 15:49:37 +01:00
trippersons, author, guests = tidy_trip_persons(trippeople, triptitle, expedition, logtime_underground, tid)
# print(f" - author: {author}")
tripcave = tidy_trip_cave(place)
tripcontent = tidy_trip_image_urls(tripcontent, year)
2023-01-28 13:14:54 +00:00
tid = tidy_tid(tid, triptitle)
2023-09-02 15:49:37 +01:00
entrytuple = (tripdate, place, tripcave, triptitle, tripcontent, trippersons, author, guests, expedition, tu, tid)
2022-12-14 23:46:14 +00:00
logentries.append(entrytuple)
2023-01-28 13:14:54 +00:00
return logentries
def clean_all_logbooks():
DataIssue.objects.filter(parser="logbooks").delete()
LogbookEntry.objects.all().delete()
2023-01-27 17:24:31 +00:00
def clean_logbook_for_expedition(expedition):
2023-01-27 23:21:07 +00:00
"""Only used when loading a single logbook. Deletes database LogBookEntries and
DataIssues for this expedition year.
"""
2023-09-04 16:56:32 +01:00
global tripsdate
tripsdate = {}
2023-01-27 23:21:07 +00:00
lbes = LogbookEntry.objects.filter(expedition=expedition).delete()
dataissues = DataIssue.objects.filter(parser="logbooks")
for di in dataissues:
ph = expedition.year
if re.search(ph, di.message) is not None: # SLOW just to delete issues for one year
# print(f' - CLEANING dataissue {di.message}')
di.delete()
def parse_logbook_for_expedition(expedition, blog=False):
2023-01-19 21:18:42 +00:00
"""Parses all logbook entries for one expedition
2020-05-30 20:31:20 +01:00
"""
2023-01-27 23:21:07 +00:00
global ENTRIES
logentries = []
logbook_parseable = False
2023-02-24 20:21:06 +00:00
expologbase = Path(settings.EXPOWEB, LOGBOOKS_DIR)
2023-01-27 23:21:07 +00:00
2022-03-24 01:05:50 +00:00
year = expedition.year
2023-01-27 23:21:07 +00:00
expect = ENTRIES[year]
2022-03-24 01:05:50 +00:00
# print(" - Logbook for: " + year)
2023-01-27 23:21:07 +00:00
if year in LOGBOOK_PARSER_SETTINGS:
yearfile, parsefunc = LOGBOOK_PARSER_SETTINGS[year]
expedition.logbookfile = yearfile # don't change this if a blog
2020-05-30 20:31:20 +01:00
else:
2023-01-27 23:21:07 +00:00
yearfile = DEFAULT_LOGBOOK_FILE
expedition.logbookfile = DEFAULT_LOGBOOK_FILE # don't change this if a blog
2023-01-19 21:18:42 +00:00
parsefunc = DEFAULT_LOGBOOK_PARSER
2023-01-27 23:21:07 +00:00
if blog:
if year not in BLOG_PARSER_SETTINGS:
message = f" ! - Expecting blog parser buut none specified for {year}"
DataIssue.objects.create(parser="logbooks", message=message)
print(message)
else:
yearfile, parsefunc = BLOG_PARSER_SETTINGS[year]
print(f" - BLOG file {yearfile} using parser {parsefunc}")
2020-05-30 20:31:20 +01:00
2023-01-27 23:21:07 +00:00
logbookpath = Path(yearfile)
# print(f" - Logbook file {yearfile} using parser {parsefunc}")
# expedition.save()
2023-01-19 21:18:42 +00:00
for sq in ["", "2", "3", "4"]: # cope with blog saved as many separate files
lb = Path(expologbase, year, logbookpath.stem + sq + logbookpath.suffix)
if not (lb.is_file()):
# print(f" ! End of blog. Next blog file in sequence not there:{lb}")
break
2022-08-30 15:58:49 +01:00
try:
2023-01-19 21:18:42 +00:00
with open(lb, "rb") as file_in:
txt = file_in.read().decode("utf-8")
logbook_parseable = True
except (IOError):
logbook_parseable = False
print(f" ! Couldn't open logbook as UTF-8 {lb}")
except:
logbook_parseable = False
print(f" ! Very Bad Error opening {lb}")
if logbook_parseable:
# --------------------
parser = globals()[parsefunc]
2023-08-31 16:55:20 +01:00
# print(f" - {year} parsing with {parsefunc} - {lb}")
print(" .", end="")
2023-01-28 13:14:54 +00:00
logentries = parser(year, expedition, txt, sq) # this launches the right parser
# --------------------
2023-01-19 21:18:42 +00:00
2022-03-24 01:05:50 +00:00
if len(logentries) == expect:
# print(f"OK {year} {len(logentries):5d} is {expect}\n")
pass
else:
2022-12-20 15:18:07 +00:00
print(f"Mismatch in number of log entries: {year} {len(logentries):5d} is not {expect}\n")
2022-03-24 01:05:50 +00:00
2023-01-28 13:14:54 +00:00
return logentries
2023-01-19 21:18:42 +00:00
2022-12-18 19:33:56 +00:00
def LoadLogbook(year):
2023-08-31 16:55:20 +01:00
"""One off logbook for testing purposes, and also reloadable on '/expedition/2023?reload'
2023-01-27 23:21:07 +00:00
This is inside an atomic transaction"""
2023-01-19 21:18:42 +00:00
expo = Expedition.objects.get(year=year)
year = expo.year # some type funny
2023-01-27 17:24:31 +00:00
clean_logbook_for_expedition(expo)
2023-01-27 23:21:07 +00:00
logentries = []
2023-01-28 13:14:54 +00:00
logentries = parse_logbook_for_expedition(expo) # this actually loads the logbook for one expo
print(f" - Loaded logbook. {len(logentries)} entries." )
if year in BLOG_PARSER_SETTINGS:
print(f" - Loading blog.." )
logentries += parse_logbook_for_expedition(expo, blog=True) # this loads the blog logbook
else:
2023-01-19 21:18:42 +00:00
print(
2023-01-27 23:21:07 +00:00
f" - Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}"
2023-01-19 21:18:42 +00:00
)
2023-01-27 23:21:07 +00:00
for entrytuple in logentries:
2023-09-02 15:49:37 +01:00
date, place, tripcave, triptitle, text, trippersons, author, guests, expedition, tu, tid = entrytuple
2023-01-28 13:14:54 +00:00
if expo == expedition: # unneeded check, we zeroed it before filling it
# print(f" -- {triptitle}")
2023-09-02 15:49:37 +01:00
store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, guests, expedition, tu, tid)
else:
2023-01-28 13:14:54 +00:00
print(f" ! unexpected log entry labelled as '{expedition}' {tid}" )
expo.save() # to save logbook name property
2023-01-27 23:21:07 +00:00
2011-07-11 02:10:22 +01:00
def LoadLogbooks():
2023-01-19 21:18:42 +00:00
"""This is the master function for parsing all logbooks into the Troggle database.
2022-11-21 16:26:30 +00:00
This should be rewritten to use coroutines to load all logbooks from disc in parallel,
but must be serialised to write to database as sqlite is single-user.
2023-01-27 23:21:07 +00:00
This is inside an atomic transaction. Maybe it shouldn't be..
2020-05-30 20:31:20 +01:00
"""
2023-01-27 23:21:07 +00:00
global ENTRIES
global logentries
allentries = []
mem1 = get_process_memory()
print(f" - MEM:{mem1:7.2f} MB now ", file=sys.stderr)
start = time.time()
2020-06-08 21:33:32 +01:00
clean_all_logbooks()
2020-05-30 12:35:15 +01:00
expos = Expedition.objects.all()
2020-06-06 22:51:55 +01:00
if len(expos) <= 1:
2023-01-27 23:21:07 +00:00
message = " ! - No expeditions found. Attempting to 'people' first"
2023-01-19 21:18:42 +00:00
DataIssue.objects.create(parser="logbooks", message=message)
2022-03-18 10:21:25 +00:00
print(message)
2023-01-27 23:21:07 +00:00
load_people_expos()
expos = Expedition.objects.all()
if len(expos) <= 1:
message = " ! - No expeditions found, even after attempting to load 'people'. Abort."
DataIssue.objects.create(parser="logbooks", message=message)
print(message)
return
2022-03-18 10:21:25 +00:00
2023-01-19 21:18:42 +00:00
noexpo = [
"1986",
"2020",
"2021",
] # no expo
2023-09-04 14:16:58 +01:00
lostlogbook = ["1976", "1977", "1980", "1981"]
2023-01-19 21:18:42 +00:00
sqlfail = [""] # breaks mysql with db constraint fail - all now fixed.]
2022-03-02 21:15:24 +00:00
nologbook = noexpo + lostlogbook + sqlfail
2022-03-24 01:05:50 +00:00
2023-01-19 21:18:42 +00:00
nlbe = {}
2022-12-15 00:35:48 +00:00
loglist = []
bloglist = []
2023-01-19 21:18:42 +00:00
2023-01-27 23:21:07 +00:00
for expo in expos:
2022-03-24 01:05:50 +00:00
year = expo.year
if year in sqlfail:
print(" - Logbook for: " + year + " NO parsing attempted - known sql failures")
2023-01-19 21:18:42 +00:00
message = f" ! - Not even attempting to parse logbook for {year} until code fixed"
DataIssue.objects.create(parser="logbooks", message=message)
2022-03-24 01:05:50 +00:00
print(message)
2022-03-02 21:15:24 +00:00
2022-03-24 01:05:50 +00:00
if year not in nologbook:
2023-01-27 23:21:07 +00:00
if year in ENTRIES:
2022-12-15 00:35:48 +00:00
loglist.append(expo)
2022-03-24 01:05:50 +00:00
else:
2023-01-19 21:18:42 +00:00
print(" - No Logbook yet for: " + year) # catch case when preparing for next expo
if year in BLOG_PARSER_SETTINGS:
2022-12-15 00:35:48 +00:00
bloglist.append(expo)
2022-03-24 01:05:50 +00:00
2022-12-15 00:35:48 +00:00
for ex in loglist:
2023-01-28 13:14:54 +00:00
logentries = parse_logbook_for_expedition(ex) # this loads the logbook for one expo
2023-01-27 23:21:07 +00:00
allentries += logentries
2022-12-15 00:35:48 +00:00
for b in bloglist:
2023-01-19 21:18:42 +00:00
print(f" - BLOG: {b}")
2023-01-28 13:14:54 +00:00
logentries = parse_logbook_for_expedition(b, blog=True) # loads the blog logbook for one expo
2023-01-27 23:21:07 +00:00
allentries += logentries
2022-03-24 01:05:50 +00:00
2023-07-24 12:24:53 +01:00
print(f"\n - {len(allentries):,} log entries parsed in all expeditions")
2023-01-27 23:21:07 +00:00
mem = get_process_memory()
print(f" - MEM: {mem:7.2f} MB in use, {mem-mem1:7.2f} MB more", file=sys.stderr)
duration = time.time() - start
print(f" - TIME: {duration:7.2f} s", file=sys.stderr)
2023-08-30 22:34:48 +01:00
print(f"\n - Now store all those logbook entries in the database.")
2023-01-27 23:21:07 +00:00
# Now we serially store the parsed data in the database, updating 3 types of object:
# - Expedition (the 'logbook.html' value)
# - LogBookEntry (text, who when etc.)
2023-01-30 16:18:19 +00:00
# - PersonLogEntry (who was on that specific trip mentione din the logbook entry)
2023-01-27 23:21:07 +00:00
for entrytuple in allentries:
2023-09-02 15:49:37 +01:00
date, place, tripcave, triptitle, text, trippersons, author, guests, expedition, tu, tid = entrytuple
store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, guests, expedition, tu, tid)
2023-01-28 13:14:54 +00:00
2023-01-27 23:21:07 +00:00
for expo in expos:
2023-01-28 13:14:54 +00:00
expo.save() # to save logbook name property
2023-01-27 23:21:07 +00:00
mem = get_process_memory()
2023-07-24 12:24:53 +01:00
print(f" - {len(allentries):,} log entries saved into database")
2023-01-27 23:21:07 +00:00
print(f" - MEM: {mem:7.2f} MB in use, {mem-mem1:7.2f} MB more", file=sys.stderr)
duration = time.time() - start
print(f" - TIME: {duration:7.2f} s", file=sys.stderr)
# dateRegex = re.compile(r'<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S)
# expeditionYearRegex = re.compile(r'<span\s+class="expeditionyear">(.*?)</span>', re.S)
# titleRegex = re.compile(r'<H1>(.*?)</H1>', re.S)
# reportRegex = re.compile(r'<div\s+class="report">(.*)</div>\s*</body>', re.S)
# personRegex = re.compile(r'<div\s+class="person">(.*?)</div>', re.S)
# nameAuthorRegex = re.compile(r'<span\s+class="name(,author|)">(.*?)</span>', re.S)
# TURegex = re.compile(r'<span\s+class="TU">([0-9]*\.?[0-9]+)</span>', re.S)
# locationRegex = re.compile(r'<span\s+class="location">(.*?)</span>', re.S)
# caveRegex = re.compile(r'<span\s+class="cave">(.*?)</span>', re.S)