mirror of
https://expo.survex.com/repositories/troggle/.git
synced 2025-12-17 04:27:02 +00:00
850 lines
34 KiB
Python
850 lines
34 KiB
Python
import os
|
|
import re
|
|
import string
|
|
import sys
|
|
import time
|
|
from dataclasses import dataclass
|
|
from datetime import date, datetime
|
|
from pathlib import Path
|
|
from random import randint
|
|
|
|
from django.conf import settings
|
|
from django.template.defaultfilters import slugify
|
|
|
|
from parsers.people import GetPersonExpeditionNameLookup, known_foreigner, load_people_expos
|
|
from typing import Any, List, Tuple
|
|
from troggle.core.models.caves import GetCaveLookup
|
|
from troggle.core.models.logbooks import LogbookEntry, PersonLogEntry
|
|
from troggle.core.models.troggle import DataIssue, Expedition
|
|
from troggle.core.utils import alphabet_suffix, get_process_memory, unique_slug
|
|
|
|
EPOCH = settings.EPOCH
|
|
"""
|
|
Parses and imports logbooks in all their wonderful confusion
|
|
See detailed explanation of the complete process:
|
|
https://expo.survex.com/handbook/computing/logbooks-parsing.html
|
|
"""
|
|
todo = """
|
|
- check cross-references to specific logbook entries in other logbooks and other HTML frahments
|
|
e.g. cave descriptions
|
|
|
|
- attach or link a DataIssue to an individual expo (logbook) so that it can be found and deleted in the DataIssue bug output
|
|
|
|
- rewrite to use generators rather than storing everything intermediate in lists - to
|
|
reduce memory impact [very low priority]
|
|
|
|
"""
|
|
|
|
@dataclass
|
|
class LogbookEntryData:
|
|
# All dataclass fields have a type annotation, by definition.
|
|
# Fields with no type annotation are not dataclass fields; they're class attributes.
|
|
tripdate: date
|
|
place: str
|
|
tripcave: Any
|
|
triptitle: str
|
|
text: str
|
|
trippersons: List[Tuple[Any, str, float]] # adjust types as needed
|
|
author: Any
|
|
guests: List[str]
|
|
expedition: Any
|
|
tu: float # time underground, not actually used anywhere
|
|
tid: str # trip identifier
|
|
|
|
MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200
|
|
BLOG_PARSER_SETTINGS = { # no default, must be explicit
|
|
# "2023": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
|
|
# "2022": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
|
|
# "2019": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
|
|
# "2018": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
|
|
# "2017": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
|
|
}
|
|
DEFAULT_LOGBOOK_FILE = "logbook.html"
|
|
DEFAULT_LOGBOOK_PARSER = "parser_html"
|
|
# All years now (Jan.2023) use the default value for Logbook parser
|
|
# dont forget to update expoweb/pubs.htm to match. 1982 left here as reminder of expected format:
|
|
LOGBOOK_PARSER_SETTINGS = {
|
|
"1982": ("logbook.html", "parser_html"),
|
|
}
|
|
LOGBOOKS_DIR = "years" # subfolder of settings.EXPOWEB
|
|
|
|
ENTRIES = {
|
|
"2025": 114,
|
|
"2024": 127,
|
|
"2023": 131,
|
|
"2022": 93,
|
|
"2019": 55,
|
|
"2018": 98,
|
|
"2017": 74,
|
|
"2016": 87,
|
|
"2015": 80,
|
|
"2014": 68,
|
|
"2013": 52,
|
|
"2012": 76,
|
|
"2011": 71,
|
|
"2010": 22,
|
|
"2009": 53,
|
|
"2008": 49,
|
|
"2007": 113,
|
|
"2006": 60,
|
|
"2005": 55,
|
|
"2004": 76,
|
|
"2003": 42,
|
|
"2002": 31,
|
|
"2001": 49,
|
|
"2000": 54,
|
|
"1999": 79,
|
|
"1998": 43,
|
|
"1997": 53,
|
|
"1996": 95,
|
|
"1995": 42,
|
|
"1994": 32,
|
|
"1993": 41,
|
|
"1992": 62,
|
|
"1991": 39,
|
|
"1990": 87,
|
|
"1989": 63,
|
|
"1988": 61,
|
|
"1987": 34,
|
|
"1985": 24,
|
|
"1984": 32,
|
|
"1983": 52,
|
|
"1982": 42,
|
|
"1979": 30,
|
|
"1978": 38,
|
|
}
|
|
for y in range(2025, 2050):
|
|
y_str = str(y)
|
|
if y_str not in ENTRIES:
|
|
ENTRIES[y_str] = 0
|
|
# What about 1970s ! Yes, 80 and 81 are missing, so are 1976 and 1977.
|
|
|
|
logentries = [] # the entire logbook for one year is a single object: a list of entries
|
|
noncaveplaces = ["travel", "Journey", "Loser Plateau", "UNKNOWN", "plateau", "base camp", "basecamp", "top camp", "topcamp"]
|
|
tripsdate = {}
|
|
|
|
|
|
def set_trip_seq_id(year, seq):
|
|
'''We have not parsed the trip date yet, so this is a sequence number
|
|
'''
|
|
tid = f"{year}_s{seq:02d}"
|
|
return tid
|
|
|
|
def reset_trip_id(date):
|
|
'''Now we have the date, we can set the tripid (the lbe slug) to be in our standard form
|
|
of <date><letter>, i.e. '2003-07-30b'
|
|
BUT this gets re-set every time the logbook is imported,
|
|
However these are persistent as the entries are ordered on this field.
|
|
'''
|
|
already =tripsdate.get(date, 0) # returns zero if none found
|
|
n = already + 1
|
|
tripsdate[date] = n
|
|
suffix = alphabet_suffix(n)
|
|
|
|
tid = f"{date}{suffix}"
|
|
# print(already, n, tid)
|
|
return tid
|
|
|
|
rx_tripauthor = re.compile(r"(?i)<u>(.*?)</u>$")
|
|
rx_round_bracket = re.compile(r"[\(\[].*?[\)\]]")
|
|
|
|
def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
|
|
res = []
|
|
author = None
|
|
guests = []
|
|
# print(f'# {tid}')
|
|
# print(f" - {tid} '{trippeople}' ")
|
|
|
|
"""
|
|
re.split(r",|\+|&|&(?!\w+;)| and ", trippeople)
|
|
|
|
, : The comma character
|
|
\+ : The plus sign (+); escaped to treat as a literal character
|
|
& : The literal string "&" (HTML-encoded ampersand)
|
|
&(?!\w+;) : An ampersand (&) not followed by one or more word characters (\w+) and a semicolon (;)
|
|
: Uses negative lookahead assertion (?!...) to ensure it's not part of an HTML entity like " "
|
|
and : The literal string " and " (with spaces before and after)
|
|
|
|
This will split the 'trippeople' string at any of these delimiters.
|
|
"""
|
|
for tripperson in re.split(r",|\+|&|&(?!\w+;)| and ", trippeople):
|
|
tripperson = tripperson.strip()
|
|
# author_u = re.match(r"(?i)<u>(.*?)</u>$", tripperson)
|
|
author_u = rx_tripauthor.match(tripperson)
|
|
if author_u:
|
|
tripperson = author_u.group(1).strip()
|
|
if tripperson:
|
|
if tripperson[0] == "*": # a name prefix of "*" is special
|
|
guests.append(tripperson)
|
|
# print(f" ! - {expedition.year} * GUEST : {tripperson}")
|
|
else:
|
|
tripperson = re.sub(rx_round_bracket, "", tripperson).strip()
|
|
|
|
# Whacky aliases all resolved in GetPersonExpeditionNameLookup()
|
|
nickname_used = tripperson
|
|
try:
|
|
personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower())
|
|
if not personyear:
|
|
guests.append(nickname_used)
|
|
if known_foreigner(nickname_used):
|
|
message = f" ! - {expedition.year} Known foreigner: '{nickname_used}' in entry {tid=}"
|
|
# print(message)
|
|
else:
|
|
message = f" ! - {expedition.year} No name match for: '{nickname_used}' in entry {tid=} for this year."
|
|
print(message)
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
else:
|
|
res.append((personyear, nickname_used, logtime_underground))
|
|
except:
|
|
# This should not happen. We do not raise exceptions in that function
|
|
message = f" ! - {expedition.year} EXCEPTION: '{tripperson}' ({nickname_used}) in entry {tid=} for this year."
|
|
print(message)
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
raise
|
|
if author_u:
|
|
author = personyear
|
|
|
|
|
|
if not author:
|
|
if not res:
|
|
return "", 0, ""
|
|
author = res[-1][0] # the last valid person and a time of 0 hours. BODGE. This gets written into the archive file as Truth.
|
|
return res, author, guests
|
|
|
|
def tidy_time_underground(logtime_underground):
|
|
# Nasty hack, must tidy this up..
|
|
if logtime_underground:
|
|
try:
|
|
logtime_underground = float(logtime_underground)
|
|
except:
|
|
# print(f"logtime_underground = {logtime_underground}")
|
|
tu_match = re.match(r"(T/U:\s*)?(\d+[.]?\d*).*", logtime_underground)
|
|
if tu_match:
|
|
# print(f"logtime_underground = {tu_match.group(2)}")
|
|
logtime_underground = float(tu_match.group(2))
|
|
else:
|
|
logtime_underground = 0
|
|
else:
|
|
logtime_underground = 0
|
|
return logtime_underground
|
|
|
|
def tidy_trip_persons(trippeople, title, expedition, logtime_underground, tid):
|
|
try:
|
|
trippersons, author, guests = GetTripPersons(trippeople, expedition, logtime_underground, tid=tid)
|
|
# trippersons is a list of tuples (personyear, nickname_used, logtime_underground)
|
|
except:
|
|
message = f" ! - {expedition.year} Logentry: {title} - GetTripPersons FAIL to recognise nickname"
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
print(message)
|
|
raise
|
|
return "", ""
|
|
|
|
if not author:
|
|
message = f" ! - {expedition.year} Warning: logentry: {title} - no expo member author for entry '{tid}'"
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
print(message)
|
|
|
|
return trippersons, author, guests
|
|
|
|
def tidy_trip_cave(place):
|
|
# GetCaveLookup() need to work better. None of this data is *used* though?
|
|
# 'tripcave' is converted to a string doing this, which renders as the cave slug.
|
|
|
|
lplace = place.lower()
|
|
cave = None
|
|
if lplace not in noncaveplaces:
|
|
cave = GetCaveLookup().get(lplace)
|
|
|
|
return cave
|
|
|
|
def tidy_trip_image_urls(text, date):
|
|
y = str(date)[:4]
|
|
|
|
|
|
text = text.replace(' src="', f' src="/years/{y}/')
|
|
text = text.replace(" src='", f" src='/years/{y}/")
|
|
|
|
text = text.replace(f' src="/years/{y}//years/{y}/', f' src="/years/{y}/')
|
|
text = text.replace(f" src='/years/{y}//years/{y}/", f" src='/years/{y}/")
|
|
|
|
text = text.replace(f'http://expo.survex.com/expofiles', f'/expofiles')
|
|
|
|
text = text.replace(f' src="/years/{y}//expofiles/', f' src="/expofiles/')
|
|
text = text.replace(f" src='/years/{y}//expofiles/", f" src='/expofiles/")
|
|
|
|
text = text.replace("\t", "")
|
|
text = text.replace("\n\n\n", "\n\n")
|
|
|
|
# lines = text.splitlines()
|
|
# for line in lines:
|
|
# if "expofiles" in line:
|
|
# print(f"tidy_trip_image_urls() - {y}\n {line}")
|
|
return text
|
|
|
|
def tidy_tid(tid, title, date):
|
|
if not tid.startswith(date):
|
|
message = f" ! - Logentry id does not have the same date {date=} {tid=} "
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
print(message)
|
|
|
|
if tid is not None:
|
|
return tid
|
|
|
|
# print(f"! {title=} ")
|
|
tid = str(randint(1000, 9999)) + "_" + slugify(title)[:10].replace("-", "_")
|
|
return tid
|
|
|
|
def bulk_store_entries(entries):
|
|
"""
|
|
Bulk saves logbook entries and related personlogentry items.
|
|
This reduces the number of database operations significantly.
|
|
This replaces >2,000 calls to store_entry_into_database()
|
|
"""
|
|
# 1. Prepare LogbookEntry objects
|
|
logbook_objs = []
|
|
slug_to_entrydata = {}
|
|
for entry in entries:
|
|
other_people = ", ".join(entry.guests)
|
|
# Ensure slug is unique, otherwise add suffix
|
|
slug = entry.tid
|
|
orig_slug = slug
|
|
i = 2
|
|
while slug in slug_to_entrydata:
|
|
# found duplicate
|
|
slug = f"{orig_slug}_{i}"
|
|
message = " ! - DUPLICATE SLUG for logbook entry " + entry.tripdate + " - " + entry.tid
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
#slug = slug + "_" + unique_slug(entry.text,2)
|
|
i += 1
|
|
slug_to_entrydata[slug] = entry
|
|
|
|
logbook_objs.append(LogbookEntry(
|
|
place=entry.place,
|
|
other_people=other_people, # Ol's mum, foreigners
|
|
text=entry.text,
|
|
expedition=entry.expedition,
|
|
time_underground=entry.tu,
|
|
cave=entry.tripcave,
|
|
slug=slug,
|
|
date=entry.tripdate,
|
|
title=entry.triptitle,
|
|
))
|
|
|
|
# 2. Bulk create LogbookEntry objects
|
|
LogbookEntry.objects.bulk_create(logbook_objs)
|
|
|
|
# 3. Fetch created LogbookEntry objects by slug for FK assignment
|
|
created_entries = {lbe.slug: lbe for lbe in LogbookEntry.objects.filter(slug__in=slug_to_entrydata.keys())}
|
|
|
|
# 4. Prepare PersonLogEntry objects
|
|
personlog_objs = []
|
|
for slug, entry in slug_to_entrydata.items():
|
|
lbo = created_entries[slug]
|
|
for tripperson, nickname_used, time_underground in entry.trippersons:
|
|
personlog_objs.append(PersonLogEntry(
|
|
personexpedition=tripperson,
|
|
nickname_used=nickname_used,
|
|
logbook_entry=lbo,
|
|
time_underground=time_underground,
|
|
is_logbook_entry_author=(tripperson == entry.author),
|
|
))
|
|
|
|
# 5. Bulk create PersonLogEntry objects
|
|
PersonLogEntry.objects.bulk_create(personlog_objs)
|
|
|
|
|
|
def parser_date(tripdate, year):
|
|
"""Interprets dates in the expo logbooks and returns a correct datetime.date object
|
|
Does NOT actually check that it is a truly valid date..
|
|
|
|
tripdate : string
|
|
year : string
|
|
|
|
Nasty bug if it returns just the year which leads to a logbook id '/2023' instead of '/2023-07-16b'
|
|
"""
|
|
dummydate = EPOCH
|
|
month = 1
|
|
day = 1
|
|
# message = f" ! - Trying to parse date in logbook: {tripdate} - {year}"
|
|
# print(message)
|
|
try:
|
|
mdatestandard = re.match(r"(\d\d\d\d)-(\d\d)-(\d\d)", tripdate)
|
|
mdategoof = re.match(r"(\d\d?)/0?(\d)/(20|19)?(\d\d)", tripdate)
|
|
if mdatestandard:
|
|
if not (mdatestandard.group(1) == year):
|
|
message = f" ! - Bad date (year) in logbook: {tripdate} - {year}"
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
return dummydate
|
|
else:
|
|
year, month, day = int(mdatestandard.group(1)), int(mdatestandard.group(2)), int(mdatestandard.group(3))
|
|
elif mdategoof:
|
|
if not (not mdategoof.group(3) or mdategoof.group(3) == year[:2]):
|
|
message = " ! - Bad date mdategoof.group(3) in logbook: " + tripdate + " - " + mdategoof.group(3)
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
return dummydate
|
|
else:
|
|
yadd = int(year[:2]) * 100
|
|
day, month, year = int(mdategoof.group(1)), int(mdategoof.group(2)), int(mdategoof.group(4)) + yadd
|
|
else:
|
|
year = EPOCH.year
|
|
message = f" ! - Bad date in logbook: {tripdate} - {year}"
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
|
|
return date(year, month, day)
|
|
except:
|
|
message = f" ! - Failed to parse date in logbook: {tripdate} - {year}"
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
return EPOCH
|
|
|
|
|
|
def parser_html(year, expedition, txt, seq=""):
|
|
r"""This uses some of the more obscure capabilities of regular expressions,
|
|
see https://docs.python.org/3/library/re.html
|
|
|
|
e.g.
|
|
* is greedy
|
|
*? is non-greedy
|
|
|
|
(?x) flag means VERBOSE
|
|
|
|
(?: ) non-capturing parentheses
|
|
|
|
\s whitespace
|
|
\S NOT whitespace
|
|
|
|
You can't see it here, but a round-trip export-then-import
|
|
for a new year logbook will move
|
|
the endmatter up to the frontmatter. This made sense when translating
|
|
from parser_html_01 format logfiles, believe me.
|
|
|
|
Now that all the old logbooks have been converted, the endmatter relocation is now disabled.
|
|
"""
|
|
logentries = []
|
|
dupl = {}
|
|
|
|
# extract front material and stash for later use when rebuilding from list of entries
|
|
if headmatch := re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt): # WALRUS OPERATOR
|
|
headpara = headmatch.groups()[0].strip()
|
|
else:
|
|
headpara = ""
|
|
|
|
if len(headpara) > 0:
|
|
frontpath = Path(settings.EXPOWEB, LOGBOOKS_DIR, year, "frontmatter.html")
|
|
if not frontpath.is_file:
|
|
# dont attempt to rewrite it. So this will only run once, for new logbook. Buggy otherwise.
|
|
with open(frontpath, "w") as front:
|
|
front.write(headpara + "\n")
|
|
|
|
# extract END material and stash for later use when rebuilding from list of entries
|
|
if endmatch := re.match(r"(?i)(?s).*<hr\s*/>([\s\S]*?)(?=</body)", txt): # WALRUS OPERATOR
|
|
endpara = endmatch.groups()[0].strip()
|
|
else:
|
|
endpara = ""
|
|
|
|
if len(endpara) > 0:
|
|
print(f"\n - {year} endpara:\n'{endpara}'\n\n NOT creating endmatter.html from 2025 onwards")
|
|
# endpath = Path(settings.EXPOWEB, LOGBOOKS_DIR, year, "endmatter.html")
|
|
# with open(endpath, "w") as end:
|
|
# end.write(endpara + "\n")
|
|
|
|
if tripparas := re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt): # WALRUS OPERATOR
|
|
pass
|
|
else:
|
|
return None
|
|
logbook_entry_count = 0
|
|
for trippara in tripparas:
|
|
logbook_entry_count += 1
|
|
tid = set_trip_seq_id(year, logbook_entry_count)
|
|
# print(f' - new seq tid:{tid} lbe count: {logbook_entry_count}')
|
|
|
|
s = re.match(
|
|
r"""(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date
|
|
\s*(?:<a\s+id="(.*?)"\s*/>\s*</a>)?
|
|
\s*<div\s+class="tripdate"\s*(?:id="(.*?)")?>(.*?)</div>(?:<p>)?
|
|
\s*<div\s+class="trippeople">\s*(.*?)</div>
|
|
\s*<div\s+class="triptitle">\s*(.*?)</div>
|
|
([\s\S]*?)
|
|
\s*(?:<div\s+class="timeug">\s*(.*?)</div>)?
|
|
\s*(?:<div\s+class="editentry"\s*.*?</div>)?
|
|
\s*$
|
|
""",
|
|
trippara,
|
|
)
|
|
if s:
|
|
tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups()
|
|
# print(f"#{logbook_entry_count} {tu} {len(triptext)} ")
|
|
else:
|
|
# if not re.search(r"Rigging Guide", trippara):
|
|
msg = f" !- Logbook. Can't parse entry, skipping:{logbook_entry_count} '{trippara[:75]}'..."
|
|
print(msg)
|
|
DataIssue.objects.create(parser="logbooks", message=msg)
|
|
continue
|
|
|
|
ldate = parser_date(tripdate.strip(), year)
|
|
if len(str(ldate)) < 10:
|
|
ldate = date(year, 10, 1) # 1st October
|
|
msg = f" !- Logbook. Bad parsed date '{tripdate}' setting to '{ldate}'"
|
|
print(msg)
|
|
DataIssue.objects.create(parser="logbooks", message=msg)
|
|
continue
|
|
lgdate = f"{ldate}"[:10]
|
|
|
|
# Now we have a date, we can reset tripid
|
|
tid = reset_trip_id(ldate)
|
|
triptitles = triptitle.split(" - ")
|
|
if len(triptitles) >= 2:
|
|
place = triptitles[0].split()[0]
|
|
else:
|
|
p = triptitle.split()
|
|
if len(p) >= 2:
|
|
place = p[0]
|
|
else:
|
|
place = triptitle
|
|
# tripcontent = re.sub(r"</p>", "", triptext)
|
|
# tripcontent = re.sub(r"<p>", "<br /><br />", tripcontent).strip()
|
|
tripcontent = triptext.strip()
|
|
|
|
triptitle = triptitle.strip()
|
|
# triptitle must be unique for a given date. [Why?!] We fix this here.
|
|
check = (ldate, triptitle)
|
|
if check in dupl:
|
|
dupl[check] += 1
|
|
triptitle = f"{triptitle} #{dupl[check]}"
|
|
print(f" - {triptitle} -- {ldate}")
|
|
else:
|
|
dupl[check] = 1
|
|
|
|
tu = tidy_time_underground(tu)
|
|
trippersons, author, guests = tidy_trip_persons(trippeople, triptitle, expedition, tu, tid)
|
|
tripcave = tidy_trip_cave(place)
|
|
tripcontent = tidy_trip_image_urls(tripcontent, ldate)
|
|
tid = tidy_tid(tid, triptitle, lgdate)
|
|
|
|
entry = LogbookEntryData(ldate, place, tripcave, triptitle, tripcontent, trippersons, author, guests, expedition, tu, tid)
|
|
logentries.append(entry)
|
|
return logentries
|
|
|
|
|
|
def parser_blog(year, expedition, txt, sq=""):
|
|
"""Parses the format of web pages collected as 'Save As HTML" from the UK Caving blog website.
|
|
Note that the entries have dates and authors, but no titles.
|
|
See detailed explanation of the complete process:
|
|
https://expo.survex.com/handbook/computing/logbooks-parsing.html
|
|
https://expo.survex.com/handbook/computing/log-blog-parsing.html
|
|
|
|
This uses some of the more obscure capabilities of regular expressions,
|
|
see https://docs.python.org/3/library/re.html
|
|
|
|
BLOG entries have this structure:
|
|
<article ... data-author="Tinywoman" data-content="post-298780" id="js-post-298780">
|
|
<article class="message-body js-selectToQuote">
|
|
</article>
|
|
</article>
|
|
So the content is nested inside the header. Attachments (images) come after the content.
|
|
It's a bugger, but it's out of our control.
|
|
"""
|
|
logentries = []
|
|
|
|
tripheads = re.findall(
|
|
# note use of non-greedy capturing (?: regex idiom here
|
|
r"<article class=\"message message--post js-post js-inlineModContainer\s*(?:is-unread)*\s*\"\s*([\s\S]*?)(?=</article)", txt
|
|
)
|
|
if not (tripheads):
|
|
message = f" ! - Skipping on failure to parse article header: {txt[:500]}"
|
|
print(message)
|
|
|
|
# (?= is a non-consuming match, see https://docs.python.org/3/library/re.html
|
|
tripparas = re.findall(
|
|
r"<article class=\"message-body js-selectToQuote\"\>\s*([\s\S]*?)(</article[^>]*>)([\s\S]*?)(?=</article)", txt
|
|
)
|
|
if not (tripparas):
|
|
message = f" ! - Skipping on failure to parse article content: {txt[:500]}"
|
|
print(message)
|
|
|
|
if len(tripheads) != len(tripparas):
|
|
print(f"{len(tripheads)} != {len(tripparas)} not the same number of headers {len(tripheads)} as paras {len(tripparas)} !")
|
|
# print(f"{len(tripheads)} - {len(tripparas)}")
|
|
|
|
#location = "Plateau" # best guess, fix manually later
|
|
tu = 0 # no logged time underground in a blog entry
|
|
logbook_entry_count = 0
|
|
for i in range(0, len(tripparas)):
|
|
tripstuff = tripparas[i]
|
|
attach = tripstuff[2]
|
|
# note use on non-greedy *? regex idiom here
|
|
attach = re.sub(r"<div class=\"file-content\">[\s\S]*?(?=</li>)", "", attach)
|
|
attach = re.sub(r"<footer[\s\S]*(</footer>)", "", attach)
|
|
tripcontent = tripstuff[0] + attach
|
|
# print(f"{i} - {len(tripstuff)} - {tripstuff[1]}")
|
|
triphead = tripheads[i]
|
|
logbook_entry_count += 1
|
|
tid = set_trip_seq_id(year, logbook_entry_count) + "_blog" + sq
|
|
# print(f" - tid: {tid}")
|
|
|
|
# data-author="tcacrossley"
|
|
match_author = re.search(r".*data-author=\"([^\"]*)\" data-content=.*", triphead)
|
|
if not (match_author):
|
|
message = f" ! - Skipping logentry {year}:{logbook_entry_count} on failure to parse data-author {tid} {triphead[:400]}..."
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
print(message)
|
|
break
|
|
trippeople = match_author.group(1)
|
|
# print(f" - tid: {tid} {trippeople}")
|
|
# datetime="2019-07-11T13:16:18+0100"
|
|
match_datetime = re.search(r".*datetime=\"([^\"]*)\" data-time=.*", triphead)
|
|
if not (match_datetime):
|
|
message = f" ! - Skipping logentry {year}:{logbook_entry_count} on failure to parse datetime {tid} {triphead[:400]}..."
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
print(message)
|
|
break
|
|
datestamp = match_datetime.group(1)
|
|
|
|
try:
|
|
tripdate = datetime.fromisoformat(datestamp)
|
|
except:
|
|
message = f" ! - FROMISOFORMAT fail logentry {year}:{logbook_entry_count} {tid} '{datestamp}'"
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
print(message)
|
|
# fallback, ignore the timestamp bits:
|
|
tripdate = datetime.fromisoformat(datestamp[0:10])
|
|
# print(f" - tid: {tid} '{trippeople}' '{tripdate}'")
|
|
|
|
# triptitle must have the location then a hyphen at the beginning as it is ignored by export function. We can't know what this is, so we set it as 'Expo' and 'Unknown'.
|
|
place = "Unknown"
|
|
# triptitle must be unique for a given date. We can enforce this here.
|
|
triptitle = f"Expo - UK Caving Blog{sq} post {logbook_entry_count}"
|
|
tripcontent = re.sub(r"(width=\"\d+\")", "", tripcontent)
|
|
tripcontent = re.sub(r"height=\"\d+\"", "", tripcontent)
|
|
tripcontent = re.sub(r"width: \d+px", "", tripcontent)
|
|
tripcontent = re.sub(r"\n\n+", "\n\n", tripcontent)
|
|
tripcontent = re.sub(r"<hr\s*>", "", tripcontent)
|
|
tripcontent = f"\n\n<!-- Content parsed from UK Caving Blog -->\nBlog Author: {trippeople}" + tripcontent
|
|
|
|
logtime_underground = 0
|
|
trippersons, author, guests = tidy_trip_persons(trippeople, triptitle, expedition, logtime_underground, tid)
|
|
# print(f" - author: {author}")
|
|
tripcave = tidy_trip_cave(place)
|
|
tripcontent = tidy_trip_image_urls(tripcontent, year)
|
|
tid = tidy_tid(tid, triptitle, datestamp)
|
|
|
|
entry = LogbookEntryData(tripdate, place, tripcave, triptitle, tripcontent, trippersons, author, guests, expedition, tu, tid)
|
|
logentries.append(entry)
|
|
return logentries
|
|
|
|
def clean_all_logbooks():
|
|
DataIssue.objects.filter(parser="logbooks").delete()
|
|
LogbookEntry.objects.all().delete()
|
|
|
|
def clean_logbook_for_expedition(expedition):
|
|
"""Only used when loading a single logbook. Deletes database LogBookEntries and
|
|
DataIssues for this expedition year.
|
|
"""
|
|
global tripsdate
|
|
tripsdate = {}
|
|
|
|
lbes = LogbookEntry.objects.filter(expedition=expedition).delete()
|
|
dataissues = DataIssue.objects.filter(parser="logbooks")
|
|
for di in dataissues:
|
|
ph = expedition.year
|
|
if re.search(ph, di.message) is not None: # SLOW just to delete issues for one year
|
|
# print(f' - CLEANING dataissue {di.message}')
|
|
di.delete()
|
|
|
|
def parse_logbook_for_expedition(expedition, blog=False):
|
|
"""Parses all logbook entries for one expedition
|
|
"""
|
|
global ENTRIES
|
|
logentries = []
|
|
|
|
logbook_parseable = False
|
|
expologbase = Path(settings.EXPOWEB, LOGBOOKS_DIR)
|
|
|
|
year = expedition.year
|
|
expect = ENTRIES[year]
|
|
# print(" - Logbook for: " + year)
|
|
|
|
if year in LOGBOOK_PARSER_SETTINGS:
|
|
yearfile, parsefunc = LOGBOOK_PARSER_SETTINGS[year]
|
|
expedition.logbookfile = yearfile # don't change this if a blog
|
|
else:
|
|
yearfile = DEFAULT_LOGBOOK_FILE
|
|
expedition.logbookfile = DEFAULT_LOGBOOK_FILE # don't change this if a blog
|
|
parsefunc = DEFAULT_LOGBOOK_PARSER
|
|
|
|
logbookpath = Path(yearfile)
|
|
|
|
if blog:
|
|
if year not in BLOG_PARSER_SETTINGS:
|
|
message = f" ! - Expecting blog parser buut none specified for {year}"
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
print(message)
|
|
else:
|
|
yearfile, parsefunc = BLOG_PARSER_SETTINGS[year]
|
|
print(f" - BLOG file {yearfile} using parser {parsefunc}")
|
|
else:
|
|
lb = Path(expologbase, year, logbookpath.stem + logbookpath.suffix)
|
|
if not (lb.is_file()):
|
|
message = f" ! Logbook file does not exist (yet): '{lb}'"
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
print(message)
|
|
|
|
for sq in ["", "2", "3", "4"]: # cope with blog saved as many separate files
|
|
lb = Path(expologbase, year, logbookpath.stem + sq + logbookpath.suffix)
|
|
if not (lb.is_file()):
|
|
break
|
|
try:
|
|
with open(lb, "rb") as file_in:
|
|
txt = file_in.read().decode("utf-8")
|
|
logbook_parseable = True
|
|
except (IOError):
|
|
logbook_parseable = False
|
|
print(f" ! Couldn't open logbook as UTF-8 {lb}")
|
|
except:
|
|
logbook_parseable = False
|
|
print(f" ! Very Bad Error opening {lb}")
|
|
|
|
if logbook_parseable:
|
|
# --------------------
|
|
parser = globals()[parsefunc]
|
|
# print(f" - {year} parsing with {parsefunc} - {lb}")
|
|
print(" .", end="")
|
|
logentries = parser(year, expedition, txt, sq) # this launches the right parser
|
|
# --------------------
|
|
|
|
if logentries:
|
|
if len(logentries) == expect:
|
|
# print(f"OK {year} {len(logentries):5d} is {expect}\n")
|
|
pass
|
|
else:
|
|
print(f"Mismatch in number of log entries: {year} {len(logentries):5d} is not {expect}\n")
|
|
|
|
return logentries
|
|
|
|
def _collect_logbook_entries_for_expos(expos, nologbook, ENTRIES, BLOG_PARSER_SETTINGS):
|
|
"""Helper to collect all logbook and blog entries for a list of expeditions."""
|
|
allentries = []
|
|
loglist = []
|
|
bloglist = []
|
|
|
|
for expo in expos:
|
|
year = expo.year
|
|
if year not in nologbook:
|
|
if year in ENTRIES:
|
|
loglist.append(expo)
|
|
else:
|
|
print(" - No Logbook entries count yet for: " + year)
|
|
loglist.append(expo)
|
|
if year in BLOG_PARSER_SETTINGS:
|
|
bloglist.append(expo)
|
|
|
|
for ex in loglist:
|
|
logentries = parse_logbook_for_expedition(ex)
|
|
allentries += logentries
|
|
|
|
for b in bloglist:
|
|
print(f" - BLOG: {b}")
|
|
logentries = parse_logbook_for_expedition(b, blog=True)
|
|
allentries += logentries
|
|
|
|
return allentries
|
|
|
|
def LoadLogbook(year):
|
|
"""One off logbook for testing purposes, and also reloadable on '/expedition/2023?reload'
|
|
This is inside an atomic transaction"""
|
|
|
|
expo = Expedition.objects.get(year=year)
|
|
year = expo.year # some type funny
|
|
clean_logbook_for_expedition(expo)
|
|
logentries = []
|
|
|
|
if logentries := parse_logbook_for_expedition(expo): # this actually loads the logbook for one expo # WALRUS
|
|
print(f" - Loaded logbook. {len(logentries)} entries." )
|
|
if year in BLOG_PARSER_SETTINGS:
|
|
print(f" - Loading blog.." )
|
|
logentries += parse_logbook_for_expedition(expo, blog=True) # this loads the blog logbook
|
|
else:
|
|
print(
|
|
f" - Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}"
|
|
)
|
|
# Bulk store all entries at once
|
|
bulk_store_entries(logentries)
|
|
#for entry in logentries:
|
|
#date, place, tripcave, triptitle, text, trippersons, author, guests, expedition, tu, tid = entrytuple
|
|
#if expo == entry.expedition: # unneeded check, we zeroed it before filling it
|
|
# print(f" -- {triptitle}")
|
|
#store_entry_into_database(entry)
|
|
#else:
|
|
#print(f" ! unexpected log entry labelled as '{entry.expedition}' {entry.tid}" )
|
|
expo.save() # to save logbook name property
|
|
|
|
def LoadLogbooks():
|
|
"""This is the master function for parsing all logbooks into the Troggle database.
|
|
This should be rewritten to use coroutines to load all logbooks from disc in parallel,
|
|
but must be serialised to write to database as sqlite is single-user.
|
|
|
|
This is inside an atomic transaction. Maybe it shouldn't be..
|
|
"""
|
|
global ENTRIES
|
|
global logentries
|
|
allentries = []
|
|
mem1 = get_process_memory()
|
|
print(f" - MEM:{mem1:7.2f} MB now ", file=sys.stderr)
|
|
start = time.time()
|
|
|
|
clean_all_logbooks()
|
|
expos = Expedition.objects.all()
|
|
if len(expos) <= 1:
|
|
message = " ! - No expeditions found. Attempting to 'people' first"
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
print(message)
|
|
load_people_expos() # by loading the folk list
|
|
expos = Expedition.objects.all()
|
|
if len(expos) <= 1:
|
|
message = " ! - No expeditions found, even after attempting to load 'people'. Abort."
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
print(message)
|
|
return
|
|
|
|
noexpo = [
|
|
"1986",
|
|
"2020",
|
|
"2021",
|
|
] # no expo
|
|
lostlogbook = ["1976", "1977", "1980", "1981"]
|
|
sqlfail = [""] # breaks mysql with db constraint fail - all now fixed.]
|
|
nologbook = noexpo + lostlogbook + sqlfail
|
|
|
|
allentries = _collect_logbook_entries_for_expos(expos, nologbook, ENTRIES, BLOG_PARSER_SETTINGS)
|
|
|
|
|
|
print(f"\n - {len(allentries):,} log entries parsed in all expeditions")
|
|
mem = get_process_memory()
|
|
print(f" - MEM: {mem:7.2f} MB in use, {mem-mem1:7.2f} MB more", file=sys.stderr)
|
|
duration = time.time() - start
|
|
print(f" - TIME: {duration:7.2f} s", file=sys.stderr)
|
|
print(f"\n - Now store all those logbook entries in the database.")
|
|
|
|
# Now we serially store the parsed data in the database, updating 3 types of object:
|
|
# - Expedition (the 'logbook.html' value)
|
|
# - LogBookEntry (text, who when etc.)
|
|
# - PersonLogEntry (who was on that specific trip mentione din the logbook entry)
|
|
bulk_store_entries(allentries)
|
|
|
|
|
|
for expo in expos:
|
|
expo.save() # to save logbook name property
|
|
mem = get_process_memory()
|
|
print(f" - {len(allentries):,} log entries saved into database")
|
|
print(f" - MEM: {mem:7.2f} MB in use, {mem-mem1:7.2f} MB more", file=sys.stderr)
|
|
duration = time.time() - start
|
|
print(f" - TIME: {duration:7.2f} s", file=sys.stderr)
|
|
# dateRegex = re.compile(r'<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S)
|
|
# expeditionYearRegex = re.compile(r'<span\s+class="expeditionyear">(.*?)</span>', re.S)
|
|
# titleRegex = re.compile(r'<H1>(.*?)</H1>', re.S)
|
|
# reportRegex = re.compile(r'<div\s+class="report">(.*)</div>\s*</body>', re.S)
|
|
# personRegex = re.compile(r'<div\s+class="person">(.*?)</div>', re.S)
|
|
# nameAuthorRegex = re.compile(r'<span\s+class="name(,author|)">(.*?)</span>', re.S)
|
|
# TURegex = re.compile(r'<span\s+class="TU">([0-9]*\.?[0-9]+)</span>', re.S)
|
|
# locationRegex = re.compile(r'<span\s+class="location">(.*?)</span>', re.S)
|
|
# caveRegex = re.compile(r'<span\s+class="cave">(.*?)</span>', re.S)
|