mirror of
https://expo.survex.com/repositories/troggle/.git
synced 2024-11-22 07:11:52 +00:00
783 lines
32 KiB
Python
783 lines
32 KiB
Python
import os
|
|
import re
|
|
import sys
|
|
import string
|
|
import time
|
|
|
|
from datetime import date, datetime
|
|
from pathlib import Path
|
|
from random import randint
|
|
|
|
from django.conf import settings
|
|
from django.template.defaultfilters import slugify
|
|
|
|
from parsers.people import GetPersonExpeditionNameLookup, load_people_expos, known_foreigner
|
|
from troggle.core.models.caves import GetCaveLookup
|
|
from troggle.core.models.logbooks import LogbookEntry, PersonLogEntry
|
|
from troggle.core.models.troggle import DataIssue, Expedition
|
|
from troggle.core.utils import get_process_memory, alphabet_suffix, unique_slug
|
|
|
|
EPOCH = settings.EPOCH
|
|
"""
|
|
Parses and imports logbooks in all their wonderful confusion
|
|
See detailed explanation of the complete process:
|
|
https://expo.survex.com/handbook/computing/logbooks-parsing.html
|
|
"""
|
|
todo = """
|
|
- check cross-references in other logbooks and other HTML frahments
|
|
e.g. cave descriptions
|
|
|
|
- Most of the time is during the database writing (6s out of 8s).
|
|
|
|
- profile the code to find bad repetitive things, of which there are many.
|
|
|
|
- attach or link a DataIssue to an individual expo (logbook) so that it can be found and deleted
|
|
|
|
- rewrite to use generators rather than storing everything intermediate in lists - to
|
|
reduce memory impact [low priority]
|
|
|
|
- We should ensure logbook.html is utf-8 and stop this crap:
|
|
file_in = open(logbookfile,'rb')
|
|
txt = file_in.read().decode("latin1")
|
|
|
|
"""
|
|
MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200
|
|
BLOG_PARSER_SETTINGS = { # no default, must be explicit
|
|
# "2023": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
|
|
# "2022": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
|
|
# "2019": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
|
|
# "2018": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
|
|
# "2017": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
|
|
}
|
|
DEFAULT_LOGBOOK_FILE = "logbook.html"
|
|
DEFAULT_LOGBOOK_PARSER = "parser_html"
|
|
# All years now (Jan.2023) use the default value for Logbook parser
|
|
# dont forget to update expoweb/pubs.htm to match. 1982 left as reminder of expected format.
|
|
LOGBOOK_PARSER_SETTINGS = {
|
|
"1982": ("logbook.html", "parser_html"),
|
|
}
|
|
LOGBOOKS_DIR = "years" # subfolder of settings.EXPOWEB
|
|
|
|
ENTRIES = {
|
|
"2024": 87,
|
|
"2023": 131,
|
|
"2022": 94,
|
|
"2019": 55,
|
|
"2018": 98,
|
|
"2017": 74,
|
|
"2016": 87,
|
|
"2015": 80,
|
|
"2014": 67,
|
|
"2013": 52,
|
|
"2012": 76,
|
|
"2011": 71,
|
|
"2010": 22,
|
|
"2009": 53,
|
|
"2008": 49,
|
|
"2007": 113,
|
|
"2006": 60,
|
|
"2005": 55,
|
|
"2004": 76,
|
|
"2003": 42,
|
|
"2002": 31,
|
|
"2001": 49,
|
|
"2000": 54,
|
|
"1999": 79,
|
|
"1998": 43,
|
|
"1997": 53,
|
|
"1996": 95,
|
|
"1995": 42,
|
|
"1994": 32,
|
|
"1993": 41,
|
|
"1992": 62,
|
|
"1991": 39,
|
|
"1990": 87,
|
|
"1989": 63,
|
|
"1988": 61,
|
|
"1987": 34,
|
|
"1985": 24,
|
|
"1984": 32,
|
|
"1983": 52,
|
|
"1982": 42,
|
|
"1979": 30,
|
|
"1978": 38,
|
|
}
|
|
# What about 1970s ! Yes, 80 and 81 are missing, so are 1976 and 1977.
|
|
|
|
logentries = [] # the entire logbook for one year is a single object: a list of entries
|
|
noncaveplaces = ["travel", "Journey", "Loser Plateau", "UNKNOWN", "plateau", "base camp", "basecamp", "top camp", "topcamp"]
|
|
tripsdate = {}
|
|
|
|
|
|
def set_trip_seq_id(year, seq):
|
|
'''We have not parsed the trip date yet, so this is a sequence number
|
|
'''
|
|
tid = f"{year}_s{seq:02d}"
|
|
return tid
|
|
|
|
def reset_trip_id(date):
|
|
'''Now we have the date, we can set the tripid (the lbe slug) to be in our standard form
|
|
of <date><letter>, i.e. '2003-07-30b'
|
|
BUT this gets re-set every time the logbook is imported,
|
|
However these are persistent as the entries are ordered on this field.
|
|
'''
|
|
already =tripsdate.get(date, 0) # returns zero if none found
|
|
n = already + 1
|
|
tripsdate[date] = n
|
|
suffix = alphabet_suffix(n)
|
|
|
|
tid = f"{date}{suffix}"
|
|
# print(already, n, tid)
|
|
return tid
|
|
|
|
rx_tripauthor = re.compile(r"(?i)<u>(.*?)</u>$")
|
|
rx_round_bracket = re.compile(r"[\(\[].*?[\)\]]")
|
|
|
|
def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
|
|
res = []
|
|
author = None
|
|
guests = []
|
|
# print(f'# {tid}')
|
|
# print(f" - {tid} '{trippeople}' ")
|
|
|
|
for tripperson in re.split(r",|\+|&|&(?!\w+;)| and ", trippeople):
|
|
tripperson = tripperson.strip()
|
|
# author_u = re.match(r"(?i)<u>(.*?)</u>$", tripperson)
|
|
author_u = rx_tripauthor.match(tripperson)
|
|
if author_u:
|
|
tripperson = author_u.group(1).strip()
|
|
if tripperson:
|
|
if tripperson[0] == "*": # a name prefix of "*" is special
|
|
guests.append(tripperson)
|
|
# print(f" ! - {expedition.year} * GUEST : {tripperson}")
|
|
else:
|
|
tripperson = re.sub(rx_round_bracket, "", tripperson).strip()
|
|
|
|
# Whacky aliases all resolved in GetPersonExpeditionNameLookup()
|
|
nickname_used = tripperson
|
|
try:
|
|
personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower())
|
|
if not personyear:
|
|
guests.append(nickname_used)
|
|
if known_foreigner(nickname_used):
|
|
message = f" ! - {expedition.year} Known foreigner: '{nickname_used}' in entry {tid=}"
|
|
# print(message)
|
|
else:
|
|
message = f" ! - {expedition.year} No name match for: '{nickname_used}' in entry {tid=} for this year."
|
|
print(message)
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
else:
|
|
res.append((personyear, nickname_used, logtime_underground))
|
|
except:
|
|
# This should not happen. We do not raise exceptions in that function
|
|
message = f" ! - {expedition.year} EXCEPTION: '{tripperson}' ({nickname_used}) in entry {tid=} for this year."
|
|
print(message)
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
raise
|
|
if author_u:
|
|
author = personyear
|
|
|
|
|
|
if not author:
|
|
if not res:
|
|
return "", 0, ""
|
|
author = res[-1][0] # the last valid person and a time of 0 hours. BODGE. This gets written into the archive file as Truth.
|
|
return res, author, guests
|
|
|
|
def tidy_time_underground(logtime_underground):
|
|
# Nasty hack, must tidy this up..
|
|
if logtime_underground:
|
|
try:
|
|
logtime_underground = float(logtime_underground)
|
|
except:
|
|
# print(f"logtime_underground = {logtime_underground}")
|
|
tu_match = re.match(r"(T/U:\s*)?(\d+[.]?\d*).*", logtime_underground)
|
|
if tu_match:
|
|
# print(f"logtime_underground = {tu_match.group(2)}")
|
|
logtime_underground = float(tu_match.group(2))
|
|
else:
|
|
logtime_underground = 0
|
|
else:
|
|
logtime_underground = 0
|
|
return logtime_underground
|
|
|
|
def tidy_trip_persons(trippeople, title, expedition, logtime_underground, tid):
|
|
try:
|
|
trippersons, author, guests = GetTripPersons(trippeople, expedition, logtime_underground, tid=tid)
|
|
# trippersons is a list of tuples (personyear, nickname_used, logtime_underground)
|
|
except:
|
|
message = f" ! - {expedition.year} Logentry: {title} - GetTripPersons FAIL to recognise nickname"
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
print(message)
|
|
raise
|
|
return "", ""
|
|
|
|
if not author:
|
|
message = f" ! - {expedition.year} Warning: logentry: {title} - no expo member author for entry '{tid}'"
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
print(message)
|
|
|
|
return trippersons, author, guests
|
|
|
|
def tidy_trip_cave(place):
|
|
# GetCaveLookup() need to work better. None of this data is *used* though?
|
|
# 'tripcave' is converted to a string doing this, which renders as the cave slug.
|
|
|
|
lplace = place.lower()
|
|
cave = None
|
|
if lplace not in noncaveplaces:
|
|
cave = GetCaveLookup().get(lplace)
|
|
|
|
return cave
|
|
|
|
def tidy_trip_image_urls(text, date):
|
|
y = str(date)[:4]
|
|
|
|
|
|
text = text.replace(' src="', f' src="/years/{y}/')
|
|
text = text.replace(" src='", f" src='/years/{y}/")
|
|
|
|
text = text.replace(f' src="/years/{y}//years/{y}/', f' src="/years/{y}/')
|
|
text = text.replace(f" src='/years/{y}//years/{y}/", f" src='/years/{y}/")
|
|
|
|
text = text.replace(f' src="/years/{y}//expofiles/', f' src="/expofiles/')
|
|
text = text.replace(f" src='/years/{y}//expofiles/", f" src='/expofiles/")
|
|
|
|
text = text.replace("\t", "")
|
|
text = text.replace("\n\n\n", "\n\n")
|
|
|
|
# lines = text.splitlines()
|
|
# for line in lines:
|
|
# if "expofiles" in line:
|
|
# print(f"tidy_trip_image_urls() - {y}\n {line}")
|
|
return text
|
|
|
|
def tidy_tid(tid, title, date):
|
|
if not tid.startswith(date):
|
|
message = f" ! - Logentry id does not have the same date {date=} {tid=} "
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
print(message)
|
|
|
|
if tid is not None:
|
|
return tid
|
|
|
|
# print(f"! {title=} ")
|
|
tid = str(randint(1000, 9999)) + "_" + slugify(title)[:10].replace("-", "_")
|
|
return tid
|
|
|
|
def store_entry_into_database(date, place, tripcave, title, text, trippersons, author, guests, expedition, logtime_underground, tid):
|
|
"""saves a single logbook entry and related personlogentry items
|
|
|
|
We could do a bulk update to save all the entries, but then we would need to do a query on
|
|
each one to get the primary key to assign to the PersonLogEntries. So overall probably not much
|
|
faster ?
|
|
"""
|
|
other_people = ", ".join(guests) # join list members separated by comma
|
|
# if guests:
|
|
# print(f" {date} - {guests}")
|
|
|
|
otherAttribs = {
|
|
"place": place,
|
|
"other_people": other_people, # *Ol's Mum, foreigners..
|
|
"text": text,
|
|
"expedition": expedition,
|
|
"time_underground": logtime_underground,
|
|
"cave": tripcave,
|
|
}
|
|
coUniqueAttribs = {"slug": tid, "date": date, "title": title}
|
|
if LogbookEntry.objects.filter(slug=tid).exists():
|
|
# oops. Our code should already have ensured this is unique.
|
|
message = " ! - DUPLICATE SLUG for logbook entry " + tripdate + " - " + slug
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
slug = slug + "_" + unique_slug(text,2)
|
|
|
|
lbo = LogbookEntry.objects.create(**otherAttribs, **coUniqueAttribs)
|
|
|
|
pt_list = []
|
|
for tripperson, nickname_used, time_underground in trippersons:
|
|
coUniqueAttribs = {"personexpedition": tripperson, "nickname_used": nickname_used, "logbook_entry": lbo} # lbo is primary key
|
|
otherAttribs = {"time_underground": time_underground, "is_logbook_entry_author": (tripperson == author)}
|
|
pt_list.append(PersonLogEntry(**otherAttribs, **coUniqueAttribs))
|
|
PersonLogEntry.objects.bulk_create(pt_list)
|
|
|
|
def parser_date(tripdate, year):
|
|
"""Interprets dates in the expo logbooks and returns a correct datetime.date object
|
|
Does NOT actually check that it is a truly valid date..
|
|
|
|
tripdate : string
|
|
year : string
|
|
|
|
Nasty bug if it returns just the year which leads to a logbook id '/2023' instead of '/2023-07-16b'
|
|
"""
|
|
dummydate = EPOCH
|
|
month = 1
|
|
day = 1
|
|
# message = f" ! - Trying to parse date in logbook: {tripdate} - {year}"
|
|
# print(message)
|
|
try:
|
|
mdatestandard = re.match(r"(\d\d\d\d)-(\d\d)-(\d\d)", tripdate)
|
|
mdategoof = re.match(r"(\d\d?)/0?(\d)/(20|19)?(\d\d)", tripdate)
|
|
if mdatestandard:
|
|
if not (mdatestandard.group(1) == year):
|
|
message = f" ! - Bad date (year) in logbook: {tripdate} - {year}"
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
return dummydate
|
|
else:
|
|
year, month, day = int(mdatestandard.group(1)), int(mdatestandard.group(2)), int(mdatestandard.group(3))
|
|
elif mdategoof:
|
|
if not (not mdategoof.group(3) or mdategoof.group(3) == year[:2]):
|
|
message = " ! - Bad date mdategoof.group(3) in logbook: " + tripdate + " - " + mdategoof.group(3)
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
return dummydate
|
|
else:
|
|
yadd = int(year[:2]) * 100
|
|
day, month, year = int(mdategoof.group(1)), int(mdategoof.group(2)), int(mdategoof.group(4)) + yadd
|
|
else:
|
|
year = EPOCH.year
|
|
message = f" ! - Bad date in logbook: {tripdate} - {year}"
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
|
|
return date(year, month, day)
|
|
except:
|
|
message = f" ! - Failed to parse date in logbook: {tripdate} - {year}"
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
return EPOCH
|
|
|
|
|
|
def parser_html(year, expedition, txt, seq=""):
|
|
r"""This uses some of the more obscure capabilities of regular expressions,
|
|
see https://docs.python.org/3/library/re.html
|
|
|
|
e.g.
|
|
* is greedy
|
|
*? is non-greedy
|
|
|
|
(?x) flag means VERBOSE
|
|
|
|
(?: ) non-capturing parentheses
|
|
|
|
\s whitespace
|
|
\S NOT whitespace
|
|
|
|
You can't see it here, but a round-trip export-then-import
|
|
for a new year logbook will move
|
|
the endmatter up to the frontmatter. This made sense when translating
|
|
from parser_html_01 format logfiles, believe me.
|
|
"""
|
|
logentries = []
|
|
dupl = {}
|
|
|
|
# extract front material and stash for later use when rebuilding from list of entries
|
|
headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt)
|
|
headpara = headmatch.groups()[0].strip()
|
|
|
|
if len(headpara) > 0:
|
|
frontpath = Path(settings.EXPOWEB, LOGBOOKS_DIR, year, "frontmatter.html")
|
|
if not frontpath.is_file:
|
|
# dont attempt to rewrite it. So this will only run once, for new logbook. Buggy otherwise.
|
|
with open(frontpath, "w") as front:
|
|
front.write(headpara + "\n")
|
|
|
|
# extract END material and stash for later use when rebuilding from list of entries
|
|
endmatch = re.match(r"(?i)(?s).*<hr\s*/>([\s\S]*?)(?=</body)", txt)
|
|
endpara = endmatch.groups()[0].strip()
|
|
|
|
if len(endpara) > 0:
|
|
print(f"\n - {year} endpara:\n'{endpara}'")
|
|
endpath = Path(settings.EXPOWEB, LOGBOOKS_DIR, year, "endmatter.html")
|
|
with open(endpath, "w") as end:
|
|
end.write(endpara + "\n")
|
|
|
|
tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt)
|
|
logbook_entry_count = 0
|
|
for trippara in tripparas:
|
|
logbook_entry_count += 1
|
|
tid = set_trip_seq_id(year, logbook_entry_count)
|
|
# print(f' - new seq tid:{tid} lbe count: {logbook_entry_count}')
|
|
|
|
s = re.match(
|
|
r"""(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date
|
|
\s*(?:<a\s+id="(.*?)"\s*/>\s*</a>)?
|
|
\s*<div\s+class="tripdate"\s*(?:id="(.*?)")?>(.*?)</div>(?:<p>)?
|
|
\s*<div\s+class="trippeople">\s*(.*?)</div>
|
|
\s*<div\s+class="triptitle">\s*(.*?)</div>
|
|
([\s\S]*?)
|
|
\s*(?:<div\s+class="timeug">\s*(.*?)</div>)?
|
|
\s*(?:<div\s+class="editentry"\s*.*?</div>)?
|
|
\s*$
|
|
""",
|
|
trippara,
|
|
)
|
|
if s:
|
|
tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups()
|
|
# print(f"#{logbook_entry_count} {tu} {len(triptext)} ")
|
|
else:
|
|
# if not re.search(r"Rigging Guide", trippara):
|
|
msg = f" !- Logbook. Can't parse entry, skipping:{logbook_entry_count} '{trippara[:75]}'..."
|
|
print(msg)
|
|
DataIssue.objects.create(parser="logbooks", message=msg)
|
|
continue
|
|
|
|
ldate = parser_date(tripdate.strip(), year)
|
|
if len(str(ldate)) < 10:
|
|
ldate = date(year, 10, 1) # 1st October
|
|
msg = f" !- Logbook. Bad parsed date '{tripdate}' setting to '{ldate}'"
|
|
print(msg)
|
|
DataIssue.objects.create(parser="logbooks", message=msg)
|
|
continue
|
|
lgdate = f"{ldate}"[:10]
|
|
|
|
# Now we have a date, we can reset tripid
|
|
tid = reset_trip_id(ldate)
|
|
triptitles = triptitle.split(" - ")
|
|
if len(triptitles) >= 2:
|
|
place = triptitles[0]
|
|
else:
|
|
place = "Unknown"
|
|
# tripcontent = re.sub(r"</p>", "", triptext)
|
|
# tripcontent = re.sub(r"<p>", "<br /><br />", tripcontent).strip()
|
|
tripcontent = triptext.strip()
|
|
|
|
triptitle = triptitle.strip()
|
|
# triptitle must be unique for a given date. [Why?!] We fix this here.
|
|
check = (ldate, triptitle)
|
|
if check in dupl:
|
|
dupl[check] += 1
|
|
triptitle = f"{triptitle} #{dupl[check]}"
|
|
print(f" - {triptitle} -- {ldate}")
|
|
else:
|
|
dupl[check] = 1
|
|
|
|
tu = tidy_time_underground(tu)
|
|
trippersons, author, guests = tidy_trip_persons(trippeople, triptitle, expedition, tu, tid)
|
|
tripcave = tidy_trip_cave(place)
|
|
tripcontent = tidy_trip_image_urls(tripcontent, ldate)
|
|
tid = tidy_tid(tid, triptitle, lgdate)
|
|
|
|
entrytuple = (ldate, place, tripcave, triptitle, tripcontent, trippersons, author, guests, expedition, tu, tid)
|
|
logentries.append(entrytuple)
|
|
return logentries
|
|
|
|
|
|
def parser_blog(year, expedition, txt, sq=""):
|
|
"""Parses the format of web pages collected as 'Save As HTML" from the UK Caving blog website.
|
|
Note that the entries have dates and authors, but no titles.
|
|
See detailed explanation of the complete process:
|
|
https://expo.survex.com/handbook/computing/logbooks-parsing.html
|
|
https://expo.survex.com/handbook/computing/log-blog-parsing.html
|
|
|
|
This uses some of the more obscure capabilities of regular expressions,
|
|
see https://docs.python.org/3/library/re.html
|
|
|
|
BLOG entries have this structure:
|
|
<article ... data-author="Tinywoman" data-content="post-298780" id="js-post-298780">
|
|
<article class="message-body js-selectToQuote">
|
|
</article>
|
|
</article>
|
|
So the content is nested inside the header. Attachments (images) come after the content.
|
|
It's a bugger, but it's out of our control.
|
|
"""
|
|
logentries = []
|
|
|
|
tripheads = re.findall(
|
|
# note use of non-greedy capturing (?: regex idiom here
|
|
r"<article class=\"message message--post js-post js-inlineModContainer\s*(?:is-unread)*\s*\"\s*([\s\S]*?)(?=</article)", txt
|
|
)
|
|
if not (tripheads):
|
|
message = f" ! - Skipping on failure to parse article header: {txt[:500]}"
|
|
print(message)
|
|
|
|
# (?= is a non-consuming match, see https://docs.python.org/3/library/re.html
|
|
tripparas = re.findall(
|
|
r"<article class=\"message-body js-selectToQuote\"\>\s*([\s\S]*?)(</article[^>]*>)([\s\S]*?)(?=</article)", txt
|
|
)
|
|
if not (tripparas):
|
|
message = f" ! - Skipping on failure to parse article content: {txt[:500]}"
|
|
print(message)
|
|
|
|
if len(tripheads) != len(tripparas):
|
|
print(f"{len(tripheads)} != {len(tripparas)} not the same number of headers {len(tripheads)} as paras {len(tripparas)} !")
|
|
# print(f"{len(tripheads)} - {len(tripparas)}")
|
|
|
|
#location = "Plateau" # best guess, fix manually later
|
|
tu = 0 # no logged time underground in a blog entry
|
|
logbook_entry_count = 0
|
|
for i in range(0, len(tripparas)):
|
|
tripstuff = tripparas[i]
|
|
attach = tripstuff[2]
|
|
# note use on non-greedy *? regex idiom here
|
|
attach = re.sub(r"<div class=\"file-content\">[\s\S]*?(?=</li>)", "", attach)
|
|
attach = re.sub(r"<footer[\s\S]*(</footer>)", "", attach)
|
|
tripcontent = tripstuff[0] + attach
|
|
# print(f"{i} - {len(tripstuff)} - {tripstuff[1]}")
|
|
triphead = tripheads[i]
|
|
logbook_entry_count += 1
|
|
tid = set_trip_seq_id(year, logbook_entry_count) + "_blog" + sq
|
|
# print(f" - tid: {tid}")
|
|
|
|
# data-author="tcacrossley"
|
|
match_author = re.search(r".*data-author=\"([^\"]*)\" data-content=.*", triphead)
|
|
if not (match_author):
|
|
message = f" ! - Skipping logentry {year}:{logbook_entry_count} on failure to parse data-author {tid} {triphead[:400]}..."
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
print(message)
|
|
break
|
|
trippeople = match_author.group(1)
|
|
# print(f" - tid: {tid} {trippeople}")
|
|
# datetime="2019-07-11T13:16:18+0100"
|
|
match_datetime = re.search(r".*datetime=\"([^\"]*)\" data-time=.*", triphead)
|
|
if not (match_datetime):
|
|
message = f" ! - Skipping logentry {year}:{logbook_entry_count} on failure to parse datetime {tid} {triphead[:400]}..."
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
print(message)
|
|
break
|
|
datestamp = match_datetime.group(1)
|
|
|
|
try:
|
|
tripdate = datetime.fromisoformat(datestamp)
|
|
except:
|
|
message = f" ! - FROMISOFORMAT fail logentry {year}:{logbook_entry_count} {tid} '{datestamp}'"
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
print(message)
|
|
# fallback, ignore the timestamp bits:
|
|
tripdate = datetime.fromisoformat(datestamp[0:10])
|
|
# print(f" - tid: {tid} '{trippeople}' '{tripdate}'")
|
|
|
|
# triptitle must have the location then a hyphen at the beginning as it is ignored by export function. We can't know what this is, so we set it as 'Expo' and 'Unknown'.
|
|
place = "Unknown"
|
|
# triptitle must be unique for a given date. We can enforce this here.
|
|
triptitle = f"Expo - UK Caving Blog{sq} post {logbook_entry_count}"
|
|
tripcontent = re.sub(r"(width=\"\d+\")", "", tripcontent)
|
|
tripcontent = re.sub(r"height=\"\d+\"", "", tripcontent)
|
|
tripcontent = re.sub(r"width: \d+px", "", tripcontent)
|
|
tripcontent = re.sub(r"\n\n+", "\n\n", tripcontent)
|
|
tripcontent = re.sub(r"<hr\s*>", "", tripcontent)
|
|
tripcontent = f"\n\n<!-- Content parsed from UK Caving Blog -->\nBlog Author: {trippeople}" + tripcontent
|
|
|
|
logtime_underground = 0
|
|
trippersons, author, guests = tidy_trip_persons(trippeople, triptitle, expedition, logtime_underground, tid)
|
|
# print(f" - author: {author}")
|
|
tripcave = tidy_trip_cave(place)
|
|
tripcontent = tidy_trip_image_urls(tripcontent, year)
|
|
tid = tidy_tid(tid, triptitle, datestamp)
|
|
|
|
entrytuple = (tripdate, place, tripcave, triptitle, tripcontent, trippersons, author, guests, expedition, tu, tid)
|
|
logentries.append(entrytuple)
|
|
return logentries
|
|
|
|
def clean_all_logbooks():
|
|
DataIssue.objects.filter(parser="logbooks").delete()
|
|
LogbookEntry.objects.all().delete()
|
|
|
|
def clean_logbook_for_expedition(expedition):
|
|
"""Only used when loading a single logbook. Deletes database LogBookEntries and
|
|
DataIssues for this expedition year.
|
|
"""
|
|
global tripsdate
|
|
tripsdate = {}
|
|
|
|
lbes = LogbookEntry.objects.filter(expedition=expedition).delete()
|
|
dataissues = DataIssue.objects.filter(parser="logbooks")
|
|
for di in dataissues:
|
|
ph = expedition.year
|
|
if re.search(ph, di.message) is not None: # SLOW just to delete issues for one year
|
|
# print(f' - CLEANING dataissue {di.message}')
|
|
di.delete()
|
|
|
|
def parse_logbook_for_expedition(expedition, blog=False):
|
|
"""Parses all logbook entries for one expedition
|
|
"""
|
|
global ENTRIES
|
|
logentries = []
|
|
|
|
logbook_parseable = False
|
|
expologbase = Path(settings.EXPOWEB, LOGBOOKS_DIR)
|
|
|
|
year = expedition.year
|
|
expect = ENTRIES[year]
|
|
# print(" - Logbook for: " + year)
|
|
|
|
if year in LOGBOOK_PARSER_SETTINGS:
|
|
yearfile, parsefunc = LOGBOOK_PARSER_SETTINGS[year]
|
|
expedition.logbookfile = yearfile # don't change this if a blog
|
|
else:
|
|
yearfile = DEFAULT_LOGBOOK_FILE
|
|
expedition.logbookfile = DEFAULT_LOGBOOK_FILE # don't change this if a blog
|
|
parsefunc = DEFAULT_LOGBOOK_PARSER
|
|
|
|
logbookpath = Path(yearfile)
|
|
|
|
if blog:
|
|
if year not in BLOG_PARSER_SETTINGS:
|
|
message = f" ! - Expecting blog parser buut none specified for {year}"
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
print(message)
|
|
else:
|
|
yearfile, parsefunc = BLOG_PARSER_SETTINGS[year]
|
|
print(f" - BLOG file {yearfile} using parser {parsefunc}")
|
|
else:
|
|
lb = Path(expologbase, year, logbookpath.stem + logbookpath.suffix)
|
|
if not (lb.is_file()):
|
|
message = f" ! Logbook file does not exist (yet): '{lb}'"
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
print(message)
|
|
|
|
for sq in ["", "2", "3", "4"]: # cope with blog saved as many separate files
|
|
lb = Path(expologbase, year, logbookpath.stem + sq + logbookpath.suffix)
|
|
if not (lb.is_file()):
|
|
break
|
|
try:
|
|
with open(lb, "rb") as file_in:
|
|
txt = file_in.read().decode("utf-8")
|
|
logbook_parseable = True
|
|
except (IOError):
|
|
logbook_parseable = False
|
|
print(f" ! Couldn't open logbook as UTF-8 {lb}")
|
|
except:
|
|
logbook_parseable = False
|
|
print(f" ! Very Bad Error opening {lb}")
|
|
|
|
if logbook_parseable:
|
|
# --------------------
|
|
parser = globals()[parsefunc]
|
|
# print(f" - {year} parsing with {parsefunc} - {lb}")
|
|
print(" .", end="")
|
|
logentries = parser(year, expedition, txt, sq) # this launches the right parser
|
|
# --------------------
|
|
|
|
if len(logentries) == expect:
|
|
# print(f"OK {year} {len(logentries):5d} is {expect}\n")
|
|
pass
|
|
else:
|
|
print(f"Mismatch in number of log entries: {year} {len(logentries):5d} is not {expect}\n")
|
|
|
|
return logentries
|
|
|
|
|
|
def LoadLogbook(year):
|
|
"""One off logbook for testing purposes, and also reloadable on '/expedition/2023?reload'
|
|
This is inside an atomic transaction"""
|
|
|
|
expo = Expedition.objects.get(year=year)
|
|
year = expo.year # some type funny
|
|
clean_logbook_for_expedition(expo)
|
|
logentries = []
|
|
|
|
logentries = parse_logbook_for_expedition(expo) # this actually loads the logbook for one expo
|
|
print(f" - Loaded logbook. {len(logentries)} entries." )
|
|
if year in BLOG_PARSER_SETTINGS:
|
|
print(f" - Loading blog.." )
|
|
logentries += parse_logbook_for_expedition(expo, blog=True) # this loads the blog logbook
|
|
else:
|
|
print(
|
|
f" - Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}"
|
|
)
|
|
for entrytuple in logentries:
|
|
date, place, tripcave, triptitle, text, trippersons, author, guests, expedition, tu, tid = entrytuple
|
|
if expo == expedition: # unneeded check, we zeroed it before filling it
|
|
# print(f" -- {triptitle}")
|
|
store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, guests, expedition, tu, tid)
|
|
else:
|
|
print(f" ! unexpected log entry labelled as '{expedition}' {tid}" )
|
|
expo.save() # to save logbook name property
|
|
|
|
def LoadLogbooks():
|
|
"""This is the master function for parsing all logbooks into the Troggle database.
|
|
This should be rewritten to use coroutines to load all logbooks from disc in parallel,
|
|
but must be serialised to write to database as sqlite is single-user.
|
|
|
|
This is inside an atomic transaction. Maybe it shouldn't be..
|
|
"""
|
|
global ENTRIES
|
|
global logentries
|
|
allentries = []
|
|
mem1 = get_process_memory()
|
|
print(f" - MEM:{mem1:7.2f} MB now ", file=sys.stderr)
|
|
start = time.time()
|
|
|
|
clean_all_logbooks()
|
|
expos = Expedition.objects.all()
|
|
if len(expos) <= 1:
|
|
message = " ! - No expeditions found. Attempting to 'people' first"
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
print(message)
|
|
load_people_expos() # by loading the folk list
|
|
expos = Expedition.objects.all()
|
|
if len(expos) <= 1:
|
|
message = " ! - No expeditions found, even after attempting to load 'people'. Abort."
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
print(message)
|
|
return
|
|
|
|
noexpo = [
|
|
"1986",
|
|
"2020",
|
|
"2021",
|
|
] # no expo
|
|
lostlogbook = ["1976", "1977", "1980", "1981"]
|
|
sqlfail = [""] # breaks mysql with db constraint fail - all now fixed.]
|
|
nologbook = noexpo + lostlogbook + sqlfail
|
|
|
|
nlbe = {}
|
|
loglist = []
|
|
bloglist = []
|
|
|
|
for expo in expos:
|
|
year = expo.year
|
|
if year in sqlfail:
|
|
print(" - Logbook for: " + year + " NO parsing attempted - known sql failures")
|
|
message = f" ! - Not even attempting to parse logbook for {year} until code fixed"
|
|
DataIssue.objects.create(parser="logbooks", message=message)
|
|
print(message)
|
|
|
|
if year not in nologbook:
|
|
if year in ENTRIES:
|
|
loglist.append(expo)
|
|
else:
|
|
print(" - No Logbook entries count yet for: " + year) # catch case when preparing for next expo
|
|
loglist.append(expo)
|
|
|
|
if year in BLOG_PARSER_SETTINGS:
|
|
bloglist.append(expo)
|
|
|
|
for ex in loglist:
|
|
logentries = parse_logbook_for_expedition(ex) # this loads the logbook for one expo
|
|
allentries += logentries
|
|
|
|
for b in bloglist:
|
|
print(f" - BLOG: {b}")
|
|
logentries = parse_logbook_for_expedition(b, blog=True) # loads the blog logbook for one expo
|
|
allentries += logentries
|
|
|
|
print(f"\n - {len(allentries):,} log entries parsed in all expeditions")
|
|
mem = get_process_memory()
|
|
print(f" - MEM: {mem:7.2f} MB in use, {mem-mem1:7.2f} MB more", file=sys.stderr)
|
|
duration = time.time() - start
|
|
print(f" - TIME: {duration:7.2f} s", file=sys.stderr)
|
|
print(f"\n - Now store all those logbook entries in the database.")
|
|
|
|
# Now we serially store the parsed data in the database, updating 3 types of object:
|
|
# - Expedition (the 'logbook.html' value)
|
|
# - LogBookEntry (text, who when etc.)
|
|
# - PersonLogEntry (who was on that specific trip mentione din the logbook entry)
|
|
for entrytuple in allentries:
|
|
date, place, tripcave, triptitle, text, trippersons, author, guests, expedition, tu, tid = entrytuple
|
|
store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, guests, expedition, tu, tid)
|
|
|
|
for expo in expos:
|
|
expo.save() # to save logbook name property
|
|
mem = get_process_memory()
|
|
print(f" - {len(allentries):,} log entries saved into database")
|
|
print(f" - MEM: {mem:7.2f} MB in use, {mem-mem1:7.2f} MB more", file=sys.stderr)
|
|
duration = time.time() - start
|
|
print(f" - TIME: {duration:7.2f} s", file=sys.stderr)
|
|
# dateRegex = re.compile(r'<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S)
|
|
# expeditionYearRegex = re.compile(r'<span\s+class="expeditionyear">(.*?)</span>', re.S)
|
|
# titleRegex = re.compile(r'<H1>(.*?)</H1>', re.S)
|
|
# reportRegex = re.compile(r'<div\s+class="report">(.*)</div>\s*</body>', re.S)
|
|
# personRegex = re.compile(r'<div\s+class="person">(.*?)</div>', re.S)
|
|
# nameAuthorRegex = re.compile(r'<span\s+class="name(,author|)">(.*?)</span>', re.S)
|
|
# TURegex = re.compile(r'<span\s+class="TU">([0-9]*\.?[0-9]+)</span>', re.S)
|
|
# locationRegex = re.compile(r'<span\s+class="location">(.*?)</span>', re.S)
|
|
# caveRegex = re.compile(r'<span\s+class="cave">(.*?)</span>', re.S)
|