2
0
mirror of https://expo.survex.com/repositories/troggle/.git synced 2024-11-25 00:31:55 +00:00

refactored logbooks parser

This commit is contained in:
Philip Sargent 2023-01-27 23:21:07 +00:00
parent 6565b3f9c4
commit e01bd39609
4 changed files with 130 additions and 91 deletions

View File

@ -549,6 +549,10 @@ class PersonTrip(TroggleModel):
logbook_entry = models.ForeignKey(LogbookEntry,on_delete=models.CASCADE) logbook_entry = models.ForeignKey(LogbookEntry,on_delete=models.CASCADE)
is_logbook_entry_author = models.BooleanField(default=False) is_logbook_entry_author = models.BooleanField(default=False)
class Meta:
ordering = ('-personexpedition',)
#order_with_respect_to = 'personexpedition'
def persontrip_next(self): def persontrip_next(self):
futurePTs = PersonTrip.objects.filter(personexpedition = self.personexpedition, logbook_entry__date__gt = self.logbook_entry.date).order_by('logbook_entry__date').all() futurePTs = PersonTrip.objects.filter(personexpedition = self.personexpedition, logbook_entry__date__gt = self.logbook_entry.date).order_by('logbook_entry__date').all()
if len(futurePTs) > 0: if len(futurePTs) > 0:

View File

@ -18,7 +18,7 @@ from troggle.core.models.caves import LogbookEntry, PersonTrip
from troggle.core.models.survex import SurvexBlock, Wallet from troggle.core.models.survex import SurvexBlock, Wallet
from troggle.core.models.troggle import Expedition, Person, PersonExpedition from troggle.core.models.troggle import Expedition, Person, PersonExpedition
from troggle.core.utils import TROG from troggle.core.utils import TROG
from troggle.parsers.logbooks import parse_logbook_for_expedition from troggle.parsers.imports import import_logbook
from troggle.parsers.people import GetPersonExpeditionNameLookup from troggle.parsers.people import GetPersonExpeditionNameLookup
from .auth import login_required_if_public from .auth import login_required_if_public
@ -69,7 +69,7 @@ def expedition(request, expeditionname):
#print(f'! - delete entry: "{entry}"') #print(f'! - delete entry: "{entry}"')
entry.delete() entry.delete()
entries = this_expedition.logbookentry_set.all() entries = this_expedition.logbookentry_set.all()
LoadLogbookForExpedition(this_expedition) import_logbook(year=this_expedition.year)
logged_in = True logged_in = True
else: else:
logged_in = False logged_in = False

View File

@ -40,7 +40,7 @@ def import_logbooks():
troggle.parsers.logbooks.LoadLogbooks() troggle.parsers.logbooks.LoadLogbooks()
def import_logbook(year=2022): def import_logbook(year=2016):
print(f"-- Importing Logbook {year}") print(f"-- Importing Logbook {year}")
with transaction.atomic(): with transaction.atomic():
troggle.parsers.logbooks.LoadLogbook(year) troggle.parsers.logbooks.LoadLogbook(year)

View File

@ -1,5 +1,8 @@
import os import os
import re import re
import sys
import time
from datetime import date, datetime from datetime import date, datetime
from pathlib import Path from pathlib import Path
from random import randint from random import randint
@ -7,10 +10,10 @@ from random import randint
from django.conf import settings from django.conf import settings
from django.template.defaultfilters import slugify from django.template.defaultfilters import slugify
from parsers.people import GetPersonExpeditionNameLookup from parsers.people import GetPersonExpeditionNameLookup, load_people_expos
from troggle.core.models.caves import GetCaveLookup, LogbookEntry, PersonTrip from troggle.core.models.caves import GetCaveLookup, LogbookEntry, PersonTrip
from troggle.core.models.troggle import DataIssue, Expedition from troggle.core.models.troggle import DataIssue, Expedition
from troggle.core.utils import save_carefully from troggle.core.utils import save_carefully, get_process_memory
""" """
Parses and imports logbooks in all their wonderful confusion Parses and imports logbooks in all their wonderful confusion
@ -20,6 +23,8 @@ Parses and imports logbooks in all their wonderful confusion
todo = """ todo = """
- refactor everything with some urgency, esp. parse_logbook_for_expedition() - refactor everything with some urgency, esp. parse_logbook_for_expedition()
- pre-compile all the heavily used regular expressions !
- break out the code that hits the database from that which parses the logbook - break out the code that hits the database from that which parses the logbook
so that the file-reading and parsing can be parallelized, while writing to the so that the file-reading and parsing can be parallelized, while writing to the
database remains serialized (sqlite is single-user). database remains serialized (sqlite is single-user).
@ -43,7 +48,7 @@ data for old logbooks? Not worth it..
""" """
MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200 MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200
BLOG_PARSER_SETTINGS = { BLOG_PARSER_SETTINGS = { # no default, must be explicit
# "2022": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html # "2022": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
# "2019": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html # "2019": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
# "2018": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html # "2018": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
@ -57,7 +62,7 @@ LOGBOOK_PARSER_SETTINGS = {
"1982": ("logbook.html", "parser_html"), "1982": ("logbook.html", "parser_html"),
} }
entries = { ENTRIES = {
"2022": 90, "2022": 90,
"2019": 55, "2019": 55,
"2018": 95, "2018": 95,
@ -99,18 +104,15 @@ entries = {
} }
logentries = [] # the entire logbook for one year is a single object: a list of entries logentries = [] # the entire logbook for one year is a single object: a list of entries
noncaveplaces = ["Journey", "Loser Plateau", "UNKNOWN", "plateau", "base camp", "basecamp", "top camp", "topcamp"] noncaveplaces = ["travel", "Journey", "Loser Plateau", "UNKNOWN", "plateau", "base camp", "basecamp", "top camp", "topcamp"]
trips = {}
def set_trip_id(year, seq): def set_trip_id(year, seq):
tid = f"{year}_s{seq:02d}" tid = f"{year}_s{seq:02d}"
return tid return tid
rx_tripperson = re.compile(r"(?i)<u>(.*?)</u>$") rx_tripperson = re.compile(r"(?i)<u>(.*?)</u>$")
rx_round_bracket = re.compile(r"[\(\[].*?[\)\]]") rx_round_bracket = re.compile(r"[\(\[].*?[\)\]]")
def GetTripPersons(trippeople, expedition, logtime_underground, tid=None): def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
res = [] res = []
author = None author = None
@ -161,7 +163,7 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
return res, author return res, author
def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, tid=None): def store_entry_into_database(date, place, title, text, trippeople, expedition, logtime_underground, tid=None):
"""saves a single logbook entry and related persontrips """saves a single logbook entry and related persontrips
Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday ! Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday !
@ -254,9 +256,11 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
nonLookupAttribs = {"time_underground": time_underground, "is_logbook_entry_author": (tripperson == author)} nonLookupAttribs = {"time_underground": time_underground, "is_logbook_entry_author": (tripperson == author)}
# this creates the PersonTrip instance. # this creates the PersonTrip instance.
save_carefully(PersonTrip, lookupAttribs, nonLookupAttribs) save_carefully(PersonTrip, lookupAttribs, nonLookupAttribs)
def ParseDate(tripdate, year): def parser_date(tripdate, year):
"""Interprets dates in the expo logbooks and returns a correct datetime.date object""" """Interprets dates in the expo logbooks and returns a correct datetime.date object"""
dummydate = date(1970, 1, 1) # replace with _EPOCH dummydate = date(1970, 1, 1) # replace with _EPOCH
month = 1 month = 1
@ -302,6 +306,7 @@ def parser_html(year, expedition, txt, seq=""):
from parser_html_01 format logfiles, believe me. from parser_html_01 format logfiles, believe me.
""" """
global logentries global logentries
dupl = {}
# extract front material and stash for later use when rebuilding from list of entries # extract front material and stash for later use when rebuilding from list of entries
headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt) headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt)
@ -370,7 +375,7 @@ def parser_html(year, expedition, txt, seq=""):
DataIssue.objects.create(parser="logbooks", message=msg) DataIssue.objects.create(parser="logbooks", message=msg)
continue continue
ldate = ParseDate(tripdate.strip(), year) ldate = parser_date(tripdate.strip(), year)
triptitles = triptitle.split(" - ") triptitles = triptitle.split(" - ")
if len(triptitles) >= 2: if len(triptitles) >= 2:
tripcave = triptitles[0] tripcave = triptitles[0]
@ -381,6 +386,15 @@ def parser_html(year, expedition, txt, seq=""):
ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip() ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
triptitle = triptitle.strip() triptitle = triptitle.strip()
# triptitle must be unique for a given date. We fix this here.
check = (ldate, triptitle)
if check in dupl:
dupl[check] += 1
triptitle = f"{triptitle} #{dupl[check]}"
print(f" - {triptitle} -- {date}")
else:
dupl[check] = 1
entrytuple = (ldate, tripcave, triptitle, ltriptext, trippeople, expedition, tu, tripid1) entrytuple = (ldate, tripcave, triptitle, ltriptext, trippeople, expedition, tu, tripid1)
logentries.append(entrytuple) logentries.append(entrytuple)
@ -401,6 +415,7 @@ def parser_blog(year, expedition, txt, sq=""):
</article> </article>
</article> </article>
So the content is nested inside the header. Attachments (images) come after the content. So the content is nested inside the header. Attachments (images) come after the content.
It's a bugger, but it's out of our control.
""" """
global logentries global logentries
@ -467,9 +482,10 @@ def parser_blog(year, expedition, txt, sq=""):
tripdate = datetime.fromisoformat(datestamp[0:10]) tripdate = datetime.fromisoformat(datestamp[0:10])
# print(f" - tid: {tid} '{trippeople}' '{tripdate}'") # print(f" - tid: {tid} '{trippeople}' '{tripdate}'")
# tripname must have the location then a hyphen at the beginning as it is ignored by export function # triptitle must have the location then a hyphen at the beginning as it is ignored by export function. We can't know what this is, so we set it as 'Expo' and 'Unknown'.
location = "Unknown" location = "Unknown"
tripname = f"Expo - UK Caving Blog{sq} post {logbook_entry_count}" # must be unique for a given date # triptitle must be unique for a given date. We can enforce this here.
triptitle = f"Expo - UK Caving Blog{sq} post {logbook_entry_count}"
tripcontent = re.sub(r"(width=\"\d+\")", "", tripcontent) tripcontent = re.sub(r"(width=\"\d+\")", "", tripcontent)
tripcontent = re.sub(r"height=\"\d+\"", "", tripcontent) tripcontent = re.sub(r"height=\"\d+\"", "", tripcontent)
tripcontent = re.sub(r"width: \d+px", "", tripcontent) tripcontent = re.sub(r"width: \d+px", "", tripcontent)
@ -477,7 +493,7 @@ def parser_blog(year, expedition, txt, sq=""):
tripcontent = re.sub(r"<hr\s*>", "", tripcontent) tripcontent = re.sub(r"<hr\s*>", "", tripcontent)
tripcontent = f"\n\n<!-- Content parsed from UK Caving Blog -->\nBlog Author: {trippeople}" + tripcontent tripcontent = f"\n\n<!-- Content parsed from UK Caving Blog -->\nBlog Author: {trippeople}" + tripcontent
entrytuple = (tripdate, location, tripname, tripcontent, trippeople, expedition, tu, tid) entrytuple = (tripdate, location, triptitle, tripcontent, trippeople, expedition, tu, tid)
logentries.append(entrytuple) logentries.append(entrytuple)
def clean_all_logbooks(): def clean_all_logbooks():
@ -485,53 +501,51 @@ def clean_all_logbooks():
LogbookEntry.objects.all().delete() LogbookEntry.objects.all().delete()
def clean_logbook_for_expedition(expedition): def clean_logbook_for_expedition(expedition):
def cleanerrors(year): """Only used when loading a single logbook. Deletes database LogBookEntries and
dataissues = DataIssue.objects.filter(parser="logbooks") DataIssues for this expedition year.
for di in dataissues: """
ph = year lbes = LogbookEntry.objects.filter(expedition=expedition).delete()
if re.search(ph, di.message) is not None: # SLOW just to delete issues for one year dataissues = DataIssue.objects.filter(parser="logbooks")
# print(f' - CLEANING dataissue {di.message}') for di in dataissues:
di.delete() ph = expedition.year
if re.search(ph, di.message) is not None: # SLOW just to delete issues for one year
# print(f' - CLEANING dataissue {di.message}')
di.delete()
def parse_logbook_for_expedition(expedition, blog=False):
year = expedition.year
cleanerrors(year)
lbes = LogbookEntry.objects.filter(expedition=expedition) # must be a quicker way
for lbe in lbes:
lbe.delete()
def parse_logbook_for_expedition(expedition):
"""Parses all logbook entries for one expedition """Parses all logbook entries for one expedition
if clean==True then it deletes all entries for this year first.
""" """
global logentries global logentries
# absolutely horrid. REFACTOR THIS (all my fault..) global ENTRIES
global entries logentries = []
logbook_parseable = False logbook_parseable = False
yearlinks = LOGBOOK_PARSER_SETTINGS expologbase = Path(settings.EXPOWEB, "years")
expologbase = os.path.join(settings.EXPOWEB, "years")
logentries = []
year = expedition.year year = expedition.year
expect = entries[year] expect = ENTRIES[year]
# print(" - Logbook for: " + year) # print(" - Logbook for: " + year)
if year in yearlinks: if year in LOGBOOK_PARSER_SETTINGS:
yearfile, yearparser = yearlinks[year] yearfile, parsefunc = LOGBOOK_PARSER_SETTINGS[year]
logbookpath = Path(yearfile) expedition.logbookfile = yearfile # don't change this if a blog
expedition.logbookfile = yearfile
parsefunc = yearparser
# print(f" - Logbook file {yearfile} using parser {yearparser}")
else: else:
logbookpath = Path(DEFAULT_LOGBOOK_FILE) yearfile = DEFAULT_LOGBOOK_FILE
expedition.logbookfile = DEFAULT_LOGBOOK_FILE expedition.logbookfile = DEFAULT_LOGBOOK_FILE # don't change this if a blog
parsefunc = DEFAULT_LOGBOOK_PARSER parsefunc = DEFAULT_LOGBOOK_PARSER
if blog:
print(f" - BLOG file {yearfile} using parser {parsefunc}")
if year not in BLOG_PARSER_SETTINGS:
message = f" ! - Expecting blog parser buut none specified for {year}"
DataIssue.objects.create(parser="logbooks", message=message)
print(message)
else:
yearfile, parsefunc = BLOG_PARSER_SETTINGS[year]
expedition.save() logbookpath = Path(yearfile)
# print(f" - Logbook file {yearfile} using parser {parsefunc}")
# expedition.save()
for sq in ["", "2", "3", "4"]: # cope with blog saved as many separate files for sq in ["", "2", "3", "4"]: # cope with blog saved as many separate files
lb = Path(expologbase, year, logbookpath.stem + sq + logbookpath.suffix) lb = Path(expologbase, year, logbookpath.stem + sq + logbookpath.suffix)
@ -556,17 +570,10 @@ def parse_logbook_for_expedition(expedition):
print(f" - {year} parsing with {parsefunc} - {lb}") print(f" - {year} parsing with {parsefunc} - {lb}")
parser(year, expedition, txt, sq) # this launches the right parser for this year parser(year, expedition, txt, sq) # this launches the right parser for this year
# -------------------- # --------------------
dupl = {} # move database storage into separate step
for entrytuple in logentries: # for entrytuple in logentries:
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple # date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
check = (date, triptitle) # store_entry_into_database(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1)
if check in dupl:
dupl[check] += 1
triptitle = f"{triptitle} #{dupl[check]}"
print(f" - {triptitle} -- {date}")
else:
dupl[check] = 1
EnterLogIntoDbase(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1)
if len(logentries) == expect: if len(logentries) == expect:
# print(f"OK {year} {len(logentries):5d} is {expect}\n") # print(f"OK {year} {len(logentries):5d} is {expect}\n")
@ -578,39 +585,57 @@ def parse_logbook_for_expedition(expedition):
def LoadLogbook(year): def LoadLogbook(year):
"""One off logbook for testing purposes""" """One off logbook for testing purposes, and also reloadable on '/expedition/2022?reload'
global LOGBOOK_PARSER_SETTINGS This is inside an atomic transaction"""
global logentries
nlbe = {} nlbe = {}
expo = Expedition.objects.get(year=year) expo = Expedition.objects.get(year=year)
year = expo.year # some type funny year = expo.year # some type funny
clean_logbook_for_expedition(expo) clean_logbook_for_expedition(expo)
logentries = []
nlbe[expo] = parse_logbook_for_expedition(expo) # this actually loads the logbook for one expo nlbe[expo] = parse_logbook_for_expedition(expo) # this actually loads the logbook for one expo
if year in BLOG_PARSER_SETTINGS: if year in BLOG_PARSER_SETTINGS:
print("BLOG parsing") nlbe[expo] = parse_logbook_for_expedition(expo, blog=True) # this loads the blog logbook
LOGBOOK_PARSER_SETTINGS[year] = BLOG_PARSER_SETTINGS[year]
nlbe[expo] = parse_logbook_for_expedition(expo) # this loads the blog logbook for one expo
else: else:
print( print(
f"Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}" f" - Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}"
) )
for entrytuple in logentries:
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
if expo == expedition:
#print(f" - {triptitle}")
store_entry_into_database(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1)
expedition.save() # to save logbook name property
def LoadLogbooks(): def LoadLogbooks():
"""This is the master function for parsing all logbooks into the Troggle database. """This is the master function for parsing all logbooks into the Troggle database.
This should be rewritten to use coroutines to load all logbooks from disc in parallel, This should be rewritten to use coroutines to load all logbooks from disc in parallel,
but must be serialised to write to database as sqlite is single-user. but must be serialised to write to database as sqlite is single-user.
This is inside an atomic transaction. Maybe it shouldn't be..
""" """
global entries global ENTRIES
global logentries
allentries = []
mem1 = get_process_memory()
print(f" - MEM:{mem1:7.2f} MB now ", file=sys.stderr)
start = time.time()
clean_all_logbooks() clean_all_logbooks()
expos = Expedition.objects.all() expos = Expedition.objects.all()
if len(expos) <= 1: if len(expos) <= 1:
message = " ! - No expeditions found. Load 'people' first" message = " ! - No expeditions found. Attempting to 'people' first"
DataIssue.objects.create(parser="logbooks", message=message) DataIssue.objects.create(parser="logbooks", message=message)
print(message) print(message)
return load_people_expos()
expos = Expedition.objects.all()
if len(expos) <= 1:
message = " ! - No expeditions found, even after attempting to load 'people'. Abort."
DataIssue.objects.create(parser="logbooks", message=message)
print(message)
return
noexpo = [ noexpo = [
"1986", "1986",
@ -625,7 +650,7 @@ def LoadLogbooks():
loglist = [] loglist = []
bloglist = [] bloglist = []
for expo in expos: # pointless as we explicitly know the years in this code. for expo in expos:
year = expo.year year = expo.year
if year in sqlfail: if year in sqlfail:
print(" - Logbook for: " + year + " NO parsing attempted - known sql failures") print(" - Logbook for: " + year + " NO parsing attempted - known sql failures")
@ -634,7 +659,7 @@ def LoadLogbooks():
print(message) print(message)
if year not in nologbook: if year not in nologbook:
if year in entries: if year in ENTRIES:
loglist.append(expo) loglist.append(expo)
else: else:
print(" - No Logbook yet for: " + year) # catch case when preparing for next expo print(" - No Logbook yet for: " + year) # catch case when preparing for next expo
@ -644,28 +669,38 @@ def LoadLogbooks():
for ex in loglist: for ex in loglist:
nlbe[ex] = parse_logbook_for_expedition(ex) # this loads the logbook for one expo nlbe[ex] = parse_logbook_for_expedition(ex) # this loads the logbook for one expo
allentries += logentries
for b in bloglist: for b in bloglist:
if str(b) in LOGBOOK_PARSER_SETTINGS:
orig = LOGBOOK_PARSER_SETTINGS[str(b)]
else:
orig = (DEFAULT_LOGBOOK_FILE, DEFAULT_LOGBOOK_PARSER)
LOGBOOK_PARSER_SETTINGS[str(b)] = BLOG_PARSER_SETTINGS[str(b)]
print(f" - BLOG: {b}") print(f" - BLOG: {b}")
nlbe[b] = parse_logbook_for_expedition(b) # no clean. loads the blog logbook for one expo nlbe[b] += parse_logbook_for_expedition(b, blog=True) # loads the blog logbook for one expo
LOGBOOK_PARSER_SETTINGS[str(b)] = orig allentries += logentries
# tried to use map with concurrent threads - but sqlite database is not concurrent, so failed with database lock
# yt = 0
# for r in map(parse_logbook_for_expedition, loglist):
# yt = r
yt = 0 yt = 0
for e in nlbe: for exp in nlbe:
yt += nlbe[e] yt += nlbe[exp]
print(f"total {yt:,} log entries parsed in all expeditions") print(f"total {yt:,} log entries parsed in all expeditions")
print(f"total {len(allentries):,} log entries in complete dict")
mem = get_process_memory()
print(f" - MEM: {mem:7.2f} MB in use, {mem-mem1:7.2f} MB more", file=sys.stderr)
duration = time.time() - start
print(f" - TIME: {duration:7.2f} s", file=sys.stderr)
# Now we serially store the parsed data in the database, updating 3 types of object:
# - Expedition (the 'logbook.html' value)
# - LogBookEntry (text, who when etc.)
# - PersonTrip (who was on that specific trip mentione din the logbook entry)
for entrytuple in allentries:
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
store_entry_into_database(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1)
for expo in expos:
expedition.save() # to save logbook name property
mem = get_process_memory()
print(f" - MEM: {mem:7.2f} MB in use, {mem-mem1:7.2f} MB more", file=sys.stderr)
duration = time.time() - start
print(f" - TIME: {duration:7.2f} s", file=sys.stderr)
# dateRegex = re.compile(r'<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S) # dateRegex = re.compile(r'<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S)
# expeditionYearRegex = re.compile(r'<span\s+class="expeditionyear">(.*?)</span>', re.S) # expeditionYearRegex = re.compile(r'<span\s+class="expeditionyear">(.*?)</span>', re.S)
# titleRegex = re.compile(r'<H1>(.*?)</H1>', re.S) # titleRegex = re.compile(r'<H1>(.*?)</H1>', re.S)