refactored logbooks parser

This commit is contained in:
Philip Sargent 2023-01-27 23:21:07 +00:00
parent 6565b3f9c4
commit e01bd39609
4 changed files with 130 additions and 91 deletions

View File

@ -549,6 +549,10 @@ class PersonTrip(TroggleModel):
logbook_entry = models.ForeignKey(LogbookEntry,on_delete=models.CASCADE)
is_logbook_entry_author = models.BooleanField(default=False)
class Meta:
ordering = ('-personexpedition',)
#order_with_respect_to = 'personexpedition'
def persontrip_next(self):
futurePTs = PersonTrip.objects.filter(personexpedition = self.personexpedition, logbook_entry__date__gt = self.logbook_entry.date).order_by('logbook_entry__date').all()
if len(futurePTs) > 0:

View File

@ -18,7 +18,7 @@ from troggle.core.models.caves import LogbookEntry, PersonTrip
from troggle.core.models.survex import SurvexBlock, Wallet
from troggle.core.models.troggle import Expedition, Person, PersonExpedition
from troggle.core.utils import TROG
from troggle.parsers.logbooks import parse_logbook_for_expedition
from troggle.parsers.imports import import_logbook
from troggle.parsers.people import GetPersonExpeditionNameLookup
from .auth import login_required_if_public
@ -69,7 +69,7 @@ def expedition(request, expeditionname):
#print(f'! - delete entry: "{entry}"')
entry.delete()
entries = this_expedition.logbookentry_set.all()
LoadLogbookForExpedition(this_expedition)
import_logbook(year=this_expedition.year)
logged_in = True
else:
logged_in = False

View File

@ -40,7 +40,7 @@ def import_logbooks():
troggle.parsers.logbooks.LoadLogbooks()
def import_logbook(year=2022):
def import_logbook(year=2016):
print(f"-- Importing Logbook {year}")
with transaction.atomic():
troggle.parsers.logbooks.LoadLogbook(year)

View File

@ -1,5 +1,8 @@
import os
import re
import sys
import time
from datetime import date, datetime
from pathlib import Path
from random import randint
@ -7,10 +10,10 @@ from random import randint
from django.conf import settings
from django.template.defaultfilters import slugify
from parsers.people import GetPersonExpeditionNameLookup
from parsers.people import GetPersonExpeditionNameLookup, load_people_expos
from troggle.core.models.caves import GetCaveLookup, LogbookEntry, PersonTrip
from troggle.core.models.troggle import DataIssue, Expedition
from troggle.core.utils import save_carefully
from troggle.core.utils import save_carefully, get_process_memory
"""
Parses and imports logbooks in all their wonderful confusion
@ -20,6 +23,8 @@ Parses and imports logbooks in all their wonderful confusion
todo = """
- refactor everything with some urgency, esp. parse_logbook_for_expedition()
- pre-compile all the heavily used regular expressions !
- break out the code that hits the database from that which parses the logbook
so that the file-reading and parsing can be parallelized, while writing to the
database remains serialized (sqlite is single-user).
@ -43,7 +48,7 @@ data for old logbooks? Not worth it..
"""
MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200
BLOG_PARSER_SETTINGS = {
BLOG_PARSER_SETTINGS = { # no default, must be explicit
# "2022": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
# "2019": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
# "2018": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
@ -57,7 +62,7 @@ LOGBOOK_PARSER_SETTINGS = {
"1982": ("logbook.html", "parser_html"),
}
entries = {
ENTRIES = {
"2022": 90,
"2019": 55,
"2018": 95,
@ -99,18 +104,15 @@ entries = {
}
logentries = [] # the entire logbook for one year is a single object: a list of entries
noncaveplaces = ["Journey", "Loser Plateau", "UNKNOWN", "plateau", "base camp", "basecamp", "top camp", "topcamp"]
trips = {}
noncaveplaces = ["travel", "Journey", "Loser Plateau", "UNKNOWN", "plateau", "base camp", "basecamp", "top camp", "topcamp"]
def set_trip_id(year, seq):
tid = f"{year}_s{seq:02d}"
return tid
rx_tripperson = re.compile(r"(?i)<u>(.*?)</u>$")
rx_round_bracket = re.compile(r"[\(\[].*?[\)\]]")
def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
res = []
author = None
@ -161,7 +163,7 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
return res, author
def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, tid=None):
def store_entry_into_database(date, place, title, text, trippeople, expedition, logtime_underground, tid=None):
"""saves a single logbook entry and related persontrips
Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday !
@ -254,9 +256,11 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
nonLookupAttribs = {"time_underground": time_underground, "is_logbook_entry_author": (tripperson == author)}
# this creates the PersonTrip instance.
save_carefully(PersonTrip, lookupAttribs, nonLookupAttribs)
def ParseDate(tripdate, year):
def parser_date(tripdate, year):
"""Interprets dates in the expo logbooks and returns a correct datetime.date object"""
dummydate = date(1970, 1, 1) # replace with _EPOCH
month = 1
@ -302,6 +306,7 @@ def parser_html(year, expedition, txt, seq=""):
from parser_html_01 format logfiles, believe me.
"""
global logentries
dupl = {}
# extract front material and stash for later use when rebuilding from list of entries
headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt)
@ -370,7 +375,7 @@ def parser_html(year, expedition, txt, seq=""):
DataIssue.objects.create(parser="logbooks", message=msg)
continue
ldate = ParseDate(tripdate.strip(), year)
ldate = parser_date(tripdate.strip(), year)
triptitles = triptitle.split(" - ")
if len(triptitles) >= 2:
tripcave = triptitles[0]
@ -381,6 +386,15 @@ def parser_html(year, expedition, txt, seq=""):
ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
triptitle = triptitle.strip()
# triptitle must be unique for a given date. We fix this here.
check = (ldate, triptitle)
if check in dupl:
dupl[check] += 1
triptitle = f"{triptitle} #{dupl[check]}"
print(f" - {triptitle} -- {date}")
else:
dupl[check] = 1
entrytuple = (ldate, tripcave, triptitle, ltriptext, trippeople, expedition, tu, tripid1)
logentries.append(entrytuple)
@ -401,6 +415,7 @@ def parser_blog(year, expedition, txt, sq=""):
</article>
</article>
So the content is nested inside the header. Attachments (images) come after the content.
It's a bugger, but it's out of our control.
"""
global logentries
@ -467,9 +482,10 @@ def parser_blog(year, expedition, txt, sq=""):
tripdate = datetime.fromisoformat(datestamp[0:10])
# print(f" - tid: {tid} '{trippeople}' '{tripdate}'")
# tripname must have the location then a hyphen at the beginning as it is ignored by export function
# triptitle must have the location then a hyphen at the beginning as it is ignored by export function. We can't know what this is, so we set it as 'Expo' and 'Unknown'.
location = "Unknown"
tripname = f"Expo - UK Caving Blog{sq} post {logbook_entry_count}" # must be unique for a given date
# triptitle must be unique for a given date. We can enforce this here.
triptitle = f"Expo - UK Caving Blog{sq} post {logbook_entry_count}"
tripcontent = re.sub(r"(width=\"\d+\")", "", tripcontent)
tripcontent = re.sub(r"height=\"\d+\"", "", tripcontent)
tripcontent = re.sub(r"width: \d+px", "", tripcontent)
@ -477,7 +493,7 @@ def parser_blog(year, expedition, txt, sq=""):
tripcontent = re.sub(r"<hr\s*>", "", tripcontent)
tripcontent = f"\n\n<!-- Content parsed from UK Caving Blog -->\nBlog Author: {trippeople}" + tripcontent
entrytuple = (tripdate, location, tripname, tripcontent, trippeople, expedition, tu, tid)
entrytuple = (tripdate, location, triptitle, tripcontent, trippeople, expedition, tu, tid)
logentries.append(entrytuple)
def clean_all_logbooks():
@ -485,53 +501,51 @@ def clean_all_logbooks():
LogbookEntry.objects.all().delete()
def clean_logbook_for_expedition(expedition):
def cleanerrors(year):
dataissues = DataIssue.objects.filter(parser="logbooks")
for di in dataissues:
ph = year
if re.search(ph, di.message) is not None: # SLOW just to delete issues for one year
# print(f' - CLEANING dataissue {di.message}')
di.delete()
"""Only used when loading a single logbook. Deletes database LogBookEntries and
DataIssues for this expedition year.
"""
lbes = LogbookEntry.objects.filter(expedition=expedition).delete()
dataissues = DataIssue.objects.filter(parser="logbooks")
for di in dataissues:
ph = expedition.year
if re.search(ph, di.message) is not None: # SLOW just to delete issues for one year
# print(f' - CLEANING dataissue {di.message}')
di.delete()
year = expedition.year
cleanerrors(year)
lbes = LogbookEntry.objects.filter(expedition=expedition) # must be a quicker way
for lbe in lbes:
lbe.delete()
def parse_logbook_for_expedition(expedition):
def parse_logbook_for_expedition(expedition, blog=False):
"""Parses all logbook entries for one expedition
if clean==True then it deletes all entries for this year first.
"""
global logentries
# absolutely horrid. REFACTOR THIS (all my fault..)
global entries
global ENTRIES
logentries = []
logbook_parseable = False
yearlinks = LOGBOOK_PARSER_SETTINGS
expologbase = os.path.join(settings.EXPOWEB, "years")
logentries = []
expologbase = Path(settings.EXPOWEB, "years")
year = expedition.year
expect = entries[year]
expect = ENTRIES[year]
# print(" - Logbook for: " + year)
if year in yearlinks:
yearfile, yearparser = yearlinks[year]
logbookpath = Path(yearfile)
expedition.logbookfile = yearfile
parsefunc = yearparser
# print(f" - Logbook file {yearfile} using parser {yearparser}")
if year in LOGBOOK_PARSER_SETTINGS:
yearfile, parsefunc = LOGBOOK_PARSER_SETTINGS[year]
expedition.logbookfile = yearfile # don't change this if a blog
else:
logbookpath = Path(DEFAULT_LOGBOOK_FILE)
expedition.logbookfile = DEFAULT_LOGBOOK_FILE
yearfile = DEFAULT_LOGBOOK_FILE
expedition.logbookfile = DEFAULT_LOGBOOK_FILE # don't change this if a blog
parsefunc = DEFAULT_LOGBOOK_PARSER
if blog:
print(f" - BLOG file {yearfile} using parser {parsefunc}")
if year not in BLOG_PARSER_SETTINGS:
message = f" ! - Expecting blog parser buut none specified for {year}"
DataIssue.objects.create(parser="logbooks", message=message)
print(message)
else:
yearfile, parsefunc = BLOG_PARSER_SETTINGS[year]
expedition.save()
logbookpath = Path(yearfile)
# print(f" - Logbook file {yearfile} using parser {parsefunc}")
# expedition.save()
for sq in ["", "2", "3", "4"]: # cope with blog saved as many separate files
lb = Path(expologbase, year, logbookpath.stem + sq + logbookpath.suffix)
@ -556,17 +570,10 @@ def parse_logbook_for_expedition(expedition):
print(f" - {year} parsing with {parsefunc} - {lb}")
parser(year, expedition, txt, sq) # this launches the right parser for this year
# --------------------
dupl = {}
for entrytuple in logentries:
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
check = (date, triptitle)
if check in dupl:
dupl[check] += 1
triptitle = f"{triptitle} #{dupl[check]}"
print(f" - {triptitle} -- {date}")
else:
dupl[check] = 1
EnterLogIntoDbase(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1)
# move database storage into separate step
# for entrytuple in logentries:
# date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
# store_entry_into_database(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1)
if len(logentries) == expect:
# print(f"OK {year} {len(logentries):5d} is {expect}\n")
@ -578,39 +585,57 @@ def parse_logbook_for_expedition(expedition):
def LoadLogbook(year):
"""One off logbook for testing purposes"""
global LOGBOOK_PARSER_SETTINGS
"""One off logbook for testing purposes, and also reloadable on '/expedition/2022?reload'
This is inside an atomic transaction"""
global logentries
nlbe = {}
expo = Expedition.objects.get(year=year)
year = expo.year # some type funny
clean_logbook_for_expedition(expo)
logentries = []
nlbe[expo] = parse_logbook_for_expedition(expo) # this actually loads the logbook for one expo
if year in BLOG_PARSER_SETTINGS:
print("BLOG parsing")
LOGBOOK_PARSER_SETTINGS[year] = BLOG_PARSER_SETTINGS[year]
nlbe[expo] = parse_logbook_for_expedition(expo) # this loads the blog logbook for one expo
nlbe[expo] = parse_logbook_for_expedition(expo, blog=True) # this loads the blog logbook
else:
print(
f"Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}"
f" - Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}"
)
for entrytuple in logentries:
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
if expo == expedition:
#print(f" - {triptitle}")
store_entry_into_database(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1)
expedition.save() # to save logbook name property
def LoadLogbooks():
"""This is the master function for parsing all logbooks into the Troggle database.
This should be rewritten to use coroutines to load all logbooks from disc in parallel,
but must be serialised to write to database as sqlite is single-user.
This is inside an atomic transaction. Maybe it shouldn't be..
"""
global entries
global ENTRIES
global logentries
allentries = []
mem1 = get_process_memory()
print(f" - MEM:{mem1:7.2f} MB now ", file=sys.stderr)
start = time.time()
clean_all_logbooks()
expos = Expedition.objects.all()
if len(expos) <= 1:
message = " ! - No expeditions found. Load 'people' first"
message = " ! - No expeditions found. Attempting to 'people' first"
DataIssue.objects.create(parser="logbooks", message=message)
print(message)
return
load_people_expos()
expos = Expedition.objects.all()
if len(expos) <= 1:
message = " ! - No expeditions found, even after attempting to load 'people'. Abort."
DataIssue.objects.create(parser="logbooks", message=message)
print(message)
return
noexpo = [
"1986",
@ -625,7 +650,7 @@ def LoadLogbooks():
loglist = []
bloglist = []
for expo in expos: # pointless as we explicitly know the years in this code.
for expo in expos:
year = expo.year
if year in sqlfail:
print(" - Logbook for: " + year + " NO parsing attempted - known sql failures")
@ -634,7 +659,7 @@ def LoadLogbooks():
print(message)
if year not in nologbook:
if year in entries:
if year in ENTRIES:
loglist.append(expo)
else:
print(" - No Logbook yet for: " + year) # catch case when preparing for next expo
@ -644,28 +669,38 @@ def LoadLogbooks():
for ex in loglist:
nlbe[ex] = parse_logbook_for_expedition(ex) # this loads the logbook for one expo
allentries += logentries
for b in bloglist:
if str(b) in LOGBOOK_PARSER_SETTINGS:
orig = LOGBOOK_PARSER_SETTINGS[str(b)]
else:
orig = (DEFAULT_LOGBOOK_FILE, DEFAULT_LOGBOOK_PARSER)
LOGBOOK_PARSER_SETTINGS[str(b)] = BLOG_PARSER_SETTINGS[str(b)]
print(f" - BLOG: {b}")
nlbe[b] = parse_logbook_for_expedition(b) # no clean. loads the blog logbook for one expo
LOGBOOK_PARSER_SETTINGS[str(b)] = orig
# tried to use map with concurrent threads - but sqlite database is not concurrent, so failed with database lock
# yt = 0
# for r in map(parse_logbook_for_expedition, loglist):
# yt = r
nlbe[b] += parse_logbook_for_expedition(b, blog=True) # loads the blog logbook for one expo
allentries += logentries
yt = 0
for e in nlbe:
yt += nlbe[e]
for exp in nlbe:
yt += nlbe[exp]
print(f"total {yt:,} log entries parsed in all expeditions")
print(f"total {len(allentries):,} log entries in complete dict")
mem = get_process_memory()
print(f" - MEM: {mem:7.2f} MB in use, {mem-mem1:7.2f} MB more", file=sys.stderr)
duration = time.time() - start
print(f" - TIME: {duration:7.2f} s", file=sys.stderr)
# Now we serially store the parsed data in the database, updating 3 types of object:
# - Expedition (the 'logbook.html' value)
# - LogBookEntry (text, who when etc.)
# - PersonTrip (who was on that specific trip mentione din the logbook entry)
for entrytuple in allentries:
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
store_entry_into_database(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1)
for expo in expos:
expedition.save() # to save logbook name property
mem = get_process_memory()
print(f" - MEM: {mem:7.2f} MB in use, {mem-mem1:7.2f} MB more", file=sys.stderr)
duration = time.time() - start
print(f" - TIME: {duration:7.2f} s", file=sys.stderr)
# dateRegex = re.compile(r'<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S)
# expeditionYearRegex = re.compile(r'<span\s+class="expeditionyear">(.*?)</span>', re.S)
# titleRegex = re.compile(r'<H1>(.*?)</H1>', re.S)