From e01bd39609ba14232544125cb78ecd3c2ba99ea7 Mon Sep 17 00:00:00 2001 From: Philip Sargent Date: Fri, 27 Jan 2023 23:21:07 +0000 Subject: [PATCH] refactored logbooks parser --- core/models/caves.py | 4 + core/views/logbooks.py | 4 +- parsers/imports.py | 2 +- parsers/logbooks.py | 211 ++++++++++++++++++++++++----------------- 4 files changed, 130 insertions(+), 91 deletions(-) diff --git a/core/models/caves.py b/core/models/caves.py index 70a13ae..0b3aa1d 100644 --- a/core/models/caves.py +++ b/core/models/caves.py @@ -549,6 +549,10 @@ class PersonTrip(TroggleModel): logbook_entry = models.ForeignKey(LogbookEntry,on_delete=models.CASCADE) is_logbook_entry_author = models.BooleanField(default=False) + class Meta: + ordering = ('-personexpedition',) + #order_with_respect_to = 'personexpedition' + def persontrip_next(self): futurePTs = PersonTrip.objects.filter(personexpedition = self.personexpedition, logbook_entry__date__gt = self.logbook_entry.date).order_by('logbook_entry__date').all() if len(futurePTs) > 0: diff --git a/core/views/logbooks.py b/core/views/logbooks.py index 415c7f8..9748aed 100644 --- a/core/views/logbooks.py +++ b/core/views/logbooks.py @@ -18,7 +18,7 @@ from troggle.core.models.caves import LogbookEntry, PersonTrip from troggle.core.models.survex import SurvexBlock, Wallet from troggle.core.models.troggle import Expedition, Person, PersonExpedition from troggle.core.utils import TROG -from troggle.parsers.logbooks import parse_logbook_for_expedition +from troggle.parsers.imports import import_logbook from troggle.parsers.people import GetPersonExpeditionNameLookup from .auth import login_required_if_public @@ -69,7 +69,7 @@ def expedition(request, expeditionname): #print(f'! - delete entry: "{entry}"') entry.delete() entries = this_expedition.logbookentry_set.all() - LoadLogbookForExpedition(this_expedition) + import_logbook(year=this_expedition.year) logged_in = True else: logged_in = False diff --git a/parsers/imports.py b/parsers/imports.py index c1de034..050d5ea 100644 --- a/parsers/imports.py +++ b/parsers/imports.py @@ -40,7 +40,7 @@ def import_logbooks(): troggle.parsers.logbooks.LoadLogbooks() -def import_logbook(year=2022): +def import_logbook(year=2016): print(f"-- Importing Logbook {year}") with transaction.atomic(): troggle.parsers.logbooks.LoadLogbook(year) diff --git a/parsers/logbooks.py b/parsers/logbooks.py index 86016af..a65774d 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -1,5 +1,8 @@ import os import re +import sys +import time + from datetime import date, datetime from pathlib import Path from random import randint @@ -7,10 +10,10 @@ from random import randint from django.conf import settings from django.template.defaultfilters import slugify -from parsers.people import GetPersonExpeditionNameLookup +from parsers.people import GetPersonExpeditionNameLookup, load_people_expos from troggle.core.models.caves import GetCaveLookup, LogbookEntry, PersonTrip from troggle.core.models.troggle import DataIssue, Expedition -from troggle.core.utils import save_carefully +from troggle.core.utils import save_carefully, get_process_memory """ Parses and imports logbooks in all their wonderful confusion @@ -20,6 +23,8 @@ Parses and imports logbooks in all their wonderful confusion todo = """ - refactor everything with some urgency, esp. parse_logbook_for_expedition() +- pre-compile all the heavily used regular expressions ! + - break out the code that hits the database from that which parses the logbook so that the file-reading and parsing can be parallelized, while writing to the database remains serialized (sqlite is single-user). @@ -43,7 +48,7 @@ data for old logbooks? Not worth it.. """ MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200 -BLOG_PARSER_SETTINGS = { +BLOG_PARSER_SETTINGS = { # no default, must be explicit # "2022": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html # "2019": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html # "2018": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html @@ -57,7 +62,7 @@ LOGBOOK_PARSER_SETTINGS = { "1982": ("logbook.html", "parser_html"), } -entries = { +ENTRIES = { "2022": 90, "2019": 55, "2018": 95, @@ -99,18 +104,15 @@ entries = { } logentries = [] # the entire logbook for one year is a single object: a list of entries -noncaveplaces = ["Journey", "Loser Plateau", "UNKNOWN", "plateau", "base camp", "basecamp", "top camp", "topcamp"] -trips = {} +noncaveplaces = ["travel", "Journey", "Loser Plateau", "UNKNOWN", "plateau", "base camp", "basecamp", "top camp", "topcamp"] def set_trip_id(year, seq): tid = f"{year}_s{seq:02d}" return tid - rx_tripperson = re.compile(r"(?i)(.*?)$") rx_round_bracket = re.compile(r"[\(\[].*?[\)\]]") - def GetTripPersons(trippeople, expedition, logtime_underground, tid=None): res = [] author = None @@ -161,7 +163,7 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None): return res, author -def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, tid=None): +def store_entry_into_database(date, place, title, text, trippeople, expedition, logtime_underground, tid=None): """saves a single logbook entry and related persontrips Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday ! @@ -254,9 +256,11 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_ nonLookupAttribs = {"time_underground": time_underground, "is_logbook_entry_author": (tripperson == author)} # this creates the PersonTrip instance. save_carefully(PersonTrip, lookupAttribs, nonLookupAttribs) + + -def ParseDate(tripdate, year): +def parser_date(tripdate, year): """Interprets dates in the expo logbooks and returns a correct datetime.date object""" dummydate = date(1970, 1, 1) # replace with _EPOCH month = 1 @@ -302,6 +306,7 @@ def parser_html(year, expedition, txt, seq=""): from parser_html_01 format logfiles, believe me. """ global logentries + dupl = {} # extract front material and stash for later use when rebuilding from list of entries headmatch = re.match(r"(?i)(?s).*]*>(.*?)= 2: tripcave = triptitles[0] @@ -381,6 +386,15 @@ def parser_html(year, expedition, txt, seq=""): ltriptext = re.sub(r"

", "

", ltriptext).strip() triptitle = triptitle.strip() + # triptitle must be unique for a given date. We fix this here. + check = (ldate, triptitle) + if check in dupl: + dupl[check] += 1 + triptitle = f"{triptitle} #{dupl[check]}" + print(f" - {triptitle} -- {date}") + else: + dupl[check] = 1 + entrytuple = (ldate, tripcave, triptitle, ltriptext, trippeople, expedition, tu, tripid1) logentries.append(entrytuple) @@ -401,6 +415,7 @@ def parser_blog(year, expedition, txt, sq=""): So the content is nested inside the header. Attachments (images) come after the content. + It's a bugger, but it's out of our control. """ global logentries @@ -467,9 +482,10 @@ def parser_blog(year, expedition, txt, sq=""): tripdate = datetime.fromisoformat(datestamp[0:10]) # print(f" - tid: {tid} '{trippeople}' '{tripdate}'") - # tripname must have the location then a hyphen at the beginning as it is ignored by export function + # triptitle must have the location then a hyphen at the beginning as it is ignored by export function. We can't know what this is, so we set it as 'Expo' and 'Unknown'. location = "Unknown" - tripname = f"Expo - UK Caving Blog{sq} post {logbook_entry_count}" # must be unique for a given date + # triptitle must be unique for a given date. We can enforce this here. + triptitle = f"Expo - UK Caving Blog{sq} post {logbook_entry_count}" tripcontent = re.sub(r"(width=\"\d+\")", "", tripcontent) tripcontent = re.sub(r"height=\"\d+\"", "", tripcontent) tripcontent = re.sub(r"width: \d+px", "", tripcontent) @@ -477,7 +493,7 @@ def parser_blog(year, expedition, txt, sq=""): tripcontent = re.sub(r"", "", tripcontent) tripcontent = f"\n\n\nBlog Author: {trippeople}" + tripcontent - entrytuple = (tripdate, location, tripname, tripcontent, trippeople, expedition, tu, tid) + entrytuple = (tripdate, location, triptitle, tripcontent, trippeople, expedition, tu, tid) logentries.append(entrytuple) def clean_all_logbooks(): @@ -485,53 +501,51 @@ def clean_all_logbooks(): LogbookEntry.objects.all().delete() def clean_logbook_for_expedition(expedition): - def cleanerrors(year): - dataissues = DataIssue.objects.filter(parser="logbooks") - for di in dataissues: - ph = year - if re.search(ph, di.message) is not None: # SLOW just to delete issues for one year - # print(f' - CLEANING dataissue {di.message}') - di.delete() + """Only used when loading a single logbook. Deletes database LogBookEntries and + DataIssues for this expedition year. + """ + lbes = LogbookEntry.objects.filter(expedition=expedition).delete() + dataissues = DataIssue.objects.filter(parser="logbooks") + for di in dataissues: + ph = expedition.year + if re.search(ph, di.message) is not None: # SLOW just to delete issues for one year + # print(f' - CLEANING dataissue {di.message}') + di.delete() - - - year = expedition.year - cleanerrors(year) - - lbes = LogbookEntry.objects.filter(expedition=expedition) # must be a quicker way - for lbe in lbes: - lbe.delete() - -def parse_logbook_for_expedition(expedition): +def parse_logbook_for_expedition(expedition, blog=False): """Parses all logbook entries for one expedition - if clean==True then it deletes all entries for this year first. """ global logentries - # absolutely horrid. REFACTOR THIS (all my fault..) - global entries - + global ENTRIES + logentries = [] + logbook_parseable = False - yearlinks = LOGBOOK_PARSER_SETTINGS - expologbase = os.path.join(settings.EXPOWEB, "years") - logentries = [] - + expologbase = Path(settings.EXPOWEB, "years") + year = expedition.year - expect = entries[year] + expect = ENTRIES[year] # print(" - Logbook for: " + year) - if year in yearlinks: - yearfile, yearparser = yearlinks[year] - logbookpath = Path(yearfile) - expedition.logbookfile = yearfile - parsefunc = yearparser - # print(f" - Logbook file {yearfile} using parser {yearparser}") - + if year in LOGBOOK_PARSER_SETTINGS: + yearfile, parsefunc = LOGBOOK_PARSER_SETTINGS[year] + expedition.logbookfile = yearfile # don't change this if a blog else: - logbookpath = Path(DEFAULT_LOGBOOK_FILE) - expedition.logbookfile = DEFAULT_LOGBOOK_FILE + yearfile = DEFAULT_LOGBOOK_FILE + expedition.logbookfile = DEFAULT_LOGBOOK_FILE # don't change this if a blog parsefunc = DEFAULT_LOGBOOK_PARSER + + if blog: + print(f" - BLOG file {yearfile} using parser {parsefunc}") + if year not in BLOG_PARSER_SETTINGS: + message = f" ! - Expecting blog parser buut none specified for {year}" + DataIssue.objects.create(parser="logbooks", message=message) + print(message) + else: + yearfile, parsefunc = BLOG_PARSER_SETTINGS[year] - expedition.save() + logbookpath = Path(yearfile) + # print(f" - Logbook file {yearfile} using parser {parsefunc}") + # expedition.save() for sq in ["", "2", "3", "4"]: # cope with blog saved as many separate files lb = Path(expologbase, year, logbookpath.stem + sq + logbookpath.suffix) @@ -556,17 +570,10 @@ def parse_logbook_for_expedition(expedition): print(f" - {year} parsing with {parsefunc} - {lb}") parser(year, expedition, txt, sq) # this launches the right parser for this year # -------------------- - dupl = {} - for entrytuple in logentries: - date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple - check = (date, triptitle) - if check in dupl: - dupl[check] += 1 - triptitle = f"{triptitle} #{dupl[check]}" - print(f" - {triptitle} -- {date}") - else: - dupl[check] = 1 - EnterLogIntoDbase(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1) + # move database storage into separate step + # for entrytuple in logentries: + # date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple + # store_entry_into_database(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1) if len(logentries) == expect: # print(f"OK {year} {len(logentries):5d} is {expect}\n") @@ -578,39 +585,57 @@ def parse_logbook_for_expedition(expedition): def LoadLogbook(year): - """One off logbook for testing purposes""" - global LOGBOOK_PARSER_SETTINGS + """One off logbook for testing purposes, and also reloadable on '/expedition/2022?reload' + This is inside an atomic transaction""" + global logentries nlbe = {} expo = Expedition.objects.get(year=year) year = expo.year # some type funny clean_logbook_for_expedition(expo) + logentries = [] nlbe[expo] = parse_logbook_for_expedition(expo) # this actually loads the logbook for one expo if year in BLOG_PARSER_SETTINGS: - print("BLOG parsing") - LOGBOOK_PARSER_SETTINGS[year] = BLOG_PARSER_SETTINGS[year] - nlbe[expo] = parse_logbook_for_expedition(expo) # this loads the blog logbook for one expo + nlbe[expo] = parse_logbook_for_expedition(expo, blog=True) # this loads the blog logbook else: print( - f"Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}" + f" - Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}" ) - - + for entrytuple in logentries: + date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple + if expo == expedition: + #print(f" - {triptitle}") + store_entry_into_database(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1) + expedition.save() # to save logbook name property + def LoadLogbooks(): """This is the master function for parsing all logbooks into the Troggle database. This should be rewritten to use coroutines to load all logbooks from disc in parallel, but must be serialised to write to database as sqlite is single-user. + + This is inside an atomic transaction. Maybe it shouldn't be.. """ - global entries + global ENTRIES + global logentries + allentries = [] + mem1 = get_process_memory() + print(f" - MEM:{mem1:7.2f} MB now ", file=sys.stderr) + start = time.time() clean_all_logbooks() expos = Expedition.objects.all() if len(expos) <= 1: - message = " ! - No expeditions found. Load 'people' first" + message = " ! - No expeditions found. Attempting to 'people' first" DataIssue.objects.create(parser="logbooks", message=message) print(message) - return + load_people_expos() + expos = Expedition.objects.all() + if len(expos) <= 1: + message = " ! - No expeditions found, even after attempting to load 'people'. Abort." + DataIssue.objects.create(parser="logbooks", message=message) + print(message) + return noexpo = [ "1986", @@ -625,7 +650,7 @@ def LoadLogbooks(): loglist = [] bloglist = [] - for expo in expos: # pointless as we explicitly know the years in this code. + for expo in expos: year = expo.year if year in sqlfail: print(" - Logbook for: " + year + " NO parsing attempted - known sql failures") @@ -634,7 +659,7 @@ def LoadLogbooks(): print(message) if year not in nologbook: - if year in entries: + if year in ENTRIES: loglist.append(expo) else: print(" - No Logbook yet for: " + year) # catch case when preparing for next expo @@ -644,28 +669,38 @@ def LoadLogbooks(): for ex in loglist: nlbe[ex] = parse_logbook_for_expedition(ex) # this loads the logbook for one expo + allentries += logentries for b in bloglist: - if str(b) in LOGBOOK_PARSER_SETTINGS: - orig = LOGBOOK_PARSER_SETTINGS[str(b)] - else: - orig = (DEFAULT_LOGBOOK_FILE, DEFAULT_LOGBOOK_PARSER) - LOGBOOK_PARSER_SETTINGS[str(b)] = BLOG_PARSER_SETTINGS[str(b)] print(f" - BLOG: {b}") - nlbe[b] = parse_logbook_for_expedition(b) # no clean. loads the blog logbook for one expo - LOGBOOK_PARSER_SETTINGS[str(b)] = orig - - # tried to use map with concurrent threads - but sqlite database is not concurrent, so failed with database lock - # yt = 0 - # for r in map(parse_logbook_for_expedition, loglist): - # yt = r + nlbe[b] += parse_logbook_for_expedition(b, blog=True) # loads the blog logbook for one expo + allentries += logentries yt = 0 - for e in nlbe: - yt += nlbe[e] + for exp in nlbe: + yt += nlbe[exp] print(f"total {yt:,} log entries parsed in all expeditions") + print(f"total {len(allentries):,} log entries in complete dict") + mem = get_process_memory() + print(f" - MEM: {mem:7.2f} MB in use, {mem-mem1:7.2f} MB more", file=sys.stderr) + duration = time.time() - start + print(f" - TIME: {duration:7.2f} s", file=sys.stderr) + + # Now we serially store the parsed data in the database, updating 3 types of object: + # - Expedition (the 'logbook.html' value) + # - LogBookEntry (text, who when etc.) + # - PersonTrip (who was on that specific trip mentione din the logbook entry) + for entrytuple in allentries: + date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple + store_entry_into_database(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1) + for expo in expos: + expedition.save() # to save logbook name property + mem = get_process_memory() + print(f" - MEM: {mem:7.2f} MB in use, {mem-mem1:7.2f} MB more", file=sys.stderr) + duration = time.time() - start + print(f" - TIME: {duration:7.2f} s", file=sys.stderr) # dateRegex = re.compile(r'(\d\d\d\d)-(\d\d)-(\d\d)', re.S) # expeditionYearRegex = re.compile(r'(.*?)', re.S) # titleRegex = re.compile(r'

(.*?)

', re.S)