refactored logbooks parser

2025-12-18 04:17:29 +00:00 · 2023-01-27 23:21:07 +00:00
parent 6565b3f9c4
commit e01bd39609
4 changed files with 130 additions and 91 deletions
--- a/core/models/caves.py
+++ b/core/models/caves.py
@@ -549,6 +549,10 @@ class PersonTrip(TroggleModel):
    logbook_entry    = models.ForeignKey(LogbookEntry,on_delete=models.CASCADE)
    is_logbook_entry_author = models.BooleanField(default=False)

+    class Meta:
+        ordering = ('-personexpedition',)
+        #order_with_respect_to = 'personexpedition'
+        
    def persontrip_next(self):
        futurePTs = PersonTrip.objects.filter(personexpedition = self.personexpedition, logbook_entry__date__gt = self.logbook_entry.date).order_by('logbook_entry__date').all()
        if len(futurePTs) > 0:
--- a/core/views/logbooks.py
+++ b/core/views/logbooks.py
@@ -18,7 +18,7 @@ from troggle.core.models.caves import LogbookEntry, PersonTrip
 from troggle.core.models.survex import SurvexBlock, Wallet
 from troggle.core.models.troggle import Expedition, Person, PersonExpedition
 from troggle.core.utils import TROG
-from troggle.parsers.logbooks import parse_logbook_for_expedition
+from troggle.parsers.imports import import_logbook
 from troggle.parsers.people import GetPersonExpeditionNameLookup

 from .auth import login_required_if_public
@@ -69,7 +69,7 @@ def expedition(request, expeditionname):
                #print(f'! - delete entry: "{entry}"')
                entry.delete() 
            entries = this_expedition.logbookentry_set.all()
-            LoadLogbookForExpedition(this_expedition) 
+            import_logbook(year=this_expedition.year)
        logged_in = True
    else:
        logged_in = False
--- a/parsers/imports.py
+++ b/parsers/imports.py
@@ -40,7 +40,7 @@ def import_logbooks():
        troggle.parsers.logbooks.LoadLogbooks()


-def import_logbook(year=2022):
+def import_logbook(year=2016):
    print(f"-- Importing Logbook {year}")
    with transaction.atomic():
        troggle.parsers.logbooks.LoadLogbook(year)
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -1,5 +1,8 @@
 import os
 import re
+import sys
+import time
+
 from datetime import date, datetime
 from pathlib import Path
 from random import randint
@@ -7,10 +10,10 @@ from random import randint
 from django.conf import settings
 from django.template.defaultfilters import slugify

-from parsers.people import GetPersonExpeditionNameLookup
+from parsers.people import GetPersonExpeditionNameLookup, load_people_expos
 from troggle.core.models.caves import GetCaveLookup, LogbookEntry, PersonTrip
 from troggle.core.models.troggle import DataIssue, Expedition
-from troggle.core.utils import save_carefully
+from troggle.core.utils import save_carefully, get_process_memory

 """
 Parses and imports logbooks in all their wonderful confusion
@@ -20,6 +23,8 @@ Parses and imports logbooks in all their wonderful confusion
 todo = """
 - refactor everything with some urgency, esp. parse_logbook_for_expedition()

+- pre-compile all the heavily used regular expressions !
+
 - break out the code that hits the database from that which parses the logbook
 so that the file-reading and parsing can be parallelized, while writing to the
 database remains serialized (sqlite is single-user).
@@ -43,7 +48,7 @@ data for old logbooks? Not worth it..

 """
 MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200
-BLOG_PARSER_SETTINGS = {
+BLOG_PARSER_SETTINGS = { # no default, must be explicit
    #  "2022": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
    #  "2019": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
    #  "2018": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
@@ -57,7 +62,7 @@ LOGBOOK_PARSER_SETTINGS = {
    "1982": ("logbook.html", "parser_html"),
 }

-entries = {
+ENTRIES = {
    "2022": 90,
    "2019": 55,
    "2018": 95,
@@ -99,18 +104,15 @@ entries = {
 }

 logentries = []  # the entire logbook for one year is a single object: a list of entries
-noncaveplaces = ["Journey", "Loser Plateau", "UNKNOWN", "plateau", "base camp", "basecamp", "top camp", "topcamp"]
-trips = {}
+noncaveplaces = ["travel", "Journey", "Loser Plateau", "UNKNOWN", "plateau", "base camp", "basecamp", "top camp", "topcamp"]

 def set_trip_id(year, seq):
    tid = f"{year}_s{seq:02d}"
    return tid

-
 rx_tripperson = re.compile(r"(?i)<u>(.*?)</u>$")
 rx_round_bracket = re.compile(r"[\(\[].*?[\)\]]")

-
 def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
    res = []
    author = None
@@ -161,7 +163,7 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
    return res, author


-def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, tid=None):
+def store_entry_into_database(date, place, title, text, trippeople, expedition, logtime_underground, tid=None):
    """saves a single logbook entry and related persontrips
    Does NOT save the expeditionday_id  - all NULLs. why? Because we are deprecating expeditionday !

@@ -256,7 +258,9 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
        save_carefully(PersonTrip, lookupAttribs, nonLookupAttribs)
        
    
-def ParseDate(tripdate, year):
+
+
+def parser_date(tripdate, year):
    """Interprets dates in the expo logbooks and returns a correct datetime.date object"""
    dummydate = date(1970, 1, 1) # replace with _EPOCH
    month = 1
@@ -302,6 +306,7 @@ def parser_html(year, expedition, txt, seq=""):
    from parser_html_01 format logfiles, believe me.
    """
    global logentries
+    dupl = {}

    # extract front material and stash for later use when rebuilding from list of entries
    headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt)
@@ -370,7 +375,7 @@ def parser_html(year, expedition, txt, seq=""):
                DataIssue.objects.create(parser="logbooks", message=msg)
                continue

-        ldate = ParseDate(tripdate.strip(), year)
+        ldate = parser_date(tripdate.strip(), year)
        triptitles = triptitle.split(" - ")
        if len(triptitles) >= 2:
            tripcave = triptitles[0]
@@ -381,6 +386,15 @@ def parser_html(year, expedition, txt, seq=""):
        ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()

        triptitle = triptitle.strip()
+        # triptitle must be unique for a given date. We fix this here.
+        check = (ldate, triptitle)
+        if check in dupl:
+            dupl[check] += 1
+            triptitle = f"{triptitle} #{dupl[check]}"
+            print(f"  - {triptitle} -- {date}")
+        else:
+            dupl[check] = 1
+            
        entrytuple = (ldate, tripcave, triptitle, ltriptext, trippeople, expedition, tu, tripid1)
        logentries.append(entrytuple)

@@ -401,6 +415,7 @@ def parser_blog(year, expedition, txt, sq=""):
            </article>
        </article>
    So the content is nested inside the header. Attachments (images) come after the content.
+    It's a bugger, but it's out of our control.
    """
    global logentries

@@ -467,9 +482,10 @@ def parser_blog(year, expedition, txt, sq=""):
            tripdate = datetime.fromisoformat(datestamp[0:10])
        # print(f" -  tid: {tid} '{trippeople}' '{tripdate}'")

-        # tripname must have the location then a hyphen at the beginning as it is ignored by export function
+        # triptitle must have the location then a hyphen at the beginning as it is ignored by export function. We can't know what this is, so we set it as 'Expo' and 'Unknown'.
        location = "Unknown"
-        tripname = f"Expo - UK Caving Blog{sq} post {logbook_entry_count}"  # must be unique for a given date
+        # triptitle must be unique for a given date. We can enforce this here.
+        triptitle = f"Expo - UK Caving Blog{sq} post {logbook_entry_count}"  
        tripcontent = re.sub(r"(width=\"\d+\")", "", tripcontent)
        tripcontent = re.sub(r"height=\"\d+\"", "", tripcontent)
        tripcontent = re.sub(r"width: \d+px", "", tripcontent)
@@ -477,7 +493,7 @@ def parser_blog(year, expedition, txt, sq=""):
        tripcontent = re.sub(r"<hr\s*>", "", tripcontent)
        tripcontent = f"\n\n<!-- Content parsed from UK Caving Blog -->\nBlog Author: {trippeople}" + tripcontent

-        entrytuple = (tripdate, location, tripname, tripcontent, trippeople, expedition, tu, tid)
+        entrytuple = (tripdate, location, triptitle, tripcontent, trippeople, expedition, tu, tid)
        logentries.append(entrytuple)

 def clean_all_logbooks():
@@ -485,53 +501,51 @@ def clean_all_logbooks():
    LogbookEntry.objects.all().delete()    
       
 def clean_logbook_for_expedition(expedition):
-    def cleanerrors(year):
+    """Only used when loading a single logbook. Deletes database LogBookEntries and  
+    DataIssues for this expedition year.
+    """
+    lbes = LogbookEntry.objects.filter(expedition=expedition).delete() 
    dataissues = DataIssue.objects.filter(parser="logbooks")
    for di in dataissues:
-            ph = year
+        ph = expedition.year
        if re.search(ph, di.message) is not None:  # SLOW just to delete issues for one year
            # print(f'   - CLEANING dataissue {di.message}')
            di.delete()

-            
-
-    year = expedition.year
-    cleanerrors(year)
-    
-    lbes = LogbookEntry.objects.filter(expedition=expedition) # must be a quicker way
-    for lbe in lbes:
-        lbe.delete()
-
-def parse_logbook_for_expedition(expedition):
+def parse_logbook_for_expedition(expedition, blog=False):
    """Parses all logbook entries for one expedition
-    if clean==True then it deletes all entries for this year first.
    """
    global logentries
-    # absolutely horrid. REFACTOR THIS (all my fault..)
-    global entries
-
-    logbook_parseable = False
-    yearlinks = LOGBOOK_PARSER_SETTINGS
-    expologbase = os.path.join(settings.EXPOWEB, "years")
+    global ENTRIES
    logentries = [] 
    
+    logbook_parseable = False
+    expologbase = Path(settings.EXPOWEB, "years")
+ 
    year = expedition.year
-    expect = entries[year]
+    expect = ENTRIES[year]
    # print(" - Logbook for: " + year)

-    if year in yearlinks:
-        yearfile, yearparser = yearlinks[year]
-        logbookpath = Path(yearfile)
-        expedition.logbookfile = yearfile
-        parsefunc = yearparser
-        # print(f" - Logbook file {yearfile} using parser {yearparser}")
-
+    if year in LOGBOOK_PARSER_SETTINGS:
+        yearfile, parsefunc = LOGBOOK_PARSER_SETTINGS[year]
+        expedition.logbookfile = yearfile # don't change this if a blog
    else:
-        logbookpath = Path(DEFAULT_LOGBOOK_FILE)
-        expedition.logbookfile = DEFAULT_LOGBOOK_FILE
+        yearfile = DEFAULT_LOGBOOK_FILE
+        expedition.logbookfile = DEFAULT_LOGBOOK_FILE # don't change this if a blog
        parsefunc = DEFAULT_LOGBOOK_PARSER
        
-    expedition.save()
+    if blog:
+        print(f" - BLOG file {yearfile} using parser {parsefunc}")
+        if year not in  BLOG_PARSER_SETTINGS:
+            message = f" ! - Expecting blog parser buut none specified for {year}"
+            DataIssue.objects.create(parser="logbooks", message=message)
+            print(message)
+        else:
+            yearfile, parsefunc = BLOG_PARSER_SETTINGS[year]
+
+    logbookpath = Path(yearfile)
+    # print(f" - Logbook file {yearfile} using parser {parsefunc}")
+    # expedition.save()

    for sq in ["", "2", "3", "4"]:  # cope with blog saved as many separate files
        lb = Path(expologbase, year, logbookpath.stem + sq + logbookpath.suffix)
@@ -556,17 +570,10 @@ def parse_logbook_for_expedition(expedition):
            print(f" - {year} parsing with {parsefunc} - {lb}")
            parser(year, expedition, txt, sq)  # this launches the right parser for this year
            # --------------------
-        dupl = {}
-        for entrytuple in logentries:
-            date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
-            check = (date, triptitle)
-            if check in dupl:
-                dupl[check] += 1
-                triptitle = f"{triptitle} #{dupl[check]}"
-                print(f"  - {triptitle} -- {date}")
-            else:
-                dupl[check] = 1
-            EnterLogIntoDbase(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1)
+        # move database storage into separate step
+        # for entrytuple in logentries:
+            # date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
+            # store_entry_into_database(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1)

    if len(logentries) == expect:
        # print(f"OK  {year} {len(logentries):5d} is {expect}\n")
@@ -578,36 +585,54 @@ def parse_logbook_for_expedition(expedition):


 def LoadLogbook(year):
-    """One off logbook for testing purposes"""
-    global LOGBOOK_PARSER_SETTINGS
+    """One off logbook for testing purposes, and also reloadable on '/expedition/2022?reload'
+    This is inside an atomic transaction"""
+    global logentries
    nlbe = {}

    expo = Expedition.objects.get(year=year)
    year = expo.year  # some type funny
    clean_logbook_for_expedition(expo)
+    logentries = []
    
    nlbe[expo] = parse_logbook_for_expedition(expo)  # this actually loads the logbook for one expo
    if year in BLOG_PARSER_SETTINGS:
-        print("BLOG parsing")
-        LOGBOOK_PARSER_SETTINGS[year] = BLOG_PARSER_SETTINGS[year]
-        nlbe[expo] = parse_logbook_for_expedition(expo)  # this  loads the blog logbook for one expo
+         nlbe[expo] = parse_logbook_for_expedition(expo, blog=True)  # this  loads the blog logbook
    else:
        print(
-            f"Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}"
+            f" - Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}"
        )
-
+    for entrytuple in logentries:
+        date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
+        if expo == expedition:
+            #print(f" - {triptitle}")
+            store_entry_into_database(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1)
+    expedition.save() # to save logbook name property
    
 def LoadLogbooks():
    """This is the master function for parsing all logbooks into the Troggle database.
    This should be rewritten to use coroutines to load all logbooks from disc in parallel,
    but must be serialised to write to database as sqlite is single-user.
+    
+    This is inside an atomic transaction. Maybe it shouldn't be..
    """
-    global entries
+    global ENTRIES
+    global logentries
+    allentries = []
+    mem1 = get_process_memory()
+    print(f" - MEM:{mem1:7.2f} MB now ", file=sys.stderr)
+    start = time.time()

    clean_all_logbooks()
    expos = Expedition.objects.all()
    if len(expos) <= 1:
-        message = " ! - No expeditions found. Load 'people' first"
+        message = " ! - No expeditions found. Attempting to 'people' first"
+        DataIssue.objects.create(parser="logbooks", message=message)
+        print(message)
+        load_people_expos()
+        expos = Expedition.objects.all()
+        if len(expos) <= 1:
+            message = " ! - No expeditions found, even after attempting to load 'people'. Abort."
            DataIssue.objects.create(parser="logbooks", message=message)
            print(message)
            return
@@ -625,7 +650,7 @@ def LoadLogbooks():
    loglist = []
    bloglist = []

-    for expo in expos:  # pointless as we explicitly know the years in this code.
+    for expo in expos: 
        year = expo.year
        if year in sqlfail:
            print(" - Logbook for: " + year + " NO parsing attempted - known sql failures")
@@ -634,7 +659,7 @@ def LoadLogbooks():
            print(message)

        if year not in nologbook:
-            if year in entries:
+            if year in ENTRIES:
                loglist.append(expo)
            else:
                print(" - No Logbook yet for: " + year)  # catch case when preparing for next expo
@@ -644,28 +669,38 @@ def LoadLogbooks():

    for ex in loglist:
        nlbe[ex] = parse_logbook_for_expedition(ex)  # this  loads the logbook for one expo
+        allentries += logentries

    for b in bloglist:
-        if str(b) in LOGBOOK_PARSER_SETTINGS:
-            orig = LOGBOOK_PARSER_SETTINGS[str(b)]
-        else:
-            orig = (DEFAULT_LOGBOOK_FILE, DEFAULT_LOGBOOK_PARSER)
-        LOGBOOK_PARSER_SETTINGS[str(b)] = BLOG_PARSER_SETTINGS[str(b)]
        print(f" - BLOG: {b}")
-        nlbe[b] = parse_logbook_for_expedition(b)  # no clean. loads the blog logbook for one expo
-        LOGBOOK_PARSER_SETTINGS[str(b)] = orig
-
-    # tried to use map with concurrent threads - but sqlite database is not concurrent, so failed with database lock
-    # yt = 0
-    # for r in map(parse_logbook_for_expedition, loglist):
-    # yt = r
+        nlbe[b] += parse_logbook_for_expedition(b, blog=True)  # loads the blog logbook for one expo
+        allentries += logentries

    yt = 0
-    for e in nlbe:
-        yt += nlbe[e]
+    for exp in nlbe:
+        yt += nlbe[exp]
    print(f"total {yt:,} log entries parsed in all expeditions")

+    print(f"total {len(allentries):,} log entries in complete dict")
+    mem = get_process_memory()
+    print(f" - MEM: {mem:7.2f} MB in use, {mem-mem1:7.2f} MB more", file=sys.stderr)
+    duration = time.time() - start
+    print(f" - TIME: {duration:7.2f} s", file=sys.stderr)
   
+    # Now we serially store the parsed data in the database, updating 3 types of object:
+    # - Expedition (the 'logbook.html' value)
+    # - LogBookEntry (text, who when etc.)
+    # - PersonTrip (who was on that specific trip mentione din the logbook entry)
+    for entrytuple in allentries:
+        date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
+        store_entry_into_database(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1)
+
+    for expo in expos: 
+        expedition.save() # to save logbook name property
+    mem = get_process_memory()
+    print(f" - MEM: {mem:7.2f} MB in use, {mem-mem1:7.2f} MB more", file=sys.stderr)
+    duration = time.time() - start
+    print(f" - TIME: {duration:7.2f} s", file=sys.stderr)
 # dateRegex = re.compile(r'<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S)
 # expeditionYearRegex = re.compile(r'<span\s+class="expeditionyear">(.*?)</span>', re.S)
 # titleRegex = re.compile(r'<H1>(.*?)</H1>', re.S)