Parser fixed to work on 2023 UKcaving blog

2025-03-13 05:41:47 +00:00 · 2023-08-31 00:09:02 +03:00 · 2023-08-31 00:09:02 +03:00 · 069a1d57c9
commit 069a1d57c9
parent df86103407
3 changed files with 29 additions and 16 deletions
--- a/parsers/imports.py
+++ b/parsers/imports.py
@ -36,7 +36,7 @@ def import_logbooks():
    with transaction.atomic():
        troggle.parsers.logbooks.LoadLogbooks()

-def import_logbook(year=2022):
+def import_logbook(year=2023):
    print(f"-- Importing Logbook {year}")
    with transaction.atomic():
        troggle.parsers.logbooks.LoadLogbook(year)
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@ -10,7 +10,7 @@ from random import randint
 from django.conf import settings
 from django.template.defaultfilters import slugify

-from parsers.people import GetPersonExpeditionNameLookup, load_people_expos
+from parsers.people import GetPersonExpeditionNameLookup, load_people_expos, known_foreigner
 from troggle.core.models.caves import GetCaveLookup
 from troggle.core.models.logbooks import LogbookEntry, PersonLogEntry
 from troggle.core.models.troggle import DataIssue, Expedition
@ -44,6 +44,7 @@ e.g. cave descriptions
 """
 MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200
 BLOG_PARSER_SETTINGS = { # no default, must be explicit
+    #  "2023": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
    #  "2022": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
    #  "2019": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
    #  "2018": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
@ -59,7 +60,7 @@ LOGBOOK_PARSER_SETTINGS = {
 LOGBOOKS_DIR = "years" # subfolder of settings.EXPOWEB

 ENTRIES = {
-    "2023": 63,
+    "2023": 82,
    "2022": 93,
    "2019": 55,
    "2018": 95,
@ -143,6 +144,7 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
                            DataIssue.objects.create(parser="logbooks", message=message)
                    res.append((personyear, nickname_used, logtime_underground))
                except:
+                    # This should not happen. We do not raise exceptions in that function
                    message = f" ! - {expedition.year} EXCEPTION: '{tripperson}' ({nickname_used}) in entry {tid=} for this year."
                    print(message)
                    DataIssue.objects.create(parser="logbooks", message=message)
@ -184,10 +186,10 @@ def tidy_trip_persons(trippeople, title, expedition, logtime_underground, tid):
        trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground, tid=tid)
        # trippersons is a list of tuples (personyear, nickname_used, logtime_underground)
    except:
-        message = f" ! - {expedition.year} Skipping logentry: {title} - GetTripPersons FAIL"
+        message = f" ! - {expedition.year} Logentry: {title} - GetTripPersons FAIL to recognise nickname"
        DataIssue.objects.create(parser="logbooks", message=message)
        print(message)
-        # raise
+        raise
        return "", ""

    if not author:
@ -424,7 +426,8 @@ def parser_blog(year, expedition, txt, sq=""):
    logentries = []

    tripheads = re.findall(
-        r"<article class=\"message message--post js-post js-inlineModContainer\s*\"\s*([\s\S]*?)(?=</article)", txt
+        # note use of non-greedy capturing (?: regex idiom here
+        r"<article class=\"message message--post js-post js-inlineModContainer\s*(?:is-unread)*\s*\"\s*([\s\S]*?)(?=</article)", txt
    )
    if not (tripheads):
        message = f" ! - Skipping on failure to parse article header: {txt[:500]}"
@ -439,10 +442,10 @@ def parser_blog(year, expedition, txt, sq=""):
        print(message)

    if len(tripheads) != len(tripparas):
-        print(f"{len(tripheads)} != {len(tripparas)}")
-    print(f"{len(tripheads)} - {len(tripparas)}")
+        print(f"{len(tripheads)} != {len(tripparas)} not the same number of headers {len(tripheads)} as paras {len(tripparas)} !")
+    # print(f"{len(tripheads)} - {len(tripparas)}")

-    location = "Plateau"  # best guess, fix manually later
+    #location = "Plateau"  # best guess, fix manually later
    tu = 0 # no logged time underground in a blog entry
    logbook_entry_count = 0
    for i in range(0, len(tripparas)):
@ -497,9 +500,11 @@ def parser_blog(year, expedition, txt, sq=""):
        tripcontent = re.sub(r"<hr\s*>", "", tripcontent)
        tripcontent = f"\n\n<!-- Content parsed from UK Caving Blog -->\nBlog Author: {trippeople}" + tripcontent

-        trippersons, author = tidy_trip_persons(trippeople, expedition, logtime_underground, tid)
+        logtime_underground = 0
+        trippersons, author = tidy_trip_persons(trippeople, triptitle, expedition, logtime_underground, tid)
+        # print(f" -  author: {author}")
        tripcave = tidy_trip_cave(place)
-        tripcontent = tidy_trip_image_urls(tripcontent, date)
+        tripcontent = tidy_trip_image_urls(tripcontent, year)
        tid = tidy_tid(tid, triptitle)

        entrytuple = (tripdate, place, tripcave, triptitle, tripcontent, trippersons, author, expedition, tu, tid)
@ -544,13 +549,13 @@ def parse_logbook_for_expedition(expedition, blog=False):
        parsefunc = DEFAULT_LOGBOOK_PARSER
        
    if blog:
-        print(f" - BLOG file {yearfile} using parser {parsefunc}")
        if year not in  BLOG_PARSER_SETTINGS:
            message = f" ! - Expecting blog parser buut none specified for {year}"
            DataIssue.objects.create(parser="logbooks", message=message)
            print(message)
        else:
            yearfile, parsefunc = BLOG_PARSER_SETTINGS[year]
+        print(f" - BLOG file {yearfile} using parser {parsefunc}")

    logbookpath = Path(yearfile)
    # print(f" - Logbook file {yearfile} using parser {parsefunc}")
@ -575,7 +580,7 @@ def parse_logbook_for_expedition(expedition, blog=False):
        if logbook_parseable:
            # --------------------
            parser = globals()[parsefunc]
-            # print(f" - {year} parsing with {parsefunc} - {lb}")
+            print(f" - {year} parsing with {parsefunc} - {lb}")
            print(" .", end="")
            logentries = parser(year, expedition, txt, sq)  # this launches the right parser
            # --------------------
@ -599,8 +604,10 @@ def LoadLogbook(year):
    logentries = []
    
    logentries = parse_logbook_for_expedition(expo)  # this actually loads the logbook for one expo
+    print(f" - Loaded logbook. {len(logentries)} entries." ) 
    if year in BLOG_PARSER_SETTINGS:
-         logentries += parse_logbook_for_expedition(expo, blog=True)  # this  loads the blog logbook
+        print(f" - Loading blog.." ) 
+        logentries += parse_logbook_for_expedition(expo, blog=True)  # this  loads the blog logbook
    else:
        print(
            f" - Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}"
@ -608,7 +615,7 @@ def LoadLogbook(year):
    for entrytuple in logentries:
        date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid = entrytuple
        if expo == expedition: # unneeded check, we zeroed it before filling it
-            #print(f" - {triptitle}")
+            # print(f" -- {triptitle}")
            store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid)
        else:
            print(f" ! unexpected log entry labelled as '{expedition}' {tid}" ) 
--- a/parsers/people.py
+++ b/parsers/people.py
@ -162,7 +162,7 @@ foreign_friends = [
 def known_foreigner(id):
    """If this someone from ARGE or a known Austrian? Name has to be exact, no soft matching
    
-    APPARENTLY NOT YET USED
+    APPARENTLY NOT YET USED? Yes it is: in logbook Blog parsing
    instead foreigners have names prefixed by * in the logbook.html ?"""
    global foreign_friends

@ -321,6 +321,12 @@ def GetPersonExpeditionNameLookup(expedition):
                possnames.append("nobrotson")
            if f"{f} {l}" == "Todd Rye".lower():
                possnames.append("samouse1")
+            if f"{f} {l}" == "Jono Lester".lower():
+                possnames.append("ILoveCaves")
+            if f"{f} {l}" == "Joel Stobbart".lower():
+                possnames.append("El Stobbarto")
+            if f"{f} {l}" == "Rob Watson".lower():
+                possnames.append("nobrotson")

        for i in [3, 4, 5, 6]:
            lim = min(i, len(f) + 1)  # short form, e.g. Dan for Daniel.