From 069a1d57c9bce3b3d6d2c8283c52100f869aefb0 Mon Sep 17 00:00:00 2001 From: Philip Sargent Date: Thu, 31 Aug 2023 00:09:02 +0300 Subject: [PATCH] Parser fixed to work on 2023 UKcaving blog --- parsers/imports.py | 2 +- parsers/logbooks.py | 35 +++++++++++++++++++++-------------- parsers/people.py | 8 +++++++- 3 files changed, 29 insertions(+), 16 deletions(-) diff --git a/parsers/imports.py b/parsers/imports.py index 9191194..abf4b77 100644 --- a/parsers/imports.py +++ b/parsers/imports.py @@ -36,7 +36,7 @@ def import_logbooks(): with transaction.atomic(): troggle.parsers.logbooks.LoadLogbooks() -def import_logbook(year=2022): +def import_logbook(year=2023): print(f"-- Importing Logbook {year}") with transaction.atomic(): troggle.parsers.logbooks.LoadLogbook(year) diff --git a/parsers/logbooks.py b/parsers/logbooks.py index f4ee8d3..6fa0c95 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -10,7 +10,7 @@ from random import randint from django.conf import settings from django.template.defaultfilters import slugify -from parsers.people import GetPersonExpeditionNameLookup, load_people_expos +from parsers.people import GetPersonExpeditionNameLookup, load_people_expos, known_foreigner from troggle.core.models.caves import GetCaveLookup from troggle.core.models.logbooks import LogbookEntry, PersonLogEntry from troggle.core.models.troggle import DataIssue, Expedition @@ -44,6 +44,7 @@ e.g. cave descriptions """ MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200 BLOG_PARSER_SETTINGS = { # no default, must be explicit + # "2023": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html # "2022": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html # "2019": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html # "2018": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html @@ -59,7 +60,7 @@ LOGBOOK_PARSER_SETTINGS = { LOGBOOKS_DIR = "years" # subfolder of settings.EXPOWEB ENTRIES = { - "2023": 63, + "2023": 82, "2022": 93, "2019": 55, "2018": 95, @@ -143,6 +144,7 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None): DataIssue.objects.create(parser="logbooks", message=message) res.append((personyear, nickname_used, logtime_underground)) except: + # This should not happen. We do not raise exceptions in that function message = f" ! - {expedition.year} EXCEPTION: '{tripperson}' ({nickname_used}) in entry {tid=} for this year." print(message) DataIssue.objects.create(parser="logbooks", message=message) @@ -184,10 +186,10 @@ def tidy_trip_persons(trippeople, title, expedition, logtime_underground, tid): trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground, tid=tid) # trippersons is a list of tuples (personyear, nickname_used, logtime_underground) except: - message = f" ! - {expedition.year} Skipping logentry: {title} - GetTripPersons FAIL" + message = f" ! - {expedition.year} Logentry: {title} - GetTripPersons FAIL to recognise nickname" DataIssue.objects.create(parser="logbooks", message=message) print(message) - # raise + raise return "", "" if not author: @@ -424,7 +426,8 @@ def parser_blog(year, expedition, txt, sq=""): logentries = [] tripheads = re.findall( - r"
", "", tripcontent) tripcontent = f"\n\n\nBlog Author: {trippeople}" + tripcontent - trippersons, author = tidy_trip_persons(trippeople, expedition, logtime_underground, tid) + logtime_underground = 0 + trippersons, author = tidy_trip_persons(trippeople, triptitle, expedition, logtime_underground, tid) + # print(f" - author: {author}") tripcave = tidy_trip_cave(place) - tripcontent = tidy_trip_image_urls(tripcontent, date) + tripcontent = tidy_trip_image_urls(tripcontent, year) tid = tidy_tid(tid, triptitle) entrytuple = (tripdate, place, tripcave, triptitle, tripcontent, trippersons, author, expedition, tu, tid) @@ -544,13 +549,13 @@ def parse_logbook_for_expedition(expedition, blog=False): parsefunc = DEFAULT_LOGBOOK_PARSER if blog: - print(f" - BLOG file {yearfile} using parser {parsefunc}") if year not in BLOG_PARSER_SETTINGS: message = f" ! - Expecting blog parser buut none specified for {year}" DataIssue.objects.create(parser="logbooks", message=message) print(message) else: yearfile, parsefunc = BLOG_PARSER_SETTINGS[year] + print(f" - BLOG file {yearfile} using parser {parsefunc}") logbookpath = Path(yearfile) # print(f" - Logbook file {yearfile} using parser {parsefunc}") @@ -575,7 +580,7 @@ def parse_logbook_for_expedition(expedition, blog=False): if logbook_parseable: # -------------------- parser = globals()[parsefunc] - # print(f" - {year} parsing with {parsefunc} - {lb}") + print(f" - {year} parsing with {parsefunc} - {lb}") print(" .", end="") logentries = parser(year, expedition, txt, sq) # this launches the right parser # -------------------- @@ -599,8 +604,10 @@ def LoadLogbook(year): logentries = [] logentries = parse_logbook_for_expedition(expo) # this actually loads the logbook for one expo + print(f" - Loaded logbook. {len(logentries)} entries." ) if year in BLOG_PARSER_SETTINGS: - logentries += parse_logbook_for_expedition(expo, blog=True) # this loads the blog logbook + print(f" - Loading blog.." ) + logentries += parse_logbook_for_expedition(expo, blog=True) # this loads the blog logbook else: print( f" - Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}" @@ -608,7 +615,7 @@ def LoadLogbook(year): for entrytuple in logentries: date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid = entrytuple if expo == expedition: # unneeded check, we zeroed it before filling it - #print(f" - {triptitle}") + # print(f" -- {triptitle}") store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid) else: print(f" ! unexpected log entry labelled as '{expedition}' {tid}" ) diff --git a/parsers/people.py b/parsers/people.py index 80e36c3..fb90f38 100644 --- a/parsers/people.py +++ b/parsers/people.py @@ -162,7 +162,7 @@ foreign_friends = [ def known_foreigner(id): """If this someone from ARGE or a known Austrian? Name has to be exact, no soft matching - APPARENTLY NOT YET USED + APPARENTLY NOT YET USED? Yes it is: in logbook Blog parsing instead foreigners have names prefixed by * in the logbook.html ?""" global foreign_friends @@ -321,6 +321,12 @@ def GetPersonExpeditionNameLookup(expedition): possnames.append("nobrotson") if f"{f} {l}" == "Todd Rye".lower(): possnames.append("samouse1") + if f"{f} {l}" == "Jono Lester".lower(): + possnames.append("ILoveCaves") + if f"{f} {l}" == "Joel Stobbart".lower(): + possnames.append("El Stobbarto") + if f"{f} {l}" == "Rob Watson".lower(): + possnames.append("nobrotson") for i in [3, 4, 5, 6]: lim = min(i, len(f) + 1) # short form, e.g. Dan for Daniel.