2
0
mirror of https://expo.survex.com/repositories/troggle/.git synced 2024-11-25 08:41:51 +00:00

Parser fixed to work on 2023 UKcaving blog

This commit is contained in:
Philip Sargent 2023-08-31 00:09:02 +03:00
parent df86103407
commit 069a1d57c9
3 changed files with 29 additions and 16 deletions

View File

@ -36,7 +36,7 @@ def import_logbooks():
with transaction.atomic():
troggle.parsers.logbooks.LoadLogbooks()
def import_logbook(year=2022):
def import_logbook(year=2023):
print(f"-- Importing Logbook {year}")
with transaction.atomic():
troggle.parsers.logbooks.LoadLogbook(year)

View File

@ -10,7 +10,7 @@ from random import randint
from django.conf import settings
from django.template.defaultfilters import slugify
from parsers.people import GetPersonExpeditionNameLookup, load_people_expos
from parsers.people import GetPersonExpeditionNameLookup, load_people_expos, known_foreigner
from troggle.core.models.caves import GetCaveLookup
from troggle.core.models.logbooks import LogbookEntry, PersonLogEntry
from troggle.core.models.troggle import DataIssue, Expedition
@ -44,6 +44,7 @@ e.g. cave descriptions
"""
MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200
BLOG_PARSER_SETTINGS = { # no default, must be explicit
# "2023": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
# "2022": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
# "2019": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
# "2018": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
@ -59,7 +60,7 @@ LOGBOOK_PARSER_SETTINGS = {
LOGBOOKS_DIR = "years" # subfolder of settings.EXPOWEB
ENTRIES = {
"2023": 63,
"2023": 82,
"2022": 93,
"2019": 55,
"2018": 95,
@ -143,6 +144,7 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
DataIssue.objects.create(parser="logbooks", message=message)
res.append((personyear, nickname_used, logtime_underground))
except:
# This should not happen. We do not raise exceptions in that function
message = f" ! - {expedition.year} EXCEPTION: '{tripperson}' ({nickname_used}) in entry {tid=} for this year."
print(message)
DataIssue.objects.create(parser="logbooks", message=message)
@ -184,10 +186,10 @@ def tidy_trip_persons(trippeople, title, expedition, logtime_underground, tid):
trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground, tid=tid)
# trippersons is a list of tuples (personyear, nickname_used, logtime_underground)
except:
message = f" ! - {expedition.year} Skipping logentry: {title} - GetTripPersons FAIL"
message = f" ! - {expedition.year} Logentry: {title} - GetTripPersons FAIL to recognise nickname"
DataIssue.objects.create(parser="logbooks", message=message)
print(message)
# raise
raise
return "", ""
if not author:
@ -424,7 +426,8 @@ def parser_blog(year, expedition, txt, sq=""):
logentries = []
tripheads = re.findall(
r"<article class=\"message message--post js-post js-inlineModContainer\s*\"\s*([\s\S]*?)(?=</article)", txt
# note use of non-greedy capturing (?: regex idiom here
r"<article class=\"message message--post js-post js-inlineModContainer\s*(?:is-unread)*\s*\"\s*([\s\S]*?)(?=</article)", txt
)
if not (tripheads):
message = f" ! - Skipping on failure to parse article header: {txt[:500]}"
@ -439,10 +442,10 @@ def parser_blog(year, expedition, txt, sq=""):
print(message)
if len(tripheads) != len(tripparas):
print(f"{len(tripheads)} != {len(tripparas)}")
print(f"{len(tripheads)} - {len(tripparas)}")
print(f"{len(tripheads)} != {len(tripparas)} not the same number of headers {len(tripheads)} as paras {len(tripparas)} !")
# print(f"{len(tripheads)} - {len(tripparas)}")
location = "Plateau" # best guess, fix manually later
#location = "Plateau" # best guess, fix manually later
tu = 0 # no logged time underground in a blog entry
logbook_entry_count = 0
for i in range(0, len(tripparas)):
@ -497,9 +500,11 @@ def parser_blog(year, expedition, txt, sq=""):
tripcontent = re.sub(r"<hr\s*>", "", tripcontent)
tripcontent = f"\n\n<!-- Content parsed from UK Caving Blog -->\nBlog Author: {trippeople}" + tripcontent
trippersons, author = tidy_trip_persons(trippeople, expedition, logtime_underground, tid)
logtime_underground = 0
trippersons, author = tidy_trip_persons(trippeople, triptitle, expedition, logtime_underground, tid)
# print(f" - author: {author}")
tripcave = tidy_trip_cave(place)
tripcontent = tidy_trip_image_urls(tripcontent, date)
tripcontent = tidy_trip_image_urls(tripcontent, year)
tid = tidy_tid(tid, triptitle)
entrytuple = (tripdate, place, tripcave, triptitle, tripcontent, trippersons, author, expedition, tu, tid)
@ -544,13 +549,13 @@ def parse_logbook_for_expedition(expedition, blog=False):
parsefunc = DEFAULT_LOGBOOK_PARSER
if blog:
print(f" - BLOG file {yearfile} using parser {parsefunc}")
if year not in BLOG_PARSER_SETTINGS:
message = f" ! - Expecting blog parser buut none specified for {year}"
DataIssue.objects.create(parser="logbooks", message=message)
print(message)
else:
yearfile, parsefunc = BLOG_PARSER_SETTINGS[year]
print(f" - BLOG file {yearfile} using parser {parsefunc}")
logbookpath = Path(yearfile)
# print(f" - Logbook file {yearfile} using parser {parsefunc}")
@ -575,7 +580,7 @@ def parse_logbook_for_expedition(expedition, blog=False):
if logbook_parseable:
# --------------------
parser = globals()[parsefunc]
# print(f" - {year} parsing with {parsefunc} - {lb}")
print(f" - {year} parsing with {parsefunc} - {lb}")
print(" .", end="")
logentries = parser(year, expedition, txt, sq) # this launches the right parser
# --------------------
@ -599,8 +604,10 @@ def LoadLogbook(year):
logentries = []
logentries = parse_logbook_for_expedition(expo) # this actually loads the logbook for one expo
print(f" - Loaded logbook. {len(logentries)} entries." )
if year in BLOG_PARSER_SETTINGS:
logentries += parse_logbook_for_expedition(expo, blog=True) # this loads the blog logbook
print(f" - Loading blog.." )
logentries += parse_logbook_for_expedition(expo, blog=True) # this loads the blog logbook
else:
print(
f" - Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}"
@ -608,7 +615,7 @@ def LoadLogbook(year):
for entrytuple in logentries:
date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid = entrytuple
if expo == expedition: # unneeded check, we zeroed it before filling it
#print(f" - {triptitle}")
# print(f" -- {triptitle}")
store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid)
else:
print(f" ! unexpected log entry labelled as '{expedition}' {tid}" )

View File

@ -162,7 +162,7 @@ foreign_friends = [
def known_foreigner(id):
"""If this someone from ARGE or a known Austrian? Name has to be exact, no soft matching
APPARENTLY NOT YET USED
APPARENTLY NOT YET USED? Yes it is: in logbook Blog parsing
instead foreigners have names prefixed by * in the logbook.html ?"""
global foreign_friends
@ -321,6 +321,12 @@ def GetPersonExpeditionNameLookup(expedition):
possnames.append("nobrotson")
if f"{f} {l}" == "Todd Rye".lower():
possnames.append("samouse1")
if f"{f} {l}" == "Jono Lester".lower():
possnames.append("ILoveCaves")
if f"{f} {l}" == "Joel Stobbart".lower():
possnames.append("El Stobbarto")
if f"{f} {l}" == "Rob Watson".lower():
possnames.append("nobrotson")
for i in [3, 4, 5, 6]:
lim = min(i, len(f) + 1) # short form, e.g. Dan for Daniel.