mirror of
https://expo.survex.com/repositories/troggle/.git
synced 2024-11-25 16:51:54 +00:00
Parser fixed to work on 2023 UKcaving blog
This commit is contained in:
parent
df86103407
commit
069a1d57c9
@ -36,7 +36,7 @@ def import_logbooks():
|
||||
with transaction.atomic():
|
||||
troggle.parsers.logbooks.LoadLogbooks()
|
||||
|
||||
def import_logbook(year=2022):
|
||||
def import_logbook(year=2023):
|
||||
print(f"-- Importing Logbook {year}")
|
||||
with transaction.atomic():
|
||||
troggle.parsers.logbooks.LoadLogbook(year)
|
||||
|
@ -10,7 +10,7 @@ from random import randint
|
||||
from django.conf import settings
|
||||
from django.template.defaultfilters import slugify
|
||||
|
||||
from parsers.people import GetPersonExpeditionNameLookup, load_people_expos
|
||||
from parsers.people import GetPersonExpeditionNameLookup, load_people_expos, known_foreigner
|
||||
from troggle.core.models.caves import GetCaveLookup
|
||||
from troggle.core.models.logbooks import LogbookEntry, PersonLogEntry
|
||||
from troggle.core.models.troggle import DataIssue, Expedition
|
||||
@ -44,6 +44,7 @@ e.g. cave descriptions
|
||||
"""
|
||||
MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200
|
||||
BLOG_PARSER_SETTINGS = { # no default, must be explicit
|
||||
# "2023": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
|
||||
# "2022": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
|
||||
# "2019": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
|
||||
# "2018": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
|
||||
@ -59,7 +60,7 @@ LOGBOOK_PARSER_SETTINGS = {
|
||||
LOGBOOKS_DIR = "years" # subfolder of settings.EXPOWEB
|
||||
|
||||
ENTRIES = {
|
||||
"2023": 63,
|
||||
"2023": 82,
|
||||
"2022": 93,
|
||||
"2019": 55,
|
||||
"2018": 95,
|
||||
@ -143,6 +144,7 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
|
||||
DataIssue.objects.create(parser="logbooks", message=message)
|
||||
res.append((personyear, nickname_used, logtime_underground))
|
||||
except:
|
||||
# This should not happen. We do not raise exceptions in that function
|
||||
message = f" ! - {expedition.year} EXCEPTION: '{tripperson}' ({nickname_used}) in entry {tid=} for this year."
|
||||
print(message)
|
||||
DataIssue.objects.create(parser="logbooks", message=message)
|
||||
@ -184,10 +186,10 @@ def tidy_trip_persons(trippeople, title, expedition, logtime_underground, tid):
|
||||
trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground, tid=tid)
|
||||
# trippersons is a list of tuples (personyear, nickname_used, logtime_underground)
|
||||
except:
|
||||
message = f" ! - {expedition.year} Skipping logentry: {title} - GetTripPersons FAIL"
|
||||
message = f" ! - {expedition.year} Logentry: {title} - GetTripPersons FAIL to recognise nickname"
|
||||
DataIssue.objects.create(parser="logbooks", message=message)
|
||||
print(message)
|
||||
# raise
|
||||
raise
|
||||
return "", ""
|
||||
|
||||
if not author:
|
||||
@ -424,7 +426,8 @@ def parser_blog(year, expedition, txt, sq=""):
|
||||
logentries = []
|
||||
|
||||
tripheads = re.findall(
|
||||
r"<article class=\"message message--post js-post js-inlineModContainer\s*\"\s*([\s\S]*?)(?=</article)", txt
|
||||
# note use of non-greedy capturing (?: regex idiom here
|
||||
r"<article class=\"message message--post js-post js-inlineModContainer\s*(?:is-unread)*\s*\"\s*([\s\S]*?)(?=</article)", txt
|
||||
)
|
||||
if not (tripheads):
|
||||
message = f" ! - Skipping on failure to parse article header: {txt[:500]}"
|
||||
@ -439,10 +442,10 @@ def parser_blog(year, expedition, txt, sq=""):
|
||||
print(message)
|
||||
|
||||
if len(tripheads) != len(tripparas):
|
||||
print(f"{len(tripheads)} != {len(tripparas)}")
|
||||
print(f"{len(tripheads)} - {len(tripparas)}")
|
||||
print(f"{len(tripheads)} != {len(tripparas)} not the same number of headers {len(tripheads)} as paras {len(tripparas)} !")
|
||||
# print(f"{len(tripheads)} - {len(tripparas)}")
|
||||
|
||||
location = "Plateau" # best guess, fix manually later
|
||||
#location = "Plateau" # best guess, fix manually later
|
||||
tu = 0 # no logged time underground in a blog entry
|
||||
logbook_entry_count = 0
|
||||
for i in range(0, len(tripparas)):
|
||||
@ -497,9 +500,11 @@ def parser_blog(year, expedition, txt, sq=""):
|
||||
tripcontent = re.sub(r"<hr\s*>", "", tripcontent)
|
||||
tripcontent = f"\n\n<!-- Content parsed from UK Caving Blog -->\nBlog Author: {trippeople}" + tripcontent
|
||||
|
||||
trippersons, author = tidy_trip_persons(trippeople, expedition, logtime_underground, tid)
|
||||
logtime_underground = 0
|
||||
trippersons, author = tidy_trip_persons(trippeople, triptitle, expedition, logtime_underground, tid)
|
||||
# print(f" - author: {author}")
|
||||
tripcave = tidy_trip_cave(place)
|
||||
tripcontent = tidy_trip_image_urls(tripcontent, date)
|
||||
tripcontent = tidy_trip_image_urls(tripcontent, year)
|
||||
tid = tidy_tid(tid, triptitle)
|
||||
|
||||
entrytuple = (tripdate, place, tripcave, triptitle, tripcontent, trippersons, author, expedition, tu, tid)
|
||||
@ -544,13 +549,13 @@ def parse_logbook_for_expedition(expedition, blog=False):
|
||||
parsefunc = DEFAULT_LOGBOOK_PARSER
|
||||
|
||||
if blog:
|
||||
print(f" - BLOG file {yearfile} using parser {parsefunc}")
|
||||
if year not in BLOG_PARSER_SETTINGS:
|
||||
message = f" ! - Expecting blog parser buut none specified for {year}"
|
||||
DataIssue.objects.create(parser="logbooks", message=message)
|
||||
print(message)
|
||||
else:
|
||||
yearfile, parsefunc = BLOG_PARSER_SETTINGS[year]
|
||||
print(f" - BLOG file {yearfile} using parser {parsefunc}")
|
||||
|
||||
logbookpath = Path(yearfile)
|
||||
# print(f" - Logbook file {yearfile} using parser {parsefunc}")
|
||||
@ -575,7 +580,7 @@ def parse_logbook_for_expedition(expedition, blog=False):
|
||||
if logbook_parseable:
|
||||
# --------------------
|
||||
parser = globals()[parsefunc]
|
||||
# print(f" - {year} parsing with {parsefunc} - {lb}")
|
||||
print(f" - {year} parsing with {parsefunc} - {lb}")
|
||||
print(" .", end="")
|
||||
logentries = parser(year, expedition, txt, sq) # this launches the right parser
|
||||
# --------------------
|
||||
@ -599,7 +604,9 @@ def LoadLogbook(year):
|
||||
logentries = []
|
||||
|
||||
logentries = parse_logbook_for_expedition(expo) # this actually loads the logbook for one expo
|
||||
print(f" - Loaded logbook. {len(logentries)} entries." )
|
||||
if year in BLOG_PARSER_SETTINGS:
|
||||
print(f" - Loading blog.." )
|
||||
logentries += parse_logbook_for_expedition(expo, blog=True) # this loads the blog logbook
|
||||
else:
|
||||
print(
|
||||
@ -608,7 +615,7 @@ def LoadLogbook(year):
|
||||
for entrytuple in logentries:
|
||||
date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid = entrytuple
|
||||
if expo == expedition: # unneeded check, we zeroed it before filling it
|
||||
#print(f" - {triptitle}")
|
||||
# print(f" -- {triptitle}")
|
||||
store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid)
|
||||
else:
|
||||
print(f" ! unexpected log entry labelled as '{expedition}' {tid}" )
|
||||
|
@ -162,7 +162,7 @@ foreign_friends = [
|
||||
def known_foreigner(id):
|
||||
"""If this someone from ARGE or a known Austrian? Name has to be exact, no soft matching
|
||||
|
||||
APPARENTLY NOT YET USED
|
||||
APPARENTLY NOT YET USED? Yes it is: in logbook Blog parsing
|
||||
instead foreigners have names prefixed by * in the logbook.html ?"""
|
||||
global foreign_friends
|
||||
|
||||
@ -321,6 +321,12 @@ def GetPersonExpeditionNameLookup(expedition):
|
||||
possnames.append("nobrotson")
|
||||
if f"{f} {l}" == "Todd Rye".lower():
|
||||
possnames.append("samouse1")
|
||||
if f"{f} {l}" == "Jono Lester".lower():
|
||||
possnames.append("ILoveCaves")
|
||||
if f"{f} {l}" == "Joel Stobbart".lower():
|
||||
possnames.append("El Stobbarto")
|
||||
if f"{f} {l}" == "Rob Watson".lower():
|
||||
possnames.append("nobrotson")
|
||||
|
||||
for i in [3, 4, 5, 6]:
|
||||
lim = min(i, len(f) + 1) # short form, e.g. Dan for Daniel.
|
||||
|
Loading…
Reference in New Issue
Block a user