2
0
mirror of https://expo.survex.com/repositories/troggle/.git synced 2024-11-22 07:11:52 +00:00

Fixed parsers

This commit is contained in:
Philip Sargent 2023-09-02 17:49:37 +03:00
parent c9729c046c
commit 1a8bc17f80
5 changed files with 49 additions and 33 deletions

View File

@ -69,8 +69,8 @@ def alphabet_suffix(n):
if not alphabet: if not alphabet:
alphabet = list(string.ascii_lowercase) alphabet = list(string.ascii_lowercase)
if n < len(alphabet): if n < len(alphabet) and n > 0:
suffix = alphabet[n] suffix = alphabet[n-1]
else: else:
suffix = "_X_" + random.choice(string.ascii_lowercase) + random.choice(string.ascii_lowercase) suffix = "_X_" + random.choice(string.ascii_lowercase) + random.choice(string.ascii_lowercase)
return suffix return suffix

View File

@ -4,6 +4,7 @@ from django.contrib.auth import authenticate
from django.contrib.auth import forms as auth_forms from django.contrib.auth import forms as auth_forms
from django.contrib.auth import login, logout from django.contrib.auth import login, logout
from django.contrib.auth.decorators import login_required from django.contrib.auth.decorators import login_required
from django.http import HttpResponse, HttpResponseRedirect
from django.shortcuts import redirect, render from django.shortcuts import redirect, render
from django.utils.http import url_has_allowed_host_and_scheme from django.utils.http import url_has_allowed_host_and_scheme
@ -22,7 +23,7 @@ class login_required_if_public(object):
def __call__(self, *args, **kwargs): def __call__(self, *args, **kwargs):
return self.f(*args, **kwargs) return self.f(*args, **kwargs)
# This is copied from CUYC.cuy.website.view.auth # This is copied from CUYC.cuy.website.view.auth
# If we want to do the whole online-email thing, we would also need to copy across the code in these # If we want to do the whole online-email thing, we would also need to copy across the code in these

View File

@ -8,6 +8,7 @@ from django.shortcuts import render, redirect
import settings import settings
from troggle.core.models.caves import GetCaveLookup
from troggle.core.models.logbooks import LogbookEntry, writelogbook, PersonLogEntry from troggle.core.models.logbooks import LogbookEntry, writelogbook, PersonLogEntry
from troggle.core.models.survex import DrawingFile from troggle.core.models.survex import DrawingFile
from troggle.core.models.troggle import DataIssue, Expedition, PersonExpedition from troggle.core.models.troggle import DataIssue, Expedition, PersonExpedition
@ -340,7 +341,7 @@ def logbookedit(request, year=None, slug=None):
"textrows": rows, "textrows": rows,
}, },
) )
else: # no slug else: # no slug or bad slug for an lbe which does not exist
# NEW logbook entry # NEW logbook entry
return render( return render(
request, request,

View File

@ -60,8 +60,8 @@ LOGBOOK_PARSER_SETTINGS = {
LOGBOOKS_DIR = "years" # subfolder of settings.EXPOWEB LOGBOOKS_DIR = "years" # subfolder of settings.EXPOWEB
ENTRIES = { ENTRIES = {
"2023": 81, "2023": 83,
"2022": 93, "2022": 94,
"2019": 55, "2019": 55,
"2018": 95, "2018": 95,
"2017": 74, "2017": 74,
@ -127,7 +127,7 @@ def reset_trip_id(date):
suffix = alphabet_suffix(n) suffix = alphabet_suffix(n)
tid = f"{date}{suffix}" tid = f"{date}{suffix}"
# print(tid) # print(already, n, tid)
return tid return tid
rx_tripperson = re.compile(r"(?i)<u>(.*?)</u>$") rx_tripperson = re.compile(r"(?i)<u>(.*?)</u>$")
@ -136,6 +136,7 @@ rx_round_bracket = re.compile(r"[\(\[].*?[\)\]]")
def GetTripPersons(trippeople, expedition, logtime_underground, tid=None): def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
res = [] res = []
author = None author = None
guests = []
# print(f'# {tid}') # print(f'# {tid}')
# print(f" - {tid} '{trippeople}' ") # print(f" - {tid} '{trippeople}' ")
@ -154,11 +155,12 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
try: try:
personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower()) personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower())
if not personyear: if not personyear:
if known_foreigner(tripperson): guests.append(nickname_used)
message = f" ! - {expedition.year} Known foreigner: '{tripperson}' in entry {tid=}" if known_foreigner(nickname_used):
message = f" ! - {expedition.year} Known foreigner: '{nickname_used}' in entry {tid=}"
print(message) print(message)
else: else:
message = f" ! - {expedition.year} No name match for: '{tripperson}' in entry {tid=} for this year." message = f" ! - {expedition.year} No name match for: '{nickname_used}' in entry {tid=} for this year."
print(message) print(message)
DataIssue.objects.create(parser="logbooks", message=message) DataIssue.objects.create(parser="logbooks", message=message)
res.append((personyear, nickname_used, logtime_underground)) res.append((personyear, nickname_used, logtime_underground))
@ -170,10 +172,9 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
raise raise
if author_u: if author_u:
author = personyear author = personyear
else: else: # *guest
# a person but with * prefix. Ignored everywhere. guests.append(tripperson)
# print(f" ! - {expedition.year} * person : {tripperson}") # print(f" ! - {expedition.year} * GUEST : {tripperson}")
pass
if not author: if not author:
if not res: if not res:
@ -181,7 +182,7 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
author = res[-1][0] # the previous valid person and a time of 0 hours author = res[-1][0] # the previous valid person and a time of 0 hours
# print(f" - {tid} [{author.person}] '{res[0][0].person}'...") # print(f" - {tid} [{author.person}] '{res[0][0].person}'...")
return res, author return res, author, guests
def tidy_time_underground(logtime_underground): def tidy_time_underground(logtime_underground):
# Nasty hack, must tidy this up.. # Nasty hack, must tidy this up..
@ -202,7 +203,7 @@ def tidy_time_underground(logtime_underground):
def tidy_trip_persons(trippeople, title, expedition, logtime_underground, tid): def tidy_trip_persons(trippeople, title, expedition, logtime_underground, tid):
try: try:
trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground, tid=tid) trippersons, author, guests = GetTripPersons(trippeople, expedition, logtime_underground, tid=tid)
# trippersons is a list of tuples (personyear, nickname_used, logtime_underground) # trippersons is a list of tuples (personyear, nickname_used, logtime_underground)
except: except:
message = f" ! - {expedition.year} Logentry: {title} - GetTripPersons FAIL to recognise nickname" message = f" ! - {expedition.year} Logentry: {title} - GetTripPersons FAIL to recognise nickname"
@ -216,7 +217,7 @@ def tidy_trip_persons(trippeople, title, expedition, logtime_underground, tid):
DataIssue.objects.create(parser="logbooks", message=message) DataIssue.objects.create(parser="logbooks", message=message)
print(message) print(message)
return trippersons, author return trippersons, author, guests
def tidy_trip_cave(place): def tidy_trip_cave(place):
# GetCaveLookup() need to work better. None of this data is *used* though? # GetCaveLookup() need to work better. None of this data is *used* though?
@ -251,16 +252,18 @@ def tidy_tid(tid, title):
tid = str(randint(1000, 9999)) + "_" + slugify(title)[:10].replace("-", "_") tid = str(randint(1000, 9999)) + "_" + slugify(title)[:10].replace("-", "_")
return tid return tid
def store_entry_into_database(date, place, tripcave, title, text, trippersons, author, expedition, logtime_underground, tid): def store_entry_into_database(date, place, tripcave, title, text, trippersons, author, guests, expedition, logtime_underground, tid):
"""saves a single logbook entry and related personlogentry items """saves a single logbook entry and related personlogentry items
We could do a bulk update to save all the entries, but then we would need to do a query on We could do a bulk update to save all the entries, but then we would need to do a query on
each one to get the primary key to asign to the PersonLogEntries. So overall probably not much each one to get the primary key to assign to the PersonLogEntries. So overall probably not much
faster ? faster ?
""" """
other_people = ", ".join(guests) # join list members separated by comma
nonLookupAttribs = { nonLookupAttribs = {
"place": place, "place": place,
"other_people": other_people, # *Ol's Mum, foreigners..
"text": text, "text": text,
"expedition": expedition, "expedition": expedition,
"time_underground": logtime_underground, "time_underground": logtime_underground,
@ -324,6 +327,17 @@ def parser_date(tripdate, year):
def parser_html(year, expedition, txt, seq=""): def parser_html(year, expedition, txt, seq=""):
"""This uses some of the more obscure capabilities of regular expressions, """This uses some of the more obscure capabilities of regular expressions,
see https://docs.python.org/3/library/re.html see https://docs.python.org/3/library/re.html
e.g.
* is greedy
*? is non-greedy
(?x) flag means VERBOSE
(?: ) non-capturing parentheses
\s whitespace
\S NOT whitespace
You can't see it here, but a round-trip export-then-import will move You can't see it here, but a round-trip export-then-import will move
the endmatter up to the frontmatter. This made sense when translating the endmatter up to the frontmatter. This made sense when translating
@ -357,7 +371,7 @@ def parser_html(year, expedition, txt, seq=""):
for trippara in tripparas: for trippara in tripparas:
logbook_entry_count += 1 logbook_entry_count += 1
tid = set_trip_seq_id(year, logbook_entry_count) tid = set_trip_seq_id(year, logbook_entry_count)
# print(f' - new tid:{tid} lbe count: {logbook_entry_count}') # print(f' - new seq tid:{tid} lbe count: {logbook_entry_count}')
s = re.match( s = re.match(
r"""(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date r"""(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date
@ -367,15 +381,17 @@ def parser_html(year, expedition, txt, seq=""):
\s*<div\s+class="triptitle">\s*(.*?)</div> \s*<div\s+class="triptitle">\s*(.*?)</div>
([\s\S]*?) ([\s\S]*?)
\s*(?:<div\s+class="timeug">\s*(.*?)</div>)? \s*(?:<div\s+class="timeug">\s*(.*?)</div>)?
\s*(?:<div\s+class="editentry"\s*.*?</div>)?
\s*$ \s*$
""", """,
trippara, trippara,
) )
if s: if s:
tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups() tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups()
# print(f"#{logbook_entry_count} {tu} {len(triptext)} ")
else: else:
# if not re.search(r"Rigging Guide", trippara): # if not re.search(r"Rigging Guide", trippara):
msg = f" !- Logbook. Can't parse entry, skipping:{logbook_entry_count} '{trippara[:55]}'...'{trippara}'" msg = f" !- Logbook. Can't parse entry, skipping:{logbook_entry_count} '{trippara[:75]}'..."
print(msg) print(msg)
DataIssue.objects.create(parser="logbooks", message=msg) DataIssue.objects.create(parser="logbooks", message=msg)
continue continue
@ -403,12 +419,12 @@ def parser_html(year, expedition, txt, seq=""):
dupl[check] = 1 dupl[check] = 1
tu = tidy_time_underground(tu) tu = tidy_time_underground(tu)
trippersons, author = tidy_trip_persons(trippeople, triptitle, expedition, tu, tid) trippersons, author, guests = tidy_trip_persons(trippeople, triptitle, expedition, tu, tid)
tripcave = tidy_trip_cave(place) tripcave = tidy_trip_cave(place)
tripcontent = tidy_trip_image_urls(tripcontent, ldate) tripcontent = tidy_trip_image_urls(tripcontent, ldate)
tid = tidy_tid(tid, triptitle) tid = tidy_tid(tid, triptitle)
entrytuple = (ldate, place, tripcave, triptitle, tripcontent, trippersons, author, expedition, tu, tid) entrytuple = (ldate, place, tripcave, triptitle, tripcontent, trippersons, author, guests, expedition, tu, tid)
logentries.append(entrytuple) logentries.append(entrytuple)
return logentries return logentries
@ -509,13 +525,13 @@ def parser_blog(year, expedition, txt, sq=""):
tripcontent = f"\n\n<!-- Content parsed from UK Caving Blog -->\nBlog Author: {trippeople}" + tripcontent tripcontent = f"\n\n<!-- Content parsed from UK Caving Blog -->\nBlog Author: {trippeople}" + tripcontent
logtime_underground = 0 logtime_underground = 0
trippersons, author = tidy_trip_persons(trippeople, triptitle, expedition, logtime_underground, tid) trippersons, author, guests = tidy_trip_persons(trippeople, triptitle, expedition, logtime_underground, tid)
# print(f" - author: {author}") # print(f" - author: {author}")
tripcave = tidy_trip_cave(place) tripcave = tidy_trip_cave(place)
tripcontent = tidy_trip_image_urls(tripcontent, year) tripcontent = tidy_trip_image_urls(tripcontent, year)
tid = tidy_tid(tid, triptitle) tid = tidy_tid(tid, triptitle)
entrytuple = (tripdate, place, tripcave, triptitle, tripcontent, trippersons, author, expedition, tu, tid) entrytuple = (tripdate, place, tripcave, triptitle, tripcontent, trippersons, author, guests, expedition, tu, tid)
logentries.append(entrytuple) logentries.append(entrytuple)
return logentries return logentries
@ -621,10 +637,10 @@ def LoadLogbook(year):
f" - Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}" f" - Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}"
) )
for entrytuple in logentries: for entrytuple in logentries:
date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid = entrytuple date, place, tripcave, triptitle, text, trippersons, author, guests, expedition, tu, tid = entrytuple
if expo == expedition: # unneeded check, we zeroed it before filling it if expo == expedition: # unneeded check, we zeroed it before filling it
# print(f" -- {triptitle}") # print(f" -- {triptitle}")
store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid) store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, guests, expedition, tu, tid)
else: else:
print(f" ! unexpected log entry labelled as '{expedition}' {tid}" ) print(f" ! unexpected log entry labelled as '{expedition}' {tid}" )
expo.save() # to save logbook name property expo.save() # to save logbook name property
@ -708,8 +724,8 @@ def LoadLogbooks():
# - LogBookEntry (text, who when etc.) # - LogBookEntry (text, who when etc.)
# - PersonLogEntry (who was on that specific trip mentione din the logbook entry) # - PersonLogEntry (who was on that specific trip mentione din the logbook entry)
for entrytuple in allentries: for entrytuple in allentries:
date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid = entrytuple date, place, tripcave, triptitle, text, trippersons, author, guests, expedition, tu, tid = entrytuple
store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid) store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, guests, expedition, tu, tid)
for expo in expos: for expo in expos:
expo.save() # to save logbook name property expo.save() # to save logbook name property

View File

@ -22,13 +22,11 @@ See troggle/code/views/other.py and core.models/logbooks.py writelogbook(year, f
<hr /> <hr />
<div class="tripdate" id="{{logbook_entry.slug}}">{{logbook_entry.date|date:'Y-m-d'}}</div> <div class="tripdate" id="{{logbook_entry.slug}}">{{logbook_entry.date|date:'Y-m-d'}}</div>
<div class="trippeople">{% for personlogentry in logbook_entry.personlogentry_set.all %}{% if personlogentry.is_logbook_entry_author %}<u>{{personlogentry.personexpedition.person|safe}}</u>{% else %}{{ personlogentry.personexpedition.person|safe }}{% endif %}, {% endfor %}</div> <div class="trippeople">{% for personlogentry in logbook_entry.personlogentry_set.all %}{% if personlogentry.is_logbook_entry_author %}<u>{{personlogentry.personexpedition.person|safe}}</u>{% else %}{{ personlogentry.personexpedition.person|safe }}{% endif %}, {% endfor %}{% if logbook_entry.other_people %}, {{logbook_entry.other_people}}{% endif %}</div>
<div class="triptitle">{{logbook_entry.title|safe}}</div> <div class="triptitle">{{logbook_entry.title|safe}}</div>
<br />
<a href="/logbookedit/{{logbook_entry.slug}}">Edit this entry</a>
<br />
{{logbook_entry.text|safe}} {{logbook_entry.text|safe}}
<div class="timeug">T/U: {{logbook_entry.time_underground|safe}} hours</div> <div class="timeug">T/U: {{logbook_entry.time_underground|safe}} hours</div>
<div class="editentry"><br /><a href="/logbookedit/{{logbook_entry.slug}}">Edit this entry</a><br /></div>
{% endfor %} {% endfor %}
<hr /> <hr />
</body> </body>