mirror of
https://expo.survex.com/repositories/troggle/.git
synced 2025-04-03 17:31:47 +01:00
bugfix and remove swapped order for title/people
This commit is contained in:
parent
98412c140d
commit
33a08bed4f
@ -1,5 +1,4 @@
|
|||||||
import subprocess
|
import subprocess
|
||||||
import string
|
|
||||||
from datetime import datetime
|
from datetime import datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
|
|
||||||
|
@ -23,7 +23,7 @@ Parses and imports logbooks in all their wonderful confusion
|
|||||||
https://expo.survex.com/handbook/computing/logbooks-parsing.html
|
https://expo.survex.com/handbook/computing/logbooks-parsing.html
|
||||||
"""
|
"""
|
||||||
todo = """
|
todo = """
|
||||||
- make id= for each entry persistent and unchanging, and check cross-references in other logbooks and other HTML frahments
|
- check cross-references in other logbooks and other HTML frahments
|
||||||
e.g. cave descriptions
|
e.g. cave descriptions
|
||||||
|
|
||||||
- Most of the time is during the database writing (6s out of 8s).
|
- Most of the time is during the database writing (6s out of 8s).
|
||||||
@ -41,7 +41,6 @@ e.g. cave descriptions
|
|||||||
file_in = open(logbookfile,'rb')
|
file_in = open(logbookfile,'rb')
|
||||||
txt = file_in.read().decode("latin1")
|
txt = file_in.read().decode("latin1")
|
||||||
|
|
||||||
|
|
||||||
"""
|
"""
|
||||||
MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200
|
MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200
|
||||||
BLOG_PARSER_SETTINGS = { # no default, must be explicit
|
BLOG_PARSER_SETTINGS = { # no default, must be explicit
|
||||||
@ -61,7 +60,7 @@ LOGBOOK_PARSER_SETTINGS = {
|
|||||||
LOGBOOKS_DIR = "years" # subfolder of settings.EXPOWEB
|
LOGBOOKS_DIR = "years" # subfolder of settings.EXPOWEB
|
||||||
|
|
||||||
ENTRIES = {
|
ENTRIES = {
|
||||||
"2023": 82,
|
"2023": 81,
|
||||||
"2022": 93,
|
"2022": 93,
|
||||||
"2019": 55,
|
"2019": 55,
|
||||||
"2018": 95,
|
"2018": 95,
|
||||||
@ -111,7 +110,7 @@ tripsdate = {}
|
|||||||
|
|
||||||
|
|
||||||
def set_trip_seq_id(year, seq):
|
def set_trip_seq_id(year, seq):
|
||||||
'''We have not parsed the trip date yet, so this is a sequence numer
|
'''We have not parsed the trip date yet, so this is a sequence number
|
||||||
'''
|
'''
|
||||||
tid = f"{year}_s{seq:02d}"
|
tid = f"{year}_s{seq:02d}"
|
||||||
return tid
|
return tid
|
||||||
@ -269,7 +268,7 @@ def store_entry_into_database(date, place, tripcave, title, text, trippersons, a
|
|||||||
}
|
}
|
||||||
lookupAttribs = {"slug": tid, "date": date, "title": title}
|
lookupAttribs = {"slug": tid, "date": date, "title": title}
|
||||||
if LogbookEntry.objects.filter(slug=tid).exists():
|
if LogbookEntry.objects.filter(slug=tid).exists():
|
||||||
# oops.
|
# oops. Our code should already have ensured this is unique.
|
||||||
message = " ! - DUPLICATE SLUG for logbook entry " + tripdate + " - " + slug
|
message = " ! - DUPLICATE SLUG for logbook entry " + tripdate + " - " + slug
|
||||||
DataIssue.objects.create(parser="logbooks", message=message)
|
DataIssue.objects.create(parser="logbooks", message=message)
|
||||||
slug = slug + "_" + unique_slug(text,2)
|
slug = slug + "_" + unique_slug(text,2)
|
||||||
@ -374,31 +373,12 @@ def parser_html(year, expedition, txt, seq=""):
|
|||||||
)
|
)
|
||||||
if s:
|
if s:
|
||||||
tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups()
|
tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups()
|
||||||
else: # allow title and people to be swapped in order
|
else:
|
||||||
msg = f" !- {year} Can't parse:{logbook_entry_count} '{trippara[:55]}'...'{trippara}'"
|
# if not re.search(r"Rigging Guide", trippara):
|
||||||
|
msg = f" !- Logbook. Can't parse entry, skipping:{logbook_entry_count} '{trippara[:55]}'...'{trippara}'"
|
||||||
print(msg)
|
print(msg)
|
||||||
DataIssue.objects.create(parser="logbooks", message=msg)
|
DataIssue.objects.create(parser="logbooks", message=msg)
|
||||||
|
continue
|
||||||
s2 = re.match(
|
|
||||||
r"""(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date
|
|
||||||
\s*(?:<a\s+id="(.*?)"\s*/>\s*</a>)?
|
|
||||||
\s*<div\s+class="tripdate"\s*(?:id="(.*?)")?>(.*?)</div>(?:<p>)?
|
|
||||||
\s*<div\s+class="triptitle">\s*(.*?)</div>
|
|
||||||
\s*<div\s+class="trippeople">\s*(.*?)</div>
|
|
||||||
([\s\S]*?)
|
|
||||||
\s*(?:<div\s+class="timeug">\s*(.*?)</div>)?
|
|
||||||
\s*$
|
|
||||||
""",
|
|
||||||
trippara,
|
|
||||||
)
|
|
||||||
if s2:
|
|
||||||
tripid, tripid1, tripdate, triptitle, trippeople, triptext, tu = s2.groups()
|
|
||||||
else:
|
|
||||||
# if not re.search(r"Rigging Guide", trippara):
|
|
||||||
msg = f" !- Logbook. Can't parse entry on 2nd pass:{logbook_entry_count} '{trippara[:55]}'...'{trippara}'"
|
|
||||||
print(msg)
|
|
||||||
DataIssue.objects.create(parser="logbooks", message=msg)
|
|
||||||
continue
|
|
||||||
|
|
||||||
ldate = parser_date(tripdate.strip(), year)
|
ldate = parser_date(tripdate.strip(), year)
|
||||||
|
|
||||||
@ -408,12 +388,12 @@ def parser_html(year, expedition, txt, seq=""):
|
|||||||
if len(triptitles) >= 2:
|
if len(triptitles) >= 2:
|
||||||
place = triptitles[0]
|
place = triptitles[0]
|
||||||
else:
|
else:
|
||||||
place = "UNKNOWN"
|
place = "Unknown"
|
||||||
tripcontent = re.sub(r"</p>", "", triptext)
|
tripcontent = re.sub(r"</p>", "", triptext)
|
||||||
tripcontent = re.sub(r"<p>", "<br /><br />", tripcontent).strip()
|
tripcontent = re.sub(r"<p>", "<br /><br />", tripcontent).strip()
|
||||||
|
|
||||||
triptitle = triptitle.strip()
|
triptitle = triptitle.strip()
|
||||||
# triptitle must be unique for a given date. We fix this here. [Why?!]
|
# triptitle must be unique for a given date. [Why?!] We fix this here.
|
||||||
check = (ldate, triptitle)
|
check = (ldate, triptitle)
|
||||||
if check in dupl:
|
if check in dupl:
|
||||||
dupl[check] += 1
|
dupl[check] += 1
|
||||||
|
Loading…
x
Reference in New Issue
Block a user