diff --git a/core/views/uploads.py b/core/views/uploads.py index 5c2f4f4..bc441f2 100644 --- a/core/views/uploads.py +++ b/core/views/uploads.py @@ -1,5 +1,4 @@ import subprocess -import string from datetime import datetime from pathlib import Path diff --git a/parsers/logbooks.py b/parsers/logbooks.py index 511ed47..fdc68ad 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -23,7 +23,7 @@ Parses and imports logbooks in all their wonderful confusion https://expo.survex.com/handbook/computing/logbooks-parsing.html """ todo = """ -- make id= for each entry persistent and unchanging, and check cross-references in other logbooks and other HTML frahments +- check cross-references in other logbooks and other HTML frahments e.g. cave descriptions - Most of the time is during the database writing (6s out of 8s). @@ -41,7 +41,6 @@ e.g. cave descriptions file_in = open(logbookfile,'rb') txt = file_in.read().decode("latin1") - """ MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200 BLOG_PARSER_SETTINGS = { # no default, must be explicit @@ -61,7 +60,7 @@ LOGBOOK_PARSER_SETTINGS = { LOGBOOKS_DIR = "years" # subfolder of settings.EXPOWEB ENTRIES = { - "2023": 82, + "2023": 81, "2022": 93, "2019": 55, "2018": 95, @@ -111,7 +110,7 @@ tripsdate = {} def set_trip_seq_id(year, seq): - '''We have not parsed the trip date yet, so this is a sequence numer + '''We have not parsed the trip date yet, so this is a sequence number ''' tid = f"{year}_s{seq:02d}" return tid @@ -269,7 +268,7 @@ def store_entry_into_database(date, place, tripcave, title, text, trippersons, a } lookupAttribs = {"slug": tid, "date": date, "title": title} if LogbookEntry.objects.filter(slug=tid).exists(): - # oops. + # oops. Our code should already have ensured this is unique. message = " ! - DUPLICATE SLUG for logbook entry " + tripdate + " - " + slug DataIssue.objects.create(parser="logbooks", message=message) slug = slug + "_" + unique_slug(text,2) @@ -374,31 +373,12 @@ def parser_html(year, expedition, txt, seq=""): ) if s: tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups() - else: # allow title and people to be swapped in order - msg = f" !- {year} Can't parse:{logbook_entry_count} '{trippara[:55]}'...'{trippara}'" + else: + # if not re.search(r"Rigging Guide", trippara): + msg = f" !- Logbook. Can't parse entry, skipping:{logbook_entry_count} '{trippara[:55]}'...'{trippara}'" print(msg) DataIssue.objects.create(parser="logbooks", message=msg) - - s2 = re.match( - r"""(?x)(?:\s*
)? # second date - \s*(?:\s*)? - \s*
)? - \s*
", "
", tripcontent).strip()
triptitle = triptitle.strip()
- # triptitle must be unique for a given date. We fix this here. [Why?!]
+ # triptitle must be unique for a given date. [Why?!] We fix this here.
check = (ldate, triptitle)
if check in dupl:
dupl[check] += 1