From 33a08bed4fa677cfaafa45071b842109d1b26d6c Mon Sep 17 00:00:00 2001 From: Philip Sargent Date: Fri, 1 Sep 2023 20:57:23 +0300 Subject: [PATCH] bugfix and remove swapped order for title/people --- core/views/uploads.py | 1 - parsers/logbooks.py | 40 ++++++++++------------------------------ 2 files changed, 10 insertions(+), 31 deletions(-) diff --git a/core/views/uploads.py b/core/views/uploads.py index 5c2f4f4..bc441f2 100644 --- a/core/views/uploads.py +++ b/core/views/uploads.py @@ -1,5 +1,4 @@ import subprocess -import string from datetime import datetime from pathlib import Path diff --git a/parsers/logbooks.py b/parsers/logbooks.py index 511ed47..fdc68ad 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -23,7 +23,7 @@ Parses and imports logbooks in all their wonderful confusion https://expo.survex.com/handbook/computing/logbooks-parsing.html """ todo = """ -- make id= for each entry persistent and unchanging, and check cross-references in other logbooks and other HTML frahments +- check cross-references in other logbooks and other HTML frahments e.g. cave descriptions - Most of the time is during the database writing (6s out of 8s). @@ -41,7 +41,6 @@ e.g. cave descriptions file_in = open(logbookfile,'rb') txt = file_in.read().decode("latin1") - """ MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200 BLOG_PARSER_SETTINGS = { # no default, must be explicit @@ -61,7 +60,7 @@ LOGBOOK_PARSER_SETTINGS = { LOGBOOKS_DIR = "years" # subfolder of settings.EXPOWEB ENTRIES = { - "2023": 82, + "2023": 81, "2022": 93, "2019": 55, "2018": 95, @@ -111,7 +110,7 @@ tripsdate = {} def set_trip_seq_id(year, seq): - '''We have not parsed the trip date yet, so this is a sequence numer + '''We have not parsed the trip date yet, so this is a sequence number ''' tid = f"{year}_s{seq:02d}" return tid @@ -269,7 +268,7 @@ def store_entry_into_database(date, place, tripcave, title, text, trippersons, a } lookupAttribs = {"slug": tid, "date": date, "title": title} if LogbookEntry.objects.filter(slug=tid).exists(): - # oops. + # oops. Our code should already have ensured this is unique. message = " ! - DUPLICATE SLUG for logbook entry " + tripdate + " - " + slug DataIssue.objects.create(parser="logbooks", message=message) slug = slug + "_" + unique_slug(text,2) @@ -374,31 +373,12 @@ def parser_html(year, expedition, txt, seq=""): ) if s: tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups() - else: # allow title and people to be swapped in order - msg = f" !- {year} Can't parse:{logbook_entry_count} '{trippara[:55]}'...'{trippara}'" + else: + # if not re.search(r"Rigging Guide", trippara): + msg = f" !- Logbook. Can't parse entry, skipping:{logbook_entry_count} '{trippara[:55]}'...'{trippara}'" print(msg) DataIssue.objects.create(parser="logbooks", message=msg) - - s2 = re.match( - r"""(?x)(?:\s*.*?\s*

)? # second date - \s*(?:\s*)? - \s*(.*?)(?:

)? - \s*\s*(.*?) - \s*\s*(.*?) - ([\s\S]*?) - \s*(?:\s*(.*?))? - \s*$ - """, - trippara, - ) - if s2: - tripid, tripid1, tripdate, triptitle, trippeople, triptext, tu = s2.groups() - else: - # if not re.search(r"Rigging Guide", trippara): - msg = f" !- Logbook. Can't parse entry on 2nd pass:{logbook_entry_count} '{trippara[:55]}'...'{trippara}'" - print(msg) - DataIssue.objects.create(parser="logbooks", message=msg) - continue + continue ldate = parser_date(tripdate.strip(), year) @@ -408,12 +388,12 @@ def parser_html(year, expedition, txt, seq=""): if len(triptitles) >= 2: place = triptitles[0] else: - place = "UNKNOWN" + place = "Unknown" tripcontent = re.sub(r"

", "", triptext) tripcontent = re.sub(r"

", "

", tripcontent).strip() triptitle = triptitle.strip() - # triptitle must be unique for a given date. We fix this here. [Why?!] + # triptitle must be unique for a given date. [Why?!] We fix this here. check = (ldate, triptitle) if check in dupl: dupl[check] += 1