From 33a08bed4fa677cfaafa45071b842109d1b26d6c Mon Sep 17 00:00:00 2001
From: Philip Sargent <philip.sargent@gmail.com>
Date: Fri, 1 Sep 2023 20:57:23 +0300
Subject: [PATCH] bugfix and remove swapped order for title/people

---
 core/views/uploads.py |  1 -
 parsers/logbooks.py   | 40 ++++++++++------------------------------
 2 files changed, 10 insertions(+), 31 deletions(-)
diff --git a/core/views/uploads.py b/core/views/uploads.py
index 5c2f4f4..bc441f2 100644
--- a/core/views/uploads.py
+++ b/core/views/uploads.py
@@ -1,5 +1,4 @@
 import subprocess
-import string
 from datetime import datetime
 from pathlib import Path
 
diff --git a/parsers/logbooks.py b/parsers/logbooks.py
index 511ed47..fdc68ad 100644
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -23,7 +23,7 @@ Parses and imports logbooks in all their wonderful confusion
     https://expo.survex.com/handbook/computing/logbooks-parsing.html
 """
 todo = """
-- make id= for each entry persistent and unchanging, and check cross-references in other logbooks and other HTML frahments
+- check cross-references in other logbooks and other HTML frahments
 e.g. cave descriptions
 
 - Most of the time is during the database writing (6s out of 8s).
@@ -41,7 +41,6 @@ e.g. cave descriptions
             file_in = open(logbookfile,'rb')
             txt = file_in.read().decode("latin1")
             
-
 """
 MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200
 BLOG_PARSER_SETTINGS = { # no default, must be explicit
@@ -61,7 +60,7 @@ LOGBOOK_PARSER_SETTINGS = {
 LOGBOOKS_DIR = "years" # subfolder of settings.EXPOWEB
 
 ENTRIES = {
-    "2023": 82,
+    "2023": 81,
     "2022": 93,
     "2019": 55,
     "2018": 95,
@@ -111,7 +110,7 @@ tripsdate = {}
 
 
 def set_trip_seq_id(year, seq):
-    '''We have not parsed the trip date yet, so this is a sequence numer
+    '''We have not parsed the trip date yet, so this is a sequence number
     '''
     tid = f"{year}_s{seq:02d}"
     return tid
@@ -269,7 +268,7 @@ def store_entry_into_database(date, place, tripcave, title, text, trippersons, a
     }
     lookupAttribs = {"slug": tid, "date": date, "title": title}
     if LogbookEntry.objects.filter(slug=tid).exists():
-        # oops.
+        # oops. Our code should already have ensured this is unique.
         message = " ! - DUPLICATE SLUG for logbook entry " + tripdate + " - " + slug
         DataIssue.objects.create(parser="logbooks", message=message)
         slug = slug + "_" + unique_slug(text,2)
@@ -374,31 +373,12 @@ def parser_html(year, expedition, txt, seq=""):
         )
         if s:
             tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups()
-        else:  # allow title and people to be swapped in order
-            msg = f" !- {year} Can't parse:{logbook_entry_count} '{trippara[:55]}'...'{trippara}'"
+        else:  
+            # if not re.search(r"Rigging Guide", trippara):
+            msg = f" !- Logbook. Can't parse entry, skipping:{logbook_entry_count} '{trippara[:55]}'...'{trippara}'"
             print(msg)
             DataIssue.objects.create(parser="logbooks", message=msg)
-
-            s2 = re.match(
-                r"""(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)?  # second date
-                                \s*(?:<a\s+id="(.*?)"\s*/>\s*</a>)?
-                                \s*<div\s+class="tripdate"\s*(?:id="(.*?)")?>(.*?)</div>(?:<p>)?
-                                \s*<div\s+class="triptitle">\s*(.*?)</div>
-                                \s*<div\s+class="trippeople">\s*(.*?)</div>
-                                ([\s\S]*?)
-                                \s*(?:<div\s+class="timeug">\s*(.*?)</div>)?
-                                \s*$
-                         """,
-                trippara,
-            )
-            if s2:
-                tripid, tripid1, tripdate, triptitle, trippeople, triptext, tu = s2.groups()
-            else:
-                # if not re.search(r"Rigging Guide", trippara):
-                msg = f" !- Logbook. Can't parse entry on 2nd pass:{logbook_entry_count} '{trippara[:55]}'...'{trippara}'"
-                print(msg)
-                DataIssue.objects.create(parser="logbooks", message=msg)
-                continue
+            continue
 
         ldate = parser_date(tripdate.strip(), year)
         
@@ -408,12 +388,12 @@ def parser_html(year, expedition, txt, seq=""):
         if len(triptitles) >= 2:
             place = triptitles[0]
         else:
-            place = "UNKNOWN"
+            place = "Unknown"
         tripcontent = re.sub(r"</p>", "", triptext)
         tripcontent = re.sub(r"<p>", "<br /><br />", tripcontent).strip()
 
         triptitle = triptitle.strip()
-        # triptitle must be unique for a given date. We fix this here. [Why?!]
+        # triptitle must be unique for a given date. [Why?!] We fix this here. 
         check = (ldate, triptitle)
         if check in dupl:
             dupl[check] += 1