From 7e9bb737771bb031d7db7864a5267b75da8e08c0 Mon Sep 17 00:00:00 2001
From: Philip Sargent <philip.sargent@gmail.com>
Date: Mon, 19 Dec 2022 11:38:34 +0000
Subject: [PATCH] Vital fix to stop parsing terminating too early

---
 parsers/logbooks.py | 21 ++++++++++++++++-----
 1 file changed, 16 insertions(+), 5 deletions(-)
diff --git a/parsers/logbooks.py b/parsers/logbooks.py
index aa4ec92..e37780c 100644
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -1,8 +1,6 @@
 import csv
 import os
 import re
-# import pickle
-# import shelve
 import time
 from random import randint
 from datetime import datetime, date
@@ -19,9 +17,8 @@ from parsers.people import GetPersonExpeditionNameLookup
 
 '''
 Parses and imports logbooks in all their wonderful confusion
-
-# When we edit logbook entries, allow a "?" after any piece of data to say we've frigged it and
-# it can be checked up later from the hard-copy if necessary; or it's not possible to determin (name, trip place, etc)
+   See detailed explanation of the complete process:
+    https://expo.survex.com/handbook/computing/logbooks-parsing.html
 '''
 todo='''
 - refactor everything with some urgency, esp. LoadLogbookForExpedition()
@@ -292,6 +289,9 @@ def ParseDate(tripdate, year):
         
 # 2002, 2004 - now
 def parser_html(year, expedition, txt, seq=""):
+    '''This uses some of the more obscure capabilities of regular expressions,
+    see https://docs.python.org/3/library/re.html
+    '''
     global logentries
     global logdataissues
 
@@ -367,6 +367,9 @@ def parser_html(year, expedition, txt, seq=""):
 # main parser for 1991 - 2001.  simpler because the data has been hacked so much to fit it
 # trying it out for years 1982 - 1990 too. Some logbook editing required by hand.. place
 def parser_html_01(year, expedition, txt, seq=""):
+    '''This uses some of the more obscure capabilities of regular expressions,
+    see https://docs.python.org/3/library/re.html
+    '''
     global logentries
     global logdataissues
     errorcount = 0
@@ -488,6 +491,12 @@ def parser_html_01(year, expedition, txt, seq=""):
 def parser_blog(year, expedition, txt, sq=""):
     '''Parses the format of web pages collected as 'Save As HTML" from the UK Caving blog website.
     Note that the entries have dates and authors, but no titles.
+    See detailed explanation of the complete process:
+    https://expo.survex.com/handbook/computing/logbooks-parsing.html
+    https://expo.survex.com/handbook/computing/log-blog-parsing.html
+    
+    This uses some of the more obscure capabilities of regular expressions,
+    see https://docs.python.org/3/library/re.html
     '''
     global logentries
     global logdataissues
@@ -498,6 +507,7 @@ def parser_blog(year, expedition, txt, sq=""):
         message = f" ! - Skipping on failure to parse article header: {txt[:500]}"
         print(message)
 
+    # (?= is a non-consuming match, see https://docs.python.org/3/library/re.html
     tripparas = re.findall(r"<article class=\"message-body js-selectToQuote\"\>\s*([\s\S]*?)(?=</article)", txt)
     if not ( tripparas ) :
         message = f" ! - Skipping on failure to parse article content: {txt[:500]}"
@@ -554,6 +564,7 @@ def parser_blog(year, expedition, txt, sq=""):
         tripcontent = re.sub(r"height=\"\d+\"","",tripcontent)
         tripcontent = re.sub(r"width: \d+px","",tripcontent)
         tripcontent = re.sub(r"\n\n+","\n\n",tripcontent)
+        tripcontent = re.sub(r"<hr\s*>","",tripcontent)
         tripcontent =  f"\n\nBlog Author: {trippeople}" + tripcontent
 
         entrytuple = (tripdate, location, tripname, tripcontent,