Vital fix to stop parsing terminating too early

2024-11-22 07:11:52 +00:00 · 2022-12-19 11:38:34 +00:00 · 2022-12-19 11:38:34 +00:00 · 7e9bb73777
commit 7e9bb73777
parent 43a98b4421
1 changed files with 16 additions and 5 deletions
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@ -1,8 +1,6 @@
 import csv
 import os
 import re
-# import pickle
-# import shelve
 import time
 from random import randint
 from datetime import datetime, date
@ -19,9 +17,8 @@ from parsers.people import GetPersonExpeditionNameLookup

 '''
 Parses and imports logbooks in all their wonderful confusion
-
-# When we edit logbook entries, allow a "?" after any piece of data to say we've frigged it and
-# it can be checked up later from the hard-copy if necessary; or it's not possible to determin (name, trip place, etc)
+   See detailed explanation of the complete process:
+    https://expo.survex.com/handbook/computing/logbooks-parsing.html
 '''
 todo='''
 - refactor everything with some urgency, esp. LoadLogbookForExpedition()
@ -292,6 +289,9 @@ def ParseDate(tripdate, year):
        
 # 2002, 2004 - now
 def parser_html(year, expedition, txt, seq=""):
+    '''This uses some of the more obscure capabilities of regular expressions,
+    see https://docs.python.org/3/library/re.html
+    '''
    global logentries
    global logdataissues

@ -367,6 +367,9 @@ def parser_html(year, expedition, txt, seq=""):
 # main parser for 1991 - 2001.  simpler because the data has been hacked so much to fit it
 # trying it out for years 1982 - 1990 too. Some logbook editing required by hand.. place
 def parser_html_01(year, expedition, txt, seq=""):
+    '''This uses some of the more obscure capabilities of regular expressions,
+    see https://docs.python.org/3/library/re.html
+    '''
    global logentries
    global logdataissues
    errorcount = 0
@ -488,6 +491,12 @@ def parser_html_01(year, expedition, txt, seq=""):
 def parser_blog(year, expedition, txt, sq=""):
    '''Parses the format of web pages collected as 'Save As HTML" from the UK Caving blog website.
    Note that the entries have dates and authors, but no titles.
+    See detailed explanation of the complete process:
+    https://expo.survex.com/handbook/computing/logbooks-parsing.html
+    https://expo.survex.com/handbook/computing/log-blog-parsing.html
+    
+    This uses some of the more obscure capabilities of regular expressions,
+    see https://docs.python.org/3/library/re.html
    '''
    global logentries
    global logdataissues
@ -498,6 +507,7 @@ def parser_blog(year, expedition, txt, sq=""):
        message = f" ! - Skipping on failure to parse article header: {txt[:500]}"
        print(message)

+    # (?= is a non-consuming match, see https://docs.python.org/3/library/re.html
    tripparas = re.findall(r"<article class=\"message-body js-selectToQuote\"\>\s*([\s\S]*?)(?=</article)", txt)
    if not ( tripparas ) :
        message = f" ! - Skipping on failure to parse article content: {txt[:500]}"
@ -554,6 +564,7 @@ def parser_blog(year, expedition, txt, sq=""):
        tripcontent = re.sub(r"height=\"\d+\"","",tripcontent)
        tripcontent = re.sub(r"width: \d+px","",tripcontent)
        tripcontent = re.sub(r"\n\n+","\n\n",tripcontent)
+        tripcontent = re.sub(r"<hr\s*>","",tripcontent)
        tripcontent =  f"\n\nBlog Author: {trippeople}" + tripcontent

        entrytuple = (tripdate, location, tripname, tripcontent,