Vital fix to stop parsing terminating too early

2022-12-19 11:38:34 +00:00
parent 43a98b4421
commit 7e9bb73777
1 changed files with 16 additions and 5 deletions
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -1,8 +1,6 @@
 import csv
 import os
 import re
 # import pickle
 # import shelve
 import time
 from random import randint
 from datetime import datetime, date
@@ -19,9 +17,8 @@ from parsers.people import GetPersonExpeditionNameLookup
 '''
 Parses and imports logbooks in all their wonderful confusion
-
+   See detailed explanation of the complete process:
-# When we edit logbook entries, allow a "?" after any piece of data to say we've frigged it and
+    https://expo.survex.com/handbook/computing/logbooks-parsing.html
 # it can be checked up later from the hard-copy if necessary; or it's not possible to determin (name, trip place, etc)
 '''
 todo='''
 - refactor everything with some urgency, esp. LoadLogbookForExpedition()
@@ -292,6 +289,9 @@ def ParseDate(tripdate, year):
 # 2002, 2004 - now
 def parser_html(year, expedition, txt, seq=""):
    '''This uses some of the more obscure capabilities of regular expressions,
    see https://docs.python.org/3/library/re.html
    '''
    global logentries
    global logdataissues
@@ -367,6 +367,9 @@ def parser_html(year, expedition, txt, seq=""):
 # main parser for 1991 - 2001.  simpler because the data has been hacked so much to fit it
 # trying it out for years 1982 - 1990 too. Some logbook editing required by hand.. place
 def parser_html_01(year, expedition, txt, seq=""):
    '''This uses some of the more obscure capabilities of regular expressions,
    see https://docs.python.org/3/library/re.html
    '''
    global logentries
    global logdataissues
    errorcount = 0
@@ -488,6 +491,12 @@ def parser_html_01(year, expedition, txt, seq=""):
 def parser_blog(year, expedition, txt, sq=""):
    '''Parses the format of web pages collected as 'Save As HTML" from the UK Caving blog website.
    Note that the entries have dates and authors, but no titles.
    See detailed explanation of the complete process:
    https://expo.survex.com/handbook/computing/logbooks-parsing.html
    https://expo.survex.com/handbook/computing/log-blog-parsing.html
    This uses some of the more obscure capabilities of regular expressions,
    see https://docs.python.org/3/library/re.html
    '''
    global logentries
    global logdataissues
@@ -498,6 +507,7 @@ def parser_blog(year, expedition, txt, sq=""):
        message = f" ! - Skipping on failure to parse article header: {txt[:500]}"
        print(message)
    # (?= is a non-consuming match, see https://docs.python.org/3/library/re.html
    tripparas = re.findall(r"<article class=\"message-body js-selectToQuote\"\>\s*([\s\S]*?)(?=</article)", txt)
    if not ( tripparas ) :
        message = f" ! - Skipping on failure to parse article content: {txt[:500]}"
@@ -554,6 +564,7 @@ def parser_blog(year, expedition, txt, sq=""):
        tripcontent = re.sub(r"height=\"\d+\"","",tripcontent)
        tripcontent = re.sub(r"width: \d+px","",tripcontent)
        tripcontent = re.sub(r"\n\n+","\n\n",tripcontent)
        tripcontent = re.sub(r"<hr\s*>","",tripcontent)
        tripcontent =  f"\n\nBlog Author: {trippeople}" + tripcontent
        entrytuple = (tripdate, location, tripname, tripcontent,