Vital fix to stop parsing terminating too early

This commit is contained in:
Philip Sargent 2022-12-19 11:38:34 +00:00
parent 43a98b4421
commit 7e9bb73777

View File

@ -1,8 +1,6 @@
import csv
import os
import re
# import pickle
# import shelve
import time
from random import randint
from datetime import datetime, date
@ -19,9 +17,8 @@ from parsers.people import GetPersonExpeditionNameLookup
'''
Parses and imports logbooks in all their wonderful confusion
# When we edit logbook entries, allow a "?" after any piece of data to say we've frigged it and
# it can be checked up later from the hard-copy if necessary; or it's not possible to determin (name, trip place, etc)
See detailed explanation of the complete process:
https://expo.survex.com/handbook/computing/logbooks-parsing.html
'''
todo='''
- refactor everything with some urgency, esp. LoadLogbookForExpedition()
@ -292,6 +289,9 @@ def ParseDate(tripdate, year):
# 2002, 2004 - now
def parser_html(year, expedition, txt, seq=""):
'''This uses some of the more obscure capabilities of regular expressions,
see https://docs.python.org/3/library/re.html
'''
global logentries
global logdataissues
@ -367,6 +367,9 @@ def parser_html(year, expedition, txt, seq=""):
# main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it
# trying it out for years 1982 - 1990 too. Some logbook editing required by hand.. place
def parser_html_01(year, expedition, txt, seq=""):
'''This uses some of the more obscure capabilities of regular expressions,
see https://docs.python.org/3/library/re.html
'''
global logentries
global logdataissues
errorcount = 0
@ -488,6 +491,12 @@ def parser_html_01(year, expedition, txt, seq=""):
def parser_blog(year, expedition, txt, sq=""):
'''Parses the format of web pages collected as 'Save As HTML" from the UK Caving blog website.
Note that the entries have dates and authors, but no titles.
See detailed explanation of the complete process:
https://expo.survex.com/handbook/computing/logbooks-parsing.html
https://expo.survex.com/handbook/computing/log-blog-parsing.html
This uses some of the more obscure capabilities of regular expressions,
see https://docs.python.org/3/library/re.html
'''
global logentries
global logdataissues
@ -498,6 +507,7 @@ def parser_blog(year, expedition, txt, sq=""):
message = f" ! - Skipping on failure to parse article header: {txt[:500]}"
print(message)
# (?= is a non-consuming match, see https://docs.python.org/3/library/re.html
tripparas = re.findall(r"<article class=\"message-body js-selectToQuote\"\>\s*([\s\S]*?)(?=</article)", txt)
if not ( tripparas ) :
message = f" ! - Skipping on failure to parse article content: {txt[:500]}"
@ -554,6 +564,7 @@ def parser_blog(year, expedition, txt, sq=""):
tripcontent = re.sub(r"height=\"\d+\"","",tripcontent)
tripcontent = re.sub(r"width: \d+px","",tripcontent)
tripcontent = re.sub(r"\n\n+","\n\n",tripcontent)
tripcontent = re.sub(r"<hr\s*>","",tripcontent)
tripcontent = f"\n\nBlog Author: {trippeople}" + tripcontent
entrytuple = (tripdate, location, tripname, tripcontent,