From 7e9bb737771bb031d7db7864a5267b75da8e08c0 Mon Sep 17 00:00:00 2001 From: Philip Sargent Date: Mon, 19 Dec 2022 11:38:34 +0000 Subject: [PATCH] Vital fix to stop parsing terminating too early --- parsers/logbooks.py | 21 ++++++++++++++++----- 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/parsers/logbooks.py b/parsers/logbooks.py index aa4ec92..e37780c 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -1,8 +1,6 @@ import csv import os import re -# import pickle -# import shelve import time from random import randint from datetime import datetime, date @@ -19,9 +17,8 @@ from parsers.people import GetPersonExpeditionNameLookup ''' Parses and imports logbooks in all their wonderful confusion - -# When we edit logbook entries, allow a "?" after any piece of data to say we've frigged it and -# it can be checked up later from the hard-copy if necessary; or it's not possible to determin (name, trip place, etc) + See detailed explanation of the complete process: + https://expo.survex.com/handbook/computing/logbooks-parsing.html ''' todo=''' - refactor everything with some urgency, esp. LoadLogbookForExpedition() @@ -292,6 +289,9 @@ def ParseDate(tripdate, year): # 2002, 2004 - now def parser_html(year, expedition, txt, seq=""): + '''This uses some of the more obscure capabilities of regular expressions, + see https://docs.python.org/3/library/re.html + ''' global logentries global logdataissues @@ -367,6 +367,9 @@ def parser_html(year, expedition, txt, seq=""): # main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it # trying it out for years 1982 - 1990 too. Some logbook editing required by hand.. place def parser_html_01(year, expedition, txt, seq=""): + '''This uses some of the more obscure capabilities of regular expressions, + see https://docs.python.org/3/library/re.html + ''' global logentries global logdataissues errorcount = 0 @@ -488,6 +491,12 @@ def parser_html_01(year, expedition, txt, seq=""): def parser_blog(year, expedition, txt, sq=""): '''Parses the format of web pages collected as 'Save As HTML" from the UK Caving blog website. Note that the entries have dates and authors, but no titles. + See detailed explanation of the complete process: + https://expo.survex.com/handbook/computing/logbooks-parsing.html + https://expo.survex.com/handbook/computing/log-blog-parsing.html + + This uses some of the more obscure capabilities of regular expressions, + see https://docs.python.org/3/library/re.html ''' global logentries global logdataissues @@ -498,6 +507,7 @@ def parser_blog(year, expedition, txt, sq=""): message = f" ! - Skipping on failure to parse article header: {txt[:500]}" print(message) + # (?= is a non-consuming match, see https://docs.python.org/3/library/re.html tripparas = re.findall(r"
\s*([\s\S]*?)(?=","",tripcontent) tripcontent = f"\n\nBlog Author: {trippeople}" + tripcontent entrytuple = (tripdate, location, tripname, tripcontent,