forked from expo/troggle
Vital fix to stop parsing terminating too early
This commit is contained in:
parent
43a98b4421
commit
7e9bb73777
@ -1,8 +1,6 @@
|
||||
import csv
|
||||
import os
|
||||
import re
|
||||
# import pickle
|
||||
# import shelve
|
||||
import time
|
||||
from random import randint
|
||||
from datetime import datetime, date
|
||||
@ -19,9 +17,8 @@ from parsers.people import GetPersonExpeditionNameLookup
|
||||
|
||||
'''
|
||||
Parses and imports logbooks in all their wonderful confusion
|
||||
|
||||
# When we edit logbook entries, allow a "?" after any piece of data to say we've frigged it and
|
||||
# it can be checked up later from the hard-copy if necessary; or it's not possible to determin (name, trip place, etc)
|
||||
See detailed explanation of the complete process:
|
||||
https://expo.survex.com/handbook/computing/logbooks-parsing.html
|
||||
'''
|
||||
todo='''
|
||||
- refactor everything with some urgency, esp. LoadLogbookForExpedition()
|
||||
@ -292,6 +289,9 @@ def ParseDate(tripdate, year):
|
||||
|
||||
# 2002, 2004 - now
|
||||
def parser_html(year, expedition, txt, seq=""):
|
||||
'''This uses some of the more obscure capabilities of regular expressions,
|
||||
see https://docs.python.org/3/library/re.html
|
||||
'''
|
||||
global logentries
|
||||
global logdataissues
|
||||
|
||||
@ -367,6 +367,9 @@ def parser_html(year, expedition, txt, seq=""):
|
||||
# main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it
|
||||
# trying it out for years 1982 - 1990 too. Some logbook editing required by hand.. place
|
||||
def parser_html_01(year, expedition, txt, seq=""):
|
||||
'''This uses some of the more obscure capabilities of regular expressions,
|
||||
see https://docs.python.org/3/library/re.html
|
||||
'''
|
||||
global logentries
|
||||
global logdataissues
|
||||
errorcount = 0
|
||||
@ -488,6 +491,12 @@ def parser_html_01(year, expedition, txt, seq=""):
|
||||
def parser_blog(year, expedition, txt, sq=""):
|
||||
'''Parses the format of web pages collected as 'Save As HTML" from the UK Caving blog website.
|
||||
Note that the entries have dates and authors, but no titles.
|
||||
See detailed explanation of the complete process:
|
||||
https://expo.survex.com/handbook/computing/logbooks-parsing.html
|
||||
https://expo.survex.com/handbook/computing/log-blog-parsing.html
|
||||
|
||||
This uses some of the more obscure capabilities of regular expressions,
|
||||
see https://docs.python.org/3/library/re.html
|
||||
'''
|
||||
global logentries
|
||||
global logdataissues
|
||||
@ -498,6 +507,7 @@ def parser_blog(year, expedition, txt, sq=""):
|
||||
message = f" ! - Skipping on failure to parse article header: {txt[:500]}"
|
||||
print(message)
|
||||
|
||||
# (?= is a non-consuming match, see https://docs.python.org/3/library/re.html
|
||||
tripparas = re.findall(r"<article class=\"message-body js-selectToQuote\"\>\s*([\s\S]*?)(?=</article)", txt)
|
||||
if not ( tripparas ) :
|
||||
message = f" ! - Skipping on failure to parse article content: {txt[:500]}"
|
||||
@ -554,6 +564,7 @@ def parser_blog(year, expedition, txt, sq=""):
|
||||
tripcontent = re.sub(r"height=\"\d+\"","",tripcontent)
|
||||
tripcontent = re.sub(r"width: \d+px","",tripcontent)
|
||||
tripcontent = re.sub(r"\n\n+","\n\n",tripcontent)
|
||||
tripcontent = re.sub(r"<hr\s*>","",tripcontent)
|
||||
tripcontent = f"\n\nBlog Author: {trippeople}" + tripcontent
|
||||
|
||||
entrytuple = (tripdate, location, tripname, tripcontent,
|
||||
|
Loading…
Reference in New Issue
Block a user