forked from expo/troggle
Vital fix to stop parsing terminating too early
This commit is contained in:
@@ -1,8 +1,6 @@
|
|||||||
import csv
|
import csv
|
||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
# import pickle
|
|
||||||
# import shelve
|
|
||||||
import time
|
import time
|
||||||
from random import randint
|
from random import randint
|
||||||
from datetime import datetime, date
|
from datetime import datetime, date
|
||||||
@@ -19,9 +17,8 @@ from parsers.people import GetPersonExpeditionNameLookup
|
|||||||
|
|
||||||
'''
|
'''
|
||||||
Parses and imports logbooks in all their wonderful confusion
|
Parses and imports logbooks in all their wonderful confusion
|
||||||
|
See detailed explanation of the complete process:
|
||||||
# When we edit logbook entries, allow a "?" after any piece of data to say we've frigged it and
|
https://expo.survex.com/handbook/computing/logbooks-parsing.html
|
||||||
# it can be checked up later from the hard-copy if necessary; or it's not possible to determin (name, trip place, etc)
|
|
||||||
'''
|
'''
|
||||||
todo='''
|
todo='''
|
||||||
- refactor everything with some urgency, esp. LoadLogbookForExpedition()
|
- refactor everything with some urgency, esp. LoadLogbookForExpedition()
|
||||||
@@ -292,6 +289,9 @@ def ParseDate(tripdate, year):
|
|||||||
|
|
||||||
# 2002, 2004 - now
|
# 2002, 2004 - now
|
||||||
def parser_html(year, expedition, txt, seq=""):
|
def parser_html(year, expedition, txt, seq=""):
|
||||||
|
'''This uses some of the more obscure capabilities of regular expressions,
|
||||||
|
see https://docs.python.org/3/library/re.html
|
||||||
|
'''
|
||||||
global logentries
|
global logentries
|
||||||
global logdataissues
|
global logdataissues
|
||||||
|
|
||||||
@@ -367,6 +367,9 @@ def parser_html(year, expedition, txt, seq=""):
|
|||||||
# main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it
|
# main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it
|
||||||
# trying it out for years 1982 - 1990 too. Some logbook editing required by hand.. place
|
# trying it out for years 1982 - 1990 too. Some logbook editing required by hand.. place
|
||||||
def parser_html_01(year, expedition, txt, seq=""):
|
def parser_html_01(year, expedition, txt, seq=""):
|
||||||
|
'''This uses some of the more obscure capabilities of regular expressions,
|
||||||
|
see https://docs.python.org/3/library/re.html
|
||||||
|
'''
|
||||||
global logentries
|
global logentries
|
||||||
global logdataissues
|
global logdataissues
|
||||||
errorcount = 0
|
errorcount = 0
|
||||||
@@ -488,6 +491,12 @@ def parser_html_01(year, expedition, txt, seq=""):
|
|||||||
def parser_blog(year, expedition, txt, sq=""):
|
def parser_blog(year, expedition, txt, sq=""):
|
||||||
'''Parses the format of web pages collected as 'Save As HTML" from the UK Caving blog website.
|
'''Parses the format of web pages collected as 'Save As HTML" from the UK Caving blog website.
|
||||||
Note that the entries have dates and authors, but no titles.
|
Note that the entries have dates and authors, but no titles.
|
||||||
|
See detailed explanation of the complete process:
|
||||||
|
https://expo.survex.com/handbook/computing/logbooks-parsing.html
|
||||||
|
https://expo.survex.com/handbook/computing/log-blog-parsing.html
|
||||||
|
|
||||||
|
This uses some of the more obscure capabilities of regular expressions,
|
||||||
|
see https://docs.python.org/3/library/re.html
|
||||||
'''
|
'''
|
||||||
global logentries
|
global logentries
|
||||||
global logdataissues
|
global logdataissues
|
||||||
@@ -498,6 +507,7 @@ def parser_blog(year, expedition, txt, sq=""):
|
|||||||
message = f" ! - Skipping on failure to parse article header: {txt[:500]}"
|
message = f" ! - Skipping on failure to parse article header: {txt[:500]}"
|
||||||
print(message)
|
print(message)
|
||||||
|
|
||||||
|
# (?= is a non-consuming match, see https://docs.python.org/3/library/re.html
|
||||||
tripparas = re.findall(r"<article class=\"message-body js-selectToQuote\"\>\s*([\s\S]*?)(?=</article)", txt)
|
tripparas = re.findall(r"<article class=\"message-body js-selectToQuote\"\>\s*([\s\S]*?)(?=</article)", txt)
|
||||||
if not ( tripparas ) :
|
if not ( tripparas ) :
|
||||||
message = f" ! - Skipping on failure to parse article content: {txt[:500]}"
|
message = f" ! - Skipping on failure to parse article content: {txt[:500]}"
|
||||||
@@ -554,6 +564,7 @@ def parser_blog(year, expedition, txt, sq=""):
|
|||||||
tripcontent = re.sub(r"height=\"\d+\"","",tripcontent)
|
tripcontent = re.sub(r"height=\"\d+\"","",tripcontent)
|
||||||
tripcontent = re.sub(r"width: \d+px","",tripcontent)
|
tripcontent = re.sub(r"width: \d+px","",tripcontent)
|
||||||
tripcontent = re.sub(r"\n\n+","\n\n",tripcontent)
|
tripcontent = re.sub(r"\n\n+","\n\n",tripcontent)
|
||||||
|
tripcontent = re.sub(r"<hr\s*>","",tripcontent)
|
||||||
tripcontent = f"\n\nBlog Author: {trippeople}" + tripcontent
|
tripcontent = f"\n\nBlog Author: {trippeople}" + tripcontent
|
||||||
|
|
||||||
entrytuple = (tripdate, location, tripname, tripcontent,
|
entrytuple = (tripdate, location, tripname, tripcontent,
|
||||||
|
|||||||
Reference in New Issue
Block a user