forked from expo/troggle
Updates to make 2018 blog merge work (faster)
This commit is contained in:
@@ -51,7 +51,7 @@ MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200
|
||||
BLOG_PARSER_SETTINGS = {
|
||||
# "2022": ("ukcavingblog.html", "parser_blog"),
|
||||
"2019": ("ukcavingblog.html", "parser_blog"),
|
||||
"2018": ("ukcavingblog.html", "parser_blog"),
|
||||
# "2018": ("ukcavingblog.html", "parser_blog"),
|
||||
# "2017": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
|
||||
}
|
||||
DEFAULT_LOGBOOK_FILE = "logbook.html"
|
||||
@@ -83,7 +83,7 @@ LOGBOOK_PARSER_SETTINGS = {
|
||||
"1982": ("log.htm", "parser_html_01"),
|
||||
}
|
||||
|
||||
entries = { "2022": 86, "2019": 56, "2018": 86, "2017": 76, "2016": 83, "2015": 79,
|
||||
entries = { "2022": 86, "2019": 56, "2018": 100, "2017": 76, "2016": 83, "2015": 79,
|
||||
"2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 53,
|
||||
"2008": 49, "2007": 113, "2006": 60, "2005": 55, "2004": 76, "2003": 42, "2002": 31,
|
||||
"2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 95, "1995": 42,
|
||||
@@ -138,6 +138,9 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
|
||||
tripperson = "Nadia"
|
||||
if tripperson =="tcacrossley":
|
||||
tripperson = "Tom Crossley"
|
||||
if tripperson =="Samouse1":
|
||||
tripperson = "Todd Rye"
|
||||
|
||||
|
||||
|
||||
personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower())
|
||||
@@ -497,6 +500,13 @@ def parser_blog(year, expedition, txt, sq=""):
|
||||
|
||||
This uses some of the more obscure capabilities of regular expressions,
|
||||
see https://docs.python.org/3/library/re.html
|
||||
|
||||
BLOG entries have this structure:
|
||||
<article ... data-author="Tinywoman" data-content="post-298780" id="js-post-298780">
|
||||
<article class="message-body js-selectToQuote">
|
||||
</article>
|
||||
</article>
|
||||
So the content is nested inside the header. Attachments (images) come after the content.
|
||||
'''
|
||||
global logentries
|
||||
global logdataissues
|
||||
@@ -508,19 +518,26 @@ def parser_blog(year, expedition, txt, sq=""):
|
||||
print(message)
|
||||
|
||||
# (?= is a non-consuming match, see https://docs.python.org/3/library/re.html
|
||||
tripparas = re.findall(r"<article class=\"message-body js-selectToQuote\"\>\s*([\s\S]*?)(?=</article)", txt)
|
||||
tripparas = re.findall(r"<article class=\"message-body js-selectToQuote\"\>\s*([\s\S]*?)(</article[^>]*>)([\s\S]*?)(?=</article)", txt)
|
||||
if not ( tripparas ) :
|
||||
message = f" ! - Skipping on failure to parse article content: {txt[:500]}"
|
||||
print(message)
|
||||
|
||||
if (len(tripheads) !=len(tripparas)):
|
||||
print(f"{len(tripheads)} != {len(tripparas)}")
|
||||
print(f"{len(tripheads)} - {len(tripparas)}")
|
||||
|
||||
location = "Plateau" # best guess, fix manually later
|
||||
tu = 0
|
||||
logbook_entry_count = 0
|
||||
for i in range(0, len(tripparas)):
|
||||
tripcontent = tripparas[i]
|
||||
tripstuff = tripparas[i]
|
||||
attach = tripstuff[2]
|
||||
# note use on non-greedy *? regex idiom here
|
||||
attach = re.sub(r"<div class=\"file-content\">[\s\S]*?(?=</li>)","",attach)
|
||||
attach = re.sub(r"<footer[\s\S]*(</footer>)","",attach)
|
||||
tripcontent = tripstuff[0] + attach
|
||||
#print(f"{i} - {len(tripstuff)} - {tripstuff[1]}")
|
||||
triphead = tripheads[i]
|
||||
logbook_entry_count += 1
|
||||
tid = set_trip_id(year,logbook_entry_count) +"_blog" + sq
|
||||
@@ -684,8 +701,15 @@ def LoadLogbook(year):
|
||||
nlbe={}
|
||||
TROG['pagecache']['expedition'][year] = None # clear cache
|
||||
|
||||
expo = Expedition.objects.get(year=year)
|
||||
expo = Expedition.objects.get(year=year)
|
||||
year = expo.year # some type funny
|
||||
nlbe[expo] = LoadLogbookForExpedition(expo) # this actually loads the logbook for one expo
|
||||
if year in BLOG_PARSER_SETTINGS:
|
||||
print("BLOG parsing")
|
||||
LOGBOOK_PARSER_SETTINGS[year] = BLOG_PARSER_SETTINGS[year]
|
||||
nlbe[expo] = LoadLogbookForExpedition(expo, clean=False) # this loads the blog logbook for one expo
|
||||
else:
|
||||
print(f" {year} not in {BLOG_PARSER_SETTINGS}")
|
||||
|
||||
def LoadLogbooks():
|
||||
""" This is the master function for parsing all logbooks into the Troggle database.
|
||||
|
||||
Reference in New Issue
Block a user