troggle-unchained/parsers/logbooks.py

783 lines
33 KiB
Python
Raw Normal View History

2011-07-11 02:10:22 +01:00
import csv
import os
2020-05-28 02:20:50 +01:00
import re
2022-11-23 00:36:44 +00:00
# import pickle
# import shelve
import time
from random import randint
from datetime import datetime, date
from pathlib import Path
2011-07-11 02:10:22 +01:00
2020-05-28 02:20:50 +01:00
from django.conf import settings
from django.template.defaultfilters import slugify
from django.utils.timezone import get_current_timezone, make_aware
2021-04-13 00:43:57 +01:00
from troggle.core.models.troggle import DataIssue, Expedition
2021-04-13 00:11:08 +01:00
from troggle.core.utils import TROG, save_carefully
2021-04-13 00:47:17 +01:00
from troggle.core.models.caves import Cave, LogbookEntry, PersonTrip, GetCaveLookup
2020-05-28 02:20:50 +01:00
from parsers.people import GetPersonExpeditionNameLookup
2011-07-11 02:10:22 +01:00
2021-04-13 01:37:42 +01:00
'''
Parses and imports logbooks in all their wonderful confusion
2011-07-11 02:10:22 +01:00
# When we edit logbook entries, allow a "?" after any piece of data to say we've frigged it and
# it can be checked up later from the hard-copy if necessary; or it's not possible to determin (name, trip place, etc)
2021-04-13 01:37:42 +01:00
'''
todo='''
- refactor everything with some urgency, esp. LoadLogbookForExpedition()
- remove the TROG and lbo things since we need the database for multiuser access? Or not?
2022-08-30 15:58:49 +01:00
- profile the code to find bad repetitive things, of which there are many.
- far too many uses of Django field dereferencing to get values, which is SLOW
2022-03-24 01:05:50 +00:00
2022-08-30 15:58:49 +01:00
- Logbooks 1987, 1988, 1989 all crash on MySql - but not sqlite - with db constraint fail. Edit logbook to fix.
2021-11-05 21:01:10 +00:00
- import/parse/re-export-as-html the 'artisanal-format' old logbooks so that
2022-08-30 15:58:49 +01:00
we keep only a modern HTML05 format. Then we can retire the old parsers and reduce the
volume of code here substantially.
- rewrite to use generators rather than storing everything intermediate in lists - to reduce memory impact.
- We should ensure logbook.html is utf-8 and stop this crap:
file_in = open(logbookfile,'rb')
txt = file_in.read().decode("latin1")
- this is a slow and uncertain function: cave = getCaveByReference(caveRef)
2022-08-30 15:58:49 +01:00
2022-12-07 18:22:09 +00:00
- use Fixtures https://docs.djangoproject.com/en/4.1/ref/django-admin/#django-admin-loaddata to cache
data for old logbooks. New design needed, with a mechanism for flagging fixtures as outdated after edits.
'''
MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200
BLOG_PARSER_SETTINGS = {
# "2017": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
"2019": ("ukcavingblog.html", "parser_blog"),
"2022": ("ukcavingblog.html", "parser_blog"),
}
DEFAULT_LOGBOOK_FILE = "logbook.html"
DEFAULT_LOGBOOK_PARSER = "parser_html"
# All years since 2010 use the default value for Logbook parser
# but several don't work, and are skipped by the parsing code, e.g. 1983
LOGBOOK_PARSER_SETTINGS = {
2022-12-14 23:46:14 +00:00
"2019": ("logbook.html", "parser_html"),
2022-11-21 16:47:25 +00:00
"2010": ("logbook.html", "parser_html"),
"2009": ("2009logbook.txt", "wiki_parser"),
"2008": ("2008logbook.txt", "wiki_parser"),
2022-11-21 16:47:25 +00:00
"2007": ("logbook.html", "parser_html"),
"2006": ("logbook.html", "parser_html"),
# "2006": ("logbook/logbook_06.txt", "wiki_parser"),
2022-11-21 16:47:25 +00:00
"2006": ("logbook.html", "parser_html"),
"2005": ("logbook.html", "parser_html"),
"2004": ("logbook.html", "parser_html"),
"2003": ("logbook.html", "parser_html"),
2022-11-21 16:47:25 +00:00
"2002": ("logbook.html", "parser_html"),
"2001": ("log.htm", "parser_html_01"),
"2000": ("log.htm", "parser_html_01"),
"1999": ("log.htm", "parser_html_01"),
"1998": ("log.htm", "parser_html_01"),
"1997": ("log.htm", "parser_html_01"),
"1996": ("log.htm", "parser_html_01"),
"1995": ("log.htm", "parser_html_01"),
"1994": ("log.htm", "parser_html_01"),
"1993": ("log.htm", "parser_html_01"),
"1992": ("log.htm", "parser_html_01"),
"1991": ("log.htm", "parser_html_01"),
"1990": ("log.htm", "parser_html_01"),
"1989": ("log.htm", "parser_html_01"), #crashes MySQL
"1988": ("log.htm", "parser_html_01"), #crashes MySQL
"1987": ("log.htm", "parser_html_01"), #crashes MySQL
"1985": ("log.htm", "parser_html_01"),
"1984": ("log.htm", "parser_html_01"),
"1983": ("log.htm", "parser_html_01"),
"1982": ("log.htm", "parser_html_01"),
}
entries = { "2022": 64, "2019": 56, "2018": 75, "2017": 76, "2016": 81, "2015": 79,
2022-03-24 01:05:50 +00:00
"2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 52,
"2008": 49, "2007": 111, "2006": 60, "2005": 55, "2004": 76, "2003": 42, "2002": 31,
"2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 95, "1995": 42,
"1994": 32, "1993": 41, "1992": 62, "1991": 39, "1990": 87, "1989": 1,"1988": 1,"1987": 1,
2022-12-10 13:00:57 +00:00
"1985": 24, "1984": 32, "1983": 52, "1982": 42,}
2022-03-24 01:05:50 +00:00
# Logbooks log.htm exist for 1983, 84, 85, 87, 88, 89 but have no full-working parser, or need hand-editing.
logentries = [] # the entire logbook for one year is a single object: a list of entries
noncaveplaces = [ "QMplaceholder", "Journey", "Loser Plateau", "UNKNOWN", 'plateau',
'base camp', 'basecamp', 'top camp', 'topcamp' ]
logdataissues = TROG['issues']['logdataissues']
trips ={}
2011-07-11 02:10:22 +01:00
#
# the logbook loading section
#
def set_trip_id(year, seq):
2021-05-02 14:50:46 +01:00
tid= f"{year}_s{seq:02d}"
return tid
rx_tripperson = re.compile(r'(?i)<u>(.*?)</u>$')
rx_round_bracket = re.compile(r"[\(\[].*?[\)\]]")
def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
2011-07-11 02:10:22 +01:00
res = [ ]
author = None
# print(f'# {tid}')
# print(f" - {tid} '{trippeople}' ")
for tripperson in re.split(r",|\+|&amp;|&(?!\w+;)| and ", trippeople):
2011-07-11 02:10:22 +01:00
tripperson = tripperson.strip()
# mul = re.match(r"(?i)<u>(.*?)</u>$", tripperson)
mul = rx_tripperson.match(tripperson)
2011-07-11 02:10:22 +01:00
if mul:
tripperson = mul.group(1).strip()
if tripperson and tripperson[0] != '*':
tripperson = re.sub(rx_round_bracket, "", tripperson).strip()
2021-04-24 01:23:55 +01:00
if tripperson =="Wiggy":
tripperson = "Phil Wigglesworth"
if tripperson =="Animal":
tripperson = "Mike Richardson"
2022-10-07 21:47:05 +01:00
if tripperson =="MikeTA":
tripperson = "Mike Richardson"
if tripperson =="cavingpig":
tripperson = "Elaine Oliver"
if tripperson =="nobrotson":
tripperson = "Rob Watson"
2021-04-24 01:23:55 +01:00
2011-07-11 02:10:22 +01:00
personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower())
if not personyear:
2022-08-30 15:58:49 +01:00
message = f" ! - {expedition.year} No name match for: '{tripperson}' in entry {tid=} for this expedition year."
2021-02-06 00:18:48 +00:00
print(message)
2020-05-30 12:35:15 +01:00
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
2011-07-11 02:10:22 +01:00
res.append((personyear, logtime_underground))
if mul:
author = personyear
if not author:
if not res:
return "", 0
author = res[-1][0] # the previous valid person and a time of 0 hours
2021-04-24 01:23:55 +01:00
#print(f" - {tid} [{author.person}] '{res[0][0].person}'...")
2011-07-11 02:10:22 +01:00
return res, author
2022-08-30 15:58:49 +01:00
def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, tid=None):
2020-06-19 16:39:05 +01:00
""" saves a logbook entry and related persontrips
2021-05-02 15:50:20 +01:00
Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday !
2022-03-23 22:55:59 +00:00
troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite
2022-11-21 16:26:30 +00:00
but we are saving the same thing too many times..
2020-06-19 16:39:05 +01:00
"""
try:
trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground, tid=tid)
except:
message = f" ! - {expedition.year} Skipping logentry: {title} - GetTripPersons FAIL"
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues["title"]=message
print(message)
raise
return
2011-07-11 02:10:22 +01:00
if not author:
2022-12-14 23:46:14 +00:00
message = f" ! - {expedition.year} Warning: logentry: {title} - no expo member author for entry '{tid}'"
2020-05-30 12:35:15 +01:00
DataIssue.objects.create(parser='logbooks', message=message)
2021-02-06 00:18:48 +00:00
logdataissues["title"]=message
print(message)
2021-04-24 01:23:55 +01:00
#return
2020-06-19 16:39:05 +01:00
# This needs attention. The slug field is derived from 'title'
# both GetCaveLookup() and GetTripCave() need to work together better. None of this data is *used* though?
#tripCave = GetTripCave(place):
2011-07-11 02:10:22 +01:00
lplace = place.lower()
2020-06-19 16:39:05 +01:00
cave=None
2011-07-11 02:10:22 +01:00
if lplace not in noncaveplaces:
2020-06-19 16:39:05 +01:00
cave = GetCaveLookup().get(lplace)
2021-05-02 15:50:20 +01:00
y = str(date)[:4]
text = text.replace(' src="', f' src="/years/{y}/' )
text = text.replace(" src='", f" src='/years/{y}/" )
text = text.replace(f' src="/years/{y}//years/{y}/', f' src="/years/{y}/' )
text = text.replace(f" src='/years/{y}//years/{y}/", f" src='/years/{y}/" )
text = text.replace('\t', '' )
text = text.replace('\n\n\n', '\n\n' )
2011-07-11 02:10:22 +01:00
#Check for an existing copy of the current entry, and save
expeditionday = expedition.get_expedition_day(date)
lookupAttribs={'date':date, 'title':title}
2020-06-19 16:39:05 +01:00
# 'cave' is converted to a string doing this, which renders as the cave slug.
2020-07-01 22:49:38 +01:00
# but it is a db query which we should try to avoid - rewrite this
2021-05-02 15:50:20 +01:00
#NEW slug for a logbook entry here! Unique id + slugified title fragment
2022-11-23 00:36:44 +00:00
if tid is not None:
slug = tid
# slug = tid + "_" + slugify(title)[:10].replace('-','_')
else:
slug = str(randint(1000,9999)) + "_" + slugify(title)[:10].replace('-','_')
2022-08-30 15:58:49 +01:00
nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave_slug':str(cave), 'slug': slug}
2021-05-02 15:50:20 +01:00
2022-08-30 15:58:49 +01:00
# This creates the lbo instance of LogbookEntry
2020-05-30 12:35:15 +01:00
lbo, created=save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs)
2011-07-11 02:10:22 +01:00
for tripperson, time_underground in trippersons:
# print(f" - {tid} '{tripperson}' author:{tripperson == author}")
2011-07-11 02:10:22 +01:00
lookupAttribs={'personexpedition':tripperson, 'logbook_entry':lbo}
nonLookupAttribs={'time_underground':time_underground, 'is_logbook_entry_author':(tripperson == author)}
2022-03-23 22:55:59 +00:00
# this creates the PersonTrip instance.
save_carefully(PersonTrip, lookupAttribs, nonLookupAttribs)
2011-07-11 02:10:22 +01:00
def ParseDate(tripdate, year):
""" Interprets dates in the expo logbooks and returns a correct datetime.date object """
2022-07-15 13:11:49 +01:00
dummydate = date(1970, 1, 1)
2022-08-25 13:54:00 +01:00
month = 1
day = 1
2022-09-21 22:22:09 +01:00
# message = f" ! - Trying to parse date in logbook: {tripdate} - {year}"
# print(message)
try:
mdatestandard = re.match(r"(\d\d\d\d)-(\d\d)-(\d\d)", tripdate)
mdategoof = re.match(r"(\d\d?)/0?(\d)/(20|19)?(\d\d)", tripdate)
if mdatestandard:
if not (mdatestandard.group(1) == year):
2022-07-08 23:30:49 +01:00
message = f" ! - Bad date (year) in logbook: {tripdate} - {year}"
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues["tripdate"]=message
2022-07-15 13:11:49 +01:00
return dummydate
else:
year, month, day = int(mdatestandard.group(1)), int(mdatestandard.group(2)), int(mdatestandard.group(3))
elif mdategoof:
if not (not mdategoof.group(3) or mdategoof.group(3) == year[:2]):
message = " ! - Bad date mdategoof.group(3) in logbook: " + tripdate + " - " + mdategoof.group(3)
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues["tripdate"]=message
2022-07-15 13:11:49 +01:00
return dummydate
else:
yadd = int(year[:2]) * 100
day, month, year = int(mdategoof.group(1)), int(mdategoof.group(2)), int(mdategoof.group(4)) + yadd
else:
2022-08-25 13:54:00 +01:00
year = 1970
2022-07-08 23:30:49 +01:00
message = f" ! - Bad date in logbook: {tripdate} - {year}"
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues["tripdate"]=message
return date(year, month, day)
except:
2022-07-08 23:30:49 +01:00
message = f" ! - Failed to parse date in logbook: {tripdate} - {year}"
DataIssue.objects.create(parser='logbooks', message=message)
2021-02-06 00:18:48 +00:00
logdataissues["tripdate"]=message
2022-07-08 23:55:11 +01:00
return datetime.date(1970, 1, 1)
2011-07-11 02:10:22 +01:00
2020-07-07 19:07:45 +01:00
# (2006 - not any more), 2008 - 2009
def wiki_parser(year, expedition, txt, seq=""):
2020-06-08 21:33:32 +01:00
global logentries
global logdataissues
logbook_entry_count = 0
trippara = re.findall(r"===(.*?)===([\s\S]*?)(?====)", txt)
2011-07-11 02:10:22 +01:00
for triphead, triptext in trippara:
2020-06-08 21:33:32 +01:00
logbook_entry_count += 1
tid = set_trip_id(year,logbook_entry_count)
2011-07-11 02:10:22 +01:00
tripheadp = triphead.split("|")
if not (len(tripheadp) == 3):
message = " ! - Bad no of items in tripdate in logbook: " + tripdate + " - " + tripheadp
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues["tripdate"]=message
2011-07-11 02:10:22 +01:00
tripdate, tripplace, trippeople = tripheadp
tripsplace = tripplace.split(" - ")
tripcave = tripsplace[0].strip()
2021-04-13 22:27:01 +01:00
if len(tripsplace) == 1:
tripsplace = tripsplace[0]
else:
tripsplace = tripsplace[1]
2021-04-14 00:12:27 +01:00
#print(f"! LOGBOOK {year} {logbook_entry_count:2} {len(triptext):4} '{tripsplace}'")
2011-07-11 02:10:22 +01:00
tul = re.findall(r"T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext)
2011-07-11 02:10:22 +01:00
if tul:
tu = tul[0][0]
else:
tu = ""
ldate = ParseDate(tripdate.strip(), year)
2020-06-08 21:33:32 +01:00
tripid =""
entrytuple = (ldate, tripcave, tripsplace, triptext,
2022-11-23 00:36:44 +00:00
trippeople, expedition, tu, tripid)
2020-06-08 21:33:32 +01:00
logentries.append(entrytuple)
2020-05-30 20:31:20 +01:00
# 2002, 2004, 2005, 2007, 2010 - now
2020-07-07 19:07:45 +01:00
# 2006 wiki text is incomplete, but the html all there. So using this parser now.
def parser_html(year, expedition, txt, seq=""):
2020-06-08 21:33:32 +01:00
global logentries
global logdataissues
# extract front material and stash for later use when rebuilding from list of entries
headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt)
headpara = headmatch.groups()[0].strip()
# print(f" - headpara:\n'{headpara}'")
if(len(headpara)>0):
frontpath = Path(settings.EXPOWEB, "years", year, "frontmatter.html")
with open(frontpath,"w") as front:
front.write(headpara+"\n")
tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt)
logbook_entry_count = 0
2011-07-11 02:10:22 +01:00
for trippara in tripparas:
logbook_entry_count += 1
tid = set_trip_id(year,logbook_entry_count)
2022-03-24 01:16:43 +00:00
# print(f' - new tid:{tid} lbe count: {logbook_entry_count}')
s = re.match(r'''(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date
2011-07-11 02:10:22 +01:00
\s*(?:<a\s+id="(.*?)"\s*/>\s*</a>)?
\s*<div\s+class="tripdate"\s*(?:id="(.*?)")?>(.*?)</div>(?:<p>)?
\s*<div\s+class="trippeople">\s*(.*?)</div>
\s*<div\s+class="triptitle">\s*(.*?)</div>
([\s\S]*?)
\s*(?:<div\s+class="timeug">\s*(.*?)</div>)?
\s*$
''', trippara)
2022-08-25 14:12:13 +01:00
if s:
tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups()
else: # allow title and people to be swapped in order
msg = f" !- {year} Can't parse:{logbook_entry_count} '{trippara[:50]}'..."
print(msg)
DataIssue.objects.create(parser='logbooks', message=msg)
logdataissues[tid]=msg
2022-08-25 14:12:13 +01:00
s2 = re.match(r'''(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date
2022-08-25 13:54:00 +01:00
\s*(?:<a\s+id="(.*?)"\s*/>\s*</a>)?
\s*<div\s+class="tripdate"\s*(?:id="(.*?)")?>(.*?)</div>(?:<p>)?
\s*<div\s+class="triptitle">\s*(.*?)</div>
\s*<div\s+class="trippeople">\s*(.*?)</div>
([\s\S]*?)
\s*(?:<div\s+class="timeug">\s*(.*?)</div>)?
\s*$
''', trippara)
2022-08-25 14:12:13 +01:00
if s2:
tripid, tripid1, tripdate, triptitle, trippeople, triptext, tu = s2.groups()
else:
# if not re.search(r"Rigging Guide", trippara):
msg = f" !- Logbook. Can't parse entry on 2nd pass:{logbook_entry_count} '{trippara[:50]}'..."
print(msg)
DataIssue.objects.create(parser='logbooks', message=msg)
logdataissues[tid]=msg
2022-08-25 14:12:13 +01:00
continue
2022-08-25 13:54:00 +01:00
2011-07-11 02:10:22 +01:00
ldate = ParseDate(tripdate.strip(), year)
triptitles = triptitle.split(" - ")
if len(triptitles) >= 2:
tripcave = triptitles[0]
else:
tripcave = "UNKNOWN"
ltriptext = re.sub(r"</p>", "", triptext)
2020-02-20 15:26:33 +00:00
ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
2020-06-08 21:33:32 +01:00
entrytuple = (ldate, tripcave, triptitle, ltriptext,
2022-11-23 00:36:44 +00:00
trippeople, expedition, tu, tripid1)
2020-06-08 21:33:32 +01:00
logentries.append(entrytuple)
# main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it
2021-04-24 01:23:55 +01:00
# trying it out for years 1982 - 1990 too. Some logbook editing required by hand.. place
def parser_html_01(year, expedition, txt, seq=""):
2020-06-08 21:33:32 +01:00
global logentries
global logdataissues
2021-02-06 00:18:48 +00:00
errorcount = 0
# extract front material and stash for later use when rebuilding from list of entries
headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt)
headpara = headmatch.groups()[0].strip()
# print(f" - headpara:\n'{headpara}'")
if(len(headpara)>0):
frontpath = Path(settings.EXPOWEB, "years", year, "frontmatter.html")
with open(frontpath,"w") as front:
front.write(headpara+"\n")
tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
2020-06-08 21:33:32 +01:00
logbook_entry_count = 0
2011-07-11 02:10:22 +01:00
for trippara in tripparas:
2020-06-08 21:33:32 +01:00
logbook_entry_count += 1
tid = set_trip_id(year,logbook_entry_count)
2021-04-24 01:23:55 +01:00
# print(f" #0 - tid: {tid}")
2021-02-06 00:18:48 +00:00
try:
2021-04-24 01:23:55 +01:00
#print(f" #1 - tid: {tid}")
2022-11-23 21:59:42 +00:00
s = re.match(r"(?i)(?s)\s*(?:<p>)?(.*?)</?p>(.*)$", trippara)
2021-02-06 00:18:48 +00:00
if not s:
message = " ! - Skipping logentry {year} failure to parse header: " + tid + trippara[:300] + "..."
2021-02-06 00:18:48 +00:00
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
2021-02-06 00:18:48 +00:00
print(message)
break
2021-04-24 01:23:55 +01:00
try:
tripheader, triptext = s.group(1), s.group(2)
except:
message = f" ! - Fail to set tripheader, triptext. trip:<{tid}> s:'{s}'"
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
print(message)
# mtripid = re.search(r'<a id="(.*?)"', tripheader)
# if not mtripid:
2021-04-24 01:23:55 +01:00
# message = f" ! - A tag id not found. Never mind. Not needed. trip:<{tid}> header:'{tripheader}'"
# DataIssue.objects.create(parser='logbooks', message=message)
# logdataissues[tid]=message
# print(message)
2021-04-24 01:23:55 +01:00
# tripid = mtripid and mtripid.group(1) or ""
# print(f" # - mtripid: {mtripid}")
2021-02-06 00:18:48 +00:00
tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader)
2021-04-24 01:23:55 +01:00
#print(f" #2 - tid: {tid}")
try:
tripdate, triptitle, trippeople = tripheader.split("|")
except:
2021-05-09 00:55:37 +01:00
message = f" ! - Fail 3 to split out date|title|people. trip:<{tid}> '{tripheader.split('|')}'"
2021-04-24 01:23:55 +01:00
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
print(message)
2021-05-09 00:55:37 +01:00
try:
tripdate, triptitle = tripheader.split("|")
trippeople = "GUESS ANON"
except:
message = f" ! - Skipping logentry {year} Fail 2 to split out date|title (anon). trip:<{tid}> '{tripheader.split('|')}' CRASHES MySQL !"
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
print(message)
break
2021-04-24 01:23:55 +01:00
#print(f" #3 - tid: {tid}")
2021-02-06 00:18:48 +00:00
ldate = ParseDate(tripdate.strip(), year)
2021-04-24 01:23:55 +01:00
#print(f" # - tid: {tid} <{tripdate}> <{triptitle}> <{trippeople}>")
#print(f" #4 - tid: {tid}")
2021-02-06 00:18:48 +00:00
mtu = re.search(r'<p[^>]*>(T/?U.*)', triptext)
if mtu:
tu = mtu.group(1)
triptext = triptext[:mtu.start(0)] + triptext[mtu.end():]
else:
tu = ""
triptitles = triptitle.split(" - ")
tripcave = triptitles[0].strip()
ltriptext = triptext
mtail = re.search(r'(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&amp;|</?p>|\((?:same day|\d+)\))*$', ltriptext)
if mtail:
ltriptext = ltriptext[:mtail.start(0)]
ltriptext = re.sub(r"</p>", "", ltriptext)
ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
ltriptext = re.sub(r"</?u>", "_", ltriptext)
ltriptext = re.sub(r"</?i>", "''", ltriptext)
ltriptext = re.sub(r"</?b>", "'''", ltriptext)
ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
if ltriptext == "":
message = " ! - Zero content for logbook entry!: " + tid
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
print(message)
2021-02-06 00:18:48 +00:00
entrytuple = (ldate, tripcave, triptitle, ltriptext,
2022-11-23 00:36:44 +00:00
trippeople, expedition, tu, tid)
2021-02-06 00:18:48 +00:00
logentries.append(entrytuple)
2021-02-06 00:18:48 +00:00
except:
message = f" ! - Skipping logentry {year} due to exception in: {tid}"
2021-02-06 00:18:48 +00:00
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
2021-02-06 00:18:48 +00:00
print(message)
errorcount += 1
2022-11-23 21:59:42 +00:00
raise
2021-02-06 00:18:48 +00:00
if errorcount >5 :
message = f" !!- TOO MANY ERRORS - aborting at '{tid}' logbook: {year}"
2021-02-06 00:18:48 +00:00
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
2021-02-06 00:18:48 +00:00
print(message)
return
2020-06-08 21:33:32 +01:00
def parser_blog(year, expedition, txt, sq=""):
'''Parses the format of web pages collected as 'Save As HTML" from the UK Caving blog website.
2022-12-14 23:46:14 +00:00
Note that the entries have dates and authors, but no titles.
'''
global logentries
global logdataissues
errorcount = 0
2020-06-08 21:33:32 +01:00
2022-12-14 23:46:14 +00:00
tripheads = re.findall(r"<article class=\"message message--post js-post js-inlineModContainer\s*\"\s*([\s\S]*?)(?=</article)", txt)
if not ( tripheads ) :
message = f" ! - Skipping on failure to parse article header: {txt[:500]}"
print(message)
2020-06-08 21:33:32 +01:00
2022-12-14 23:46:14 +00:00
tripparas = re.findall(r"<article class=\"message-body js-selectToQuote\"\>\s*([\s\S]*?)(?=</article)", txt)
if not ( tripparas ) :
message = f" ! - Skipping on failure to parse article content: {txt[:500]}"
print(message)
if (len(tripheads) !=len(tripparas)):
print(f"{len(tripheads)} != {len(tripparas)}")
location = "Plateau"
tu = 0
logbook_entry_count = 0
for i in range(0, len(tripparas)):
trippara = tripparas[i]
triphead = tripheads[i]
logbook_entry_count += 1
tid = set_trip_id(year,logbook_entry_count) +"_blog" + sq
2022-12-14 23:46:14 +00:00
# print(f" - tid: {tid}")
# data-author="tcacrossley"
match_author = re.search(r".*data-author=\"([^\"]*)\" data-content=.*", triphead)
if not ( match_author ) :
message = f" ! - Skipping logentry {year}:{logbook_entry_count} on failure to parse data-author {tid} {triphead[:400]}..."
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
print(message)
break
trippeople = match_author.group(1)
# print(f" - tid: {tid} {trippeople}")
# datetime="2019-07-11T13:16:18+0100"
2022-12-14 23:46:14 +00:00
match_datetime = re.search(r".*datetime=\"([^\"]*)\" data-time=.*", triphead)
if not ( match_datetime ) :
message = f" ! - Skipping logentry {year}:{logbook_entry_count} on failure to parse datetime {tid} {triphead[:400]}..."
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
print(message)
break
datestamp = match_datetime.group(1)
2022-12-15 01:06:54 +00:00
try:
tripdate = datetime.fromisoformat(datestamp)
except:
message = f" ! - FROMISOFORMAT fail logentry {year}:{logbook_entry_count} {tid} '{datestamp}'"
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
print(message)
# fallback, ignore the timestamp bits:
2022-12-15 01:06:54 +00:00
tripdate = datetime.fromisoformat(datestamp[0:10])
# print(f" - tid: {tid} '{trippeople}' '{tripdate}'")
2022-12-14 23:46:14 +00:00
# tripname must have the location then a hyphen at the beginning as it is ignored by export function
location = "Unknown"
tripname = f"Expo - UK Caving Blog{sq} post {logbook_entry_count}" # must be unique for a given date
tripcontent = trippara + f"\n\nBlog Author: {trippeople}"
2020-06-08 21:33:32 +01:00
entrytuple = (tripdate, location, tripname, tripcontent,
2022-12-14 23:46:14 +00:00
trippeople, expedition, tu, tid)
logentries.append(entrytuple)
2022-12-14 23:46:14 +00:00
2022-12-15 00:35:48 +00:00
def LoadLogbookForExpedition(expedition, clean=True):
2020-05-30 20:31:20 +01:00
""" Parses all logbook entries for one expedition
if clean==True then it deletes all entries for this year first.
2020-05-30 20:31:20 +01:00
"""
global logentries
# absolutely horrid. REFACTOR THIS (all my fault..)
global logdataissues
2022-03-24 01:05:50 +00:00
global entries
logbook_parseable = False
yearlinks = LOGBOOK_PARSER_SETTINGS
2020-05-30 20:31:20 +01:00
expologbase = os.path.join(settings.EXPOWEB, "years")
2020-06-08 21:33:32 +01:00
logentries=[]
2022-03-24 01:05:50 +00:00
year = expedition.year
expect = entries[year]
# print(" - Logbook for: " + year)
2022-11-23 00:36:44 +00:00
2020-06-08 21:33:32 +01:00
def cleanerrors(year):
global logdataissues
dataissues = DataIssue.objects.filter(parser='logbooks')
for di in dataissues:
ph = year
if re.search(ph, di.message) is not None:
#print(f' - CLEANING dataissue {di.message}')
di.delete()
#print(f' - CLEAN {year} {len(logdataissues)} {type(logdataissues)} data issues for this year')
dellist = []
for key, value in logdataissues.items():
#print(f' - CLEANING logdataissues [{key}]: {value}')
if key.startswith(year):
#print(f' - CLEANING logdataissues [{key:12}]: {value} ')
dellist.append(key)
for i in dellist:
del logdataissues[i]
2022-12-15 00:35:48 +00:00
if (clean):
cleanerrors(year)
2022-03-24 01:05:50 +00:00
if year in yearlinks:
2022-12-14 23:46:14 +00:00
yearfile, yearparser = yearlinks[year]
logbookpath = Path(yearfile)
2022-12-14 23:46:14 +00:00
expedition.logbookfile = yearfile
parsefunc = yearparser
2022-12-15 00:35:48 +00:00
# print(f" - Logbook file {yearfile} using parser {yearparser}")
2022-12-14 23:46:14 +00:00
2020-05-30 20:31:20 +01:00
else:
logbookpath = Path(DEFAULT_LOGBOOK_FILE)
expedition.logbookfile = DEFAULT_LOGBOOK_FILE
parsefunc = DEFAULT_LOGBOOK_PARSER
2020-05-30 20:31:20 +01:00
expedition.save()
lbes = LogbookEntry.objects.filter(expedition=expedition)
2022-12-15 00:35:48 +00:00
if (clean):
for lbe in lbes:
lbe.delete()
for sq in ["", "2", "3", "4"]: # cope with blog saved as many separate files
lb = Path(expologbase, year, logbookpath.stem + sq + logbookpath.suffix)
if not (lb.is_file()):
# print(f" ! End of blog. Next blog file in sequence not there:{lb}")
break
2022-08-30 15:58:49 +01:00
try:
with open(lb,'rb') as file_in:
txt = file_in.read().decode("utf-8")
logbook_parseable = True
except (IOError):
logbook_parseable = False
print(f" ! Couldn't open logbook as UTF-8 {lb}")
except:
logbook_parseable = False
print(f" ! Very Bad Error opening {lb}")
if logbook_parseable:
# --------------------
parser = globals()[parsefunc]
print(f' - {year} parsing with {parsefunc} - {lb}')
parser(year, expedition, txt, sq) # this launches the right parser for this year
# --------------------
for entrytuple in logentries:
# date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
try:
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
except ValueError: # cope with removal of entry_type but still in cache files. Remove in Dec. 2022.
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, entry_type, tripid1 = entrytuple
print(f' - Exception entry_type "{entry_type}" {tripid1}')
EnterLogIntoDbase(date, tripcave, triptitle, text, trippeople, expedition, 0,
tripid1)
2022-03-24 01:05:50 +00:00
if len(logentries) == expect:
# print(f"OK {year} {len(logentries):5d} is {expect}\n")
pass
else:
2022-11-21 16:26:30 +00:00
print(f"Mismatch {year} {len(logentries):5d} is not {expect}\n")
2022-03-24 01:05:50 +00:00
2020-06-06 22:51:55 +01:00
return len(logentries)
# def LoadLogbook(year, format="cucc"):
# global LOGBOOK_PARSER_SETTINGS
2022-12-14 23:46:14 +00:00
# nlbe={}
# TROG['pagecache']['expedition'][year] = None # clear cache
# expo = Expedition.objects.get(year=year)
# if (format=="blog"):
# LOGBOOK_PARSER_SETTINGS[str(year)] = BLOG_PARSER_SETTINGS[str(year)]
# # print(f" - Logbook file {LOGBOOK_PARSER_SETTINGS[str(year)][0]} using parser {LOGBOOK_PARSER_SETTINGS[str(year)][1]}")
2022-12-14 23:46:14 +00:00
# nlbe[expo] = LoadLogbookForExpedition(expo) # this actually loads the logbook for one expo
2022-12-14 23:46:14 +00:00
2011-07-11 02:10:22 +01:00
def LoadLogbooks():
2020-05-30 20:31:20 +01:00
""" This is the master function for parsing all logbooks into the Troggle database.
2022-11-21 16:26:30 +00:00
This should be rewritten to use coroutines to load all logbooks from disc in parallel,
but must be serialised to write to database as sqlite is single-user.
2020-05-30 20:31:20 +01:00
"""
2020-06-08 21:33:32 +01:00
global logdataissues
2022-03-24 01:05:50 +00:00
global entries
2020-06-08 21:33:32 +01:00
logdataissues = {}
2020-05-30 12:35:15 +01:00
DataIssue.objects.filter(parser='logbooks').delete()
expos = Expedition.objects.all()
2020-06-06 22:51:55 +01:00
if len(expos) <= 1:
2022-03-18 10:21:25 +00:00
message = f" ! - No expeditions found. Load 'people' first"
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[f"sqlfail 0000"]=message
print(message)
2022-12-15 00:35:48 +00:00
return
2022-03-18 10:21:25 +00:00
2022-03-02 21:15:24 +00:00
noexpo = ["1986", "2020", "2021",] #no expo
lostlogbook = ["1976", "1977", "1978", "1979", "1980", "1981"]
sqlfail = ["1987", "1988", "1989"] # breaks mysql with db constraint fail - debug locally first]
nologbook = noexpo + lostlogbook + sqlfail
2022-03-24 01:05:50 +00:00
# blogs = ["2019"]
2022-12-15 00:35:48 +00:00
2020-06-08 21:33:32 +01:00
nlbe={}
expd ={}
2022-12-15 00:35:48 +00:00
loglist = []
bloglist = []
2022-03-24 01:05:50 +00:00
for expo in expos: # pointless as we explicitly know the years in this code.
2022-03-24 01:05:50 +00:00
year = expo.year
TROG['pagecache']['expedition'][year] = None # clear cache
if year in sqlfail:
print(" - Logbook for: " + year + " NO parsing attempted - known sql failures")
message = f" ! - Not even attempting to parse logbook for {year} until code fixed"
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[f"sqlfail {year}"]=message
print(message)
2022-03-02 21:15:24 +00:00
2022-03-24 01:05:50 +00:00
if year not in nologbook:
if year in entries:
2022-12-15 00:35:48 +00:00
loglist.append(expo)
2022-03-24 01:05:50 +00:00
else:
print(" - No Logbook yet for: " + year) # catch case when preparing for next expo
2022-12-15 00:35:48 +00:00
if year in BLOG_PARSER_SETTINGS:
2022-12-15 00:35:48 +00:00
bloglist.append(expo)
2022-03-24 01:05:50 +00:00
2022-12-15 00:35:48 +00:00
for ex in loglist:
nlbe[ex] = LoadLogbookForExpedition(ex) # this loads the logbook for one expo
2022-12-15 00:35:48 +00:00
for b in bloglist:
if str(b) in LOGBOOK_PARSER_SETTINGS:
orig = LOGBOOK_PARSER_SETTINGS[str(b)]
else:
orig = (DEFAULT_LOGBOOK_FILE, DEFAULT_LOGBOOK_PARSER)
LOGBOOK_PARSER_SETTINGS[str(b)] = BLOG_PARSER_SETTINGS[str(b)]
print(f" - BLOG: {b}")
nlbe[b] = LoadLogbookForExpedition(b, clean=False) # this loads the blog logbook for one expo
2022-12-15 00:35:48 +00:00
LOGBOOK_PARSER_SETTINGS[str(b)] = orig
2022-03-24 01:05:50 +00:00
# tried to use map with concurrent threads - but sqlite database is not concurrent, so failed with database lock
# yt = 0
2022-12-15 00:35:48 +00:00
# for r in map(LoadLogbookForExpedition, loglist):
2022-03-24 01:05:50 +00:00
# yt = r
yt = 0
for e in nlbe:
yt += nlbe[e]
print(f"total {yt:,} log entries parsed in all expeditions")
2022-03-02 21:15:24 +00:00
# dateRegex = re.compile(r'<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S)
# expeditionYearRegex = re.compile(r'<span\s+class="expeditionyear">(.*?)</span>', re.S)
# titleRegex = re.compile(r'<H1>(.*?)</H1>', re.S)
# reportRegex = re.compile(r'<div\s+class="report">(.*)</div>\s*</body>', re.S)
# personRegex = re.compile(r'<div\s+class="person">(.*?)</div>', re.S)
# nameAuthorRegex = re.compile(r'<span\s+class="name(,author|)">(.*?)</span>', re.S)
# TURegex = re.compile(r'<span\s+class="TU">([0-9]*\.?[0-9]+)</span>', re.S)
# locationRegex = re.compile(r'<span\s+class="location">(.*?)</span>', re.S)
# caveRegex = re.compile(r'<span\s+class="cave">(.*?)</span>', re.S)
2011-07-11 02:10:22 +01:00