2011-07-11 02:10:22 +01:00
|
|
|
import csv
|
2020-05-28 02:20:50 +01:00
|
|
|
import datetime
|
2011-07-11 02:10:22 +01:00
|
|
|
import os
|
2020-05-28 02:20:50 +01:00
|
|
|
import re
|
|
|
|
import time
|
2020-06-14 10:05:25 +01:00
|
|
|
import pickle
|
2020-06-08 21:33:32 +01:00
|
|
|
import shelve
|
2011-07-11 02:10:22 +01:00
|
|
|
|
2020-05-28 02:20:50 +01:00
|
|
|
from django.conf import settings
|
|
|
|
from django.template.defaultfilters import slugify
|
|
|
|
from django.utils.timezone import get_current_timezone, make_aware
|
|
|
|
|
2020-05-30 12:35:15 +01:00
|
|
|
from troggle.core.models import DataIssue, Expedition
|
2020-07-23 02:16:08 +01:00
|
|
|
from troggle.core.models_caves import Cave, LogbookEntry, PersonTrip, GetCaveLookup
|
2020-05-28 02:20:50 +01:00
|
|
|
from parsers.people import GetPersonExpeditionNameLookup
|
2011-07-11 02:10:22 +01:00
|
|
|
from utils import save_carefully
|
|
|
|
|
|
|
|
#
|
|
|
|
# When we edit logbook entries, allow a "?" after any piece of data to say we've frigged it and
|
|
|
|
# it can be checked up later from the hard-copy if necessary; or it's not possible to determin (name, trip place, etc)
|
|
|
|
#
|
|
|
|
|
|
|
|
#
|
|
|
|
# the logbook loading section
|
|
|
|
#
|
2019-07-11 12:29:38 +01:00
|
|
|
def GetTripPersons(trippeople, expedition, logtime_underground):
|
2011-07-11 02:10:22 +01:00
|
|
|
res = [ ]
|
|
|
|
author = None
|
2019-07-11 12:29:38 +01:00
|
|
|
round_bracket_regex = re.compile(r"[\(\[].*?[\)\]]")
|
2019-03-06 23:20:34 +00:00
|
|
|
for tripperson in re.split(r",|\+|&|&(?!\w+;)| and ", trippeople):
|
2011-07-11 02:10:22 +01:00
|
|
|
tripperson = tripperson.strip()
|
2019-03-06 23:20:34 +00:00
|
|
|
mul = re.match(r"<u>(.*?)</u>$(?i)", tripperson)
|
2011-07-11 02:10:22 +01:00
|
|
|
if mul:
|
|
|
|
tripperson = mul.group(1).strip()
|
|
|
|
if tripperson and tripperson[0] != '*':
|
|
|
|
#assert tripperson in personyearmap, "'%s' << %s\n\n %s" % (tripperson, trippeople, personyearmap)
|
2019-07-11 12:29:38 +01:00
|
|
|
tripperson = re.sub(round_bracket_regex, "", tripperson).strip()
|
2011-07-11 02:10:22 +01:00
|
|
|
personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower())
|
|
|
|
if not personyear:
|
2021-02-06 00:18:48 +00:00
|
|
|
message = "No name match for: ||'%s'|| in year '%s'" % (tripperson, expedition.year)
|
|
|
|
print(message)
|
2020-05-30 12:35:15 +01:00
|
|
|
DataIssue.objects.create(parser='logbooks', message=message)
|
2021-02-06 00:18:48 +00:00
|
|
|
logdataissues[expedition.year + "~" + tripperson]=message
|
2011-07-11 02:10:22 +01:00
|
|
|
res.append((personyear, logtime_underground))
|
|
|
|
if mul:
|
|
|
|
author = personyear
|
|
|
|
if not author:
|
|
|
|
if not res:
|
|
|
|
return None, None
|
|
|
|
author = res[-1][0]
|
|
|
|
return res, author
|
|
|
|
|
2020-05-28 04:54:53 +01:00
|
|
|
def GetTripCave(place):
|
2011-07-11 02:10:22 +01:00
|
|
|
try:
|
|
|
|
katastNumRes=[]
|
2020-05-30 12:35:15 +01:00
|
|
|
katastNumRes=list(Cave.objects.filter(kataster_number=int(place)))
|
2011-07-11 02:10:22 +01:00
|
|
|
except ValueError:
|
2020-07-23 02:16:08 +01:00
|
|
|
message = " ! - ValueError on finding place " + str(place) + " entered. " + tripdate + " - " + year
|
|
|
|
DataIssue.objects.create(parser='logbooks', message=message)
|
|
|
|
logdataissues["author"]=message
|
2020-05-30 12:35:15 +01:00
|
|
|
officialNameRes=list(Cave.objects.filter(official_name=place))
|
2011-07-11 02:10:22 +01:00
|
|
|
tripCaveRes=officialNameRes+katastNumRes
|
|
|
|
|
|
|
|
if len(tripCaveRes)==1:
|
|
|
|
return tripCaveRes[0]
|
|
|
|
elif len(tripCaveRes)>1:
|
2020-07-23 02:16:08 +01:00
|
|
|
message = " ! - Ambiguous place " + str(place) + " entered. " + tripdate + " - " + year + " " + str(tripCaveRes)
|
|
|
|
DataIssue.objects.create(parser='logbooks', message=message)
|
|
|
|
logdataissues["author"]=message
|
|
|
|
return tripCaveRes[0]
|
2011-07-11 02:10:22 +01:00
|
|
|
else:
|
2020-07-23 02:16:08 +01:00
|
|
|
print((" " , place))
|
|
|
|
message = " ! - No cave found for place:" + str(place) + tripdate + " - " + year
|
|
|
|
DataIssue.objects.create(parser='logbooks', message=message)
|
|
|
|
logdataissues["author"]=message
|
2020-06-19 16:39:05 +01:00
|
|
|
return None
|
2011-07-11 02:10:22 +01:00
|
|
|
|
2020-05-28 04:54:53 +01:00
|
|
|
|
2020-06-08 21:33:32 +01:00
|
|
|
logentries = [] # the entire logbook for one year is a single object: a list of entries
|
2020-06-19 16:39:05 +01:00
|
|
|
noncaveplaces = [ "QMplaceholder", "Journey", "Loser Plateau", "UNKNOWN", 'plateau',
|
|
|
|
'base camp', 'basecamp', 'top camp', 'topcamp' ]
|
2020-06-08 21:33:32 +01:00
|
|
|
logdataissues = {}
|
|
|
|
trips ={}
|
2020-04-12 22:29:30 +01:00
|
|
|
|
2019-03-31 15:39:53 +01:00
|
|
|
def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, entry_type="wiki"):
|
2020-06-19 16:39:05 +01:00
|
|
|
""" saves a logbook entry and related persontrips
|
|
|
|
Does NOT save the expeditionday_id - all NULLs. why?
|
|
|
|
"""
|
2020-04-12 22:29:30 +01:00
|
|
|
|
2011-07-11 02:10:22 +01:00
|
|
|
trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground)
|
|
|
|
if not author:
|
2020-06-08 21:33:32 +01:00
|
|
|
print(" ! - Skipping logentry: " + title + " - no author for entry")
|
|
|
|
message = " ! - Skipping logentry: %s - no author for entry in year '%s'" % (title, expedition.year)
|
2020-05-30 12:35:15 +01:00
|
|
|
DataIssue.objects.create(parser='logbooks', message=message)
|
2021-02-06 00:18:48 +00:00
|
|
|
logdataissues["title"]=message
|
2011-07-11 02:10:22 +01:00
|
|
|
return
|
2019-03-31 15:39:53 +01:00
|
|
|
|
2020-06-19 16:39:05 +01:00
|
|
|
# This needs attention. The slug field is derived from 'title'
|
|
|
|
# both GetCaveLookup() and GetTripCave() need to work together better. None of this data is *used* though?
|
|
|
|
#tripCave = GetTripCave(place):
|
2019-03-31 15:39:53 +01:00
|
|
|
|
2011-07-11 02:10:22 +01:00
|
|
|
lplace = place.lower()
|
2020-06-19 16:39:05 +01:00
|
|
|
cave=None
|
2011-07-11 02:10:22 +01:00
|
|
|
if lplace not in noncaveplaces:
|
2020-06-19 16:39:05 +01:00
|
|
|
cave = GetCaveLookup().get(lplace)
|
|
|
|
# message = " ! - '" + lplace + "' place not in noncaveplaces."
|
|
|
|
# print(message)
|
|
|
|
# DataIssue.objects.create(parser='logbooks', message=message)
|
2011-07-11 02:10:22 +01:00
|
|
|
|
|
|
|
#Check for an existing copy of the current entry, and save
|
|
|
|
expeditionday = expedition.get_expedition_day(date)
|
2019-03-31 15:39:53 +01:00
|
|
|
lookupAttribs={'date':date, 'title':title}
|
2020-06-19 16:39:05 +01:00
|
|
|
# 'cave' is converted to a string doing this, which renders as the cave slug.
|
2020-07-01 22:49:38 +01:00
|
|
|
# but it is a db query which we should try to avoid - rewrite this
|
2020-06-19 16:39:05 +01:00
|
|
|
nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave_slug':str(cave), 'slug':slugify(title)[:50], 'entry_type':entry_type}
|
2020-05-30 12:35:15 +01:00
|
|
|
lbo, created=save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs)
|
2020-04-12 22:29:30 +01:00
|
|
|
|
2011-07-11 02:10:22 +01:00
|
|
|
|
|
|
|
for tripperson, time_underground in trippersons:
|
|
|
|
lookupAttribs={'personexpedition':tripperson, 'logbook_entry':lbo}
|
|
|
|
nonLookupAttribs={'time_underground':time_underground, 'is_logbook_entry_author':(tripperson == author)}
|
2020-06-28 15:57:40 +01:00
|
|
|
save_carefully(PersonTrip, lookupAttribs, nonLookupAttribs) # PersonTrip also saved in SetDatesFromLogbookEntries
|
2011-07-11 02:10:22 +01:00
|
|
|
|
|
|
|
def ParseDate(tripdate, year):
|
|
|
|
""" Interprets dates in the expo logbooks and returns a correct datetime.date object """
|
2019-03-06 23:20:34 +00:00
|
|
|
mdatestandard = re.match(r"(\d\d\d\d)-(\d\d)-(\d\d)", tripdate)
|
|
|
|
mdategoof = re.match(r"(\d\d?)/0?(\d)/(20|19)?(\d\d)", tripdate)
|
2011-07-11 02:10:22 +01:00
|
|
|
if mdatestandard:
|
|
|
|
assert mdatestandard.group(1) == year, (tripdate, year)
|
|
|
|
year, month, day = int(mdatestandard.group(1)), int(mdatestandard.group(2)), int(mdatestandard.group(3))
|
|
|
|
elif mdategoof:
|
|
|
|
assert not mdategoof.group(3) or mdategoof.group(3) == year[:2], mdategoof.groups()
|
|
|
|
yadd = int(year[:2]) * 100
|
|
|
|
day, month, year = int(mdategoof.group(1)), int(mdategoof.group(2)), int(mdategoof.group(4)) + yadd
|
|
|
|
else:
|
2020-07-07 01:35:58 +01:00
|
|
|
message = " ! - Bad date in logbook: " + tripdate + " - " + year
|
2020-07-06 21:46:58 +01:00
|
|
|
DataIssue.objects.create(parser='logbooks', message=message)
|
2021-02-06 00:18:48 +00:00
|
|
|
logdataissues["tripdate"]=message
|
2020-07-07 19:07:45 +01:00
|
|
|
assert False, tripdate
|
2020-07-06 21:46:58 +01:00
|
|
|
|
2011-07-11 02:10:22 +01:00
|
|
|
return datetime.date(year, month, day)
|
|
|
|
|
2020-07-07 19:07:45 +01:00
|
|
|
# (2006 - not any more), 2008 - 2009
|
2011-07-11 02:10:22 +01:00
|
|
|
def Parselogwikitxt(year, expedition, txt):
|
2020-06-08 21:33:32 +01:00
|
|
|
global logentries
|
|
|
|
global logdataissues
|
|
|
|
|
|
|
|
logbook_entry_count = 0
|
2019-03-06 23:20:34 +00:00
|
|
|
trippara = re.findall(r"===(.*?)===([\s\S]*?)(?====)", txt)
|
2011-07-11 02:10:22 +01:00
|
|
|
for triphead, triptext in trippara:
|
2020-06-08 21:33:32 +01:00
|
|
|
logbook_entry_count += 1
|
2011-07-11 02:10:22 +01:00
|
|
|
tripheadp = triphead.split("|")
|
|
|
|
assert len(tripheadp) == 3, (tripheadp, triptext)
|
|
|
|
tripdate, tripplace, trippeople = tripheadp
|
|
|
|
tripsplace = tripplace.split(" - ")
|
|
|
|
tripcave = tripsplace[0].strip()
|
|
|
|
|
2019-03-06 23:20:34 +00:00
|
|
|
tul = re.findall(r"T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext)
|
2011-07-11 02:10:22 +01:00
|
|
|
if tul:
|
|
|
|
tu = tul[0][0]
|
|
|
|
else:
|
|
|
|
tu = ""
|
|
|
|
|
|
|
|
ldate = ParseDate(tripdate.strip(), year)
|
2020-06-08 21:33:32 +01:00
|
|
|
tripid =""
|
|
|
|
|
|
|
|
entrytuple = (ldate, tripcave, tripsplace, triptext,
|
|
|
|
trippeople, expedition, tu, "wiki", tripid)
|
|
|
|
logentries.append(entrytuple)
|
|
|
|
|
|
|
|
EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople,
|
|
|
|
expedition=expedition, logtime_underground=0)
|
|
|
|
|
|
|
|
EnterLogIntoObjStore(year, ldate, tripcave, tripplace, triptext, trippeople,
|
|
|
|
tu, "wiki", tripid, logbook_entry_count)
|
|
|
|
|
|
|
|
|
|
|
|
def EnterLogIntoObjStore(year, date, tripcave, triptitle, text, trippeople, tu, formattype, tripid1, seq):
|
|
|
|
# This will need additional functions to replicate the persontrip calculation and storage. For the
|
|
|
|
# moment we leave all that to be done in the django db
|
|
|
|
global trips # should be a singleton class object in models.py eventually
|
|
|
|
global logdataissues
|
2011-07-11 02:10:22 +01:00
|
|
|
|
2020-06-08 21:33:32 +01:00
|
|
|
if tripid1 is None or tripid1 =="":
|
|
|
|
tid= "n{}-s{:02d}".format(str(date),seq)
|
|
|
|
#print(" - New id ",tid)
|
|
|
|
else:
|
|
|
|
tid= tripid1
|
|
|
|
if tid in trips:
|
|
|
|
msg = " ! DUPLICATE id .{}. {} ~{}~".format(tid, trips[tid][0], trips[tid][1])
|
|
|
|
print(msg)
|
|
|
|
DataIssue.objects.create(parser='logbooks', message=msg)
|
|
|
|
tid= "d{}-s{:02d}".format(str(date),seq)
|
|
|
|
#print(" - De-dup ",seq, tid)
|
|
|
|
logdataissues[tid]=msg
|
|
|
|
trips[tid] = (year, date, tripcave, triptitle, text, trippeople, tu, formattype)
|
|
|
|
|
|
|
|
## copy a lot of checking functionality here from EnterLogIntoDbase()
|
|
|
|
# GetTripPersons is a db query, so this will need to be put in ObjStore before this will work..
|
|
|
|
# or design a different way to do it.
|
|
|
|
#trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground)
|
|
|
|
# if not author:
|
2020-06-14 10:05:25 +01:00
|
|
|
# print(" ! - Skipping logentry: " + title + " - no RECOGNISED author for entry")
|
2020-06-08 21:33:32 +01:00
|
|
|
# message = " ! - Skipping logentry: %s - no author for entry in year '%s'" % (title, expedition.year)
|
|
|
|
# DataIssue.objects.create(parser='logbooks', message=message)
|
|
|
|
# logdataissues[tid+"author"]=message
|
2020-07-07 19:07:45 +01:00
|
|
|
pass
|
2020-06-08 21:33:32 +01:00
|
|
|
|
2020-05-30 20:31:20 +01:00
|
|
|
# 2002, 2004, 2005, 2007, 2010 - now
|
2020-07-07 19:07:45 +01:00
|
|
|
# 2006 wiki text is incomplete, but the html all there. So using this parser now.
|
2011-07-11 02:10:22 +01:00
|
|
|
def Parseloghtmltxt(year, expedition, txt):
|
2020-06-08 21:33:32 +01:00
|
|
|
global logentries
|
|
|
|
global logdataissues
|
|
|
|
|
2019-03-06 23:20:34 +00:00
|
|
|
tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt)
|
|
|
|
logbook_entry_count = 0
|
2011-07-11 02:10:22 +01:00
|
|
|
for trippara in tripparas:
|
2019-03-06 23:20:34 +00:00
|
|
|
logbook_entry_count += 1
|
2011-07-11 02:10:22 +01:00
|
|
|
|
2019-03-06 23:20:34 +00:00
|
|
|
s = re.match(r'''(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date
|
2011-07-11 02:10:22 +01:00
|
|
|
\s*(?:<a\s+id="(.*?)"\s*/>\s*</a>)?
|
|
|
|
\s*<div\s+class="tripdate"\s*(?:id="(.*?)")?>(.*?)</div>(?:<p>)?
|
|
|
|
\s*<div\s+class="trippeople">\s*(.*?)</div>
|
|
|
|
\s*<div\s+class="triptitle">\s*(.*?)</div>
|
|
|
|
([\s\S]*?)
|
|
|
|
\s*(?:<div\s+class="timeug">\s*(.*?)</div>)?
|
|
|
|
\s*$
|
|
|
|
''', trippara)
|
|
|
|
if not s:
|
2019-03-06 23:20:34 +00:00
|
|
|
if not re.search(r"Rigging Guide", trippara):
|
2020-07-07 19:07:45 +01:00
|
|
|
msg = " !- Logbook. Can't parse: {} entry:{}".format(trippara, logbook_entry_count)
|
2020-06-08 21:33:32 +01:00
|
|
|
print(msg)
|
|
|
|
DataIssue.objects.create(parser='logbooks', message=msg)
|
|
|
|
logdataissues[tid]=msg
|
2011-07-11 02:10:22 +01:00
|
|
|
continue
|
|
|
|
tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups()
|
|
|
|
ldate = ParseDate(tripdate.strip(), year)
|
|
|
|
triptitles = triptitle.split(" - ")
|
|
|
|
if len(triptitles) >= 2:
|
|
|
|
tripcave = triptitles[0]
|
|
|
|
else:
|
|
|
|
tripcave = "UNKNOWN"
|
2019-03-06 23:20:34 +00:00
|
|
|
ltriptext = re.sub(r"</p>", "", triptext)
|
2020-02-20 15:26:33 +00:00
|
|
|
ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
|
2020-02-20 14:13:38 +00:00
|
|
|
ltriptext = re.sub(r"<p>", "</br></br>", ltriptext).strip()
|
2020-06-08 21:33:32 +01:00
|
|
|
|
|
|
|
entrytuple = (ldate, tripcave, triptitle, ltriptext,
|
|
|
|
trippeople, expedition, tu, "html", tripid1)
|
|
|
|
logentries.append(entrytuple)
|
|
|
|
|
2019-03-31 15:39:53 +01:00
|
|
|
EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext,
|
|
|
|
trippeople=trippeople, expedition=expedition, logtime_underground=0,
|
|
|
|
entry_type="html")
|
2011-07-11 02:10:22 +01:00
|
|
|
|
2020-06-08 21:33:32 +01:00
|
|
|
EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu,
|
|
|
|
"html", tripid1, logbook_entry_count)
|
|
|
|
|
2019-03-30 13:58:38 +00:00
|
|
|
# main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it
|
2021-02-06 00:18:48 +00:00
|
|
|
# trying it out for years 1982 - 1990 too. Some logbook editing required by hand..
|
2011-07-11 02:10:22 +01:00
|
|
|
def Parseloghtml01(year, expedition, txt):
|
2020-06-08 21:33:32 +01:00
|
|
|
global logentries
|
|
|
|
global logdataissues
|
2021-02-06 00:18:48 +00:00
|
|
|
errorcount = 0
|
2020-06-08 21:33:32 +01:00
|
|
|
|
2019-03-06 23:20:34 +00:00
|
|
|
tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
|
2020-06-08 21:33:32 +01:00
|
|
|
logbook_entry_count = 0
|
2011-07-11 02:10:22 +01:00
|
|
|
for trippara in tripparas:
|
2020-06-08 21:33:32 +01:00
|
|
|
logbook_entry_count += 1
|
2021-02-06 00:18:48 +00:00
|
|
|
try:
|
|
|
|
tripentry = year + "." + str(logbook_entry_count)
|
|
|
|
s = re.match("(?s)\s*(?:<p>)?(.*?)</?p>(.*)$(?i)", trippara)
|
|
|
|
if not s:
|
|
|
|
message = " ! - Skipping logentry on failure to parse header: " + tripentry + trippara[:300] + "..."
|
|
|
|
DataIssue.objects.create(parser='logbooks', message=message)
|
|
|
|
logdataissues[tripentry]=message
|
|
|
|
print(message)
|
|
|
|
break
|
|
|
|
tripheader, triptext = s.group(1), s.group(2)
|
|
|
|
mtripid = re.search(r'<a id="(.*?)"', tripheader)
|
|
|
|
tripid = mtripid and mtripid.group(1) or ""
|
|
|
|
tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader)
|
|
|
|
|
|
|
|
tripdate, triptitle, trippeople = tripheader.split("|")
|
|
|
|
ldate = ParseDate(tripdate.strip(), year)
|
2020-06-08 21:33:32 +01:00
|
|
|
|
2021-02-06 00:18:48 +00:00
|
|
|
mtu = re.search(r'<p[^>]*>(T/?U.*)', triptext)
|
|
|
|
if mtu:
|
|
|
|
tu = mtu.group(1)
|
|
|
|
triptext = triptext[:mtu.start(0)] + triptext[mtu.end():]
|
|
|
|
else:
|
|
|
|
tu = ""
|
|
|
|
|
|
|
|
triptitles = triptitle.split(" - ")
|
|
|
|
tripcave = triptitles[0].strip()
|
|
|
|
|
|
|
|
ltriptext = triptext
|
|
|
|
|
|
|
|
mtail = re.search(r'(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&|</?p>|\((?:same day|\d+)\))*$', ltriptext)
|
|
|
|
if mtail:
|
|
|
|
ltriptext = ltriptext[:mtail.start(0)]
|
|
|
|
ltriptext = re.sub(r"</p>", "", ltriptext)
|
|
|
|
ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
|
|
|
|
ltriptext = re.sub(r"<p>|<br>", "\n\n", ltriptext).strip()
|
|
|
|
ltriptext = re.sub(r"</?u>", "_", ltriptext)
|
|
|
|
ltriptext = re.sub(r"</?i>", "''", ltriptext)
|
|
|
|
ltriptext = re.sub(r"</?b>", "'''", ltriptext)
|
|
|
|
|
|
|
|
entrytuple = (ldate, tripcave, triptitle, ltriptext,
|
|
|
|
trippeople, expedition, tu, "html01", tripid)
|
|
|
|
logentries.append(entrytuple)
|
|
|
|
|
|
|
|
EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext,
|
|
|
|
trippeople=trippeople, expedition=expedition, logtime_underground=0,
|
|
|
|
entry_type="html")
|
|
|
|
|
|
|
|
EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu,
|
|
|
|
"html01", tripid, logbook_entry_count)
|
|
|
|
except:
|
|
|
|
message = " ! - Skipping logentry due to exception in: " + tripentry
|
|
|
|
DataIssue.objects.create(parser='logbooks', message=message)
|
|
|
|
logdataissues[tripentry]=message
|
|
|
|
print(message)
|
|
|
|
errorcount += 1
|
|
|
|
if errorcount >5 :
|
|
|
|
message = " !!- TOO MANY ERRORS - aborting logbook: " + year
|
|
|
|
DataIssue.objects.create(parser='logbooks', message=message)
|
|
|
|
logdataissues[tripentry]=message
|
|
|
|
print(message)
|
|
|
|
return
|
2020-06-08 21:33:32 +01:00
|
|
|
|
2019-03-30 13:58:38 +00:00
|
|
|
# parser for 2003
|
2011-07-11 02:10:22 +01:00
|
|
|
def Parseloghtml03(year, expedition, txt):
|
2020-06-08 21:33:32 +01:00
|
|
|
global logentries
|
|
|
|
global logdataissues
|
|
|
|
|
2019-03-06 23:20:34 +00:00
|
|
|
tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt)
|
2020-06-08 21:33:32 +01:00
|
|
|
logbook_entry_count = 0
|
2011-07-11 02:10:22 +01:00
|
|
|
for trippara in tripparas:
|
2020-06-08 21:33:32 +01:00
|
|
|
logbook_entry_count += 1
|
|
|
|
|
2020-05-24 01:57:06 +01:00
|
|
|
s = re.match("(?s)\s*<p>(.*?)</p>(.*)$", trippara)
|
2011-07-11 02:10:22 +01:00
|
|
|
assert s, trippara
|
|
|
|
tripheader, triptext = s.group(1), s.group(2)
|
2019-03-06 23:20:34 +00:00
|
|
|
tripheader = re.sub(r" ", " ", tripheader)
|
|
|
|
tripheader = re.sub(r"\s+", " ", tripheader).strip()
|
2011-07-11 02:10:22 +01:00
|
|
|
sheader = tripheader.split(" -- ")
|
|
|
|
tu = ""
|
|
|
|
if re.match("T/U|Time underwater", sheader[-1]):
|
|
|
|
tu = sheader.pop()
|
|
|
|
if len(sheader) != 3:
|
2020-06-08 21:33:32 +01:00
|
|
|
print((" ! Header not three pieces", sheader))
|
2011-07-11 02:10:22 +01:00
|
|
|
tripdate, triptitle, trippeople = sheader
|
|
|
|
ldate = ParseDate(tripdate.strip(), year)
|
|
|
|
triptitles = triptitle.split(" , ")
|
|
|
|
if len(triptitles) >= 2:
|
|
|
|
tripcave = triptitles[0]
|
|
|
|
else:
|
|
|
|
tripcave = "UNKNOWN"
|
2019-03-06 23:20:34 +00:00
|
|
|
ltriptext = re.sub(r"</p>", "", triptext)
|
|
|
|
ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
|
|
|
|
ltriptext = re.sub(r"<p>", "\n\n", ltriptext).strip()
|
|
|
|
ltriptext = re.sub(r"[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext)
|
2020-06-08 21:33:32 +01:00
|
|
|
|
|
|
|
tid= "n{}-s{:02d}".format(str(ldate),logbook_entry_count)
|
|
|
|
|
|
|
|
entrytuple = (ldate, tripcave, triptitle, ltriptext,
|
|
|
|
trippeople, expedition, tu, "html03", tid)
|
|
|
|
logentries.append(entrytuple)
|
|
|
|
|
2019-03-31 15:39:53 +01:00
|
|
|
EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle,
|
|
|
|
text = ltriptext, trippeople=trippeople, expedition=expedition,
|
|
|
|
logtime_underground=0, entry_type="html")
|
2011-07-11 02:10:22 +01:00
|
|
|
|
2020-06-08 21:33:32 +01:00
|
|
|
EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu,
|
|
|
|
"html03", tid, logbook_entry_count)
|
|
|
|
|
2011-07-11 02:10:22 +01:00
|
|
|
|
|
|
|
def SetDatesFromLogbookEntries(expedition):
|
|
|
|
"""
|
|
|
|
Sets the date_from and date_to field for an expedition based on persontrips.
|
|
|
|
Then sets the expedition date_from and date_to based on the personexpeditions.
|
|
|
|
"""
|
2020-07-01 22:49:38 +01:00
|
|
|
# Probably a faster way to do this. This uses a lot of db queries, but we have all this
|
|
|
|
# in memory..
|
2011-07-11 02:10:22 +01:00
|
|
|
for personexpedition in expedition.personexpedition_set.all():
|
|
|
|
persontrips = personexpedition.persontrip_set.order_by('logbook_entry__date')
|
|
|
|
# sequencing is difficult to do
|
|
|
|
lprevpersontrip = None
|
|
|
|
for persontrip in persontrips:
|
|
|
|
persontrip.persontrip_prev = lprevpersontrip
|
|
|
|
if lprevpersontrip:
|
|
|
|
lprevpersontrip.persontrip_next = persontrip
|
|
|
|
lprevpersontrip.save()
|
|
|
|
persontrip.persontrip_next = None
|
|
|
|
lprevpersontrip = persontrip
|
2020-06-28 15:57:40 +01:00
|
|
|
persontrip.save() # also saved in EnterLogIntoDbase. MERGE these to speed up import.
|
2019-03-06 23:20:34 +00:00
|
|
|
|
|
|
|
|
2020-06-08 21:33:32 +01:00
|
|
|
def LoadLogbookForExpedition(expedition,expect):
|
2020-05-30 20:31:20 +01:00
|
|
|
""" Parses all logbook entries for one expedition
|
2020-06-08 21:33:32 +01:00
|
|
|
If a cache is found it uses it. If not found, or fails sanity checks, parses source file.
|
2020-05-30 20:31:20 +01:00
|
|
|
"""
|
2020-06-08 21:33:32 +01:00
|
|
|
# absolutely horrid. REFACTOR THIS (all my fault..)
|
2020-04-12 22:29:30 +01:00
|
|
|
global logentries
|
2019-03-06 23:20:34 +00:00
|
|
|
logbook_parseable = False
|
2020-04-12 22:29:30 +01:00
|
|
|
logbook_cached = False
|
2020-05-30 20:31:20 +01:00
|
|
|
yearlinks = settings.LOGBOOK_PARSER_SETTINGS
|
|
|
|
expologbase = os.path.join(settings.EXPOWEB, "years")
|
2020-06-08 21:33:32 +01:00
|
|
|
logentries=[]
|
|
|
|
|
|
|
|
def validcache(year,n):
|
|
|
|
if year != expedition:
|
|
|
|
print(" ! year != expedition ",year, expedition )
|
|
|
|
return False
|
|
|
|
if len(logentries) != n:
|
|
|
|
print(" ! len(logentries) != n ",len(logentries), n )
|
|
|
|
return False
|
|
|
|
if n != expect:
|
|
|
|
print(" ! n != expect ",n, expect )
|
|
|
|
return False
|
|
|
|
return True
|
|
|
|
|
2019-03-06 23:20:34 +00:00
|
|
|
if expedition.year in yearlinks:
|
2020-05-30 20:31:20 +01:00
|
|
|
logbookfile = os.path.join(expologbase, yearlinks[expedition.year][0])
|
|
|
|
parsefunc = yearlinks[expedition.year][1]
|
|
|
|
else:
|
|
|
|
logbookfile = os.path.join(expologbase, expedition.year, settings.DEFAULT_LOGBOOK_FILE)
|
|
|
|
parsefunc = settings.DEFAULT_LOGBOOK_PARSER
|
|
|
|
cache_filename = logbookfile + ".cache"
|
|
|
|
|
|
|
|
try:
|
|
|
|
bad_cache = False
|
|
|
|
now = time.time()
|
|
|
|
cache_t = os.path.getmtime(cache_filename)
|
|
|
|
if os.path.getmtime(logbookfile) - cache_t > 2: # at least 2 secs later
|
|
|
|
bad_cache= True
|
|
|
|
if now - cache_t > 30*24*60*60:
|
|
|
|
bad_cache= True
|
|
|
|
if bad_cache:
|
|
|
|
print(" - ! Cache is either stale or more than 30 days old. Deleting it.")
|
|
|
|
os.remove(cache_filename)
|
|
|
|
logentries=[]
|
|
|
|
print(" ! Removed stale or corrupt cache file")
|
|
|
|
raise
|
|
|
|
print(" - Reading cache: " + cache_filename, end='')
|
2019-03-06 23:20:34 +00:00
|
|
|
try:
|
2020-05-30 20:31:20 +01:00
|
|
|
with open(cache_filename, "rb") as f:
|
2020-06-08 21:33:32 +01:00
|
|
|
year,n,logentries = pickle.load(f)
|
|
|
|
if validcache(year,n):
|
|
|
|
print(" -- Loaded ", len(logentries), " log entries")
|
|
|
|
logbook_cached = True
|
|
|
|
else:
|
|
|
|
print(" !- Should be ", expect, " but ", len(logentries), " found in cache")
|
|
|
|
raise
|
2020-04-12 22:29:30 +01:00
|
|
|
except:
|
2020-06-08 21:33:32 +01:00
|
|
|
print(" ! Failed to load corrupt cache. Deleting it.")
|
2020-05-30 20:31:20 +01:00
|
|
|
os.remove(cache_filename)
|
|
|
|
logentries=[]
|
|
|
|
raise
|
|
|
|
except : # no cache found
|
|
|
|
#print(" - No cache \"" + cache_filename +"\"")
|
|
|
|
try:
|
|
|
|
file_in = open(logbookfile,'rb')
|
2019-03-06 23:20:34 +00:00
|
|
|
txt = file_in.read().decode("latin1")
|
|
|
|
file_in.close()
|
|
|
|
logbook_parseable = True
|
2020-05-30 20:31:20 +01:00
|
|
|
print((" - Using: " + parsefunc + " to parse " + logbookfile))
|
|
|
|
except (IOError):
|
|
|
|
logbook_parseable = False
|
|
|
|
print((" ! Couldn't open logbook " + logbookfile))
|
|
|
|
|
|
|
|
if logbook_parseable:
|
|
|
|
parser = globals()[parsefunc]
|
|
|
|
parser(expedition.year, expedition, txt)
|
|
|
|
SetDatesFromLogbookEntries(expedition)
|
|
|
|
if len(logentries) >0:
|
|
|
|
print(" - Cacheing " , len(logentries), " log entries")
|
|
|
|
with open(cache_filename, "wb") as fc:
|
2020-06-08 21:33:32 +01:00
|
|
|
logbk=(expedition,len(logentries),logentries)
|
|
|
|
pickle.dump(logbk, fc, protocol=4)
|
2020-05-30 20:31:20 +01:00
|
|
|
else:
|
|
|
|
print(" ! NO TRIP entries found in logbook, check the syntax.")
|
2019-03-06 23:20:34 +00:00
|
|
|
|
2020-06-08 21:33:32 +01:00
|
|
|
if logbook_cached: # working on this bit...
|
2020-05-30 20:31:20 +01:00
|
|
|
i=0
|
|
|
|
for entrytuple in range(len(logentries)):
|
2020-06-08 21:33:32 +01:00
|
|
|
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, entry_type, tripid1 = logentries[i]
|
|
|
|
EnterLogIntoDbase(date, tripcave, triptitle, text, trippeople, expedition, 0,
|
|
|
|
entry_type)
|
|
|
|
EnterLogIntoObjStore(expedition.year, date, tripcave, triptitle, text, trippeople, logtime_underground,
|
|
|
|
entry_type, tripid1, i)
|
2020-05-30 20:31:20 +01:00
|
|
|
i +=1
|
2020-06-08 21:33:32 +01:00
|
|
|
SetDatesFromLogbookEntries(expedition)
|
2020-06-06 22:51:55 +01:00
|
|
|
return len(logentries)
|
2020-05-28 04:54:53 +01:00
|
|
|
|
2011-07-11 02:10:22 +01:00
|
|
|
def LoadLogbooks():
|
2020-05-30 20:31:20 +01:00
|
|
|
""" This is the master function for parsing all logbooks into the Troggle database.
|
2021-02-06 00:18:48 +00:00
|
|
|
Parser settings appropriate for each year are set in settings.py LOGBOOK_PARSER_SETTINGS.
|
|
|
|
This should be rewritten to use coroutines to load all logbooks from disc in parallel.
|
2020-05-30 20:31:20 +01:00
|
|
|
"""
|
2020-06-08 21:33:32 +01:00
|
|
|
global logdataissues
|
|
|
|
|
|
|
|
logdataissues = {}
|
2020-05-30 12:35:15 +01:00
|
|
|
DataIssue.objects.filter(parser='logbooks').delete()
|
|
|
|
expos = Expedition.objects.all()
|
2020-06-06 22:51:55 +01:00
|
|
|
if len(expos) <= 1:
|
2020-06-08 21:33:32 +01:00
|
|
|
print(" ! No expeditions found. Load 'people' first.\n")
|
2021-03-24 21:40:52 +00:00
|
|
|
nologbook = ["1976", "1977", "1978", "1979", "1980", "1981",
|
|
|
|
"1983", "1984", "1985", "1987", "1988", "1989",
|
|
|
|
"1986", "2020",]
|
2021-02-06 00:18:48 +00:00
|
|
|
entries = {"2021": 0, "2019": 20, "2018": 74, "2017": 60, "2016": 81, "2015": 79,
|
2020-06-08 21:33:32 +01:00
|
|
|
"2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 52,
|
2020-07-07 19:07:45 +01:00
|
|
|
"2008": 49, "2007": 111, "2006": 60, "2005": 55, "2004": 76, "2003": 40, "2002": 31,
|
2020-06-08 21:33:32 +01:00
|
|
|
"2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 94, "1995": 41,
|
2021-02-06 00:18:48 +00:00
|
|
|
"1994": 32, "1993": 41, "1992": 61, "1991": 38, "1990": 87, "1989": 1,"1988": 1,"1987": 1,
|
|
|
|
"1985": 1,"1984": 1,"1983": 1,"1982": 42,}
|
|
|
|
# Logbooks log.htm exist for 1983, 84, 85, 87, 88, 89 but have no full-working parser, or need hand-editing.
|
2020-06-06 22:51:55 +01:00
|
|
|
try:
|
|
|
|
os.remove("loadlogbk.log")
|
|
|
|
except OSError:
|
|
|
|
pass
|
2020-06-08 21:33:32 +01:00
|
|
|
nlbe={}
|
|
|
|
expd ={}
|
2020-06-06 22:51:55 +01:00
|
|
|
with open("loadlogbk.log", "a") as log:
|
|
|
|
for expo in expos:
|
|
|
|
if expo.year not in nologbook:
|
|
|
|
print((" - Logbook for: " + expo.year))
|
|
|
|
numentries = LoadLogbookForExpedition(expo, entries[expo.year])
|
2020-06-08 21:33:32 +01:00
|
|
|
log.write("{} {:5d} should be {}\n".format(expo.year, numentries, entries[expo.year]))
|
|
|
|
nlbe[expo.year]=numentries
|
|
|
|
expd[expo.year]= 0
|
|
|
|
print("** total trips in ObjStore:", len(trips))
|
2021-02-06 00:18:48 +00:00
|
|
|
#for i in logdataissues:
|
|
|
|
# print("{:15s}: {}".format(i, logdataissues[i]))
|
2020-06-08 21:33:32 +01:00
|
|
|
|
|
|
|
for lbe in trips:
|
|
|
|
year, date, tripcave, triptitle, text, trippeople, tu, formattype = trips[lbe]
|
|
|
|
expd[year] += 1
|
|
|
|
yt=0
|
|
|
|
for y in expd:
|
|
|
|
print("{} {}".format(y, expd[y]), nlbe[y])
|
|
|
|
yt += expd[y]
|
2021-02-06 00:18:48 +00:00
|
|
|
print("total {} log entries in all expeditions".format(yt))
|
2020-06-08 21:33:32 +01:00
|
|
|
|
|
|
|
with shelve.open('logbktrips.shelve',writeback=True) as odb:
|
|
|
|
for lbe in trips:
|
|
|
|
odb[lbe]=trips[lbe]
|
|
|
|
odb.sync()
|
|
|
|
odb.close()
|
2019-03-06 23:20:34 +00:00
|
|
|
|
|
|
|
dateRegex = re.compile(r'<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S)
|
|
|
|
expeditionYearRegex = re.compile(r'<span\s+class="expeditionyear">(.*?)</span>', re.S)
|
|
|
|
titleRegex = re.compile(r'<H1>(.*?)</H1>', re.S)
|
|
|
|
reportRegex = re.compile(r'<div\s+class="report">(.*)</div>\s*</body>', re.S)
|
|
|
|
personRegex = re.compile(r'<div\s+class="person">(.*?)</div>', re.S)
|
|
|
|
nameAuthorRegex = re.compile(r'<span\s+class="name(,author|)">(.*?)</span>', re.S)
|
|
|
|
TURegex = re.compile(r'<span\s+class="TU">([0-9]*\.?[0-9]+)</span>', re.S)
|
|
|
|
locationRegex = re.compile(r'<span\s+class="location">(.*?)</span>', re.S)
|
|
|
|
caveRegex = re.compile(r'<span\s+class="cave">(.*?)</span>', re.S)
|
2011-07-11 02:10:22 +01:00
|
|
|
|
|
|
|
def parseAutoLogBookEntry(filename):
|
|
|
|
errors = []
|
|
|
|
f = open(filename, "r")
|
|
|
|
contents = f.read()
|
|
|
|
f.close()
|
|
|
|
|
|
|
|
dateMatch = dateRegex.search(contents)
|
|
|
|
if dateMatch:
|
|
|
|
year, month, day = [int(x) for x in dateMatch.groups()]
|
|
|
|
date = datetime.date(year, month, day)
|
|
|
|
else:
|
2020-05-30 20:31:20 +01:00
|
|
|
errors.append(" - Date could not be found")
|
2011-07-11 02:10:22 +01:00
|
|
|
|
|
|
|
expeditionYearMatch = expeditionYearRegex.search(contents)
|
|
|
|
if expeditionYearMatch:
|
|
|
|
try:
|
2020-05-30 12:35:15 +01:00
|
|
|
expedition = Expedition.objects.get(year = expeditionYearMatch.groups()[0])
|
2011-07-11 02:10:22 +01:00
|
|
|
personExpeditionNameLookup = GetPersonExpeditionNameLookup(expedition)
|
2020-05-30 12:35:15 +01:00
|
|
|
except Expedition.DoesNotExist:
|
2020-05-30 20:31:20 +01:00
|
|
|
errors.append(" - Expedition not in database")
|
2011-07-11 02:10:22 +01:00
|
|
|
else:
|
2020-05-30 20:31:20 +01:00
|
|
|
errors.append(" - Expedition Year could not be parsed")
|
2011-07-11 02:10:22 +01:00
|
|
|
|
|
|
|
titleMatch = titleRegex.search(contents)
|
|
|
|
if titleMatch:
|
|
|
|
title, = titleMatch.groups()
|
|
|
|
if len(title) > settings.MAX_LOGBOOK_ENTRY_TITLE_LENGTH:
|
2020-05-30 20:31:20 +01:00
|
|
|
errors.append(" - Title too long")
|
2011-07-11 02:10:22 +01:00
|
|
|
else:
|
2020-05-30 20:31:20 +01:00
|
|
|
errors.append(" - Title could not be found")
|
2011-07-11 02:10:22 +01:00
|
|
|
|
|
|
|
caveMatch = caveRegex.search(contents)
|
|
|
|
if caveMatch:
|
|
|
|
caveRef, = caveMatch.groups()
|
|
|
|
try:
|
2020-06-28 01:50:34 +01:00
|
|
|
# this is a slow and uncertain function:
|
2020-05-30 12:35:15 +01:00
|
|
|
cave = getCaveByReference(caveRef)
|
2011-07-11 02:10:22 +01:00
|
|
|
except AssertionError:
|
|
|
|
cave = None
|
2020-05-30 20:31:20 +01:00
|
|
|
errors.append(" - Cave not found in database")
|
2011-07-11 02:10:22 +01:00
|
|
|
else:
|
|
|
|
cave = None
|
|
|
|
|
|
|
|
locationMatch = locationRegex.search(contents)
|
|
|
|
if locationMatch:
|
|
|
|
location, = locationMatch.groups()
|
|
|
|
else:
|
|
|
|
location = None
|
|
|
|
|
|
|
|
if cave is None and location is None:
|
2020-05-30 20:31:20 +01:00
|
|
|
errors.append(" - Location nor cave could not be found")
|
2011-07-11 02:10:22 +01:00
|
|
|
|
|
|
|
reportMatch = reportRegex.search(contents)
|
|
|
|
if reportMatch:
|
|
|
|
report, = reportMatch.groups()
|
|
|
|
else:
|
2020-05-30 20:31:20 +01:00
|
|
|
errors.append(" - Contents could not be found")
|
2011-07-11 02:10:22 +01:00
|
|
|
if errors:
|
|
|
|
return errors # Easiest to bail out at this point as we need to make sure that we know which expedition to look for people from.
|
|
|
|
people = []
|
|
|
|
for personMatch in personRegex.findall(contents):
|
|
|
|
nameAuthorMatch = nameAuthorRegex.search(contents)
|
|
|
|
if nameAuthorMatch:
|
|
|
|
author, name = nameAuthorMatch.groups()
|
|
|
|
if name.lower() in personExpeditionNameLookup:
|
|
|
|
personExpo = personExpeditionNameLookup[name.lower()]
|
|
|
|
else:
|
2020-05-30 20:31:20 +01:00
|
|
|
errors.append(" - Person could not be found in database")
|
2011-07-11 02:10:22 +01:00
|
|
|
author = bool(author)
|
|
|
|
else:
|
2020-05-30 20:31:20 +01:00
|
|
|
errors.append(" - Persons name could not be found")
|
2011-07-11 02:10:22 +01:00
|
|
|
|
|
|
|
TUMatch = TURegex.search(contents)
|
|
|
|
if TUMatch:
|
|
|
|
TU, = TUMatch.groups()
|
|
|
|
else:
|
2020-05-30 20:31:20 +01:00
|
|
|
errors.append(" - TU could not be found")
|
2011-07-11 02:10:22 +01:00
|
|
|
if not errors:
|
|
|
|
people.append((name, author, TU))
|
|
|
|
if errors:
|
2020-05-30 20:31:20 +01:00
|
|
|
return errors # Bail out before committing to the database
|
2020-05-30 12:35:15 +01:00
|
|
|
logbookEntry = LogbookEntry(date = date,
|
2011-07-11 02:10:22 +01:00
|
|
|
expedition = expedition,
|
|
|
|
title = title, cave = cave, place = location,
|
|
|
|
text = report, slug = slugify(title)[:50],
|
|
|
|
filename = filename)
|
|
|
|
logbookEntry.save()
|
|
|
|
for name, author, TU in people:
|
2020-05-30 12:35:15 +01:00
|
|
|
PersonTrip(personexpedition = personExpo,
|
2011-07-11 02:10:22 +01:00
|
|
|
time_underground = TU,
|
|
|
|
logbook_entry = logbookEntry,
|
|
|
|
is_logbook_entry_author = author).save()
|
2020-05-28 04:54:53 +01:00
|
|
|
print(logbookEntry)
|