Deep fix, nonunique ids in logbookentries fixed

This commit is contained in:
Philip Sargent 2021-04-23 16:11:50 +01:00
parent 343d6cf350
commit b7659a477c
3 changed files with 110 additions and 179 deletions

View File

@ -36,7 +36,11 @@ save_carefully() - core function that saves troggle objects in the database
TROG = {
'pagecache' : {
'expedition' : {}
},
'issues' : {
'logdataissues' : {}
}
}
# This is module-level executable. This is a Bad Thing. Especially when it touches the file system.

View File

@ -59,16 +59,16 @@ def expedition(request, expeditionname):
if request.user.is_authenticated:
if "reload" in request.GET:
this_expedition = Expedition.objects.get(year=int(expeditionname))
# Need to delete the exisitng entries or we get duplication
# Need to delete the existing entries or we get duplication
# Need to delete both in the Django ORM and in our own object-store.
entries = this_expedition.logbookentry_set.all()
print(f'! - expo {expeditionname} {len(entries)} entries')
print(f'! - expo {expeditionname} {len(entries)} entries initially')
for entry in entries:
print(f'! - delete entry: "{entry}"')
#print(f'! - delete entry: "{entry}"')
entry.delete()
entries = this_expedition.logbookentry_set.all()
print(f'! - expo {expeditionname} {len(entries)} entries')
LoadLogbookForExpedition(this_expedition, 0) # 0 means re-parse
print(f'! - expo {expeditionname} {len(entries)} entries after deletion')
LoadLogbookForExpedition(this_expedition, 0) # 0 means re-parse as implies cache expected to be 0
logged_in = True
else:
logged_in = False

View File

@ -1,8 +1,8 @@
import csv
import datetime
from datetime import datetime, date, time
import os
import re
import time
#import time
import pickle
import shelve
@ -26,8 +26,6 @@ todo='''
- refactor everything with some urgency, esp. LoadLogbookForExpedition()
- delete all the autoLogbooKEntry stuff when we are absolutely certain what it does
- Logbooks log.htm exist for 1983, 84, 85, 87, 88, 89 but have no full-working parser,
or it is broken/incomplete and need hand-editing.
@ -46,26 +44,35 @@ todo='''
- We should ensure logbook.html is utf-8 and stop this crap:
file_in = open(logbookfile,'rb')
txt = file_in.read().decode("latin1")
- this is a slow and uncertain function: cave = getCaveByReference(caveRef)
'''
logentries = [] # the entire logbook for one year is a single object: a list of entries
noncaveplaces = [ "QMplaceholder", "Journey", "Loser Plateau", "UNKNOWN", 'plateau',
'base camp', 'basecamp', 'top camp', 'topcamp' ]
logdataissues = {}
logdataissues = TROG['issues']['logdataissues']
trips ={}
#
# the logbook loading section
#
def GetTripPersons(trippeople, expedition, logtime_underground, tid="!"):
def set_trip_id(year, seq):
tid= f"{year}.s{seq:02d}"
return tid
def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
res = [ ]
author = None
round_bracket_regex = re.compile(r"[\(\[].*?[\)\]]")
if tid =="!":
tid = expedition.year + "." + tripperson
#print(f'# {tid}')
for tripperson in re.split(r",|\+|&|&(?!\w+;)| and ", trippeople):
tripperson = tripperson.strip()
if not tid:
tid = expedition.year + "." + tripperson + datetime.now().strftime("%S%f") # no good. Should be getting the tid
mul = re.match(r"<u>(.*?)</u>$(?i)", tripperson)
if mul:
tripperson = mul.group(1).strip()
@ -86,23 +93,24 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid="!"):
author = res[-1][0]
return res, author
def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, entry_type="wiki", tid="!"):
def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, entry_type="wiki", tid=None):
""" saves a logbook entry and related persontrips
Does NOT save the expeditionday_id - all NULLs. why?
"""
try:
trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground, tid=tid)
except:
message = " ! - Skipping logentry: %s - GetTripPersons FAIL in year '%s'" % (title, expedition.year)
message = f" ! - {expedition.year} Skipping logentry: {title} - GetTripPersons FAIL"
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues["title"]=message
print(message)
return
if not author:
print(" ! - Skipping logentry: " + title + " - no author for entry")
message = " ! - Skipping logentry: %s - no author for entry in year '%s'" % (title, expedition.year)
message = f" ! - {expedition.year} Skipping logentry: {title} - - no author for entry in year "
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues["title"]=message
print(message)
return
# This needs attention. The slug field is derived from 'title'
@ -113,16 +121,16 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
cave=None
if lplace not in noncaveplaces:
cave = GetCaveLookup().get(lplace)
# message = " ! - '" + lplace + "' place not in noncaveplaces."
# print(message)
# DataIssue.objects.create(parser='logbooks', message=message)
#Check for an existing copy of the current entry, and save
expeditionday = expedition.get_expedition_day(date)
lookupAttribs={'date':date, 'title':title}
# 'cave' is converted to a string doing this, which renders as the cave slug.
# but it is a db query which we should try to avoid - rewrite this
nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave_slug':str(cave), 'slug':slugify(title)[:50], 'entry_type':entry_type}
#NEW sluf for a logbook entry here! Use the unique id, not the title !!!
slug = tid + slugify(title)[:50]
nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave_slug':str(cave), 'slug': slug, 'entry_type':entry_type}
lbo, created=save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs)
@ -133,31 +141,37 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
def ParseDate(tripdate, year):
""" Interprets dates in the expo logbooks and returns a correct datetime.date object """
mdatestandard = re.match(r"(\d\d\d\d)-(\d\d)-(\d\d)", tripdate)
mdategoof = re.match(r"(\d\d?)/0?(\d)/(20|19)?(\d\d)", tripdate)
if mdatestandard:
if not (mdatestandard.group(1) == year):
message = " ! - Bad date (year) in logbook: " + tripdate + " - " + year
try:
mdatestandard = re.match(r"(\d\d\d\d)-(\d\d)-(\d\d)", tripdate)
mdategoof = re.match(r"(\d\d?)/0?(\d)/(20|19)?(\d\d)", tripdate)
if mdatestandard:
if not (mdatestandard.group(1) == year):
message = " ! - Bad date (year) in logbook: " + tripdate + " - " + year
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues["tripdate"]=message
return datetime.date('1970', '01', '01')
else:
year, month, day = int(mdatestandard.group(1)), int(mdatestandard.group(2)), int(mdatestandard.group(3))
elif mdategoof:
if not (not mdategoof.group(3) or mdategoof.group(3) == year[:2]):
message = " ! - Bad date mdategoof.group(3) in logbook: " + tripdate + " - " + mdategoof.group(3)
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues["tripdate"]=message
return date('1970', '01', '01')
else:
yadd = int(year[:2]) * 100
day, month, year = int(mdategoof.group(1)), int(mdategoof.group(2)), int(mdategoof.group(4)) + yadd
else:
message = " ! - Bad date in logbook: " + tripdate + " - " + year
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues["tripdate"]=message
return datetime.date('1970', '01', '01')
else:
year, month, day = int(mdatestandard.group(1)), int(mdatestandard.group(2)), int(mdatestandard.group(3))
elif mdategoof:
if not (not mdategoof.group(3) or mdategoof.group(3) == year[:2]):
message = " ! - Bad date mdategoof.group(3) in logbook: " + tripdate + " - " + mdategoof.group(3)
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues["tripdate"]=message
return datetime.date('1970', '01', '01')
else:
yadd = int(year[:2]) * 100
day, month, year = int(mdategoof.group(1)), int(mdategoof.group(2)), int(mdategoof.group(4)) + yadd
else:
message = " ! - Bad date in logbook: " + tripdate + " - " + year
return date(year, month, day)
except:
message = " ! - Failed to parse date in logbook: " + tripdate + " - " + year
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues["tripdate"]=message
return datetime.date(year, month, day)
return date(year, month, day)
# (2006 - not any more), 2008 - 2009
def Parselogwikitxt(year, expedition, txt):
@ -168,6 +182,8 @@ def Parselogwikitxt(year, expedition, txt):
trippara = re.findall(r"===(.*?)===([\s\S]*?)(?====)", txt)
for triphead, triptext in trippara:
logbook_entry_count += 1
tid = set_trip_id(year,logbook_entry_count)
tripheadp = triphead.split("|")
if not (len(tripheadp) == 3):
message = " ! - Bad no of items in tripdate in logbook: " + tripdate + " - " + tripheadp
@ -198,30 +214,24 @@ def Parselogwikitxt(year, expedition, txt):
logentries.append(entrytuple)
EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople,
expedition=expedition, logtime_underground=0)
expedition=expedition, logtime_underground=0, tid=tid)
EnterLogIntoObjStore(year, ldate, tripcave, tripplace, triptext, trippeople,
tu, "wiki", tripid, logbook_entry_count)
tu, "wiki", tripid, logbook_entry_count, tid=tid)
def EnterLogIntoObjStore(year, date, tripcave, triptitle, text, trippeople, tu, formattype, tripid1, seq):
def EnterLogIntoObjStore(year, date, tripcave, triptitle, text, trippeople, tu, formattype, tripid1, seq, tid=None):
# This will need additional functions to replicate the persontrip calculation and storage. For the
# moment we leave all that to be done in the django db
global trips # should be a singleton TROG eventually
global logdataissues
if tripid1 is None or tripid1 =="":
tid= "n{}-s{:02d}".format(str(date),seq)
#print(" - New id ",tid)
else:
tid= tripid1
if tid in trips:
tyear, tdate, *trest = trips[tid]
msg = f" ! DUPLICATE on {tdate} id: '{tid}'"
print(msg)
DataIssue.objects.create(parser='logbooks', message=msg)
tid= "d{}-s{:02d}".format(str(date),seq)
tid = set_trip_id(str(date),seq)
#print(" - De-dup ",seq, tid)
logdataissues[tid]=msg
trips[tid] = (year, date, tripcave, triptitle, text, trippeople, tu, formattype)
@ -247,7 +257,7 @@ def Parseloghtmltxt(year, expedition, txt):
logbook_entry_count = 0
for trippara in tripparas:
logbook_entry_count += 1
tid= "n{}-s{:02d}".format(year,logbook_entry_count)
tid = set_trip_id(year,logbook_entry_count)
s = re.match(r'''(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date
\s*(?:<a\s+id="(.*?)"\s*/>\s*</a>)?
@ -285,7 +295,7 @@ def Parseloghtmltxt(year, expedition, txt):
entry_type="html", tid=tid)
EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu,
"html", tripid1, logbook_entry_count)
"html", tripid1, logbook_entry_count, tid=tid)
# main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it
# trying it out for years 1982 - 1990 too. Some logbook editing required by hand..
@ -298,18 +308,26 @@ def Parseloghtml01(year, expedition, txt):
logbook_entry_count = 0
for trippara in tripparas:
logbook_entry_count += 1
tid= f"{year}.s{logbook_entry_count:02d}"
tid = set_trip_id(year,logbook_entry_count)
try:
s = re.match(r"(?s)\s*(?:<p>)?(.*?)</?p>(.*)$(?i)", trippara)
if not s:
message = " ! - Skipping logentry on failure to parse header: " + tid + trippara[:300] + "..."
message = " ! - Skipping logentry {year} failure to parse header: " + tid + trippara[:300] + "..."
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
print(message)
break
tripheader, triptext = s.group(1), s.group(2)
mtripid = re.search(r'<a id="(.*?)"', tripheader)
# if not mtripid:
# # not an error, this is probabluy jusyt a different year
# message = f" ! - Fail id trip:{tid} header:'{tripheader}'"
# DataIssue.objects.create(parser='logbooks', message=message)
# logdataissues[tid]=message
# print(message)
tripid = mtripid and mtripid.group(1) or ""
#print(f" # - mtripid: {mtripid}")
tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader)
tripdate, triptitle, trippeople = tripheader.split("|")
@ -336,6 +354,13 @@ def Parseloghtml01(year, expedition, txt):
ltriptext = re.sub(r"</?u>", "_", ltriptext)
ltriptext = re.sub(r"</?i>", "''", ltriptext)
ltriptext = re.sub(r"</?b>", "'''", ltriptext)
if ltriptext == "":
message = " ! - Zero content for logbook entry!: " + tid
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
print(message)
entrytuple = (ldate, tripcave, triptitle, ltriptext,
trippeople, expedition, tu, "html01", tripid)
@ -343,16 +368,16 @@ def Parseloghtml01(year, expedition, txt):
try:
EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext,
trippeople=trippeople, expedition=expedition, logtime_underground=0,
entry_type="html")
entry_type="html", tid=tid)
except:
message = " ! - Enter log entry into database FAIL exception in: " + tid
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
print(message)
try:
EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu,
"html01", tripid, logbook_entry_count)
"html01", tripid, logbook_entry_count, tid=tid)
except:
message = " ! - Enter log entry into ObjectStore FAIL exception in: " + tid
DataIssue.objects.create(parser='logbooks', message=message)
@ -360,7 +385,7 @@ def Parseloghtml01(year, expedition, txt):
print(message)
except:
message = " ! - Skipping logentry due to exception in: " + tid
message = f" ! - Skipping logentry {year} due to exception in: {tid}"
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
print(message)
@ -381,11 +406,11 @@ def Parseloghtml03(year, expedition, txt):
logbook_entry_count = 0
for trippara in tripparas:
logbook_entry_count += 1
tid= f"{year}.s{logbook_entry_count:02d}"
tid = set_trip_id(year,logbook_entry_count)
s = re.match(r"(?s)\s*<p>(.*?)</p>(.*)$", trippara)
if not ( s ) :
message = " ! - Skipping logentry on failure to parse Parseloghtml03: {} {} {}...".format(tid,s,trippara[:300])
message = " ! - Skipping logentry {year} on failure to parse Parseloghtml03: {} {} {}...".format(tid,s,trippara[:300])
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
print(message)
@ -419,10 +444,10 @@ def Parseloghtml03(year, expedition, txt):
EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle,
text = ltriptext, trippeople=trippeople, expedition=expedition,
logtime_underground=0, entry_type="html")
logtime_underground=0, entry_type="html", tid=tid)
EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu,
"html03", tid, logbook_entry_count)
"html03", tid, logbook_entry_count, tid=tid)
def SetDatesFromLogbookEntries(expedition):
@ -477,16 +502,15 @@ def LoadLogbookForExpedition(expedition, expect):
for di in dataissues:
ph = year
if re.search(ph, di.message) is not None:
print(f' - CLEANING dataissue {di.message}')
#print(f' - CLEANING dataissue {di.message}')
di.delete()
print(f' - CLEAN {year} {len(logdataissues)} {type(logdataissues)} data issues for this year')
#print(f' - CLEAN {year} {len(logdataissues)} {type(logdataissues)} data issues for this year')
dellist = []
for key, value in logdataissues.items():
# tripentry = year + "." + str(logbook_entry_count)
print(f' - CLEAN [{key}]')
if key.startswith(year + "."):
print(f' - CLEANING logdataissues [{key:12}]: value ')
#print(f' - CLEANING logdataissues [{key}]: {value}')
if key.startswith(year):
#print(f' - CLEANING logdataissues [{key:12}]: {value} ')
dellist.append(key)
for i in dellist:
del logdataissues[i]
@ -547,7 +571,9 @@ def LoadLogbookForExpedition(expedition, expect):
if logbook_parseable:
parser = globals()[parsefunc]
parser(expedition.year, expedition, txt)
parser(expedition.year, expedition, txt) # this launches the parser
SetDatesFromLogbookEntries(expedition)
if len(logentries) >0:
print(" - Cacheing " , len(logentries), " log entries")
@ -625,112 +651,13 @@ def LoadLogbooks():
odb.sync()
odb.close()
dateRegex = re.compile(r'<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S)
expeditionYearRegex = re.compile(r'<span\s+class="expeditionyear">(.*?)</span>', re.S)
titleRegex = re.compile(r'<H1>(.*?)</H1>', re.S)
reportRegex = re.compile(r'<div\s+class="report">(.*)</div>\s*</body>', re.S)
personRegex = re.compile(r'<div\s+class="person">(.*?)</div>', re.S)
nameAuthorRegex = re.compile(r'<span\s+class="name(,author|)">(.*?)</span>', re.S)
TURegex = re.compile(r'<span\s+class="TU">([0-9]*\.?[0-9]+)</span>', re.S)
locationRegex = re.compile(r'<span\s+class="location">(.*?)</span>', re.S)
caveRegex = re.compile(r'<span\s+class="cave">(.*?)</span>', re.S)
# dateRegex = re.compile(r'<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S)
# expeditionYearRegex = re.compile(r'<span\s+class="expeditionyear">(.*?)</span>', re.S)
# titleRegex = re.compile(r'<H1>(.*?)</H1>', re.S)
# reportRegex = re.compile(r'<div\s+class="report">(.*)</div>\s*</body>', re.S)
# personRegex = re.compile(r'<div\s+class="person">(.*?)</div>', re.S)
# nameAuthorRegex = re.compile(r'<span\s+class="name(,author|)">(.*?)</span>', re.S)
# TURegex = re.compile(r'<span\s+class="TU">([0-9]*\.?[0-9]+)</span>', re.S)
# locationRegex = re.compile(r'<span\s+class="location">(.*?)</span>', re.S)
# caveRegex = re.compile(r'<span\s+class="cave">(.*?)</span>', re.S)
def parseAutoLogBookEntry(filename):
'''An AutoLogBookEntry appears to be one that was created online using a form, for a single trip,
which is then stored in a separate location to the usual logbook.html
But when importing logbook.html all these individual entries also need ot be parsed.
This is all redundant as we are getting rid of the whole individual trip entry system
'''
errors = []
f = open(filename, "r")
contents = f.read()
f.close()
dateMatch = dateRegex.search(contents)
if dateMatch:
year, month, day = [int(x) for x in dateMatch.groups()]
date = datetime.date(year, month, day)
else:
errors.append(" - Date could not be found")
expeditionYearMatch = expeditionYearRegex.search(contents)
if expeditionYearMatch:
try:
expedition = Expedition.objects.get(year = expeditionYearMatch.groups()[0])
personExpeditionNameLookup = GetPersonExpeditionNameLookup(expedition)
except Expedition.DoesNotExist:
errors.append(" - Expedition not in database")
else:
errors.append(" - Expedition Year could not be parsed")
titleMatch = titleRegex.search(contents)
if titleMatch:
title, = titleMatch.groups()
if len(title) > settings.MAX_LOGBOOK_ENTRY_TITLE_LENGTH:
errors.append(" - Title too long")
else:
errors.append(" - Title could not be found")
caveMatch = caveRegex.search(contents)
if caveMatch:
caveRef, = caveMatch.groups()
try:
# this is a slow and uncertain function:
cave = getCaveByReference(caveRef)
except:
cave = None
errors.append(" - Cave not found in database")
else:
cave = None
locationMatch = locationRegex.search(contents)
if locationMatch:
location, = locationMatch.groups()
else:
location = None
if cave is None and location is None:
errors.append(" - Location nor cave could not be found")
reportMatch = reportRegex.search(contents)
if reportMatch:
report, = reportMatch.groups()
else:
errors.append(" - Contents could not be found")
if errors:
return errors # Easiest to bail out at this point as we need to make sure that we know which expedition to look for people from.
people = []
for personMatch in personRegex.findall(contents):
nameAuthorMatch = nameAuthorRegex.search(contents)
if nameAuthorMatch:
author, name = nameAuthorMatch.groups()
if name.lower() in personExpeditionNameLookup:
personExpo = personExpeditionNameLookup[name.lower()]
else:
errors.append(" - Person could not be found in database")
author = bool(author)
else:
errors.append(" - Persons name could not be found")
TUMatch = TURegex.search(contents)
if TUMatch:
TU, = TUMatch.groups()
else:
errors.append(" - TU could not be found")
if not errors:
people.append((name, author, TU))
if errors:
return errors # Bail out before committing to the database
logbookEntry = LogbookEntry(date = date,
expedition = expedition,
title = title, cave = cave, place = location,
text = report, slug = slugify(title)[:50],
filename = filename)
logbookEntry.save()
for name, author, TU in people:
PersonTrip(personexpedition = personExpo,
time_underground = TU,
logbook_entry = logbookEntry,
is_logbook_entry_author = author).save()
print(logbookEntry)