Many fixes and speedups

This commit is contained in:
Philip Sargent 2022-08-30 17:58:49 +03:00
parent 6daa96b69e
commit 0853bbdd19

View File

@ -27,33 +27,36 @@ An idea which no longer seems sensible given that we rely on the database to do
# it can be checked up later from the hard-copy if necessary; or it's not possible to determin (name, trip place, etc)
'''
todo='''
- Put the object store 'trips' and the 'logdataissues' into TROG global object
- Use the .shelve.db cache for all logbooks, not just individually
- refactor everything with some urgency, esp. LoadLogbookForExpedition()
-- far too many uses of Django field dereferencing to get values, which is SLOW
- profile the code to find bad repetitive things, of which there are many.
- Loogbooks 1987, 1988, 1989 all crash on MySql - but not sqlite - with db constraint fail. Edit logbook to fix.
- far too many uses of Django field dereferencing to get values, which is SLOW
- Logbooks 1987, 1988, 1989 all crash on MySql - but not sqlite - with db constraint fail. Edit logbook to fix.
- import/parse/re-export-as-html the 'artisanal-format' old logbooks so that
we keep only a modern HTML05 format. Then we can retiure the old parsers and reduce the
we keep only a modern HTML05 format. Then we can retire the old parsers and reduce the
volume of code here substantially.
- rewrite to use generators rather than storing everything intermediate in lists - to reduce memory impact.
- the object store will need additional functions to replicate the persontrip calculation
and storage. For the moment we leave all that to be done in the django db
Concurrent synchronisation would be nice..
- DB lock currently prevents multiple threads for loading logbooks. But asyncio might work..?
- We should ensure logbook.html is utf-8 and stop this crap:
file_in = open(logbookfile,'rb')
txt = file_in.read().decode("latin1")
- this is a slow and uncertain function: cave = getCaveByReference(caveRef)
- the object store will need additional functions to replicate the persontrip calculation
and storage. For the moment we leave all that to be done in the django db
Concurrent synchronisation would be nice..
- DB lock currently prevents multiple threads for loading logbooks. But asyncio might work..?
- Put the object store 'trips' and the 'logdataissues' into TROG global object
'''
logentries = [] # the entire logbook for one year is a single object: a list of entries
@ -62,7 +65,7 @@ noncaveplaces = [ "QMplaceholder", "Journey", "Loser Plateau", "UNKNOWN", 'plate
logdataissues = TROG['issues']['logdataissues']
trips ={}
entries = { "2022": 42, "2019": 44, "2018": 74, "2017": 60, "2016": 81, "2015": 79,
entries = { "2022": 62, "2019": 44, "2018": 74, "2017": 60, "2016": 81, "2015": 79,
"2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 52,
"2008": 49, "2007": 111, "2006": 60, "2005": 55, "2004": 76, "2003": 40, "2002": 31,
"2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 94, "1995": 41,
@ -101,7 +104,7 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower())
if not personyear:
message = f" ! - {expedition.year} No name match for: '{tripperson}' "
message = f" ! - {expedition.year} No name match for: '{tripperson}' in entry {tid=} for this expedition year."
print(message)
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
@ -115,7 +118,7 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
return res, author
def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, entry_type="wiki", tid=None):
def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, tid=None):
""" saves a logbook entry and related persontrips
Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday !
@ -132,7 +135,7 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
return
if not author:
message = f" ! - {expedition.year} Warning: logentry: {title} - no author for entry in year "
message = f" ! - {expedition.year} Warning: logentry: {title} - no author for entry '{tid}'"
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues["title"]=message
print(message)
@ -163,9 +166,9 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
slug = tid + "_" + slugify(title)[:10].replace('-','_')
else:
slug = str(randint(1000,10000)) + "_" + slugify(title)[:10].replace('-','_')
nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave_slug':str(cave), 'slug': slug, 'entry_type':entry_type}
nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave_slug':str(cave), 'slug': slug}
# This cretes the lbo instance of LogbookEntry
# This creates the lbo instance of LogbookEntry
lbo, created=save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs)
@ -253,14 +256,14 @@ def Parselogwikitxt(year, expedition, txt):
trippeople, expedition, tu, "wiki", tripid)
logentries.append(entrytuple)
EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople,
expedition=expedition, logtime_underground=0, tid=tid)
# EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople,
# expedition=expedition, logtime_underground=0, tid=tid)
EnterLogIntoObjStore(year, ldate, tripcave, tripplace, triptext, trippeople,
tu, "wiki", tripid, logbook_entry_count, tid=tid)
# EnterLogIntoObjStore(year, ldate, tripcave, tripplace, triptext, trippeople,
# tu, "wiki", tripid, logbook_entry_count, tid=tid)
def EnterLogIntoObjStore(year, date, tripcave, triptitle, text, trippeople, tu, formattype, tripid1, seq, tid=None):
def EnterLogIntoObjStore(year, date, tripcave, triptitle, text, trippeople, tu, tripid1, seq, tid=None):
'''Called once for each logbook entry as the logbook is parsed
'''
# This will need additional functions to replicate the persontrip calculation and storage. For the
@ -280,7 +283,7 @@ def EnterLogIntoObjStore(year, date, tripcave, triptitle, text, trippeople, tu,
if not tid:
tid = set_trip_id(str(date),seq)
trips[tid] = (year, date, tripcave, triptitle, text, trippeople, tu, formattype)
trips[tid] = (year, date, tripcave, triptitle, text, trippeople, tu)
## copy a lot of checking functionality here from EnterLogIntoDbase()
# GetTripPersons is a db query, so this will need to be put in ObjStore before this will work..
@ -353,12 +356,12 @@ def Parseloghtmltxt(year, expedition, txt):
trippeople, expedition, tu, "html", tripid1)
logentries.append(entrytuple)
EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext,
trippeople=trippeople, expedition=expedition, logtime_underground=0,
entry_type="html", tid=tid)
# EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext,
# trippeople=trippeople, expedition=expedition, logtime_underground=0,
# entry_type="html", tid=tid)
EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu,
"html", tripid1, logbook_entry_count, tid=tid)
# EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu,
# "html", tripid1, logbook_entry_count, tid=tid)
# main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it
# trying it out for years 1982 - 1990 too. Some logbook editing required by hand.. place
@ -455,24 +458,24 @@ def Parseloghtml01(year, expedition, txt):
entrytuple = (ldate, tripcave, triptitle, ltriptext,
trippeople, expedition, tu, "html01", tid)
logentries.append(entrytuple)
try:
EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext,
trippeople=trippeople, expedition=expedition, logtime_underground=0,
entry_type="html", tid=tid)
except:
message = " ! - Enter log entry into database FAIL exception in: " + tid
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
print(message)
# try:
# EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext,
# trippeople=trippeople, expedition=expedition, logtime_underground=0,
# entry_type="html", tid=tid)
# except:
# message = " ! - Enter log entry into database FAIL exception in: " + tid
# DataIssue.objects.create(parser='logbooks', message=message)
# logdataissues[tid]=message
# print(message)
try:
EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu,
"html01", tid, logbook_entry_count, tid=tid)
except:
message = " ! - Enter log entry into ObjectStore FAIL exception in: " + tid
DataIssue.objects.create(parser='logbooks', message=message)
logdataissues[tid]=message
print(message)
# try:
# EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu,
# "html01", tid, logbook_entry_count, tid=tid)
# except:
# message = " ! - Enter log entry into ObjectStore FAIL exception in: " + tid
# DataIssue.objects.create(parser='logbooks', message=message)
# logdataissues[tid]=message
# print(message)
except:
message = f" ! - Skipping logentry {year} due to exception in: {tid}"
@ -514,7 +517,7 @@ def Parseloghtml03(year, expedition, txt):
if re.match("T/U|Time underwater", sheader[-1]):
tu = sheader.pop()
if len(sheader) != 3:
print((" ! Header not three pieces", sheader))
print(" ! Header not three pieces", sheader)
tripdate, triptitle, trippeople = sheader
ldate = ParseDate(tripdate.strip(), year)
triptitles = triptitle.split(" , ")
@ -532,12 +535,12 @@ def Parseloghtml03(year, expedition, txt):
trippeople, expedition, tu, "html03", tid)
logentries.append(entrytuple)
EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle,
text = ltriptext, trippeople=trippeople, expedition=expedition,
logtime_underground=0, entry_type="html", tid=tid)
# EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle,
# text = ltriptext, trippeople=trippeople, expedition=expedition,
# logtime_underground=0, entry_type="html", tid=tid)
EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu,
"html03", tid, logbook_entry_count, tid=tid)
# EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu,
# "html03", tid, logbook_entry_count, tid=tid)
def SetDatesFromLogbookEntries(expedition):
@ -618,7 +621,7 @@ def LoadLogbookForExpedition(expedition):
expedition.logbookfile = yearlinks[year][0]
parsefunc = yearlinks[year][1]
else:
logbookpath = os.path.join(expologbase, year, settings.DEFAULT_LOGBOOK_FILE)
logbookpath = Path(expologbase) / year / settings.DEFAULT_LOGBOOK_FILE
expedition.logbookfile = settings.DEFAULT_LOGBOOK_FILE
parsefunc = settings.DEFAULT_LOGBOOK_PARSER
cache_filename = Path(str(logbookpath) + ".cache")
@ -639,13 +642,13 @@ def LoadLogbookForExpedition(expedition):
print(" - ! Cache is > 30 days old")
bad_cache= True
if bad_cache:
print(" - ! Cache is either stale or more than 30 days old. Deleting it.")
print(" - so cache is either stale or more than 30 days old. Deleting it.")
os.remove(cache_filename)
logentries=[]
print(" ! Removed stale or corrupt cache file")
print(" - Deleted stale or corrupt cache file")
raise
# print(" - Reading cache: " + str(cache_filename), end='')
try:
# print(" - Reading cache: " + str(cache_filename), end='')
with open(cache_filename, "rb") as f:
year,n,logentries = pickle.load(f)
if validcache(year,n):
@ -660,21 +663,22 @@ def LoadLogbookForExpedition(expedition):
logentries=[]
raise
except :
print(" - Cache de-pickle failure \"" + str(cache_filename) +"\"")
print(" - Cache old or de-pickle failure \"" + str(cache_filename) +"\"")
try:
file_in = open(logbookpath,'rb')
txt = file_in.read().decode("latin1")
txt = file_in.read().decode("utf-8")
file_in.close()
logbook_parseable = True
except (IOError):
logbook_parseable = False
print((" ! Couldn't open logbook " + logbookpath))
print(" ! Couldn't open logbook as UTF-8 " + logbookpath)
if logbook_parseable:
parser = globals()[parsefunc]
print(f' - Using parser {parsefunc}')
parser(year, expedition, txt) # this launches the right parser for this year
print(" - Setting dates from logbook entries")
SetDatesFromLogbookEntries(expedition)
if len(logentries) >0:
print(" - Cacheing " , len(logentries), " log entries")
@ -686,11 +690,14 @@ def LoadLogbookForExpedition(expedition):
i=0
for entrytuple in logentries:
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, entry_type, tripid1 = entrytuple
try:
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
except ValueError: # cope with removal of entry_type but still in cache files. Remove in Sept. 2022.
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, entry_type, tripid1 = entrytuple
EnterLogIntoDbase(date, tripcave, triptitle, text, trippeople, expedition, 0,
entry_type, tripid1)
tripid1)
EnterLogIntoObjStore(year, date, tripcave, triptitle, text, trippeople, logtime_underground,
entry_type, tripid1, i)
tripid1, i)
i +=1
SetDatesFromLogbookEntries(expedition)