2
0
mirror of https://expo.survex.com/repositories/troggle/.git synced 2024-11-22 07:11:52 +00:00

New cacheing for parsed logbooks. All logbooks load in 75 seconds now.

This commit is contained in:
Philip Sargent 2020-04-12 22:29:30 +01:00
parent 98fd314a62
commit ac9f3cf061
2 changed files with 84 additions and 30 deletions

View File

@ -127,7 +127,7 @@ def import_auto_logbooks():
print(os.path.join(root, filename))
parsers.logbooks.parseAutoLogBookEntry(os.path.join(root, filename))
#Temporary function until definative source of data transfered.
#Temporary function until definitive source of data transfered.
from django.template.defaultfilters import slugify
from django.template import Context, loader
def dumplogbooks():
@ -177,16 +177,16 @@ def usage():
caves - read in the caves
folklog - read in the people (folk) and then the logbooks
logbooks - read in just the logbooks
autologbooks - read in autologbooks
autologbooks - read in autologbooks (what are these?)
dumplogbooks - write out autologbooks (not working?)
people - read in the people from folk.csv
QMs - read in the QM files
resetend
scans - NOT the scanned surveynotes ?!
survex - read in the survex files
survexpos
survex - read in the survex files - all the survex blocks
survexpos - just the Pos out of the survex files
surveys - read in the scanned surveynotes
tunnel - read in the Tunnel files
tunnel - read in the Tunnel files - which scans the surveyscans too
""")
if __name__ == "__main__":
@ -214,10 +214,7 @@ if __name__ == "__main__":
elif "resetend" in sys.argv:
#import_logbooks()
import_QMs()
try:
import_tunnelfiles()
except:
print("Tunnel files parser broken.")
import_tunnelfiles()
import_surveys()
import_descriptions()
parse_descriptions()

View File

@ -12,8 +12,9 @@ from django.utils.timezone import make_aware
import csv
import re
import datetime
import datetime, time
import os
import pickle
from utils import save_carefully
@ -78,10 +79,17 @@ def GetTripCave(place): #need to be fuzzier about matching here. Already a very
print("No cave found for place " , place)
return
logentries = [] # the entire logbook is a single object: a list of entries
noncaveplaces = [ "Journey", "Loser Plateau" ]
def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, entry_type="wiki"):
""" saves a logbook entry and related persontrips """
global logentries
entrytuple = (date, place, title, text,
trippeople, expedition, logtime_underground, entry_type)
logentries.append(entrytuple)
trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground)
if not author:
print(" - Skipping logentry: " + title + " - no author for entry")
@ -100,12 +108,14 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
lookupAttribs={'date':date, 'title':title}
nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave':cave, 'slug':slugify(title)[:50], 'entry_type':entry_type}
lbo, created=save_carefully(models.LogbookEntry, lookupAttribs, nonLookupAttribs)
#logentries.append(models.LogbookEntry)
for tripperson, time_underground in trippersons:
lookupAttribs={'personexpedition':tripperson, 'logbook_entry':lbo}
nonLookupAttribs={'time_underground':time_underground, 'is_logbook_entry_author':(tripperson == author)}
#print nonLookupAttribs
save_carefully(models.PersonTrip, lookupAttribs, nonLookupAttribs)
#logentries.append(models.PersonTrip)
def ParseDate(tripdate, year):
@ -189,7 +199,7 @@ def Parseloghtmltxt(year, expedition, txt):
trippeople=trippeople, expedition=expedition, logtime_underground=0,
entry_type="html")
if logbook_entry_count == 0:
print(" - No trip entrys found in logbook, check the syntax matches htmltxt format")
print(" - No trip entries found in logbook, check the syntax matches htmltxt format")
# main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it
@ -294,39 +304,86 @@ def SetDatesFromLogbookEntries(expedition):
def LoadLogbookForExpedition(expedition):
""" Parses all logbook entries for one expedition """
global logentries
expowebbase = os.path.join(settings.EXPOWEB, "years")
yearlinks = settings.LOGBOOK_PARSER_SETTINGS
logbook_parseable = False
logbook_cached = False
if expedition.year in yearlinks:
# print " - Valid logbook year: ", expedition.year
year_settings = yearlinks[expedition.year]
file_in = open(os.path.join(expowebbase, year_settings[0]))
txt = file_in.read().decode("latin1")
file_in.close()
parsefunc = year_settings[1]
logbook_parseable = True
print(" - Parsing logbook: " + year_settings[0] + "\n - Using parser: " + year_settings[1])
else:
try:
file_in = open(os.path.join(expowebbase, expedition.year, settings.DEFAULT_LOGBOOK_FILE))
bad_cache = False
cache_filename = os.path.join(expowebbase, year_settings[0])+".cache"
now = time.time()
cache_t = os.path.getmtime(cache_filename)
file_t = os.path.getmtime(os.path.join(expowebbase, year_settings[0]))
if file_t - cache_t > 2: # at least 2 secs later
#print " - Cache is stale."
bad_cache= True
if now - cache_t > 30*24*60*60:
#print " - Cache is more than 30 days old."
bad_cache= True
if bad_cache:
print " - Cache is either stale or more than 30 days old. Deleting it."
os.remove(cache_filename)
logentries=[]
raise
print(" - Reading cache: " + cache_filename )
try:
with open(cache_filename, "rb") as f:
logentries = pickle.load(f)
print " - Loaded ", len(logentries), " objects"
logbook_cached = True
except:
print " - Failed to load corrupt cache. Deleting it.\n"
os.remove(cache_filename)
logentries=[]
except:
print(" - Opening logbook: ")
file_in = open(os.path.join(expowebbase, year_settings[0]))
txt = file_in.read().decode("latin1")
file_in.close()
parsefunc = year_settings[1]
logbook_parseable = True
print("No set parser found using default")
parsefunc = settings.DEFAULT_LOGBOOK_PARSER
except (IOError):
logbook_parseable = False
print("Couldn't open default logbook file and nothing in settings for expo " + expedition.year)
print(" - Parsing logbook: " + year_settings[0] + "\n - Using parser: " + year_settings[1])
else:
try:
file_in = open(os.path.join(expowebbase, expedition.year, settings.DEFAULT_LOGBOOK_FILE))
txt = file_in.read().decode("latin1")
file_in.close()
logbook_parseable = True
print("No set parser found using default")
parsefunc = settings.DEFAULT_LOGBOOK_PARSER
except (IOError):
logbook_parseable = False
print("Couldn't open default logbook file and nothing in settings for expo " + expedition.year)
if logbook_parseable:
parser = globals()[parsefunc]
parser(expedition.year, expedition, txt)
SetDatesFromLogbookEntries(expedition)
# and this has also stored all the objects in logentries[]
print " - Storing " , len(logentries), " log entries"
with open(cache_filename, "wb") as f:
pickle.dump(logentries, f, 2)
logentries=[] # flush for next year
if logbook_cached:
i=0
for entrytuple in range(len(logentries)):
date, place, title, text, trippeople, expedition, logtime_underground, \
entry_type = logentries[i]
#print " - - obj ", i, date, title
EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground,\
entry_type)
i +=1
#return "TOLOAD: " + year + " " + str(expedition.personexpedition_set.all()[1].logbookentry_set.count()) + " " + str(models.PersonTrip.objects.filter(personexpedition__expedition=expedition).count())
def LoadLogbooks():
""" This is the master function for parsing all logbooks into the Troggle database. """
@ -372,7 +429,7 @@ def parseAutoLogBookEntry(filename):
except models.Expedition.DoesNotExist:
errors.append("Expedition not in database")
else:
errors.append("Expediton Year could not be parsed")
errors.append("Expedition Year could not be parsed")
titleMatch = titleRegex.search(contents)
if titleMatch: