diff --git a/databaseReset.py b/databaseReset.py index 1eed075..c27ee7c 100644 --- a/databaseReset.py +++ b/databaseReset.py @@ -127,7 +127,7 @@ def import_auto_logbooks(): print(os.path.join(root, filename)) parsers.logbooks.parseAutoLogBookEntry(os.path.join(root, filename)) -#Temporary function until definative source of data transfered. +#Temporary function until definitive source of data transfered. from django.template.defaultfilters import slugify from django.template import Context, loader def dumplogbooks(): @@ -177,16 +177,16 @@ def usage(): caves - read in the caves folklog - read in the people (folk) and then the logbooks logbooks - read in just the logbooks - autologbooks - read in autologbooks + autologbooks - read in autologbooks (what are these?) dumplogbooks - write out autologbooks (not working?) people - read in the people from folk.csv QMs - read in the QM files resetend scans - NOT the scanned surveynotes ?! - survex - read in the survex files - survexpos + survex - read in the survex files - all the survex blocks + survexpos - just the Pos out of the survex files surveys - read in the scanned surveynotes - tunnel - read in the Tunnel files + tunnel - read in the Tunnel files - which scans the surveyscans too """) if __name__ == "__main__": @@ -214,10 +214,7 @@ if __name__ == "__main__": elif "resetend" in sys.argv: #import_logbooks() import_QMs() - try: - import_tunnelfiles() - except: - print("Tunnel files parser broken.") + import_tunnelfiles() import_surveys() import_descriptions() parse_descriptions() diff --git a/parsers/logbooks.py b/parsers/logbooks.py index 9dfa31b..9f47d4f 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -12,8 +12,9 @@ from django.utils.timezone import make_aware import csv import re -import datetime +import datetime, time import os +import pickle from utils import save_carefully @@ -78,10 +79,17 @@ def GetTripCave(place): #need to be fuzzier about matching here. Already a very print("No cave found for place " , place) return - +logentries = [] # the entire logbook is a single object: a list of entries noncaveplaces = [ "Journey", "Loser Plateau" ] + def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, entry_type="wiki"): """ saves a logbook entry and related persontrips """ + global logentries + + entrytuple = (date, place, title, text, + trippeople, expedition, logtime_underground, entry_type) + logentries.append(entrytuple) + trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground) if not author: print(" - Skipping logentry: " + title + " - no author for entry") @@ -100,12 +108,14 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_ lookupAttribs={'date':date, 'title':title} nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave':cave, 'slug':slugify(title)[:50], 'entry_type':entry_type} lbo, created=save_carefully(models.LogbookEntry, lookupAttribs, nonLookupAttribs) + #logentries.append(models.LogbookEntry) + for tripperson, time_underground in trippersons: lookupAttribs={'personexpedition':tripperson, 'logbook_entry':lbo} nonLookupAttribs={'time_underground':time_underground, 'is_logbook_entry_author':(tripperson == author)} - #print nonLookupAttribs save_carefully(models.PersonTrip, lookupAttribs, nonLookupAttribs) + #logentries.append(models.PersonTrip) def ParseDate(tripdate, year): @@ -189,7 +199,7 @@ def Parseloghtmltxt(year, expedition, txt): trippeople=trippeople, expedition=expedition, logtime_underground=0, entry_type="html") if logbook_entry_count == 0: - print(" - No trip entrys found in logbook, check the syntax matches htmltxt format") + print(" - No trip entries found in logbook, check the syntax matches htmltxt format") # main parser for 1991 - 2001. simpler because the data has been hacked so much to fit it @@ -293,40 +303,87 @@ def SetDatesFromLogbookEntries(expedition): def LoadLogbookForExpedition(expedition): """ Parses all logbook entries for one expedition """ - + + global logentries + expowebbase = os.path.join(settings.EXPOWEB, "years") yearlinks = settings.LOGBOOK_PARSER_SETTINGS logbook_parseable = False - + logbook_cached = False + if expedition.year in yearlinks: + # print " - Valid logbook year: ", expedition.year year_settings = yearlinks[expedition.year] - file_in = open(os.path.join(expowebbase, year_settings[0])) - txt = file_in.read().decode("latin1") - file_in.close() - parsefunc = year_settings[1] - logbook_parseable = True - print(" - Parsing logbook: " + year_settings[0] + "\n - Using parser: " + year_settings[1]) - else: try: - file_in = open(os.path.join(expowebbase, expedition.year, settings.DEFAULT_LOGBOOK_FILE)) + bad_cache = False + cache_filename = os.path.join(expowebbase, year_settings[0])+".cache" + now = time.time() + cache_t = os.path.getmtime(cache_filename) + file_t = os.path.getmtime(os.path.join(expowebbase, year_settings[0])) + if file_t - cache_t > 2: # at least 2 secs later + #print " - Cache is stale." + bad_cache= True + if now - cache_t > 30*24*60*60: + #print " - Cache is more than 30 days old." + bad_cache= True + if bad_cache: + print " - Cache is either stale or more than 30 days old. Deleting it." + os.remove(cache_filename) + logentries=[] + raise + print(" - Reading cache: " + cache_filename ) + try: + with open(cache_filename, "rb") as f: + logentries = pickle.load(f) + print " - Loaded ", len(logentries), " objects" + logbook_cached = True + except: + print " - Failed to load corrupt cache. Deleting it.\n" + os.remove(cache_filename) + logentries=[] + except: + print(" - Opening logbook: ") + file_in = open(os.path.join(expowebbase, year_settings[0])) txt = file_in.read().decode("latin1") file_in.close() + parsefunc = year_settings[1] logbook_parseable = True - print("No set parser found using default") - parsefunc = settings.DEFAULT_LOGBOOK_PARSER - except (IOError): - logbook_parseable = False - print("Couldn't open default logbook file and nothing in settings for expo " + expedition.year) + print(" - Parsing logbook: " + year_settings[0] + "\n - Using parser: " + year_settings[1]) + else: + try: + file_in = open(os.path.join(expowebbase, expedition.year, settings.DEFAULT_LOGBOOK_FILE)) + txt = file_in.read().decode("latin1") + file_in.close() + logbook_parseable = True + print("No set parser found using default") + parsefunc = settings.DEFAULT_LOGBOOK_PARSER + except (IOError): + logbook_parseable = False + print("Couldn't open default logbook file and nothing in settings for expo " + expedition.year) if logbook_parseable: parser = globals()[parsefunc] parser(expedition.year, expedition, txt) SetDatesFromLogbookEntries(expedition) + # and this has also stored all the objects in logentries[] + print " - Storing " , len(logentries), " log entries" + with open(cache_filename, "wb") as f: + pickle.dump(logentries, f, 2) + logentries=[] # flush for next year + + if logbook_cached: + i=0 + for entrytuple in range(len(logentries)): + date, place, title, text, trippeople, expedition, logtime_underground, \ + entry_type = logentries[i] + #print " - - obj ", i, date, title + EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground,\ + entry_type) + i +=1 #return "TOLOAD: " + year + " " + str(expedition.personexpedition_set.all()[1].logbookentry_set.count()) + " " + str(models.PersonTrip.objects.filter(personexpedition__expedition=expedition).count()) - def LoadLogbooks(): """ This is the master function for parsing all logbooks into the Troggle database. """ @@ -372,7 +429,7 @@ def parseAutoLogBookEntry(filename): except models.Expedition.DoesNotExist: errors.append("Expedition not in database") else: - errors.append("Expediton Year could not be parsed") + errors.append("Expedition Year could not be parsed") titleMatch = titleRegex.search(contents) if titleMatch: