New cacheing for parsed logbooks. All logbooks load in 75 seconds now.

2024-11-22 07:11:52 +00:00 · 2020-04-12 22:29:30 +01:00 · 2020-04-12 22:29:30 +01:00 · ac9f3cf061
commit ac9f3cf061
parent 98fd314a62
2 changed files with 84 additions and 30 deletions
--- a/databaseReset.py
+++ b/databaseReset.py
@ -127,7 +127,7 @@ def import_auto_logbooks():
                print(os.path.join(root, filename))
                parsers.logbooks.parseAutoLogBookEntry(os.path.join(root, filename))

-#Temporary function until definative source of data transfered.
+#Temporary function until definitive source of data transfered.
 from django.template.defaultfilters import slugify
 from django.template import Context, loader
 def dumplogbooks():
@ -177,16 +177,16 @@ def usage():
             caves    - read in the caves 
             folklog  - read in the people (folk) and then the logbooks
             logbooks - read in just the logbooks
-             autologbooks - read in autologbooks
+             autologbooks - read in autologbooks (what are these?)
             dumplogbooks - write out autologbooks (not working?)
             people   - read in the people from folk.csv
             QMs      - read in the QM files
             resetend
             scans   - NOT the scanned surveynotes ?!
-             survex   - read in the survex files
-             survexpos
+             survex   - read in the survex files - all the survex blocks
+             survexpos - just the Pos out of the survex files
             surveys  - read in the scanned surveynotes
-             tunnel   - read in the Tunnel files
+             tunnel   - read in the Tunnel files - which scans the surveyscans too
             """)

 if __name__ == "__main__":
@ -214,10 +214,7 @@ if __name__ == "__main__":
    elif "resetend" in sys.argv:
        #import_logbooks()
        import_QMs()
-        try:
-            import_tunnelfiles()
-        except:
-            print("Tunnel files parser broken.")
+        import_tunnelfiles()
        import_surveys()
        import_descriptions()
        parse_descriptions()
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@ -12,8 +12,9 @@ from django.utils.timezone import make_aware

 import csv
 import re
-import datetime
+import datetime, time
 import os
+import pickle

 from utils import save_carefully

@ -78,10 +79,17 @@ def GetTripCave(place):  #need to be fuzzier about matching here. Already a very
        print("No cave found for place " , place)
        return

-
+logentries = [] # the entire logbook is a single object: a list of entries
 noncaveplaces = [ "Journey", "Loser Plateau" ]
+
 def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, entry_type="wiki"):
    """ saves a logbook entry and related persontrips """
+    global logentries
+
+    entrytuple = (date, place, title, text, 
+        trippeople, expedition, logtime_underground, entry_type)
+    logentries.append(entrytuple)
+
    trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground)
    if not author:
        print("   - Skipping logentry: " + title + " - no author for entry")
@ -100,12 +108,14 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
    lookupAttribs={'date':date, 'title':title}
    nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave':cave, 'slug':slugify(title)[:50], 'entry_type':entry_type}
    lbo, created=save_carefully(models.LogbookEntry, lookupAttribs, nonLookupAttribs)
+    #logentries.append(models.LogbookEntry)
+
    
    for tripperson, time_underground in trippersons:
        lookupAttribs={'personexpedition':tripperson, 'logbook_entry':lbo}
        nonLookupAttribs={'time_underground':time_underground, 'is_logbook_entry_author':(tripperson == author)}
-        #print nonLookupAttribs
        save_carefully(models.PersonTrip, lookupAttribs, nonLookupAttribs)
+        #logentries.append(models.PersonTrip)


 def ParseDate(tripdate, year):
@ -189,7 +199,7 @@ def Parseloghtmltxt(year, expedition, txt):
                          trippeople=trippeople, expedition=expedition, logtime_underground=0,
                          entry_type="html")
    if logbook_entry_count == 0:
-        print(" - No trip entrys found in logbook, check the syntax matches htmltxt format")
+        print(" - No trip entries found in logbook, check the syntax matches htmltxt format")


 # main parser for 1991 - 2001.  simpler because the data has been hacked so much to fit it
@ -294,39 +304,86 @@ def SetDatesFromLogbookEntries(expedition):
 def LoadLogbookForExpedition(expedition):
    """ Parses all logbook entries for one expedition """

+    global logentries
+    
    expowebbase = os.path.join(settings.EXPOWEB, "years")
    yearlinks = settings.LOGBOOK_PARSER_SETTINGS

    logbook_parseable = False
+    logbook_cached = False
 
    if expedition.year in yearlinks:
+        # print " - Valid logbook year: ", expedition.year
        year_settings = yearlinks[expedition.year]
-        file_in = open(os.path.join(expowebbase, year_settings[0]))
-        txt = file_in.read().decode("latin1")
-        file_in.close()
-        parsefunc = year_settings[1]
-        logbook_parseable = True
-        print(" - Parsing logbook: " + year_settings[0] + "\n - Using parser: " + year_settings[1])
-    else:
        try:
-            file_in = open(os.path.join(expowebbase, expedition.year, settings.DEFAULT_LOGBOOK_FILE))
+            bad_cache = False
+            cache_filename = os.path.join(expowebbase, year_settings[0])+".cache"
+            now = time.time()
+            cache_t = os.path.getmtime(cache_filename)
+            file_t  = os.path.getmtime(os.path.join(expowebbase, year_settings[0]))
+            if file_t - cache_t > 2: # at least 2 secs later
+                #print " - Cache is stale."
+                bad_cache= True
+            if now - cache_t > 30*24*60*60:
+                #print " - Cache is more than 30 days old."
+                bad_cache= True
+            if bad_cache:
+                print " - Cache is either stale or more than 30 days old. Deleting it."
+                os.remove(cache_filename)
+                logentries=[]
+                raise
+            print(" - Reading cache: " + cache_filename )
+            try:
+                with open(cache_filename, "rb") as f:
+                    logentries = pickle.load(f) 
+                print " - Loaded ", len(logentries), " objects"
+                logbook_cached = True
+            except:
+                print " - Failed to load corrupt cache. Deleting it.\n"
+                os.remove(cache_filename)
+                logentries=[]
+        except:
+            print(" - Opening logbook: ")
+            file_in = open(os.path.join(expowebbase, year_settings[0]))
            txt = file_in.read().decode("latin1")
            file_in.close()
+            parsefunc = year_settings[1]
            logbook_parseable = True
-            print("No set parser found using default")
-            parsefunc = settings.DEFAULT_LOGBOOK_PARSER
-        except (IOError):
-            logbook_parseable = False
-            print("Couldn't open default logbook file and nothing in settings for expo " + expedition.year)
+            print(" - Parsing logbook: " + year_settings[0] + "\n - Using parser: " + year_settings[1])
+    else:
+            try:
+                file_in = open(os.path.join(expowebbase, expedition.year, settings.DEFAULT_LOGBOOK_FILE))
+                txt = file_in.read().decode("latin1")
+                file_in.close()
+                logbook_parseable = True
+                print("No set parser found using default")
+                parsefunc = settings.DEFAULT_LOGBOOK_PARSER
+            except (IOError):
+                logbook_parseable = False
+                print("Couldn't open default logbook file and nothing in settings for expo " + expedition.year)

    if logbook_parseable:
        parser = globals()[parsefunc]
        parser(expedition.year, expedition, txt)
        SetDatesFromLogbookEntries(expedition)
+        # and this has also stored all the objects in logentries[]
+        print " - Storing " , len(logentries), " log entries"
+        with open(cache_filename, "wb") as f:
+            pickle.dump(logentries, f, 2)
+        logentries=[] # flush for next year
+
+    if logbook_cached:
+        i=0
+        for entrytuple in range(len(logentries)):
+            date, place, title, text, trippeople, expedition, logtime_underground, \
+                entry_type = logentries[i]
+            #print " - - obj ", i, date, title
+            EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground,\
+                 entry_type)
+            i +=1

    #return "TOLOAD: " + year + "  " + str(expedition.personexpedition_set.all()[1].logbookentry_set.count()) + "  " + str(models.PersonTrip.objects.filter(personexpedition__expedition=expedition).count())

-
 def LoadLogbooks():
    """ This is the master function for parsing all logbooks into the Troggle database. """

@ -372,7 +429,7 @@ def parseAutoLogBookEntry(filename):
        except models.Expedition.DoesNotExist:
            errors.append("Expedition not in database")   
    else:
-        errors.append("Expediton Year could not be parsed")   
+        errors.append("Expedition Year could not be parsed")   

    titleMatch = titleRegex.search(contents)
    if titleMatch: