dbReset now loads into memory first (fast err checking), then into db

2026-06-22 06:44:43 +01:00 · 2020-04-30 23:15:57 +01:00
parent 76a6b501f3
commit 39c622d5bf
4 changed files with 123 additions and 37 deletions
@@ -33,3 +33,4 @@ ignored-files.log
 tunnel-import.log
 posnotfound
 troggle.sqlite-journal
+loadsurvexblks.log
@@ -55,8 +55,9 @@ def controlPanel(request):
    
            #importlist is mostly here so that things happen in the correct order.
            #http post data seems to come in an unpredictable order, so we do it this way.
-            importlist=['reload_db', 'import_people', 'import_cavetab', 'import_logbooks', 'import_surveys', 'import_QMs']
-            databaseReset.make_dirs()
+            importlist=['reinit_db', 'import_people', 'import_caves', 'import_logbooks',
+            'import_survexblks', 'import_QMs', 'import_survexpos', 'import_surveyscans', 'import_tunnelfiles']
+            databaseReset.dirsredirect()
            for item in importlist:
                if item in request.POST:
                    print("running"+ " databaseReset."+item+"()")
@@ -5,7 +5,7 @@ import settings
 os.environ['PYTHONPATH'] = settings.PYTHON_PATH
 os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'settings')
 from django.core import management
-from django.db import connection
+from django.db import connection, close_old_connections
 from django.contrib.auth.models import User
 from django.http import HttpResponse
 from django.core.urlresolvers import reverse
@@ -13,6 +13,9 @@ from troggle.core.models import Cave, Entrance
 import troggle.flatpages.models
 import json

+# NOTE databaseRest.py is *imported* by views_other.py as it is used in the control panel
+# presented there.
+
 databasename=settings.DATABASES['default']['NAME']
 expouser=settings.EXPOUSER
 expouserpass=settings.EXPOUSERPASS
@@ -22,17 +25,18 @@ def reinit_db():
    """Rebuild database from scratch. Deletes the file first if sqlite is used,
    otherwise it drops the database and creates it.
    """
+    currentdbname = settings.DATABASES['default']['NAME']
    if settings.DATABASES['default']['ENGINE'] == 'django.db.backends.sqlite3':
        try:
-            os.remove(databasename)
+            os.remove(currentdbname)
        except OSError:
            pass
    else:
        cursor = connection.cursor()
-        cursor.execute("DROP DATABASE %s" % databasename)
-        cursor.execute("CREATE DATABASE %s" % databasename)
-        cursor.execute("ALTER DATABASE %s CHARACTER SET=utf8" % databasename)
-        cursor.execute("USE %s" % databasename)
+        cursor.execute("DROP DATABASE %s" % currentdbname)
+        cursor.execute("CREATE DATABASE %s" % currentdbname)
+        cursor.execute("ALTER DATABASE %s CHARACTER SET=utf8" % currentdbname)
+        cursor.execute("USE %s" % currentdbname)
    syncuser()

 def syncuser():
@@ -73,7 +77,7 @@ def import_logbooks():
 def import_QMs():
    print("Importing QMs (old caves)")
    import parsers.QMs
-    # import process itself runs on qm.csv in only 3 caves, not 264!
+    # import process itself runs on qm.csv in only 3 old caves, not the modern ones!
    
 def import_survexblks():
    import parsers.survex
@@ -159,7 +163,7 @@ def dumplogbooks():
 # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - 

 class JobQueue():
-    """A list of import operations to run. Always reports times
+    """A list of import operations to run. Always reports profile times
    in the same order. 
    """
    def __init__(self,run):
@@ -173,7 +177,7 @@ class JobQueue():
        for k in self.results_order:
            self.results[k]=[]
        self.tfile = "import_profile.json"
-        self.htmlfile = "profile.html"
+        self.htmlfile = "profile.html" # for HTML results table. Not yet done.

    #Adding elements to queue - enqueue
    def enq(self,label,func):
@@ -186,7 +190,9 @@ class JobQueue():
    #         return self.queue.pop()
    #     return ("Queue Empty!")

-    def run(self):
+    def loadprofiles(self):
+        """Load timings for previous runs from file
+        """
        if os.path.isfile(self.tfile):
            try:
                f = open(self.tfile, "r")
@@ -197,9 +203,26 @@ class JobQueue():
                print "FAILURE parsing JSON file %s" % (self.tfile)
                # Python bug: https://github.com/ShinNoNoir/twitterwebsearch/issues/12
            f.close()
-        
        for j in self.results_order:
            self.results[j].append(None) # append a placeholder
+        return True
+    
+    def saveprofiles(self):
+        with open(self.tfile, 'w') as f:
+            json.dump(self.results, f)     
+        return True
+    
+    def memdumpsql(self):
+        djconn = django.db.connection
+        from dump import _iterdump
+        with open('memdump.sql', 'w') as f:
+            for line in _iterdump(djconn):
+                f.write('%s\n' % line.encode("utf8"))
+        return True
+
+    def runqonce(self):
+        """Run all the jobs in the queue provided once
+        """
        
        print "** Running job ", self.runlabel
        jobstart = time.time()
@@ -216,26 +239,68 @@ class JobQueue():
            self.results[i[0]].pop()  # the null item
            self.results[i[0]].append(duration)
               
-        with open(self.tfile, 'w') as f:
-            json.dump(self.results, f)     

        jobend = time.time()
        jobduration = jobend-jobstart
-        print "** Ended all jobs. %.1f seconds" % jobduration
+        print "** Ended job   %s  -  %.1f seconds total." % (self.runlabel,jobduration)
+        
+        return True
+   
+    
+    def run(self):
+        self.loadprofiles()

-        # currently uses django db whatever it was. CHANGE this to explicitly use
-        # a new sqlite3 db and then import the sql dump of that into the troggle db
-        # instead of loading directly into the troggle sqlite db.
-        # in-memory ":memory:" sqlite is ~ 7x faster and all of troggle can be
-        # loaded in 6 minutes that way
-        djconn = django.db.connection
-        from dump import _iterdump
-        with open('memdump.sql', 'w') as f:
-            for line in _iterdump(djconn):
-                f.write('%s\n' % line.encode("utf8"))
+        dbengine = settings.DATABASES['default']['ENGINE']
+        dbname = settings.DATABASES['default']['NAME']

-        # now import the memory image sql into   
-        ####(to do)   
+        if dbname ==":memory:":
+            # just run, and save the sql file
+            print "--  ", settings.DATABASES['default']['NAME'], settings.DATABASES['default']['ENGINE']
+            self.runqonce()
+            self.memdumpsql()
+            self.saveprofiles()
+        else:
+            # run all the imports through :memory: first
+            settings.DATABASES['default']['ENGINE'] = 'django.db.backends.sqlite3'
+            settings.DATABASES['default']['NAME'] = ":memory:"
+            print "--  ", settings.DATABASES['default']['NAME'], settings.DATABASES['default']['ENGINE']
+
+            # but because the user may be expecting to add this to a db with lots of tables already there,
+            # the jobque may not start from scratch so we need to initialise the db properly first.
+            # But initiating twice crashes, so be sure to do it once only.
+            if ("reinit",reinit_db) not in self.queue:
+                reinit_db()
+            if ("dirsredirect",dirsredirect) not in self.queue:
+                dirsredirect()
+            if ("caves",import_caves) not in self.queue:
+                import_caves()
+            if ("people",import_people) not in self.queue:
+                import_people()
+                
+            django.db.close_old_connections() # maybe not needed here
+            
+            self.runqonce() 
+            self.memdumpsql()
+            self.showprofile() 
+            
+            # restore the original db and import again
+            # if we wanted to, we could re-import the SQL generated in the first pass to be
+            # blazing fast. But for the present just re-import the lot.
+            settings.DATABASES['default']['ENGINE'] = dbengine
+            settings.DATABASES['default']['NAME'] = dbname
+            print "--  ", settings.DATABASES['default']['NAME'], settings.DATABASES['default']['ENGINE']
+            
+            for j in self.results_order:
+                self.results[j].pop() # throw away results from :memory: run
+                self.results[j].append(None) # append a placeholder
+
+            django.db.close_old_connections() # magic rune. works. found by looking in django.db__init__.py
+            #django.setup()  # should this be needed?
+
+            
+            self.runqonce() # crashes because it thinks it has no migrations to apply, when it does.
+            self.saveprofiles()
+    
        return True

    def showprofile(self):
@@ -277,9 +342,10 @@ class JobQueue():
                        percen = 100* (r[i] - r[i-1])/r[i-1]
                        if abs(percen) >0.1:
                            print '%8.1f%%' % percen,
-                        else:
-                            print "      -  ",
+                else:
+                    print "      - ",
            print ""
+        print "\n"
        return True


@@ -333,8 +399,8 @@ if __name__ == "__main__":
        jq.enq("reinit",reinit_db)
        jq.enq("dirsredirect",dirsredirect)
        jq.enq("caves",import_caves)
-        jq.enq("survexblks",import_survexblks)
-        jq.enq("survexpos",import_survexpos)
+        jq.enq("people",import_people)
+        jq.enq("scans",import_surveyscans)
    elif "caves" in sys.argv:
        jq.enq("caves",import_caves)
    elif "logbooks" in sys.argv:
@@ -10,7 +10,9 @@ from django.utils.timezone import make_aware

 import re
 import os
+import time
 from datetime import datetime, timedelta
+import sys

 line_leg_regex = re.compile(r"[\d\-+.]+$")

@@ -179,7 +181,7 @@ def RecursiveLoad(survexblock, survexfile, fin, textlines):
            # print('QM res station %s' % qm_resolve_station)
            # print('QM notes %s' % qm_notes)

-            # If the QM isn't resolved (has a resolving station) thn load it
+            # If the QM isn't resolved (has a resolving station) then load it
            if not qm_resolve_section or qm_resolve_section is not '-' or qm_resolve_section is not 'None':
                from_section = models.SurvexBlock.objects.filter(name=qm_from_section)
                # If we can find a section (survex note chunck, named)
@@ -364,6 +366,11 @@ def LoadAllSurvexBlocks():

    print(" - Data flushed")
    print(' - Loading All Survex Blocks...')
+    
+    print('  - redirecting stdout to loadsurvexblks.log ...')
+    stdout_orig = sys.stdout
+    # Redirect sys.stdout to the file
+    sys.stdout = open('loadsurvexblks.log', 'w')

    survexfile = models.SurvexFile(path=settings.SURVEX_TOPNAME, cave=None)
    survexfile.save()
@@ -379,6 +386,11 @@ def LoadAllSurvexBlocks():
    fin.close()
    survexblockroot.text = "".join(textlines)
    survexblockroot.save()
+    
+    # Close the file
+    sys.stdout.close()
+    # Restore sys.stdout to our old saved file handler
+    sys.stdout = stdout_orig
    print(' - Loaded All Survex Blocks.')


@@ -399,13 +411,18 @@ def LoadPos():
    # but without cave import being run before,
    # then *everything* may be in the fresh  'not found' cache file. 
    
-    cachefile = settings.SURVEX_DATA + "posnotfound"
+    cachefile = settings.SURVEX_DATA + "posnotfound.cache"
    notfoundbefore = {}
    if os.path.isfile(cachefile):
        updtsvx = os.path.getmtime(topdata + ".svx")
        updtcache = os.path.getmtime(cachefile)
        age = updtcache - updtsvx
-        print('   svx: %s    cache: %s    cache age: %s' % (updtsvx, updtcache, str(timedelta(seconds=age) )))
+        print('   svx: %s    cache: %s    not-found cache is fresher by: %s' % (updtsvx, updtcache, str(timedelta(seconds=age) )))
+        
+        now = time.time()
+        if now - updtcache > 30*24*60*60:
+            print "   cache is more than 30 days old. Deleting."
+            os.remove(cachefile)
        if age < 0 :
            print "   cache is stale."
            os.remove(cachefile)
@@ -432,6 +449,8 @@ def LoadPos():
    # cavern defaults to using same cwd as supplied input file
    call([settings.CAVERN, "--output=%s.3d" % (topdata), "%s.svx" % (topdata)])
    call([settings.THREEDTOPOS, '%s.3d' % (topdata)], cwd = settings.SURVEX_DATA)
+    print "  - This next bit takes a while. Matching ~32,000 survey positions. Be patient..."
+
    posfile = open("%s.pos" % (topdata))
    posfile.readline() #Drop header
    for line in posfile.readlines():
@@ -449,9 +468,8 @@ def LoadPos():
                    ss.save()
                    found += 1
                except:
-                    #print "%s in %s.pos not found in lookup of SurvexStation.objects" % (name, settings.SURVEX_TOPNAME)
                    notfoundnow.append(name)
-    print " - %s stations NOT found in lookup of SurvexStation.objects. %s found. %s skipped." % (len(notfoundnow),found, len(skip))
+    print " - %s stations not found in lookup of SurvexStation.objects. %s found. %s skipped." % (len(notfoundnow),found, len(skip))

    if found > 10: # i.e. a previous cave import has been done
        try: