From 39c622d5bfab0ddd4c75b9d643ecbe10d724e022 Mon Sep 17 00:00:00 2001 From: Philip Sargent Date: Thu, 30 Apr 2020 23:15:57 +0100 Subject: [PATCH] dbReset now loads into memory first (fast err checking), then into db --- .gitignore | 1 + core/views_other.py | 5 +- databaseReset.py | 126 +++++++++++++++++++++++++++++++++----------- parsers/survex.py | 28 ++++++++-- 4 files changed, 123 insertions(+), 37 deletions(-) diff --git a/.gitignore b/.gitignore index 39d4835..ea7063e 100644 --- a/.gitignore +++ b/.gitignore @@ -33,3 +33,4 @@ ignored-files.log tunnel-import.log posnotfound troggle.sqlite-journal +loadsurvexblks.log diff --git a/core/views_other.py b/core/views_other.py index 1297e7f..cc9782b 100644 --- a/core/views_other.py +++ b/core/views_other.py @@ -55,8 +55,9 @@ def controlPanel(request): #importlist is mostly here so that things happen in the correct order. #http post data seems to come in an unpredictable order, so we do it this way. - importlist=['reload_db', 'import_people', 'import_cavetab', 'import_logbooks', 'import_surveys', 'import_QMs'] - databaseReset.make_dirs() + importlist=['reinit_db', 'import_people', 'import_caves', 'import_logbooks', + 'import_survexblks', 'import_QMs', 'import_survexpos', 'import_surveyscans', 'import_tunnelfiles'] + databaseReset.dirsredirect() for item in importlist: if item in request.POST: print("running"+ " databaseReset."+item+"()") diff --git a/databaseReset.py b/databaseReset.py index 6c03509..2387a44 100644 --- a/databaseReset.py +++ b/databaseReset.py @@ -5,7 +5,7 @@ import settings os.environ['PYTHONPATH'] = settings.PYTHON_PATH os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'settings') from django.core import management -from django.db import connection +from django.db import connection, close_old_connections from django.contrib.auth.models import User from django.http import HttpResponse from django.core.urlresolvers import reverse @@ -13,6 +13,9 @@ from troggle.core.models import Cave, Entrance import troggle.flatpages.models import json +# NOTE databaseRest.py is *imported* by views_other.py as it is used in the control panel +# presented there. + databasename=settings.DATABASES['default']['NAME'] expouser=settings.EXPOUSER expouserpass=settings.EXPOUSERPASS @@ -22,17 +25,18 @@ def reinit_db(): """Rebuild database from scratch. Deletes the file first if sqlite is used, otherwise it drops the database and creates it. """ + currentdbname = settings.DATABASES['default']['NAME'] if settings.DATABASES['default']['ENGINE'] == 'django.db.backends.sqlite3': try: - os.remove(databasename) + os.remove(currentdbname) except OSError: pass else: cursor = connection.cursor() - cursor.execute("DROP DATABASE %s" % databasename) - cursor.execute("CREATE DATABASE %s" % databasename) - cursor.execute("ALTER DATABASE %s CHARACTER SET=utf8" % databasename) - cursor.execute("USE %s" % databasename) + cursor.execute("DROP DATABASE %s" % currentdbname) + cursor.execute("CREATE DATABASE %s" % currentdbname) + cursor.execute("ALTER DATABASE %s CHARACTER SET=utf8" % currentdbname) + cursor.execute("USE %s" % currentdbname) syncuser() def syncuser(): @@ -73,7 +77,7 @@ def import_logbooks(): def import_QMs(): print("Importing QMs (old caves)") import parsers.QMs - # import process itself runs on qm.csv in only 3 caves, not 264! + # import process itself runs on qm.csv in only 3 old caves, not the modern ones! def import_survexblks(): import parsers.survex @@ -159,7 +163,7 @@ def dumplogbooks(): # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - class JobQueue(): - """A list of import operations to run. Always reports times + """A list of import operations to run. Always reports profile times in the same order. """ def __init__(self,run): @@ -173,7 +177,7 @@ class JobQueue(): for k in self.results_order: self.results[k]=[] self.tfile = "import_profile.json" - self.htmlfile = "profile.html" + self.htmlfile = "profile.html" # for HTML results table. Not yet done. #Adding elements to queue - enqueue def enq(self,label,func): @@ -186,7 +190,9 @@ class JobQueue(): # return self.queue.pop() # return ("Queue Empty!") - def run(self): + def loadprofiles(self): + """Load timings for previous runs from file + """ if os.path.isfile(self.tfile): try: f = open(self.tfile, "r") @@ -197,9 +203,26 @@ class JobQueue(): print "FAILURE parsing JSON file %s" % (self.tfile) # Python bug: https://github.com/ShinNoNoir/twitterwebsearch/issues/12 f.close() - for j in self.results_order: self.results[j].append(None) # append a placeholder + return True + + def saveprofiles(self): + with open(self.tfile, 'w') as f: + json.dump(self.results, f) + return True + + def memdumpsql(self): + djconn = django.db.connection + from dump import _iterdump + with open('memdump.sql', 'w') as f: + for line in _iterdump(djconn): + f.write('%s\n' % line.encode("utf8")) + return True + + def runqonce(self): + """Run all the jobs in the queue provided once + """ print "** Running job ", self.runlabel jobstart = time.time() @@ -216,26 +239,68 @@ class JobQueue(): self.results[i[0]].pop() # the null item self.results[i[0]].append(duration) - with open(self.tfile, 'w') as f: - json.dump(self.results, f) jobend = time.time() jobduration = jobend-jobstart - print "** Ended all jobs. %.1f seconds" % jobduration + print "** Ended job %s - %.1f seconds total." % (self.runlabel,jobduration) + + return True + + + def run(self): + self.loadprofiles() - # currently uses django db whatever it was. CHANGE this to explicitly use - # a new sqlite3 db and then import the sql dump of that into the troggle db - # instead of loading directly into the troggle sqlite db. - # in-memory ":memory:" sqlite is ~ 7x faster and all of troggle can be - # loaded in 6 minutes that way - djconn = django.db.connection - from dump import _iterdump - with open('memdump.sql', 'w') as f: - for line in _iterdump(djconn): - f.write('%s\n' % line.encode("utf8")) + dbengine = settings.DATABASES['default']['ENGINE'] + dbname = settings.DATABASES['default']['NAME'] - # now import the memory image sql into - ####(to do) + if dbname ==":memory:": + # just run, and save the sql file + print "-- ", settings.DATABASES['default']['NAME'], settings.DATABASES['default']['ENGINE'] + self.runqonce() + self.memdumpsql() + self.saveprofiles() + else: + # run all the imports through :memory: first + settings.DATABASES['default']['ENGINE'] = 'django.db.backends.sqlite3' + settings.DATABASES['default']['NAME'] = ":memory:" + print "-- ", settings.DATABASES['default']['NAME'], settings.DATABASES['default']['ENGINE'] + + # but because the user may be expecting to add this to a db with lots of tables already there, + # the jobque may not start from scratch so we need to initialise the db properly first. + # But initiating twice crashes, so be sure to do it once only. + if ("reinit",reinit_db) not in self.queue: + reinit_db() + if ("dirsredirect",dirsredirect) not in self.queue: + dirsredirect() + if ("caves",import_caves) not in self.queue: + import_caves() + if ("people",import_people) not in self.queue: + import_people() + + django.db.close_old_connections() # maybe not needed here + + self.runqonce() + self.memdumpsql() + self.showprofile() + + # restore the original db and import again + # if we wanted to, we could re-import the SQL generated in the first pass to be + # blazing fast. But for the present just re-import the lot. + settings.DATABASES['default']['ENGINE'] = dbengine + settings.DATABASES['default']['NAME'] = dbname + print "-- ", settings.DATABASES['default']['NAME'], settings.DATABASES['default']['ENGINE'] + + for j in self.results_order: + self.results[j].pop() # throw away results from :memory: run + self.results[j].append(None) # append a placeholder + + django.db.close_old_connections() # magic rune. works. found by looking in django.db__init__.py + #django.setup() # should this be needed? + + + self.runqonce() # crashes because it thinks it has no migrations to apply, when it does. + self.saveprofiles() + return True def showprofile(self): @@ -277,9 +342,10 @@ class JobQueue(): percen = 100* (r[i] - r[i-1])/r[i-1] if abs(percen) >0.1: print '%8.1f%%' % percen, - else: - print " - ", + else: + print " - ", print "" + print "\n" return True @@ -333,8 +399,8 @@ if __name__ == "__main__": jq.enq("reinit",reinit_db) jq.enq("dirsredirect",dirsredirect) jq.enq("caves",import_caves) - jq.enq("survexblks",import_survexblks) - jq.enq("survexpos",import_survexpos) + jq.enq("people",import_people) + jq.enq("scans",import_surveyscans) elif "caves" in sys.argv: jq.enq("caves",import_caves) elif "logbooks" in sys.argv: diff --git a/parsers/survex.py b/parsers/survex.py index 5720b11..6fb7c62 100644 --- a/parsers/survex.py +++ b/parsers/survex.py @@ -10,7 +10,9 @@ from django.utils.timezone import make_aware import re import os +import time from datetime import datetime, timedelta +import sys line_leg_regex = re.compile(r"[\d\-+.]+$") @@ -179,7 +181,7 @@ def RecursiveLoad(survexblock, survexfile, fin, textlines): # print('QM res station %s' % qm_resolve_station) # print('QM notes %s' % qm_notes) - # If the QM isn't resolved (has a resolving station) thn load it + # If the QM isn't resolved (has a resolving station) then load it if not qm_resolve_section or qm_resolve_section is not '-' or qm_resolve_section is not 'None': from_section = models.SurvexBlock.objects.filter(name=qm_from_section) # If we can find a section (survex note chunck, named) @@ -364,6 +366,11 @@ def LoadAllSurvexBlocks(): print(" - Data flushed") print(' - Loading All Survex Blocks...') + + print(' - redirecting stdout to loadsurvexblks.log ...') + stdout_orig = sys.stdout + # Redirect sys.stdout to the file + sys.stdout = open('loadsurvexblks.log', 'w') survexfile = models.SurvexFile(path=settings.SURVEX_TOPNAME, cave=None) survexfile.save() @@ -379,6 +386,11 @@ def LoadAllSurvexBlocks(): fin.close() survexblockroot.text = "".join(textlines) survexblockroot.save() + + # Close the file + sys.stdout.close() + # Restore sys.stdout to our old saved file handler + sys.stdout = stdout_orig print(' - Loaded All Survex Blocks.') @@ -399,13 +411,18 @@ def LoadPos(): # but without cave import being run before, # then *everything* may be in the fresh 'not found' cache file. - cachefile = settings.SURVEX_DATA + "posnotfound" + cachefile = settings.SURVEX_DATA + "posnotfound.cache" notfoundbefore = {} if os.path.isfile(cachefile): updtsvx = os.path.getmtime(topdata + ".svx") updtcache = os.path.getmtime(cachefile) age = updtcache - updtsvx - print(' svx: %s cache: %s cache age: %s' % (updtsvx, updtcache, str(timedelta(seconds=age) ))) + print(' svx: %s cache: %s not-found cache is fresher by: %s' % (updtsvx, updtcache, str(timedelta(seconds=age) ))) + + now = time.time() + if now - updtcache > 30*24*60*60: + print " cache is more than 30 days old. Deleting." + os.remove(cachefile) if age < 0 : print " cache is stale." os.remove(cachefile) @@ -432,6 +449,8 @@ def LoadPos(): # cavern defaults to using same cwd as supplied input file call([settings.CAVERN, "--output=%s.3d" % (topdata), "%s.svx" % (topdata)]) call([settings.THREEDTOPOS, '%s.3d' % (topdata)], cwd = settings.SURVEX_DATA) + print " - This next bit takes a while. Matching ~32,000 survey positions. Be patient..." + posfile = open("%s.pos" % (topdata)) posfile.readline() #Drop header for line in posfile.readlines(): @@ -449,9 +468,8 @@ def LoadPos(): ss.save() found += 1 except: - #print "%s in %s.pos not found in lookup of SurvexStation.objects" % (name, settings.SURVEX_TOPNAME) notfoundnow.append(name) - print " - %s stations NOT found in lookup of SurvexStation.objects. %s found. %s skipped." % (len(notfoundnow),found, len(skip)) + print " - %s stations not found in lookup of SurvexStation.objects. %s found. %s skipped." % (len(notfoundnow),found, len(skip)) if found > 10: # i.e. a previous cave import has been done try: