dbReset now loads into memory first (fast err checking), then into db

This commit is contained in:
Philip Sargent 2020-04-30 23:15:57 +01:00
parent 76a6b501f3
commit 39c622d5bf
4 changed files with 123 additions and 37 deletions

1
.gitignore vendored
View File

@ -33,3 +33,4 @@ ignored-files.log
tunnel-import.log tunnel-import.log
posnotfound posnotfound
troggle.sqlite-journal troggle.sqlite-journal
loadsurvexblks.log

View File

@ -55,8 +55,9 @@ def controlPanel(request):
#importlist is mostly here so that things happen in the correct order. #importlist is mostly here so that things happen in the correct order.
#http post data seems to come in an unpredictable order, so we do it this way. #http post data seems to come in an unpredictable order, so we do it this way.
importlist=['reload_db', 'import_people', 'import_cavetab', 'import_logbooks', 'import_surveys', 'import_QMs'] importlist=['reinit_db', 'import_people', 'import_caves', 'import_logbooks',
databaseReset.make_dirs() 'import_survexblks', 'import_QMs', 'import_survexpos', 'import_surveyscans', 'import_tunnelfiles']
databaseReset.dirsredirect()
for item in importlist: for item in importlist:
if item in request.POST: if item in request.POST:
print("running"+ " databaseReset."+item+"()") print("running"+ " databaseReset."+item+"()")

View File

@ -5,7 +5,7 @@ import settings
os.environ['PYTHONPATH'] = settings.PYTHON_PATH os.environ['PYTHONPATH'] = settings.PYTHON_PATH
os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'settings') os.environ.setdefault('DJANGO_SETTINGS_MODULE', 'settings')
from django.core import management from django.core import management
from django.db import connection from django.db import connection, close_old_connections
from django.contrib.auth.models import User from django.contrib.auth.models import User
from django.http import HttpResponse from django.http import HttpResponse
from django.core.urlresolvers import reverse from django.core.urlresolvers import reverse
@ -13,6 +13,9 @@ from troggle.core.models import Cave, Entrance
import troggle.flatpages.models import troggle.flatpages.models
import json import json
# NOTE databaseRest.py is *imported* by views_other.py as it is used in the control panel
# presented there.
databasename=settings.DATABASES['default']['NAME'] databasename=settings.DATABASES['default']['NAME']
expouser=settings.EXPOUSER expouser=settings.EXPOUSER
expouserpass=settings.EXPOUSERPASS expouserpass=settings.EXPOUSERPASS
@ -22,17 +25,18 @@ def reinit_db():
"""Rebuild database from scratch. Deletes the file first if sqlite is used, """Rebuild database from scratch. Deletes the file first if sqlite is used,
otherwise it drops the database and creates it. otherwise it drops the database and creates it.
""" """
currentdbname = settings.DATABASES['default']['NAME']
if settings.DATABASES['default']['ENGINE'] == 'django.db.backends.sqlite3': if settings.DATABASES['default']['ENGINE'] == 'django.db.backends.sqlite3':
try: try:
os.remove(databasename) os.remove(currentdbname)
except OSError: except OSError:
pass pass
else: else:
cursor = connection.cursor() cursor = connection.cursor()
cursor.execute("DROP DATABASE %s" % databasename) cursor.execute("DROP DATABASE %s" % currentdbname)
cursor.execute("CREATE DATABASE %s" % databasename) cursor.execute("CREATE DATABASE %s" % currentdbname)
cursor.execute("ALTER DATABASE %s CHARACTER SET=utf8" % databasename) cursor.execute("ALTER DATABASE %s CHARACTER SET=utf8" % currentdbname)
cursor.execute("USE %s" % databasename) cursor.execute("USE %s" % currentdbname)
syncuser() syncuser()
def syncuser(): def syncuser():
@ -73,7 +77,7 @@ def import_logbooks():
def import_QMs(): def import_QMs():
print("Importing QMs (old caves)") print("Importing QMs (old caves)")
import parsers.QMs import parsers.QMs
# import process itself runs on qm.csv in only 3 caves, not 264! # import process itself runs on qm.csv in only 3 old caves, not the modern ones!
def import_survexblks(): def import_survexblks():
import parsers.survex import parsers.survex
@ -159,7 +163,7 @@ def dumplogbooks():
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - # - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
class JobQueue(): class JobQueue():
"""A list of import operations to run. Always reports times """A list of import operations to run. Always reports profile times
in the same order. in the same order.
""" """
def __init__(self,run): def __init__(self,run):
@ -173,7 +177,7 @@ class JobQueue():
for k in self.results_order: for k in self.results_order:
self.results[k]=[] self.results[k]=[]
self.tfile = "import_profile.json" self.tfile = "import_profile.json"
self.htmlfile = "profile.html" self.htmlfile = "profile.html" # for HTML results table. Not yet done.
#Adding elements to queue - enqueue #Adding elements to queue - enqueue
def enq(self,label,func): def enq(self,label,func):
@ -186,7 +190,9 @@ class JobQueue():
# return self.queue.pop() # return self.queue.pop()
# return ("Queue Empty!") # return ("Queue Empty!")
def run(self): def loadprofiles(self):
"""Load timings for previous runs from file
"""
if os.path.isfile(self.tfile): if os.path.isfile(self.tfile):
try: try:
f = open(self.tfile, "r") f = open(self.tfile, "r")
@ -197,9 +203,26 @@ class JobQueue():
print "FAILURE parsing JSON file %s" % (self.tfile) print "FAILURE parsing JSON file %s" % (self.tfile)
# Python bug: https://github.com/ShinNoNoir/twitterwebsearch/issues/12 # Python bug: https://github.com/ShinNoNoir/twitterwebsearch/issues/12
f.close() f.close()
for j in self.results_order: for j in self.results_order:
self.results[j].append(None) # append a placeholder self.results[j].append(None) # append a placeholder
return True
def saveprofiles(self):
with open(self.tfile, 'w') as f:
json.dump(self.results, f)
return True
def memdumpsql(self):
djconn = django.db.connection
from dump import _iterdump
with open('memdump.sql', 'w') as f:
for line in _iterdump(djconn):
f.write('%s\n' % line.encode("utf8"))
return True
def runqonce(self):
"""Run all the jobs in the queue provided once
"""
print "** Running job ", self.runlabel print "** Running job ", self.runlabel
jobstart = time.time() jobstart = time.time()
@ -216,26 +239,68 @@ class JobQueue():
self.results[i[0]].pop() # the null item self.results[i[0]].pop() # the null item
self.results[i[0]].append(duration) self.results[i[0]].append(duration)
with open(self.tfile, 'w') as f:
json.dump(self.results, f)
jobend = time.time() jobend = time.time()
jobduration = jobend-jobstart jobduration = jobend-jobstart
print "** Ended all jobs. %.1f seconds" % jobduration print "** Ended job %s - %.1f seconds total." % (self.runlabel,jobduration)
return True
def run(self):
self.loadprofiles()
# currently uses django db whatever it was. CHANGE this to explicitly use dbengine = settings.DATABASES['default']['ENGINE']
# a new sqlite3 db and then import the sql dump of that into the troggle db dbname = settings.DATABASES['default']['NAME']
# instead of loading directly into the troggle sqlite db.
# in-memory ":memory:" sqlite is ~ 7x faster and all of troggle can be
# loaded in 6 minutes that way
djconn = django.db.connection
from dump import _iterdump
with open('memdump.sql', 'w') as f:
for line in _iterdump(djconn):
f.write('%s\n' % line.encode("utf8"))
# now import the memory image sql into if dbname ==":memory:":
####(to do) # just run, and save the sql file
print "-- ", settings.DATABASES['default']['NAME'], settings.DATABASES['default']['ENGINE']
self.runqonce()
self.memdumpsql()
self.saveprofiles()
else:
# run all the imports through :memory: first
settings.DATABASES['default']['ENGINE'] = 'django.db.backends.sqlite3'
settings.DATABASES['default']['NAME'] = ":memory:"
print "-- ", settings.DATABASES['default']['NAME'], settings.DATABASES['default']['ENGINE']
# but because the user may be expecting to add this to a db with lots of tables already there,
# the jobque may not start from scratch so we need to initialise the db properly first.
# But initiating twice crashes, so be sure to do it once only.
if ("reinit",reinit_db) not in self.queue:
reinit_db()
if ("dirsredirect",dirsredirect) not in self.queue:
dirsredirect()
if ("caves",import_caves) not in self.queue:
import_caves()
if ("people",import_people) not in self.queue:
import_people()
django.db.close_old_connections() # maybe not needed here
self.runqonce()
self.memdumpsql()
self.showprofile()
# restore the original db and import again
# if we wanted to, we could re-import the SQL generated in the first pass to be
# blazing fast. But for the present just re-import the lot.
settings.DATABASES['default']['ENGINE'] = dbengine
settings.DATABASES['default']['NAME'] = dbname
print "-- ", settings.DATABASES['default']['NAME'], settings.DATABASES['default']['ENGINE']
for j in self.results_order:
self.results[j].pop() # throw away results from :memory: run
self.results[j].append(None) # append a placeholder
django.db.close_old_connections() # magic rune. works. found by looking in django.db__init__.py
#django.setup() # should this be needed?
self.runqonce() # crashes because it thinks it has no migrations to apply, when it does.
self.saveprofiles()
return True return True
def showprofile(self): def showprofile(self):
@ -277,9 +342,10 @@ class JobQueue():
percen = 100* (r[i] - r[i-1])/r[i-1] percen = 100* (r[i] - r[i-1])/r[i-1]
if abs(percen) >0.1: if abs(percen) >0.1:
print '%8.1f%%' % percen, print '%8.1f%%' % percen,
else: else:
print " - ", print " - ",
print "" print ""
print "\n"
return True return True
@ -333,8 +399,8 @@ if __name__ == "__main__":
jq.enq("reinit",reinit_db) jq.enq("reinit",reinit_db)
jq.enq("dirsredirect",dirsredirect) jq.enq("dirsredirect",dirsredirect)
jq.enq("caves",import_caves) jq.enq("caves",import_caves)
jq.enq("survexblks",import_survexblks) jq.enq("people",import_people)
jq.enq("survexpos",import_survexpos) jq.enq("scans",import_surveyscans)
elif "caves" in sys.argv: elif "caves" in sys.argv:
jq.enq("caves",import_caves) jq.enq("caves",import_caves)
elif "logbooks" in sys.argv: elif "logbooks" in sys.argv:

View File

@ -10,7 +10,9 @@ from django.utils.timezone import make_aware
import re import re
import os import os
import time
from datetime import datetime, timedelta from datetime import datetime, timedelta
import sys
line_leg_regex = re.compile(r"[\d\-+.]+$") line_leg_regex = re.compile(r"[\d\-+.]+$")
@ -179,7 +181,7 @@ def RecursiveLoad(survexblock, survexfile, fin, textlines):
# print('QM res station %s' % qm_resolve_station) # print('QM res station %s' % qm_resolve_station)
# print('QM notes %s' % qm_notes) # print('QM notes %s' % qm_notes)
# If the QM isn't resolved (has a resolving station) thn load it # If the QM isn't resolved (has a resolving station) then load it
if not qm_resolve_section or qm_resolve_section is not '-' or qm_resolve_section is not 'None': if not qm_resolve_section or qm_resolve_section is not '-' or qm_resolve_section is not 'None':
from_section = models.SurvexBlock.objects.filter(name=qm_from_section) from_section = models.SurvexBlock.objects.filter(name=qm_from_section)
# If we can find a section (survex note chunck, named) # If we can find a section (survex note chunck, named)
@ -364,6 +366,11 @@ def LoadAllSurvexBlocks():
print(" - Data flushed") print(" - Data flushed")
print(' - Loading All Survex Blocks...') print(' - Loading All Survex Blocks...')
print(' - redirecting stdout to loadsurvexblks.log ...')
stdout_orig = sys.stdout
# Redirect sys.stdout to the file
sys.stdout = open('loadsurvexblks.log', 'w')
survexfile = models.SurvexFile(path=settings.SURVEX_TOPNAME, cave=None) survexfile = models.SurvexFile(path=settings.SURVEX_TOPNAME, cave=None)
survexfile.save() survexfile.save()
@ -379,6 +386,11 @@ def LoadAllSurvexBlocks():
fin.close() fin.close()
survexblockroot.text = "".join(textlines) survexblockroot.text = "".join(textlines)
survexblockroot.save() survexblockroot.save()
# Close the file
sys.stdout.close()
# Restore sys.stdout to our old saved file handler
sys.stdout = stdout_orig
print(' - Loaded All Survex Blocks.') print(' - Loaded All Survex Blocks.')
@ -399,13 +411,18 @@ def LoadPos():
# but without cave import being run before, # but without cave import being run before,
# then *everything* may be in the fresh 'not found' cache file. # then *everything* may be in the fresh 'not found' cache file.
cachefile = settings.SURVEX_DATA + "posnotfound" cachefile = settings.SURVEX_DATA + "posnotfound.cache"
notfoundbefore = {} notfoundbefore = {}
if os.path.isfile(cachefile): if os.path.isfile(cachefile):
updtsvx = os.path.getmtime(topdata + ".svx") updtsvx = os.path.getmtime(topdata + ".svx")
updtcache = os.path.getmtime(cachefile) updtcache = os.path.getmtime(cachefile)
age = updtcache - updtsvx age = updtcache - updtsvx
print(' svx: %s cache: %s cache age: %s' % (updtsvx, updtcache, str(timedelta(seconds=age) ))) print(' svx: %s cache: %s not-found cache is fresher by: %s' % (updtsvx, updtcache, str(timedelta(seconds=age) )))
now = time.time()
if now - updtcache > 30*24*60*60:
print " cache is more than 30 days old. Deleting."
os.remove(cachefile)
if age < 0 : if age < 0 :
print " cache is stale." print " cache is stale."
os.remove(cachefile) os.remove(cachefile)
@ -432,6 +449,8 @@ def LoadPos():
# cavern defaults to using same cwd as supplied input file # cavern defaults to using same cwd as supplied input file
call([settings.CAVERN, "--output=%s.3d" % (topdata), "%s.svx" % (topdata)]) call([settings.CAVERN, "--output=%s.3d" % (topdata), "%s.svx" % (topdata)])
call([settings.THREEDTOPOS, '%s.3d' % (topdata)], cwd = settings.SURVEX_DATA) call([settings.THREEDTOPOS, '%s.3d' % (topdata)], cwd = settings.SURVEX_DATA)
print " - This next bit takes a while. Matching ~32,000 survey positions. Be patient..."
posfile = open("%s.pos" % (topdata)) posfile = open("%s.pos" % (topdata))
posfile.readline() #Drop header posfile.readline() #Drop header
for line in posfile.readlines(): for line in posfile.readlines():
@ -449,9 +468,8 @@ def LoadPos():
ss.save() ss.save()
found += 1 found += 1
except: except:
#print "%s in %s.pos not found in lookup of SurvexStation.objects" % (name, settings.SURVEX_TOPNAME)
notfoundnow.append(name) notfoundnow.append(name)
print " - %s stations NOT found in lookup of SurvexStation.objects. %s found. %s skipped." % (len(notfoundnow),found, len(skip)) print " - %s stations not found in lookup of SurvexStation.objects. %s found. %s skipped." % (len(notfoundnow),found, len(skip))
if found > 10: # i.e. a previous cave import has been done if found > 10: # i.e. a previous cave import has been done
try: try: