SurvexBlocks now importing in deatil

2020-07-01 22:49:38 +01:00
parent 8cc768e5b6
commit df434cd399
8 changed files with 605 additions and 53 deletions
--- a/parsers/caves.py
+++ b/parsers/caves.py
@@ -192,6 +192,8 @@ def readcave(filename):
                         url = url[0],
                         filename = filename)
            except:
+                # this slow db query happens on every cave, but on import we have all this in memory
+                # and don't need to do a db query. Fix this to speed it up!
                # need to cope with duplicates
                print(" ! FAILED to get only one CAVE when updating using: "+filename)
                kaves = models_caves.Cave.objects.all().filter(kataster_number=kataster_number[0])
@@ -206,6 +208,8 @@ def readcave(filename):
                        c = k
                
            for area_slug in areas:
+                # this slow db query happens on every cave, but on import we have all this in memory
+                # and don't need to do a db query. Fix this to speed it up!
                area = models_caves.Area.objects.filter(short_name = area_slug)
                if area:
                    newArea = area[0]
@@ -216,6 +220,8 @@ def readcave(filename):
            primary = True
            for slug in slugs:
                try:
+                    # this slow db query happens on every cave, but on import we have all this in memory
+                    # and don't need to do a db query. Fix this to speed it up!
                    cs = models_caves.CaveSlug.objects.update_or_create(cave = c,
                              slug = slug,
                              primary = primary)
@@ -225,10 +231,13 @@ def readcave(filename):
                    print(message)
                    
                primary = False
+
            for entrance in entrances:
                slug = getXML(entrance, "entranceslug", maxItems = 1, context = context)[0]
                letter = getXML(entrance, "letter", maxItems = 1, context = context)[0]
                try:
+                    # this slow db query happens on every entrance, but on import we have all this in memory
+                    # and don't need to do a db query. Fix this to speed it up!
                    entrance = models_caves.Entrance.objects.get(entranceslug__slug = slug)
                    ce = models_caves.CaveAndEntrance.objects.update_or_create(cave = c, entrance_letter = letter, entrance = entrance)
                except:
--- a/parsers/imports.py
+++ b/parsers/imports.py
@@ -16,21 +16,21 @@ import troggle.parsers.logbooks
 import troggle.parsers.QMs

 def import_caves():
-    print("Importing Caves to ",end="")
+    print("-- Importing Caves to ",end="")
    print(django.db.connections.databases['default']['NAME'])
    troggle.parsers.caves.readcaves()

 def import_people():
-    print("Importing People (folk.csv) to ",end="")
+    print("-- Importing People (folk.csv) to ",end="")
    print(django.db.connections.databases['default']['NAME'])
    troggle.parsers.people.LoadPersonsExpos()

 def import_surveyscans():
-    print("Importing Survey Scans")
+    print("-- Importing Survey Scans")
    troggle.parsers.surveys.LoadListScans()

 def import_logbooks():
-    print("Importing Logbooks")
+    print("-- Importing Logbooks")
    troggle.parsers.logbooks.LoadLogbooks()

 def import_QMs():
@@ -40,7 +40,7 @@ def import_QMs():
 def import_survex():
    # when this import is moved to the top with the rest it all crashes horribly
    import troggle.parsers.survex 
-    print("Importing Survex Blocks")
+    print("-- Importing Survex Blocks")
    print(" - Survex Blocks")
    troggle.parsers.survex.LoadSurvexBlocks()
    print(" - Survex entrances x/y/z Positions")
@@ -53,6 +53,6 @@ def import_loadpos():
    troggle.parsers.survex.LoadPos()

 def import_drawingsfiles():
-    print("Importing Drawings files")
+    print("-- Importing Drawings files")
    troggle.parsers.surveys.LoadDrawingFiles()

--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -114,6 +114,7 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
    expeditionday = expedition.get_expedition_day(date)
    lookupAttribs={'date':date, 'title':title}
    # 'cave' is converted to a string doing this, which renders as the cave slug.
+    # but it is a db query which we should try to avoid - rewrite this
    nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave_slug':str(cave), 'slug':slugify(title)[:50], 'entry_type':entry_type}
    lbo, created=save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs)

@@ -356,6 +357,8 @@ def SetDatesFromLogbookEntries(expedition):
    Sets the date_from and date_to field for an expedition based on persontrips.
    Then sets the expedition date_from and date_to based on the personexpeditions.
    """
+    # Probably a faster way to do this. This uses a lot of db queries, but we have all this
+    # in memory..
    for personexpedition in expedition.personexpedition_set.all():
        persontrips = personexpedition.persontrip_set.order_by('logbook_entry__date')
        # sequencing is difficult to do
--- a/parsers/survex.py
+++ b/parsers/survex.py
@@ -324,7 +324,8 @@ class LoadingSurvex():
                    return self.caveslist[g]
            print('    ! Failed to find cave for {}'.format(cavepath.lower()))
        else:
-            print('    ! No regex cave match for %s' % cavepath.lower())
+            # not a cave, but that is fine.
+            # print('    ! No regex(standard identifier) cave match for %s' % cavepath.lower())
            return None

    def GetSurvexDirectory(self, headpath):
@@ -353,17 +354,17 @@ class LoadingSurvex():
        print("\n"+message,file=sys.stderr)
        models.DataIssue.objects.create(parser='survex', message=message)
        
-    def LoadSurvexFile(self, includelabel):
+    def LoadSurvexFile(self, svxid):
        """Creates SurvexFile in the database, and SurvexDirectory if needed
        with links to 'cave'
-        Creates a new current survexblock with valid .survexfile and valid .survexdirectory
+        Creates a new current survexfile and valid .survexdirectory
        The survexblock passed-in is not necessarily the parent. FIX THIS.
        """
        depth = " " * self.depthbegin
-        print("{:2}{}   - NEW survexfile:'{}'".format(self.depthbegin, depth, includelabel))
-        headpath, tail = os.path.split(includelabel)
+        print("{:2}{}   - NEW survexfile:'{}'".format(self.depthbegin, depth, svxid))
+        headpath = os.path.dirname(svxid)

-        newfile = models_survex.SurvexFile(path=includelabel)
+        newfile = models_survex.SurvexFile(path=svxid)
        newfile.save() # until we do this there is no internal id so no foreign key works
        self.currentsurvexfile = newfile 
        newdirectory = self.GetSurvexDirectory(headpath)
@@ -383,10 +384,10 @@ class LoadingSurvex():
            newfile.cave   = cave
            #print("\n"+str(newdirectory.cave),file=sys.stderr)
        else:
-            self.ReportNonCaveIncludes(headpath, includelabel)
+            self.ReportNonCaveIncludes(headpath, svxid)

        if not newfile.survexdirectory:
-            message = " ! SurvexDirectory NOT SET in new SurvexFile {} ".format(includelabel)
+            message = " ! SurvexDirectory NOT SET in new SurvexFile {} ".format(svxid)
            print(message)
            print(message,file=sys.stderr)
            models.DataIssue.objects.create(parser='survex', message=message)
@@ -401,7 +402,7 @@ class LoadingSurvex():
    def ProcessIncludeLine(self, included):
        svxid = included.groups()[0]
        #depth = " " * self.depthbegin
-        #print("{:2}{}   - Include survexfile:'{}'".format(self.depthbegin, depth,  svxid))
+        #print("{:2}{}   - Include survexfile:'{}' {}".format(self.depthbegin, depth,  svxid, included))
        self.LoadSurvexFile(svxid)
        self.stacksvxfiles.append(self.currentsurvexfile)

@@ -426,8 +427,10 @@ class LoadingSurvex():
            self.LoadSurvexQM(survexblock, qmline)
            
        included = self.rx_comminc.match(comment)
-        # ;*include means we have been included; not 'proceed to include' which *include means
+        # ;*include means 'we have been included'; whereas *include means 'proceed to include' 
        if included:
+            #depth = " " * self.depthbegin
+            #print("{:2}{}   - Include comment:'{}' {}".format(self.depthbegin, depth,  comment, included))
            self.ProcessIncludeLine(included)

        edulcni = self.rx_commcni.match(comment)
@@ -457,7 +460,7 @@ class LoadingSurvex():

    def LinearLoad(self, survexblock, path, svxlines):
        """Loads a single survex file. Usually used to import all the survex files which have been collated
-        into a single file. Loads the begin/end blocks recursively.
+        into a single file. Loads the begin/end blocks using a stack for labels.
        """
        self.relativefilename = path
        cave = self.IdentifyCave(path) # this will produce null for survex files which are geographic collections
@@ -466,19 +469,25 @@ class LoadingSurvex():
        self.currentsurvexfile.save() # django insists on this although it is already saved !?
        
        blockcount = 0
+        lineno = 0
        def tickle():
            nonlocal blockcount
            blockcount +=1
            if blockcount % 10 ==0 :
                print(".", file=sys.stderr,end='')
-            if blockcount % 500 ==0 :
+            if blockcount % 200 ==0 :
                print("\n", file=sys.stderr,end='')
-            sys.stderr.flush();
+                print(" - MEM:{:7.3f} MB in use".format(models.get_process_memory()),file=sys.stderr)
+            sys.stderr.flush()

        for svxline in svxlines:
-            sline, comment = self.rx_comment.match(svxline.strip()).groups()
+            lineno += 1
+            sline, comment = self.rx_comment.match(svxline).groups()
            if comment:
+                depth = " " * self.depthbegin
+                print("{:4} {:2}{}   - Include comment:'{}' {}".format(lineno, self.depthbegin, depth,  comment, sline))
                self.LoadSurvexComment(survexblock, comment) # this catches the ;*include and ;*edulcni lines too
+
            if not sline:
                continue # skip blank lines

@@ -503,10 +512,10 @@ class LoadingSurvex():
                            pathlist += "." + id
                    newsurvexblock = models_survex.SurvexBlock(name=blockid, parent=survexblock, 
                            survexpath=pathlist, 
-                            title = survexblock.title, # copy parent inititally
                            cave=self.currentcave, survexfile=self.currentsurvexfile, 
                            legsall=0, legssplay=0, legssurfc=0, totalleglength=0.0)
                    newsurvexblock.save()
+                    newsurvexblock.title = "("+survexblock.title+")" # copy parent inititally
                    survexblock = newsurvexblock
                    # survexblock.survexfile.save() 
                    survexblock.save() # django insists on this , but we want to save at the end !
@@ -564,7 +573,7 @@ class LoadingSurvex():
                else:
                    pass # ignore all other sorts of data

-    def RecursiveScan(self, survexblock, survexfile, fin, flinear, fcollate):
+    def RecursiveScan(self, survexblock, path, fin, flinear, fcollate):
        """Follows the *include links in all the survex files from the root file 1623.svx
        and reads only the *include and *begin and *end statements. It produces a linearised
        list of the include tree
@@ -577,27 +586,27 @@ class LoadingSurvex():
        if self.callcount % 500 ==0 :
            print("\n", file=sys.stderr,end='')

-        if survexfile in self.svxfileslist:
-            message = " * Warning. Survex file already seen: {}".format(survexfile.path)
+        if path in self.svxfileslist:
+            message = " * Warning. Duplicate in *include list at:{} depth:{} file:{}".format(self.callcount, self.depthinclude, path)
            print(message)
            print(message,file=flinear)
-            print(message,file=sys.stderr)
+            print("\n"+message,file=sys.stderr)
            models.DataIssue.objects.create(parser='survex', message=message)
-            if self.svxfileslist.count(survexfile) > 20:
-                message = " ! ERROR. Survex file already seen 20x. Probably an infinite loop so fix your *include statements that include this. Aborting. {}".format(survexfile.path)
+            if self.svxfileslist.count(path) > 20:
+                message = " ! ERROR. Survex file already seen 20x. Probably an infinite loop so fix your *include statements that include this. Aborting. {}".format(path)
                print(message)
                print(message,file=flinear)
                print(message,file=sys.stderr)
                models.DataIssue.objects.create(parser='survex', message=message)
                return
-        self.svxfileslist.append(survexfile)
+        self.svxfileslist.append(path)
        
        svxlines = fin.read().splitlines()
        for svxline in svxlines:
            self.lineno += 1
            includestmt =self.rx_include.match(svxline)
            if not includestmt:
-                fcollate.write("{}\n".format(svxline))
+                fcollate.write("{}\n".format(svxline.strip()))

            sline, comment = self.rx_comment.match(svxline.strip()).groups()
            mstar = self.rx_star.match(sline)
@@ -605,40 +614,35 @@ class LoadingSurvex():
                cmd, args = mstar.groups()
                cmd = cmd.lower()
                if re.match("(?i)include$", cmd):
-                    includepath = os.path.normpath(os.path.join(os.path.split(survexfile.path)[0], re.sub(r"\.svx$", "", args)))
-                    path_match = re.search(r"caves-(\d\d\d\d)/(\d+|\d\d\d\d-?\w+-\d+)/", includepath)
+                    includepath = os.path.normpath(os.path.join(os.path.split(path)[0], re.sub(r"\.svx$", "", args)))
+                    #path_match = re.search(r"caves-(\d\d\d\d)/(\d+|\d\d\d\d-?\w+-\d+)/", includepath)

-                    includesurvexfile = models_survex.SurvexFile(path=includepath)
-                    includesurvexfile.save()
-
-                    if includesurvexfile.exists():
-                        # do not create SurvexFile in DB here by doing includesurvexfile.save(). Do it when reading data.
+                    fullpath = os.path.join(settings.SURVEX_DATA, includepath + ".svx")
+                    if os.path.isfile(fullpath):
                        #--------------------------------------------------------
                        self.depthinclude += 1
-                        fininclude = includesurvexfile.OpenFile()
-                        fcollate.write(";*include {}\n".format(includesurvexfile.path))
-                        flinear.write("{:2} {} *include {}\n".format(self.depthinclude, indent, includesurvexfile.path))
-                        push = includesurvexfile.path.lower()
+                        fininclude = open(fullpath,'r')
+                        fcollate.write(";*include {}\n".format(includepath))
+                        flinear.write("{:2} {} *include {}\n".format(self.depthinclude, indent, includepath))
+                        push = includepath.lower()
                        self.stackinclude.append(push)
                        #-----------------
-                        self.RecursiveScan(survexblock, includesurvexfile, fininclude, flinear, fcollate)
+                        self.RecursiveScan(survexblock, includepath, fininclude, flinear, fcollate)
                        #-----------------
                        pop = self.stackinclude.pop()
                        if pop != push:
-                            message = "!!!!!!!    ERROR pop != push {} != {} {}".format(pop, push, self.stackinclude)
+                            message = "!! ERROR mismatch *include pop!=push  {}".format(pop, push, self.stackinclude)
                            print(message)
                            print(message,file=flinear)
                            print(message,file=sys.stderr)
                            models.DataIssue.objects.create(parser='survex', message=message)
-                        includesurvexfile.path += "-TEMP"
-                        includesurvexfile = None
                        flinear.write("{:2} {} *edulcni {}\n".format(self.depthinclude, indent, pop))
                        fcollate.write(";*edulcni {}\n".format(pop))
                        fininclude.close()
                        self.depthinclude -= 1
                        #--------------------------------------------------------
                    else:
-                        message = "    ! ERROR *include file not found for [{}]:'{}'".format(includesurvexfile, includepath)
+                        message = "    ! ERROR *include file not found for:'{}'".format(includepath)
                        print(message)
                        print(message,file=sys.stderr)
                        models.DataIssue.objects.create(parser='survex', message=message)
@@ -659,7 +663,7 @@ class LoadingSurvex():
                        args = " "
                    popargs = self.stackbegin.pop()
                    if popargs != args.lower():
-                        message = "!!!!!!!    ERROR BEGIN/END pop != push {} != {}\n{}".format(popargs, args, self. stackbegin)
+                        message = "!! ERROR mismatch in BEGIN/END labels pop!=push '{}'!='{}'\n{}".format(popargs, args, self. stackbegin)
                        print(message)
                        print(message,file=flinear)
                        print(message,file=sys.stderr)
@@ -701,7 +705,7 @@ def FindAndLoadSurvex(survexblockroot):
    fcollate.write(";*include {}\n".format(survexfileroot.path))
    flinear.write("{:2} {} *include {}\n".format(svx_scan.depthinclude, indent, survexfileroot.path))
    #----------------------------------------------------------------
-    svx_scan.RecursiveScan(survexblockroot, survexfileroot, finroot, flinear, fcollate)
+    svx_scan.RecursiveScan(survexblockroot, survexfileroot.path, finroot, flinear, fcollate)
    #----------------------------------------------------------------
    flinear.write("{:2} {} *edulcni {}\n".format(svx_scan.depthinclude, indent, survexfileroot.path))
    fcollate.write(";*edulcni {}\n".format(survexfileroot.path))
@@ -712,7 +716,7 @@ def FindAndLoadSurvex(survexblockroot):
    flinear.write("    - {:,} survex files in linear include list \n".format(len(svxfileslist)))
    flinear.close()
    fcollate.close()
-    svx_scan = None
+    svx_scan = None # Hmm. Does this actually delete all the instance variables if they are lists, dicts etc.?
    print("\n -  {:,} survex files in linear include list \n".format(len(svxfileslist)),file=sys.stderr)

    mem1 = models.get_process_memory()
@@ -724,7 +728,7 @@ def FindAndLoadSurvex(survexblockroot):
    # entrance locations currently loaded after this by LoadPos(), but could better be done before ?
    # look in MapLocations() for how we find the entrances
   
-    print('\n - Loading All Survex Blocks (LinearRecursive)',file=sys.stderr)
+    print('\n - Loading All Survex Blocks (LinearLoad)',file=sys.stderr)
    svx_load = LoadingSurvex()
    
    svx_load.svxdirs[""] = survexfileroot.survexdirectory