Parse all files, not just those in the *include tree

2026-02-08 13:18:15 +00:00 · 2022-10-05 21:11:18 +03:00
parent 9e5bdace2c
commit 7e47fe1f30
2 changed files with 156 additions and 57 deletions
--- a/parsers/caves.py
+++ b/parsers/caves.py
@@ -457,7 +457,7 @@ def readcaves():
        DataIssue.objects.filter(parser='caves ok').delete()
        DataIssue.objects.filter(parser='entrances').delete()
        
-        print(" - Creating Areas 1623, 1624 and 1626")
+        print(" - Creating Areas 1623, 1624, 1627 and 1626")
        # This crashes on the server with MariaDB even though a null parent is explicitly allowed.
        area_1623= Area.objects.create(short_name = "1623", super=None)
        print(" - Saving Area 1623")
@@ -468,6 +468,9 @@ def readcaves():
        area_1626= Area.objects.create(short_name = "1626", super=None)
        print(" - Saving Area 1626")
        area_1626.save()
+        area_1627= Area.objects.create(short_name = "1627", super=None)
+        print(" - Saving Area 1627")
+        area_1627.save()


    with transaction.atomic():
--- a/parsers/survex.py
+++ b/parsers/survex.py
@@ -6,7 +6,7 @@ import copy
 import subprocess

 from pathlib import Path
-from datetime import datetime, timedelta, date
+from datetime import datetime, timedelta, date, timezone

 from django.utils.timezone import get_current_timezone
 from django.utils.timezone import make_aware
@@ -39,9 +39,12 @@ todo = '''Also walk the entire tree in the :loser: repo looking for unconnected
        
 '''
 survexblockroot = None
+survexomitsroot = None
 ROOTBLOCK = "rootblock"
+OMITBLOCK = "omitblock"
 METRESINFEET = 3.28084

+stop_dup_warning = False
 debugprint = False # Turns on debug printout for just one *include file
 debugprinttrigger = "!"
 # debugprinttrigger = "caves-1623/40/old/EisSVH"
@@ -182,7 +185,7 @@ class LoadingSurvex():
    callcount = 0
    caverncount = 0
    ignoreprefix = ["surface", "kataster", "fixedpts", "gpx"]
-    ignorenoncave = ["caves-1623", "caves-1626", "caves-1623/2007-neu"]
+    ignorenoncave = ["caves-1623", "caves-1623/2007-NEU","caves-1626", "caves-1624", "caves-1627", "fixedpts/gps/gps00raw", ""]
    includedfilename =""
    currentsurvexblock = None
    currentsurvexfile = None
@@ -344,7 +347,7 @@ class LoadingSurvex():
        the rest is discarded after error-checking.
        Now skipping the error checking - returns as soon as the leg is not one we count.
        
-        REPLACE ALL THIS by reading the .log output of cavern for the file
+        REPLACE ALL THIS by reading the .log output of cavern for the file. But we need the lengths per Block, not by File.  Hmm.
        """
        invalid_clino = 180.0
        invalid_compass = 720.0
@@ -457,7 +460,7 @@ class LoadingSurvex():
            print(("! Clino misread in", survexblock.survexfile.path))
            print(("  datastar:", datastar))
            print(("  Line:", ls))
-            message = ' ! Value Error: Clino misread in line %s in %s' % (ls, survexblock.survexfile.path)
+            message = f' ! Value Error: Clino misread in line \'{sline.lower()}\' {datastar=} {self.datastar=} {ls=} in\n{survexblock}\n{survexblock.survexfile}\n{survexblock.survexfile.path}'
            DataIssue.objects.create(parser='survexleg', message=message, url=get_offending_filename(survexblock.survexfile.path))
            lclino = invalid_clino

@@ -668,17 +671,24 @@ class LoadingSurvex():
                        datastar["tape"] = i-1
                self.datastar = copy.deepcopy(datastar)
                return
-        elif ls[0] == "cartesian" or ls[0] == "nosurvey" or ls[0] == "diving" or ls[0] == "cylpolar" or ls[0] == "passage":
-            # message = " ! - *data {}  blocks ignored. {}|{}"   '{}' .format(ls[0].upper(), survexblock.name, survexblock.survexpath, args)
+        elif ls[0] == "passage" or ls[0] == "nosurvey" or ls[0] == "diving" or ls[0] == "cylpolar":
+            #message = " ! - *data {}  blocks ignored. {}|{}"   '{}' .format(ls[0].upper(), survexblock.name, survexblock.survexpath, args)
            # print(message)
-            # print(message,file=sys.stderr)
-            # DataIssue.objects.create(parser='survex', message=message)
+            #print(message,file=sys.stderr)
+            #DataIssue.objects.create(parser='survex', message=message)
+            self.datastar["type"] = ls[0]
+        elif ls[0] == "cartesian": # We should not ignore this ?! Default for Germans ?
+            #message = " ! - *data {}  blocks ignored. {}|{}"   '{}' .format(ls[0].upper(), survexblock.name, survexblock.survexpath, args)
+            # print(message)
+            #print(message,file=sys.stderr)
+            #DataIssue.objects.create(parser='survex', message=message)
            self.datastar["type"] = ls[0]
        else:
            message = " ! - Unrecognised *data statement '{}' {}|{}".format(args, survexblock.name, survexblock.survexpath)
            print(message)
            print(message,file=sys.stderr)
            DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(survexblock.survexfile.path))
+            self.datastar["type"] = ls[0]

    def LoadSurvexFlags(self, args):
        # Valid flags are DUPLICATE, SPLAY, and SURFACE, and a flag may be preceded with NOT to turn it off.
@@ -779,11 +789,11 @@ class LoadingSurvex():
           # Yes we didn't find this cave, but we know it is a pending one. So not an error.
           # print(f'! ALREADY PENDING {caveid}',file=sys.stderr)
           return
-            
-        message = f" ! Error: {caveid} not a cave nor ignorable. headpath:'{headpath}' while parsing '{includelabel=}.svx' at depth:[{len(depth)}].  ignore prefix list:'{self.ignoreprefix}'"
+        
+        message = f" ! Warning: cave identifier '{caveid}' (guessed from file path) is not a known cave.  Need to add to expoweb/cave_data/pending.txt ?  In '{includelabel}.svx' at depth:[{len(depth)}]."
        print("\n"+message)
        print("\n"+message,file=sys.stderr)
-        DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(headpath))
+        DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(includelabel))
        print(f' # datastack in  LoadSurvexFile:{includelabel} type:', end="",file=sys.stderr)
        for dict in self.datastack:
            print(f'<{dict["type"].upper()}   >', end="",file=sys.stderr)
@@ -1190,6 +1200,7 @@ class LoadingSurvex():
        and reads only the *include and *begin and *end statements. It produces a linearised
        list of the include tree and detects blocks included more than once.
        """
+        global stop_dup_warning
        thissvxline = 0
        indent = " " * self.depthinclude
        sys.stderr.flush();
@@ -1202,18 +1213,23 @@ class LoadingSurvex():

        if path in self.svxfileslist:
            # We have already used os.normpath() so this is OK. "/../" and "//" have been simplified already.
-            message = f" * Warning. Duplicate detected. We have already seen this *include '{path}' from another survex file. Detected at callcount:{self.callcount} depth:{self.depthinclude}"
-            print(message)
-            print(message,file=flinear)
-            print("\n"+message,file=sys.stderr)
-            DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(path))
-            if self.svxfileslist.count(path) > 20:
-                message = " ! ERROR. Survex file already *included 20x. Probably an infinite loop so fix your *include statements that include this. Aborting. {}".format(path)
+            if stop_dup_warning:
+                #print("D",end="", file=sys.stderr)
+                pass
+            else:
+                message = f" * Warning. Duplicate detected. We have already seen this *include '{path}' from another survex file. Detected at callcount:{self.callcount} depth:{self.depthinclude}"
                print(message)
                print(message,file=flinear)
-                print(message,file=sys.stderr)
+                #print(message,file=sys.stderr)
+                DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(path))
+            if self.svxfileslist.count(path) > 2:
+                message = " ! ERROR. Should have been caught before this. Survex file already *included 2x. Probably an infinite loop so fix your *include statements that include this. Aborting. {}".format(path)
+                print(message)
+                print(message,file=flinear)
+                #print(message,file=sys.stderr)
                DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(path))
                return
+            return
        self.svxfileslist.append(path)

        try:
@@ -1254,7 +1270,7 @@ class LoadingSurvex():
                    includepath = os.path.normpath(os.path.join(os.path.split(path)[0], re.sub(r"\.svx$", "", args)))

                    fullpath = os.path.join(settings.SURVEX_DATA, includepath + ".svx")
-                    self.RunSurvexIfNeeded(os.path.join(settings.SURVEX_DATA, includepath))
+                    self.RunSurvexIfNeeded(os.path.join(settings.SURVEX_DATA, includepath), path)
                    self.checkUniqueness(os.path.join(settings.SURVEX_DATA, includepath))
                    if os.path.isfile(fullpath):
                        #--------------------------------------------------------
@@ -1280,7 +1296,7 @@ class LoadingSurvex():
                        self.depthinclude -= 1
                        #--------------------------------------------------------
                    else:
-                        message = "    ! ERROR *include file not found for:'{}'".format(includepath)
+                        message = f"    ! ERROR *include file '{includepath}' not found, listed in '{fin.name}'"
                        print(message)
                        print(message,file=sys.stderr)
                        DataIssue.objects.create(parser='survex', message=message,  url=get_offending_filename(path))
@@ -1329,7 +1345,7 @@ class LoadingSurvex():
            print(message)

    
-    def RunSurvexIfNeeded(self,fullpath):
+    def RunSurvexIfNeeded(self,fullpath, calledpath):
        now = time.time()
        cav_t = now - 365*24*3600
        log_t = now - 365*24*3600
@@ -1368,7 +1384,7 @@ class LoadingSurvex():
        outputdir = Path(svxpath).parent

        if not svxpath.is_file(): 
-            message = f' ! BAD survex file "{fullpath}" specified in *include (somewhere).. ' 
+            message = f' ! BAD survex file "{fullpath}" specified in *include in {calledpath} ' 
            DataIssue.objects.create(parser='entrances', message=message)
            print(message)
            return
@@ -1410,6 +1426,7 @@ class LoadingSurvex():
 def FindAndLoadSurvex(survexblockroot):
    """Follows the *include links successively to find files in the whole include tree
    """
+    global stop_dup_warning
    print('  - redirecting stdout to svxblks.log...')
    stdout_orig = sys.stdout
    # Redirect sys.stdout to the file
@@ -1419,22 +1436,21 @@ def FindAndLoadSurvex(survexblockroot):
    survexfileroot = survexblockroot.survexfile # i.e. SURVEX_TOPNAME only
    collatefilename = "_" + survexfileroot.path + ".svx"

+
    svx_scan = LoadingSurvex()
    svx_scan.callcount = 0
    svx_scan.depthinclude = 0
    fullpathtotop = os.path.join(survexfileroot.survexdirectory.path, survexfileroot.path)
    
-    # Rather than do this check for the presence of the .log and .3d files synchronously here,
-    # we should instead run this in a separate thread asynchronously.
    print("  - RunSurvexIfNeeded cavern on '{}'".format(fullpathtotop), file=sys.stderr)
-    svx_scan.RunSurvexIfNeeded(fullpathtotop)
+    svx_scan.RunSurvexIfNeeded(fullpathtotop, fullpathtotop)
    svx_scan.checkUniqueness(fullpathtotop)
    
    indent=""
    fcollate = open(collatefilename, 'w')

    mem0 = get_process_memory()
-    print(" - MEM:{:7.2f} MB START".format(mem0),file=sys.stderr)
+    print("  - MEM:{:7.2f} MB START".format(mem0),file=sys.stderr)
    flinear = open('svxlinear.log', 'w')
    flinear.write("    - MEM:{:7.2f} MB START {}\n".format(mem0,survexfileroot.path))
    print("    ", file=sys.stderr,end='')
@@ -1447,6 +1463,7 @@ def FindAndLoadSurvex(survexblockroot):
    from pstats import SortKey
    pr = cProfile.Profile()
    pr.enable()
+    #print(f"###{survexblockroot=} {survexfileroot.path=}",file=sys.stderr)
    #----------------------------------------------------------------
    svx_scan.PushdownStackScan(survexblockroot, survexfileroot.path, finroot, flinear, fcollate)
    #----------------------------------------------------------------
@@ -1455,47 +1472,113 @@ def FindAndLoadSurvex(survexblockroot):
        ps = pstats.Stats(pr, stream=f)
        ps.sort_stats(SortKey.CUMULATIVE)
        ps.print_stats()
-        
+    
    flinear.write("{:2} {} *edulcni {}\n".format(svx_scan.depthinclude, indent, survexfileroot.path))
    fcollate.write(";*edulcni {}\n".format(survexfileroot.path))
    mem1 = get_process_memory()
    flinear.write("\n    - MEM:{:.2f} MB STOP {}\n".format(mem1,survexfileroot.path))
-    flinear.write("    - MEM:{:.3f} MB USED\n".format(mem1-mem0))
-    svxfileslist = svx_scan.svxfileslist
-    flinear.write("    - {:,} survex files in linear include list \n".format(len(svxfileslist)))
-    flinear.close()
-    fcollate.close()
-    
-    print("\n -  {:,} runs of survex 'cavern' refreshing .3d files \n".format(svx_scan.caverncount),file=sys.stderr)
-    
-    svx_scan = None # Hmm. Does this actually delete all the instance variables if they are lists, dicts etc.? 
-    print("\n -  {:,} survex files in linear include list \n".format(len(svxfileslist)),file=sys.stderr)
+    flinear.write("    - MEM:{:.3f} MB ADDITIONALLY USED\n".format(mem1-mem0))
+    flinear.write("    - {:,} survex files in linear include list \n".format(len(svx_scan.svxfileslist)))
+     
+    print("  -  {:,} runs of survex 'cavern' refreshing .3d files".format(svx_scan.caverncount),file=sys.stderr)
+    print("  -  {:,} survex files from tree in linear include list".format(len(svx_scan.svxfileslist)),file=sys.stderr)
       
    mem1 = get_process_memory()
-    print(" - MEM:{:7.2f} MB END ".format(mem0),file=sys.stderr)
-    print(" - MEM:{:7.3f} MB USED".format(mem1-mem0),file=sys.stderr)
+    print("  - MEM:{:7.2f} MB END ".format(mem1),file=sys.stderr)
+    print("  - MEM:{:7.3f} MB ADDITIONALLY USED".format(mem1-mem0),file=sys.stderr)
    
-    a = []
+    #
+    # Process all the omitted files in :loser: with some exceptions
+    #
+    unseens = set()
    b=[]
    
    for p in Path(settings.SURVEX_DATA).rglob('*.svx'):
        if p.is_file():
            po = p.relative_to(Path(settings.SURVEX_DATA))
            pox = po.with_suffix('')
-            if str(pox) not in svxfileslist:
-                print(f"[{pox}]", file=sys.stderr)
-                a.append(pox)
+            if str(pox) not in svx_scan.svxfileslist:
+                # print(f"[{pox}]", file=sys.stderr)
+                unseens.add(pox)
            else:
-                print("'", end=" ", file=sys.stderr)
                b.append(pox)
+    
+    if len(b) != len(svx_scan.svxfileslist):
+        print(f" ! Mismatch. {len(b)} survex files found which should be {len(svx_scan.svxfileslist)} in main tree)", file=sys.stderr)
+     
+    excpts = ["surface/terrain", "kataster/kataster-boundaries", "template", "docs", "_unseens"]
+    removals = []
+    for x in unseens:
+        for o in excpts:
+            if  str(x).strip().startswith(o):
+                removals.append(x)
+    for x in removals:
+        unseens.remove(x)
+    print(f"  - {len(unseens)} survex files found which were not included in main tree. ({len(svx_scan.svxfileslist)} in main tree)", file=sys.stderr)
+    print(f" -- Now loading the previously-omitted survex files.", file=sys.stderr)
+    
+    with open(Path(settings.SURVEX_DATA, '_unseens.svx'), 'w') as u: 
+        u.write(f"; {len(unseens):,} survex files not *included by {settings.SURVEX_TOPNAME} (which are {len(svx_scan.svxfileslist):,} files)\n")
+        u.write(f"; autogenerated  by parser/survex.py from databasereset.py on '{datetime.now(timezone.utc)}'\n")
+        u.write(f"; omitting any file beginning with {excpts}\n\n")
+        u.write(f"*begin unseens\n")
+        for x in sorted(unseens):
+            u.write(f"    *include {x}\n")
+        u.write(f"*end unseens\n")
 
-    print("=>", len(a), len(b), len(svxfileslist), file=sys.stderr)
+    survexfileroot = survexblockroot.survexfile # i.e. SURVEX_TOPNAME only
+
+    omit_scan = LoadingSurvex()
+    omit_scan.callcount = 0
+    omit_scan.depthinclude = 0
+    fullpathtotop = os.path.join(survexfileroot.survexdirectory.path, '_unseens.svx')
    
-    for i in [0,1,2,3,4,5]:
-        print(f"==> [{svxfileslist[i]}]", file=sys.stderr)
+    # copy the list to prime the next pass through the files
+    omit_scan.svxfileslist = svx_scan.svxfileslist[:]
+    svx_scan.svxfileslist = [] # free memory
+    svx_scan = None # Hmm. Does this actually delete all the instance variables if they are lists, dicts etc.? 
  
+    print("  - RunSurvexIfNeeded cavern on '{}'".format(fullpathtotop), file=sys.stderr)
+    omit_scan.RunSurvexIfNeeded(fullpathtotop, fullpathtotop)
+    omit_scan.checkUniqueness(fullpathtotop)
+ 
+    mem0 = get_process_memory()
+    print("  - MEM:{:7.2f} MB START '_unseens'".format(mem0),file=sys.stderr)
+    #flinear = open('svxlinear.log', 'w')
+    flinear.write(f"    - MEM:{mem0:7.2f} MB START '_unseens'\n")
+    print("    ", file=sys.stderr,end='')
+
+    finroot = open(fullpathtotop)
+    fcollate.write(";*include {}\n".format('_unseens.svx'))
+    flinear.write("{:2} {} *include {}\n".format(omit_scan.depthinclude, indent, '_unseens'))
+    stop_dup_warning = True
+    #----------------------------------------------------------------
+    omit_scan.PushdownStackScan(survexblockroot, '_unseens', finroot, flinear, fcollate)
+    #----------------------------------------------------------------
+    stop_dup_warning = False
+
+    flinear.write("{:2} {} *edulcni {}\n".format(omit_scan.depthinclude, indent, '_unseens'))
+    fcollate.write(";*edulcni {}\n".format('_unseens.svx'))
+    mem1 = get_process_memory()
+    flinear.write("\n    - MEM:{:.2f} MB STOP {} OMIT\n".format(mem1,'_unseens.svx'))
+    flinear.write("    - MEM:{:.3f} MB ADDITIONALLY USED OMIT\n".format(mem1-mem0))
+    flinear.write("    - {:,} survex files in linear include list OMIT \n".format(len(omit_scan.svxfileslist)))
    
-    svxfileslist = [] # free memory
+    flinear.close()
+    fcollate.close()
+    
+    print("\n -  {:,} runs of survex 'cavern' refreshing .3d files in the unseen list \n".format(omit_scan.caverncount),file=sys.stderr)
+    
+    print("\n -  {:,} survex files in linear include list including previously unseen ones \n".format(len(omit_scan.svxfileslist)),file=sys.stderr)
+    omit_scan = None # Hmm. Does this actually delete all the instance variables if they are lists, dicts etc.? 
+      
+    mem1 = get_process_memory()
+    print("  - MEM:{:7.2f} MB END ".format(mem1),file=sys.stderr)
+    print("  - MEM:{:7.3f} MB ADDITIONALLY USED".format(mem1-mem0),file=sys.stderr)
+
+ 
+ 
+ 
 
    # Before doing this, it would be good to identify the *equate and *entrance we need that are relevant to the
    # entrance locations currently loaded after this by LoadPos(), but could better be done before ?
@@ -1503,6 +1586,8 @@ def FindAndLoadSurvex(survexblockroot):
   
    print('\n  - Loading All Survex Blocks (LinearLoad)',file=sys.stderr)
    svx_load = LoadingSurvex()
+    mem1 = get_process_memory()
+    print("  - MEM:{:7.2f} MB after creating empty loading object.".format(mem1),file=sys.stderr)

    svx_load.survexdict[survexfileroot.survexdirectory] = []
    svx_load.survexdict[survexfileroot.survexdirectory].append(survexfileroot)
@@ -1525,7 +1610,7 @@ def FindAndLoadSurvex(survexblockroot):
        # ps.print_stats()

    print("\n - MEM:{:7.2f} MB STOP".format(mem1),file=sys.stderr)
-    print(" - MEM:{:7.3f} MB USED".format(mem1-mem0),file=sys.stderr)
+    print(" - MEM:{:7.3f} MB ADDITIONALLY USED".format(mem1-mem0),file=sys.stderr)

    # Close the logging file, Restore sys.stdout to our old saved file handle
    sys.stdout.close()
@@ -1536,12 +1621,12 @@ def FindAndLoadSurvex(survexblockroot):
    legsnumber = svx_load.legsnumber
    mem1 = get_process_memory()

-    print("  - Number of SurvexDirectories: {}".format(len(svx_load.survexdict)))
+    print("  - Number of SurvexDirectories: {:,}".format(len(svx_load.survexdict)))
    tf=0
    for d in svx_load.survexdict:
        tf += len(svx_load.survexdict[d])
-    print("  - Number of SurvexFiles: {}".format(tf))
-    print(f"  - Number of Survex legs: {legsnumber}")
+    print(f"  - Number of SurvexFiles: {tf:,}")
+    print(f"  - Number of Survex legs: {legsnumber:,}")
    svx_load = None

    return legsnumber
@@ -1562,6 +1647,14 @@ def MakeSurvexFileRoot():
    fileroot.survexdirectory = directoryroot # i.e. SURVEX_DATA/SURVEX_TOPNAME
    fileroot.save() # mutually dependent objects need a double-save like this
    return fileroot
+    
+def MakeOmitFileRoot(fn):
+    """Returns a file_object.path = _unseens.svx associated with directory_object.path = SURVEX_DATA
+    """
+    fileroot = SurvexFile(path=fn, cave=None)
+    fileroot.survexdirectory = SurvexDirectory.objects.get(path=settings.SURVEX_DATA)
+    fileroot.save() 
+    return fileroot

 def LoadSurvexBlocks():

@@ -1571,7 +1664,7 @@ def LoadSurvexBlocks():
    SurvexDirectory.objects.all().delete()
    SurvexPersonRole.objects.all().delete()
    SurvexStation.objects.all().delete()
-    print(" - survex Data Issues flushed")
+    print("  - survex Data Issues flushed")
    DataIssue.objects.filter(parser='survex').delete()
    DataIssue.objects.filter(parser='svxdate').delete()
    DataIssue.objects.filter(parser='survexleg').delete()
@@ -1588,9 +1681,12 @@ def LoadSurvexBlocks():
    # fix by restarting db on server
    # sudo service mariadb stop
    # sudo service mariadb start
-
-
    survexblockroot.save()
+    
+    omitsfileroot = MakeOmitFileRoot("_unseens.svx")
+    survexomitsroot = SurvexBlock(name=OMITBLOCK, survexpath="", cave=None, survexfile=omitsfileroot, 
+        legsall=0, legslength=0.0)
+    survexomitsroot.save()  

    print(' - Loading Survex Blocks...')
    memstart = get_process_memory()