From 7e47fe1f30e68bff5a31caee8cdf356d0f802888 Mon Sep 17 00:00:00 2001
From: Philip Sargent <philip.sargent@klebos.com>
Date: Wed, 5 Oct 2022 21:11:18 +0300
Subject: [PATCH] Parse all files, not just those in the *include tree

---
 parsers/caves.py  |   5 +-
 parsers/survex.py | 208 +++++++++++++++++++++++++++++++++-------------
 2 files changed, 156 insertions(+), 57 deletions(-)

diff --git a/parsers/caves.py b/parsers/caves.py
index 634434c..9458d7a 100644
--- a/parsers/caves.py
+++ b/parsers/caves.py
@@ -457,7 +457,7 @@ def readcaves():
         DataIssue.objects.filter(parser='caves ok').delete()
         DataIssue.objects.filter(parser='entrances').delete()
         
-        print(" - Creating Areas 1623, 1624 and 1626")
+        print(" - Creating Areas 1623, 1624, 1627 and 1626")
         # This crashes on the server with MariaDB even though a null parent is explicitly allowed.
         area_1623= Area.objects.create(short_name = "1623", super=None)
         print(" - Saving Area 1623")
@@ -468,6 +468,9 @@ def readcaves():
         area_1626= Area.objects.create(short_name = "1626", super=None)
         print(" - Saving Area 1626")
         area_1626.save()
+        area_1627= Area.objects.create(short_name = "1627", super=None)
+        print(" - Saving Area 1627")
+        area_1627.save()
 
 
     with transaction.atomic():
diff --git a/parsers/survex.py b/parsers/survex.py
index d37fea7..2740213 100644
--- a/parsers/survex.py
+++ b/parsers/survex.py
@@ -6,7 +6,7 @@ import copy
 import subprocess
 
 from pathlib import Path
-from datetime import datetime, timedelta, date
+from datetime import datetime, timedelta, date, timezone
 
 from django.utils.timezone import get_current_timezone
 from django.utils.timezone import make_aware
@@ -39,9 +39,12 @@ todo = '''Also walk the entire tree in the :loser: repo looking for unconnected
         
 '''
 survexblockroot = None
+survexomitsroot = None
 ROOTBLOCK = "rootblock"
+OMITBLOCK = "omitblock"
 METRESINFEET = 3.28084
 
+stop_dup_warning = False
 debugprint = False # Turns on debug printout for just one *include file
 debugprinttrigger = "!"
 # debugprinttrigger = "caves-1623/40/old/EisSVH"
@@ -182,7 +185,7 @@ class LoadingSurvex():
     callcount = 0
     caverncount = 0
     ignoreprefix = ["surface", "kataster", "fixedpts", "gpx"]
-    ignorenoncave = ["caves-1623", "caves-1626", "caves-1623/2007-neu"]
+    ignorenoncave = ["caves-1623", "caves-1623/2007-NEU","caves-1626", "caves-1624", "caves-1627", "fixedpts/gps/gps00raw", ""]
     includedfilename =""
     currentsurvexblock = None
     currentsurvexfile = None
@@ -344,7 +347,7 @@ class LoadingSurvex():
         the rest is discarded after error-checking.
         Now skipping the error checking - returns as soon as the leg is not one we count.
         
-        REPLACE ALL THIS by reading the .log output of cavern for the file
+        REPLACE ALL THIS by reading the .log output of cavern for the file. But we need the lengths per Block, not by File.  Hmm.
         """
         invalid_clino = 180.0
         invalid_compass = 720.0
@@ -457,7 +460,7 @@ class LoadingSurvex():
             print(("! Clino misread in", survexblock.survexfile.path))
             print(("  datastar:", datastar))
             print(("  Line:", ls))
-            message = ' ! Value Error: Clino misread in line %s in %s' % (ls, survexblock.survexfile.path)
+            message = f' ! Value Error: Clino misread in line \'{sline.lower()}\' {datastar=} {self.datastar=} {ls=} in\n{survexblock}\n{survexblock.survexfile}\n{survexblock.survexfile.path}'
             DataIssue.objects.create(parser='survexleg', message=message, url=get_offending_filename(survexblock.survexfile.path))
             lclino = invalid_clino
 
@@ -668,17 +671,24 @@ class LoadingSurvex():
                         datastar["tape"] = i-1
                 self.datastar = copy.deepcopy(datastar)
                 return
-        elif ls[0] == "cartesian" or ls[0] == "nosurvey" or ls[0] == "diving" or ls[0] == "cylpolar" or ls[0] == "passage":
-            # message = " ! - *data {}  blocks ignored. {}|{}"   '{}' .format(ls[0].upper(), survexblock.name, survexblock.survexpath, args)
+        elif ls[0] == "passage" or ls[0] == "nosurvey" or ls[0] == "diving" or ls[0] == "cylpolar":
+            #message = " ! - *data {}  blocks ignored. {}|{}"   '{}' .format(ls[0].upper(), survexblock.name, survexblock.survexpath, args)
             # print(message)
-            # print(message,file=sys.stderr)
-            # DataIssue.objects.create(parser='survex', message=message)
+            #print(message,file=sys.stderr)
+            #DataIssue.objects.create(parser='survex', message=message)
+            self.datastar["type"] = ls[0]
+        elif ls[0] == "cartesian": # We should not ignore this ?! Default for Germans ?
+            #message = " ! - *data {}  blocks ignored. {}|{}"   '{}' .format(ls[0].upper(), survexblock.name, survexblock.survexpath, args)
+            # print(message)
+            #print(message,file=sys.stderr)
+            #DataIssue.objects.create(parser='survex', message=message)
             self.datastar["type"] = ls[0]
         else:
             message = " ! - Unrecognised *data statement '{}' {}|{}".format(args, survexblock.name, survexblock.survexpath)
             print(message)
             print(message,file=sys.stderr)
             DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(survexblock.survexfile.path))
+            self.datastar["type"] = ls[0]
 
     def LoadSurvexFlags(self, args):
         # Valid flags are DUPLICATE, SPLAY, and SURFACE, and a flag may be preceded with NOT to turn it off.
@@ -779,11 +789,11 @@ class LoadingSurvex():
            # Yes we didn't find this cave, but we know it is a pending one. So not an error.
            # print(f'! ALREADY PENDING {caveid}',file=sys.stderr)
            return
-            
-        message = f" ! Error: {caveid} not a cave nor ignorable. headpath:'{headpath}' while parsing '{includelabel=}.svx' at depth:[{len(depth)}].  ignore prefix list:'{self.ignoreprefix}'"
+        
+        message = f" ! Warning: cave identifier '{caveid}' (guessed from file path) is not a known cave.  Need to add to expoweb/cave_data/pending.txt ?  In '{includelabel}.svx' at depth:[{len(depth)}]."
         print("\n"+message)
         print("\n"+message,file=sys.stderr)
-        DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(headpath))
+        DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(includelabel))
         print(f' # datastack in  LoadSurvexFile:{includelabel} type:', end="",file=sys.stderr)
         for dict in self.datastack:
             print(f'<{dict["type"].upper()}   >', end="",file=sys.stderr)
@@ -1190,6 +1200,7 @@ class LoadingSurvex():
         and reads only the *include and *begin and *end statements. It produces a linearised
         list of the include tree and detects blocks included more than once.
         """
+        global stop_dup_warning
         thissvxline = 0
         indent = " " * self.depthinclude
         sys.stderr.flush();
@@ -1202,18 +1213,23 @@ class LoadingSurvex():
 
         if path in self.svxfileslist:
             # We have already used os.normpath() so this is OK. "/../" and "//" have been simplified already.
-            message = f" * Warning. Duplicate detected. We have already seen this *include '{path}' from another survex file. Detected at callcount:{self.callcount} depth:{self.depthinclude}"
-            print(message)
-            print(message,file=flinear)
-            print("\n"+message,file=sys.stderr)
-            DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(path))
-            if self.svxfileslist.count(path) > 20:
-                message = " ! ERROR. Survex file already *included 20x. Probably an infinite loop so fix your *include statements that include this. Aborting. {}".format(path)
+            if stop_dup_warning:
+                #print("D",end="", file=sys.stderr)
+                pass
+            else:
+                message = f" * Warning. Duplicate detected. We have already seen this *include '{path}' from another survex file. Detected at callcount:{self.callcount} depth:{self.depthinclude}"
                 print(message)
                 print(message,file=flinear)
-                print(message,file=sys.stderr)
+                #print(message,file=sys.stderr)
+                DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(path))
+            if self.svxfileslist.count(path) > 2:
+                message = " ! ERROR. Should have been caught before this. Survex file already *included 2x. Probably an infinite loop so fix your *include statements that include this. Aborting. {}".format(path)
+                print(message)
+                print(message,file=flinear)
+                #print(message,file=sys.stderr)
                 DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(path))
                 return
+            return
         self.svxfileslist.append(path)
 
         try:
@@ -1254,7 +1270,7 @@ class LoadingSurvex():
                     includepath = os.path.normpath(os.path.join(os.path.split(path)[0], re.sub(r"\.svx$", "", args)))
 
                     fullpath = os.path.join(settings.SURVEX_DATA, includepath + ".svx")
-                    self.RunSurvexIfNeeded(os.path.join(settings.SURVEX_DATA, includepath))
+                    self.RunSurvexIfNeeded(os.path.join(settings.SURVEX_DATA, includepath), path)
                     self.checkUniqueness(os.path.join(settings.SURVEX_DATA, includepath))
                     if os.path.isfile(fullpath):
                         #--------------------------------------------------------
@@ -1280,7 +1296,7 @@ class LoadingSurvex():
                         self.depthinclude -= 1
                         #--------------------------------------------------------
                     else:
-                        message = "    ! ERROR *include file not found for:'{}'".format(includepath)
+                        message = f"    ! ERROR *include file '{includepath}' not found, listed in '{fin.name}'"
                         print(message)
                         print(message,file=sys.stderr)
                         DataIssue.objects.create(parser='survex', message=message,  url=get_offending_filename(path))
@@ -1329,7 +1345,7 @@ class LoadingSurvex():
             print(message)
 
     
-    def RunSurvexIfNeeded(self,fullpath):
+    def RunSurvexIfNeeded(self,fullpath, calledpath):
         now = time.time()
         cav_t = now - 365*24*3600
         log_t = now - 365*24*3600
@@ -1368,7 +1384,7 @@ class LoadingSurvex():
         outputdir = Path(svxpath).parent
 
         if not svxpath.is_file(): 
-            message = f' ! BAD survex file "{fullpath}" specified in *include (somewhere).. ' 
+            message = f' ! BAD survex file "{fullpath}" specified in *include in {calledpath} ' 
             DataIssue.objects.create(parser='entrances', message=message)
             print(message)
             return
@@ -1410,6 +1426,7 @@ class LoadingSurvex():
 def FindAndLoadSurvex(survexblockroot):
     """Follows the *include links successively to find files in the whole include tree
     """
+    global stop_dup_warning
     print('  - redirecting stdout to svxblks.log...')
     stdout_orig = sys.stdout
     # Redirect sys.stdout to the file
@@ -1419,22 +1436,21 @@ def FindAndLoadSurvex(survexblockroot):
     survexfileroot = survexblockroot.survexfile # i.e. SURVEX_TOPNAME only
     collatefilename = "_" + survexfileroot.path + ".svx"
 
+
     svx_scan = LoadingSurvex()
     svx_scan.callcount = 0
     svx_scan.depthinclude = 0
     fullpathtotop = os.path.join(survexfileroot.survexdirectory.path, survexfileroot.path)
     
-    # Rather than do this check for the presence of the .log and .3d files synchronously here,
-    # we should instead run this in a separate thread asynchronously.
     print("  - RunSurvexIfNeeded cavern on '{}'".format(fullpathtotop), file=sys.stderr)
-    svx_scan.RunSurvexIfNeeded(fullpathtotop)
+    svx_scan.RunSurvexIfNeeded(fullpathtotop, fullpathtotop)
     svx_scan.checkUniqueness(fullpathtotop)
     
     indent=""
     fcollate = open(collatefilename, 'w')
 
     mem0 = get_process_memory()
-    print(" - MEM:{:7.2f} MB START".format(mem0),file=sys.stderr)
+    print("  - MEM:{:7.2f} MB START".format(mem0),file=sys.stderr)
     flinear = open('svxlinear.log', 'w')
     flinear.write("    - MEM:{:7.2f} MB START {}\n".format(mem0,survexfileroot.path))
     print("    ", file=sys.stderr,end='')
@@ -1447,6 +1463,7 @@ def FindAndLoadSurvex(survexblockroot):
     from pstats import SortKey
     pr = cProfile.Profile()
     pr.enable()
+    #print(f"###{survexblockroot=} {survexfileroot.path=}",file=sys.stderr)
     #----------------------------------------------------------------
     svx_scan.PushdownStackScan(survexblockroot, survexfileroot.path, finroot, flinear, fcollate)
     #----------------------------------------------------------------
@@ -1455,47 +1472,113 @@ def FindAndLoadSurvex(survexblockroot):
         ps = pstats.Stats(pr, stream=f)
         ps.sort_stats(SortKey.CUMULATIVE)
         ps.print_stats()
-        
+    
     flinear.write("{:2} {} *edulcni {}\n".format(svx_scan.depthinclude, indent, survexfileroot.path))
     fcollate.write(";*edulcni {}\n".format(survexfileroot.path))
     mem1 = get_process_memory()
     flinear.write("\n    - MEM:{:.2f} MB STOP {}\n".format(mem1,survexfileroot.path))
-    flinear.write("    - MEM:{:.3f} MB USED\n".format(mem1-mem0))
-    svxfileslist = svx_scan.svxfileslist
-    flinear.write("    - {:,} survex files in linear include list \n".format(len(svxfileslist)))
-    flinear.close()
-    fcollate.close()
-    
-    print("\n -  {:,} runs of survex 'cavern' refreshing .3d files \n".format(svx_scan.caverncount),file=sys.stderr)
-    
-    svx_scan = None # Hmm. Does this actually delete all the instance variables if they are lists, dicts etc.? 
-    print("\n -  {:,} survex files in linear include list \n".format(len(svxfileslist)),file=sys.stderr)
+    flinear.write("    - MEM:{:.3f} MB ADDITIONALLY USED\n".format(mem1-mem0))
+    flinear.write("    - {:,} survex files in linear include list \n".format(len(svx_scan.svxfileslist)))
+     
+    print("  -  {:,} runs of survex 'cavern' refreshing .3d files".format(svx_scan.caverncount),file=sys.stderr)
+    print("  -  {:,} survex files from tree in linear include list".format(len(svx_scan.svxfileslist)),file=sys.stderr)
        
     mem1 = get_process_memory()
-    print(" - MEM:{:7.2f} MB END ".format(mem0),file=sys.stderr)
-    print(" - MEM:{:7.3f} MB USED".format(mem1-mem0),file=sys.stderr)
+    print("  - MEM:{:7.2f} MB END ".format(mem1),file=sys.stderr)
+    print("  - MEM:{:7.3f} MB ADDITIONALLY USED".format(mem1-mem0),file=sys.stderr)
     
-    a = []
+    #
+    # Process all the omitted files in :loser: with some exceptions
+    #
+    unseens = set()
     b=[]
     
     for p in Path(settings.SURVEX_DATA).rglob('*.svx'):
         if p.is_file():
             po = p.relative_to(Path(settings.SURVEX_DATA))
             pox = po.with_suffix('')
-            if str(pox) not in svxfileslist:
-                print(f"[{pox}]", file=sys.stderr)
-                a.append(pox)
+            if str(pox) not in svx_scan.svxfileslist:
+                # print(f"[{pox}]", file=sys.stderr)
+                unseens.add(pox)
             else:
-                print("'", end=" ", file=sys.stderr)
                 b.append(pox)
+    
+    if len(b) != len(svx_scan.svxfileslist):
+        print(f" ! Mismatch. {len(b)} survex files found which should be {len(svx_scan.svxfileslist)} in main tree)", file=sys.stderr)
+     
+    excpts = ["surface/terrain", "kataster/kataster-boundaries", "template", "docs", "_unseens"]
+    removals = []
+    for x in unseens:
+        for o in excpts:
+            if  str(x).strip().startswith(o):
+                removals.append(x)
+    for x in removals:
+        unseens.remove(x)
+    print(f"  - {len(unseens)} survex files found which were not included in main tree. ({len(svx_scan.svxfileslist)} in main tree)", file=sys.stderr)
+    print(f" -- Now loading the previously-omitted survex files.", file=sys.stderr)
+    
+    with open(Path(settings.SURVEX_DATA, '_unseens.svx'), 'w') as u: 
+        u.write(f"; {len(unseens):,} survex files not *included by {settings.SURVEX_TOPNAME} (which are {len(svx_scan.svxfileslist):,} files)\n")
+        u.write(f"; autogenerated  by parser/survex.py from databasereset.py on '{datetime.now(timezone.utc)}'\n")
+        u.write(f"; omitting any file beginning with {excpts}\n\n")
+        u.write(f"*begin unseens\n")
+        for x in sorted(unseens):
+            u.write(f"    *include {x}\n")
+        u.write(f"*end unseens\n")
  
-    print("=>", len(a), len(b), len(svxfileslist), file=sys.stderr)
+    survexfileroot = survexblockroot.survexfile # i.e. SURVEX_TOPNAME only
+
+    omit_scan = LoadingSurvex()
+    omit_scan.callcount = 0
+    omit_scan.depthinclude = 0
+    fullpathtotop = os.path.join(survexfileroot.survexdirectory.path, '_unseens.svx')
     
-    for i in [0,1,2,3,4,5]:
-        print(f"==> [{svxfileslist[i]}]", file=sys.stderr)
+    # copy the list to prime the next pass through the files
+    omit_scan.svxfileslist = svx_scan.svxfileslist[:]
+    svx_scan.svxfileslist = [] # free memory
+    svx_scan = None # Hmm. Does this actually delete all the instance variables if they are lists, dicts etc.? 
   
+    print("  - RunSurvexIfNeeded cavern on '{}'".format(fullpathtotop), file=sys.stderr)
+    omit_scan.RunSurvexIfNeeded(fullpathtotop, fullpathtotop)
+    omit_scan.checkUniqueness(fullpathtotop)
+ 
+    mem0 = get_process_memory()
+    print("  - MEM:{:7.2f} MB START '_unseens'".format(mem0),file=sys.stderr)
+    #flinear = open('svxlinear.log', 'w')
+    flinear.write(f"    - MEM:{mem0:7.2f} MB START '_unseens'\n")
+    print("    ", file=sys.stderr,end='')
+
+    finroot = open(fullpathtotop)
+    fcollate.write(";*include {}\n".format('_unseens.svx'))
+    flinear.write("{:2} {} *include {}\n".format(omit_scan.depthinclude, indent, '_unseens'))
+    stop_dup_warning = True
+    #----------------------------------------------------------------
+    omit_scan.PushdownStackScan(survexblockroot, '_unseens', finroot, flinear, fcollate)
+    #----------------------------------------------------------------
+    stop_dup_warning = False
+
+    flinear.write("{:2} {} *edulcni {}\n".format(omit_scan.depthinclude, indent, '_unseens'))
+    fcollate.write(";*edulcni {}\n".format('_unseens.svx'))
+    mem1 = get_process_memory()
+    flinear.write("\n    - MEM:{:.2f} MB STOP {} OMIT\n".format(mem1,'_unseens.svx'))
+    flinear.write("    - MEM:{:.3f} MB ADDITIONALLY USED OMIT\n".format(mem1-mem0))
+    flinear.write("    - {:,} survex files in linear include list OMIT \n".format(len(omit_scan.svxfileslist)))
     
-    svxfileslist = [] # free memory
+    flinear.close()
+    fcollate.close()
+    
+    print("\n -  {:,} runs of survex 'cavern' refreshing .3d files in the unseen list \n".format(omit_scan.caverncount),file=sys.stderr)
+    
+    print("\n -  {:,} survex files in linear include list including previously unseen ones \n".format(len(omit_scan.svxfileslist)),file=sys.stderr)
+    omit_scan = None # Hmm. Does this actually delete all the instance variables if they are lists, dicts etc.? 
+      
+    mem1 = get_process_memory()
+    print("  - MEM:{:7.2f} MB END ".format(mem1),file=sys.stderr)
+    print("  - MEM:{:7.3f} MB ADDITIONALLY USED".format(mem1-mem0),file=sys.stderr)
+
+ 
+ 
+ 
  
     # Before doing this, it would be good to identify the *equate and *entrance we need that are relevant to the
     # entrance locations currently loaded after this by LoadPos(), but could better be done before ?
@@ -1503,6 +1586,8 @@ def FindAndLoadSurvex(survexblockroot):
    
     print('\n  - Loading All Survex Blocks (LinearLoad)',file=sys.stderr)
     svx_load = LoadingSurvex()
+    mem1 = get_process_memory()
+    print("  - MEM:{:7.2f} MB after creating empty loading object.".format(mem1),file=sys.stderr)
 
     svx_load.survexdict[survexfileroot.survexdirectory] = []
     svx_load.survexdict[survexfileroot.survexdirectory].append(survexfileroot)
@@ -1525,7 +1610,7 @@ def FindAndLoadSurvex(survexblockroot):
         # ps.print_stats()
 
     print("\n - MEM:{:7.2f} MB STOP".format(mem1),file=sys.stderr)
-    print(" - MEM:{:7.3f} MB USED".format(mem1-mem0),file=sys.stderr)
+    print(" - MEM:{:7.3f} MB ADDITIONALLY USED".format(mem1-mem0),file=sys.stderr)
 
     # Close the logging file, Restore sys.stdout to our old saved file handle
     sys.stdout.close()
@@ -1536,12 +1621,12 @@ def FindAndLoadSurvex(survexblockroot):
     legsnumber = svx_load.legsnumber
     mem1 = get_process_memory()
 
-    print("  - Number of SurvexDirectories: {}".format(len(svx_load.survexdict)))
+    print("  - Number of SurvexDirectories: {:,}".format(len(svx_load.survexdict)))
     tf=0
     for d in svx_load.survexdict:
         tf += len(svx_load.survexdict[d])
-    print("  - Number of SurvexFiles: {}".format(tf))
-    print(f"  - Number of Survex legs: {legsnumber}")
+    print(f"  - Number of SurvexFiles: {tf:,}")
+    print(f"  - Number of Survex legs: {legsnumber:,}")
     svx_load = None
 
     return legsnumber
@@ -1562,6 +1647,14 @@ def MakeSurvexFileRoot():
     fileroot.survexdirectory = directoryroot # i.e. SURVEX_DATA/SURVEX_TOPNAME
     fileroot.save() # mutually dependent objects need a double-save like this
     return fileroot
+    
+def MakeOmitFileRoot(fn):
+    """Returns a file_object.path = _unseens.svx associated with directory_object.path = SURVEX_DATA
+    """
+    fileroot = SurvexFile(path=fn, cave=None)
+    fileroot.survexdirectory = SurvexDirectory.objects.get(path=settings.SURVEX_DATA)
+    fileroot.save() 
+    return fileroot
 
 def LoadSurvexBlocks():
 
@@ -1571,7 +1664,7 @@ def LoadSurvexBlocks():
     SurvexDirectory.objects.all().delete()
     SurvexPersonRole.objects.all().delete()
     SurvexStation.objects.all().delete()
-    print(" - survex Data Issues flushed")
+    print("  - survex Data Issues flushed")
     DataIssue.objects.filter(parser='survex').delete()
     DataIssue.objects.filter(parser='svxdate').delete()
     DataIssue.objects.filter(parser='survexleg').delete()
@@ -1588,9 +1681,12 @@ def LoadSurvexBlocks():
     # fix by restarting db on server
     # sudo service mariadb stop
     # sudo service mariadb start
-
-
     survexblockroot.save()
+    
+    omitsfileroot = MakeOmitFileRoot("_unseens.svx")
+    survexomitsroot = SurvexBlock(name=OMITBLOCK, survexpath="", cave=None, survexfile=omitsfileroot, 
+        legsall=0, legslength=0.0)
+    survexomitsroot.save()  
 
     print(' - Loading Survex Blocks...')
     memstart = get_process_memory()