clean up de-duplication code

2026-02-08 12:27:35 +00:00 · 2023-02-28 16:18:29 +00:00
parent 5067ef2c8c
commit dc03016dbe
1 changed files with 140 additions and 75 deletions
--- a/parsers/survex.py
+++ b/parsers/survex.py
@@ -46,8 +46,10 @@ survexomitsroot = None
 ROOTBLOCK = "rootblock"
 OMITBLOCK = "omitblock"
 METRESINFEET = 3.28084
+UNSEENS = "_unseens.svx"

 stop_dup_warning = False
+dup_includes = 1
 debugprint = False  # Turns on debug printout for just one *include file
 debugprinttrigger = "!"

@@ -260,8 +262,8 @@ class LoadingSurvex:

    rx_cave = re.compile(r"(?i)caves-(\d\d\d\d)/([-\d\w]+|\d\d\d\d-?\w+-\d+)")
    rx_comment = re.compile(r"([^;]*?)\s*(?:;\s*(.*))?\n?$")
-    rx_comminc = re.compile(r"(?i)^\|\*include[\s]*([-\w/]*).*$")  # inserted by linear collate ;*include
-    rx_commcni = re.compile(r"(?i)^\|\*edulcni[\s]*([-\w/]*).*$")  # inserted by linear collate ;*edulcni
+    rx_comminc = re.compile(r"(?i)^\|\*include[\s]*([-\w/]*).*$")  # inserted by linear collate ;|*include
+    rx_commcni = re.compile(r"(?i)^\|\*edulcni[\s]*([-\w/]*).*$")  # inserted by linear collate ;|*edulcni
    rx_include = re.compile(r"(?i)^\s*(\*include[\s].*)$")
    rx_include2 = re.compile("(?i)include$")
    rx_commref = re.compile(r"(?i)^\s*ref(?:erence)?[\s.:]*(\d+)\s*#\s*(X)?\s*(\d+)")
@@ -300,7 +302,7 @@ class LoadingSurvex:
    stacksvxfiles = []
    svxfileslist = []
    svxdirs = {}
-    uniquename = {}
+    uniquefile = {}
    expos = {}
    survexdict = {}  # each key is a directory, and its value is a list of files
    lineno = 0
@@ -1163,9 +1165,16 @@ class LoadingSurvex:
        """Creates SurvexFile in the database, and SurvexDirectory if needed
        with links to 'cave'
        Creates a new current survexfile and valid .survexdirectory
-        Inspects the parent folder of the survexfile and uses that to decide if this is a cave we know
+        Inspects the parent folder of the survexfile and uses that to decide if this is 
+        a cave we know.
+        
+        If we see a duplicate cave, this is too late. It has already been included into the
+        long linear file. This needs to be prevented when the long linear file is created.
+        
        The survexblock passed-in is not necessarily the parent. FIX THIS.
        """
+        global dup_includes
+        
        if debugprint:
            print(f" # datastack in  LoadSurvexFile:{svxid} 'type':", end="")
            for dict in self.datastack:
@@ -1173,10 +1182,20 @@ class LoadingSurvex:
            print("")

        depth = " " * self.depthbegin
-        # print("{:2}{}   - NEW survexfile:'{}'".format(self.depthbegin, depth, svxid))
+        print("{:2}{}   - NEW survexfile:'{}'".format(self.depthbegin, depth, svxid))
        headpath = os.path.dirname(svxid)

-        newfile = SurvexFile(path=svxid)
+        newfile, created = SurvexFile.objects.update_or_create(path=svxid)
+        if not created:
+            dup_includes += 1
+            message = f" ! DUP SurvexFile '{svxid}' create attempt in LoadSurvexFile()"
+            print(message)
+            # print(message, file=sys.stderr)
+            stash_data_issue(parser="survex", message=message, url=f"/survexfile/{svxid}")
+            
+            self.currentsurvexfile = newfile
+            return # abort as everything already done for object creation 
+        
        newfile.save()  # until we do this there is no internal id so no foreign key works
        self.currentsurvexfile = newfile
        newdirectory = self.GetSurvexDirectory(headpath)
@@ -1217,7 +1236,11 @@ class LoadingSurvex:
                print(f"'{dict['type'].upper()}'   ", end="")
            print("")

+
    def ProcessIncludeLine(self, included):
+        """As we read the long linear file, we come across lines telling us that the
+        content from this point on is from a particular included file
+        """
        global debugprint
        svxid = included.groups()[0]
        if svxid.lower() == debugprinttrigger.lower():
@@ -1226,7 +1249,9 @@ class LoadingSurvex:
        self.stacksvxfiles.append(self.currentsurvexfile)

    def ProcessEdulcniLine(self, edulcni):
-        """Saves the current survexfile in the db"""
+        """As we read the long linear file, we come across lines telling us that the
+        we are about to pop back out of the contents of an included file
+        Saves the current survexfile object in the db to include the data parsed from it"""
        global debugprint
        svxid = edulcni.groups()[0]
        if debugprint:
@@ -1277,8 +1302,8 @@ class LoadingSurvex:
                    )

        included = self.rx_comminc.match(comment)
-        # ;*include means 'we have been included'; whereas *include means 'proceed to include'
-        # bug, If the original survex file contians the line ;*include then we pick it up ! So fix our special code to be ;|*include
+        # ;|*include means 'we have been included'; whereas *include means 'proceed to include'
+        # No test here to check that this file has not already been included. Ouch.
        if included:
            self.ProcessIncludeLine(included)

@@ -1553,7 +1578,7 @@ class LoadingSurvex:
                self.lineno += 1
                sline, comment = self.rx_comment.match(svxline).groups()
                if comment:
-                    # this catches the ;*include NEWFILE and ;*edulcni ENDOFFILE lines too
+                    # this catches the ;|*include NEWFILE and ;|*edulcni ENDOFFILE lines too
                    self.LoadSurvexComment(survexblock, comment)

                if not sline:
@@ -1616,11 +1641,11 @@ class LoadingSurvex:
                if self.rx_include2.match(cmd):
                # rx_include2 = re.compile("(?i)include$")
                # if re.match("(?i)include$", cmd):
-                    includepath = os.path.normpath(os.path.join(os.path.split(path)[0], re.sub(r"\.svx$", "", args)))
-
+                    includepath = os.path.normpath(os.path.join(os.path.split(path)[0], re.sub(r"\.svx$", "", args))) # normalises path syntax
+                    if self.never_seen(includepath, path):
                        fullpath = os.path.join(settings.SURVEX_DATA, includepath + ".svx")
                        self.RunSurvexIfNeeded(os.path.join(settings.SURVEX_DATA, includepath), path)
-                    self.checkUniqueness(os.path.join(settings.SURVEX_DATA, includepath))
+                        self.check_unique_name(os.path.join(settings.SURVEX_DATA, includepath))
                        if os.path.isfile(fullpath):
                            # --------------------------------------------------------
                            self.depthinclude += 1
@@ -1733,20 +1758,39 @@ class LoadingSurvex:
            print(message)
            print(message, file=sys.stderr)
            stash_data_issue(parser="survex", message=message, url=None, sb=(path))
+            raise
            return  # skip this survex file and all things *included in it

-    def checkUniqueness(self, fullpath):
-        fn = Path(fullpath).name
-        if fn not in self.uniquename:
-            self.uniquename[fn] = [fullpath]
-        else:
-            self.uniquename[fn].append(fullpath)
-            # This is not an error now that we are moving .3d files to the :loser: directory tree
+    def never_seen(self, incpath, parent):
+        """The _unseen files may include survex files we have already seen, and we do not
+        want to process them again. For the _unseens this is not an error, but for the main
+        *include tree it is an error.
+        """
+ 
+        if incpath in self.uniquefile:
+            self.uniquefile[incpath].append(parent)
+            
            message = (
-                f" NOTE:  non-unique survex filename, '{fn}' - '{self.uniquename[fn]}' #{len(self.uniquename[fn])}"
+                f" DUP:  non-unique survex filepath, '{incpath}' -  #{len(self.uniquefile[incpath])} '{self.uniquefile[incpath]}'"
            )
-            # print(message)
+            print(message)
            # stash_data_issue(parser='survex', message=message)
+            for p in self.uniquefile[incpath]:
+                if p in self.uniquefile:
+                    print(f"{p} <- {self.uniquefile[p]}")
+            return False
+        else:
+            self.uniquefile[incpath] = [parent]
+            return True
+            
+    def check_unique_name(self, fullpath):
+        """This only checks whether the last bit of the name of the survex file is unique,
+        e.g. "bigpitch", not whether the whole path of the survexfile has been seen before.
+        
+        We don't care about this any more.
+        """
+        return
+        

    def RunSurvexIfNeeded(self, fullpath, calledpath):
        now = time.time()
@@ -1843,7 +1887,13 @@ class LoadingSurvex:


 def FindAndLoadSurvex(survexblockroot):
-    """Follows the *include links successively to find files in the whole include tree"""
+    """Follows the *include links successively to find survex files
+    This proceeds in 3 phases:
+    1. The root survex file is read and all the *include files are found, using PushdownStackScan()
+    2. All the other survex files in the :loser: repo are found, and their *includes found,
+       using another PushdownStackScan() [duplicates omitted]
+    3. The combined expanded file containing all the survex data is parsed as a single file,
+       using LinearLoad()"""
    global stop_dup_warning
    print("  - redirecting stdout to svxblks.log...")
    stdout_orig = sys.stdout
@@ -1861,15 +1911,16 @@ def FindAndLoadSurvex(survexblockroot):

    print(f"  - RunSurvexIfNeeded cavern on '{fullpathtotop}'", file=sys.stderr)
    svx_scan.RunSurvexIfNeeded(fullpathtotop, fullpathtotop)
-    svx_scan.checkUniqueness(fullpathtotop)
+    svx_scan.check_unique_name(fullpathtotop)
+    svx_scan.uniquefile[str(survexfileroot)] = ["0"]

    indent = ""
    fcollate = open(collatefilename, "w")

    mem0 = get_process_memory()
-    print(f"  - MEM:{mem0:7.2f} MB START", file=sys.stderr)
+    print(f"  - MEM:{mem0:7.2f} MB START '{survexfileroot}'", file=sys.stderr)
    flinear = open("svxlinear.log", "w")
-    flinear.write(f"    - MEM:{mem0:7.2f} MB START {survexfileroot.path}\n")
+    flinear.write(f"    - MEM:{mem0:7.2f} MB START '{survexfileroot.path}'\n")
    print("    ", file=sys.stderr, end="")

    finrootname = Path(settings.SURVEX_DATA, survexfileroot.path + ".svx")
@@ -1897,16 +1948,24 @@ def FindAndLoadSurvex(survexblockroot):
    flinear.write(f"\n    - MEM:{mem1:.2f} MB STOP {survexfileroot.path}\n")
    flinear.write(f"    - MEM:{mem1 - mem0:.3f} MB ADDITIONALLY USED\n")
    flinear.write(f"    - {len(svx_scan.svxfileslist):,} survex files in linear include list \n")
+    flinear.write(f"    - {len(svx_scan.uniquefile):,} unique survex files in linear include list \n")
+    for j in svx_scan.svxfileslist:
+        if j not in svx_scan.uniquefile:
+            flinear.write(f"    - '{j}' {type(j)} not in unique list  \n")            
+    for f in svx_scan.uniquefile:
+        # flinear.write(f"    - '{f}'  {type(f)} {svx_scan.uniquefile[f]}   \n") 
+        if len(svx_scan.uniquefile[f]) > 1:
+            flinear.write(f"    - '{f}' {type(f)} {svx_scan.uniquefile[f]} dup survex files  \n")
           
    print(f"\n  -  {svx_scan.caverncount:,} runs of survex 'cavern' refreshing .3d files", file=sys.stderr)
    print(f"  -  {len(svx_scan.svxfileslist):,} survex files from tree in linear include list", file=sys.stderr)
-
+    print(f"  -  {len(svx_scan.uniquefile):,} unique survex files from tree in linear include list", file=sys.stderr)
    mem1 = get_process_memory()
    print(f"  - MEM:{mem1:7.2f} MB END ", file=sys.stderr)
    print(f"  - MEM:{mem1 - mem0:7.3f} MB ADDITIONALLY USED", file=sys.stderr)
+    
    #
    # Process all the omitted files in :loser: with some exceptions
-    #
    unseens = set()
    b = []

@@ -1926,13 +1985,14 @@ def FindAndLoadSurvex(survexblockroot):
            file=sys.stderr,
        )

-    excpts = ["surface/terrain", "kataster/kataster-boundaries", "template", "docs", "_unseens"]
+    unseensroot = re.sub(r"\.svx$", "", UNSEENS)
+    excpts = ["surface/terrain", "kataster/kataster-boundaries", "template", "docs", unseensroot]
    removals = []
    for x in unseens:
        for o in excpts:
            if str(x).strip().startswith(o):
                removals.append(x)
-    # special fix for file not actually in survex format
+    # special fix for .svx file not actually in survex format
    unseens.remove(Path("fixedpts/gps/gps00raw"))

    for x in removals:
@@ -1944,7 +2004,7 @@ def FindAndLoadSurvex(survexblockroot):
    check_team_cache()
    print(" -- Now loading the previously-omitted survex files.", file=sys.stderr)

-    with open(Path(settings.SURVEX_DATA, "_unseens.svx"), "w") as u:
+    with open(Path(settings.SURVEX_DATA, UNSEENS), "w") as u:
        u.write(
            f"; {len(unseens):,} survex files not *included by {settings.SURVEX_TOPNAME} (which are {len(svx_scan.svxfileslist):,} files)\n"
        )
@@ -1960,7 +2020,7 @@ def FindAndLoadSurvex(survexblockroot):
    omit_scan = LoadingSurvex()
    omit_scan.callcount = 0
    omit_scan.depthinclude = 0
-    fullpathtotop = os.path.join(survexfileroot.survexdirectory.path, "_unseens.svx")
+    fullpathtotop = os.path.join(survexfileroot.survexdirectory.path, UNSEENS)

    # copy the list to prime the next pass through the files
    omit_scan.svxfileslist = svx_scan.svxfileslist[:]
@@ -1969,32 +2029,35 @@ def FindAndLoadSurvex(survexblockroot):

    print(f"  - RunSurvexIfNeeded cavern on '{fullpathtotop}'", file=sys.stderr)
    omit_scan.RunSurvexIfNeeded(fullpathtotop, fullpathtotop)
-    omit_scan.checkUniqueness(fullpathtotop)
+    omit_scan.check_unique_name(fullpathtotop)
+    omit_scan.uniquefile[unseensroot] = ["0"]

    mem0 = get_process_memory()
-    print(f"  - MEM:{mem0:7.2f} MB START '_unseens'", file=sys.stderr)
+    print(f"  - MEM:{mem0:7.2f} MB START '{unseensroot}'", file=sys.stderr)
    # flinear = open('svxlinear.log', 'w')
-    flinear.write(f"    - MEM:{mem0:7.2f} MB START '_unseens'\n")
+    flinear.write(f"    - MEM:{mem0:7.2f} MB START '{unseensroot}'\n")
    print("    ", file=sys.stderr, end="")

+    # this is a bit tricky as some unseen files will *include files we have already seen, which 
+    # we should not process again.
    finrootname = fullpathtotop
-    fcollate.write(";*include _unseens.svx\n")
-    flinear.write(f"{omit_scan.depthinclude:2} {indent} *include _unseens\n")
-    stop_dup_warning = True
+    fcollate.write(f";*include {UNSEENS}\n")
+    flinear.write(f"{omit_scan.depthinclude:2} {indent} *include {unseensroot}\n")
+    # stop_dup_warning = True
    # ----------------------------------------------------------------
-    omit_scan.PushdownStackScan(survexblockroot, "_unseens", finrootname, flinear, fcollate)
+    omit_scan.PushdownStackScan(survexblockroot, unseensroot, finrootname, flinear, fcollate)
    # ----------------------------------------------------------------
-    stop_dup_warning = False
+    # stop_dup_warning = False

-    flinear.write(f"{omit_scan.depthinclude:2} {indent} *edulcni _unseens\n")
-    fcollate.write(";*edulcni _unseens.svx\n")
+    flinear.write(f"{omit_scan.depthinclude:2} {indent} *edulcni {unseensroot}\n")
+    fcollate.write(f";*edulcni {UNSEENS}\n")
    
    check_team_cache()

    mem1 = get_process_memory()
-    flinear.write(f"\n    - MEM:{mem1:.2f} MB STOP _unseens.svx OMIT\n")
-    flinear.write(f"    - MEM:{mem1 - mem0:.3f} MB ADDITIONALLY USED OMIT\n")
-    flinear.write(f"    - {len(omit_scan.svxfileslist):,} survex files in linear include list OMIT \n")
+    flinear.write(f"\n    - MEM:{mem1:.2f} MB STOP {UNSEENS} Unseen Oddments\n")
+    flinear.write(f"    - MEM:{mem1 - mem0:.3f} MB ADDITIONALLY USED Unseen Oddments\n")
+    flinear.write(f"    - {len(omit_scan.svxfileslist):,} survex files in linear include list Unseen Oddments \n")

    flinear.close()
    fcollate.close()
@@ -2085,6 +2148,7 @@ def MakeOmitFileRoot(fn):


 def LoadSurvexBlocks():
+    global dup_includes
    mem1 = get_process_memory()
    print(f"  - MEM:{mem1:7.2f} MB now ", file=sys.stderr)
    start = time.time()
@@ -2129,7 +2193,7 @@ def LoadSurvexBlocks():
    # sudo service mariadb start
    survexblockroot.save()

-    omitsfileroot = MakeOmitFileRoot("_unseens.svx")
+    omitsfileroot = MakeOmitFileRoot(UNSEENS)
    survexomitsroot = SurvexBlock(
        name=OMITBLOCK, survexpath="", survexfile=omitsfileroot, legsall=0, legslength=0.0
    )
@@ -2157,5 +2221,6 @@ def LoadSurvexBlocks():
    store_data_issues()
    # duration = time.time() - start
    # print(f" - TIME: {duration:7.2f} s", file=sys.stderr)
+    print(f" - Duplicate *includes = {dup_includes}")
    print(" - Loaded All Survex Blocks.")