refactoring cave id stuff in survex parser - working

2025-12-16 14:27:03 +00:00 · 2023-09-08 18:35:47 +03:00
parent 28d1092956
commit 7779544c0c
1 changed files with 70 additions and 68 deletions
--- a/parsers/survex.py
+++ b/parsers/survex.py
@@ -22,19 +22,18 @@ It also scans the Loser repo for all the svx files, which it loads individually
 """

 todo = """
+- Obscure bug in the *team inheritance and rootblock initialization needs tracking down,
+  probably in the team cache which should NOT be global, but should be an instance variable of 
+  LoadingSurvex
+  
 - Lots to do to cut down on unnecessary .save() calls to avoid hitting the db so much. Should
  speed it up noticably. 
  
- Obscure bug in the *team inheritance and rootblock initialization needs tracking down
-  
 - Learn to use Django .select_related() and .prefetch_related() to speed things up
  https://zerotobyte.com/how-to-use-django-select-related-and-prefetch-related/
        
 - LoadSurvexFile() Creates a new current survexfile 
-        The survexblock passed-in is not necessarily the parent. FIX THIS.
-        
- Finish writing the parse_one_file() function for survexfiles edited online. Perhaps
-  easier if this is a completely new file rather than an existing file.. nasty.
+        The survexblock passed-in is not necessarily the survex parent. FIX THIS.
                
 - When Olly implements LEG in the  'dump3d --legs' utility, then we can use that to get the length of
  all the legs in a survex block instead of adding them up oursleves. Which means that we can
@@ -62,27 +61,10 @@ class SurvexLeg:
    compass = 0.0
    clino = 0.0
    
-def IdentifyCave(cavepath):
-    """Given a file path for a survex file, or a survex-block path,
-    return the cave object
-        """
-    caveslist = GetCaveLookup()
-    if cavepath.lower() in caveslist:
-        return caveslist[cavepath.lower()]
-    # TO DO - this predates the big revision to Gcavelookup so look at this again carefully
-    path_match = LoadingSurvex.rx_cave.search(cavepath) # use as Class method
-    if path_match:
-        sluggy = f"{path_match.group(1)}-{path_match.group(2)}"
-        guesses = [sluggy.lower(), path_match.group(2).lower()]
-        for g in guesses:
-            if g in caveslist:
-                caveslist[cavepath] = caveslist[g]
-                return caveslist[g]
-        print(f"    ! Failed to find cave for {cavepath.lower()}")
-    else:
-        # not a cave, but that is fine.
-        # print(f'    ! No regex(standard identifier) cave match for {cavepath.lower()}')
-        return None
+        
+
+
+

 def datewallet(w, earliest):
    """Gets the date of the youngest survexblock associated with the wallet
@@ -141,7 +123,8 @@ def get_offending_filename(path):
    """
    return "/survexfile/" + path + ".svx"

-trip_people_cache = {} # per survexblock, so robust wrt PUSH/POP begin/end
+# THIS SHOULD NOT BE GLOBAL ! SHould be per instance of file loader..
+trip_people_cache = {}   # indexed by  survexblock, so never needs cleaning out
 def get_team_on_trip(survexblock):
    """Uses a cache to avoid a database query if it doesn't need to.
    Only used for complete team."""
@@ -165,8 +148,9 @@ def get_people_on_trip(survexblock):
    
    return list(set(people))

-trip_person_record = {}  # per survexblock, so robust wrt PUSH/POP begin/end
-trip_team_cache = {}  # per survexblock, so robust wrt PUSH/POP begin/end
+# THIS SHOULD NOT BE GLOBAL ! SHould be per instance of file loader
+trip_person_record = {}  # indexed by (survexblock, personexpedition) - so never needs cleaning out
+trip_team_cache = {}  #  indexed by  survexblock, so never needs cleaning out
 def put_person_on_trip(survexblock, personexpedition, tm):
    """Uses a cache to avoid a database query if it doesn't need to.
    Only used for a single person"""
@@ -206,18 +190,17 @@ def confirm_team_on_trip(survexblock):
    SurvexPersonRole.objects.bulk_create(trip_team_cache[survexblock])
    trip_team_cache[survexblock] = [] # in database now, so empty cache    
    
-def check_team_cache():
+def check_team_cache(label=None):
    global trip_team_cache
-
    message = f"! check_team_cache() called.. "
    print(message)
-
+    print(message, file=sys.stderr)
    for block in trip_team_cache:
-        message = f"! *team CACHEFAIL, already created {block.survexfile.path} ({block})  "
+        message = f"! *team CACHEFAIL, trip_team_cache {block.survexfile.path} ({block}). label:{label}"
        print(message)
+        print(message, file=sys.stderr)
    
-    
-person_pending_cache = {}  # per survexblock, so robust wrt PUSH/POP begin/end
+person_pending_cache = {}  # indexed per survexblock, so robust wrt PUSH/POP begin/end
 def add_to_pending(survexblock, tm):
    """Collects team names before we have a date so cannot validate against 
    expo attendance yet"""
@@ -1180,7 +1163,45 @@ class LoadingSurvex:
            self.svxprim[headpath.lower()] = primary
        return self.svxprim[headpath.lower()]

-    def ReportNonCaveIncludes(self, headpath, includelabel, depth):
+    def IdentifyCave(self, cavepath, svxid, depth):
+        """Given a file path for a survex file, e.g. /1626/107/107.svx, or a survex-block path,
+        return the cave object
+        
+        REWRITE ALL THIS and make a methoid on the class
+        """
+        caveslist = GetCaveLookup()
+        if cavepath.lower() in caveslist: # will only work after we load in full paths as indexes, see below
+            return caveslist[cavepath.lower()]
+        #     rx_cave = re.compile(r"(?i)caves-(\d\d\d\d)/([-\d\w]+|\d\d\d\d-?\w+-\d+)")
+        path_match = self.rx_cave.search(cavepath) # use as Class method. 
+        if path_match:
+            sluggy = f"{path_match.group(1)}-{path_match.group(2)}"
+            # guesses = [sluggy.lower(), path_match.group(2).lower()] # this looks for JUST "107" and ignores 1626..
+            guesses = [sluggy.lower()] # full 1626-107 search, don;t use short-forms
+            for g in guesses:
+                if g in caveslist:
+                    caveslist[cavepath] = caveslist[g] # set "caves-1626/107/107.svx" as index to cave 1626-107
+                    return caveslist[g]
+            print(f"    ! Failed to find cave for {cavepath.lower()}", file=sys.stderr)
+        else:
+            # not a cave, but that is fine.
+            if self.is_it_already_pending(cavepath, svxid, depth):
+                pass
+            else:
+                # It is too late to add it to the pending caves list here, they were already 
+                # processed in parsers/caves.py So we have to do a bespoke creation.
+                cave = create_new_cave(svxid)
+                
+                message = f" ! Warning: cave identifier '{caveid}'or {id} (guessed from file path)  is not a known cave.  Need to add to expoweb/cave_data/pendingcaves.txt ?  In '{includelabel}.svx' at depth:[{len(depth)}]."
+                print("\n" + message)
+                print("\n" + message, file=sys.stderr)
+                print(f"{self.pending}", end="", file=sys.stderr)
+                stash_data_issue(parser="survex", message=message, url=None, sb=(includelabel))
+                    
+                print(f'    ! No regex (standard identifier) cave match for {cavepath.lower()}', file=sys.stderr)
+            return None
+            
+    def is_it_already_pending(self, headpath, includelabel, depth):
        """Ignore surface, kataser and gpx *include survex files"""
        if not self.pending:
            self.pending = set()
@@ -1199,7 +1220,7 @@ class LoadingSurvex:
            message = f" - {headpath} is <ignorenoncave> (while creating '{includelabel}' sfile & sdirectory)"
            # print("\n"+message)
            # print("\n"+message,file=sys.stderr)
-            return
+            return True
        for i in self.ignoreprefix:
            if headpath.startswith(i):
                message = (
@@ -1207,27 +1228,16 @@ class LoadingSurvex:
                )
                # print("\n"+message)
                # print("\n"+message,file=sys.stderr)
-                return
+                return True
        caveid = f"{headpath[6:10]}-{headpath[11:]}".upper()
        if caveid in self.pending:
            # Yes we didn't find this cave, but we know it is a pending one. So not an error.
-            # print(f'! ALREADY PENDING {caveid}',file=sys.stderr)
-            return
+            print(f'! ALREADY PENDING caveid {caveid}',file=sys.stderr)
+            return True
        id = caveid[5:]
        if id in self.pending:
-            print(f"! ALREADY PENDING {id}", file=sys.stderr)
-            return
-
-        # It is too late to add it to the pending caves list here, they were already 
-        # processed in parsers/caves.py So we have to do a bespoke creation.
-        svxpath= includelabel
-        cave = create_new_cave(svxpath)
-        
-        message = f" ! Warning: cave identifier '{caveid}'or {id} (guessed from file path)  is not a known cave.  Need to add to expoweb/cave_data/pendingcaves.txt ?  In '{includelabel}.svx' at depth:[{len(depth)}]."
-        print("\n" + message)
-        print("\n" + message, file=sys.stderr)
-        print(f"{self.pending}", end="", file=sys.stderr)
-        stash_data_issue(parser="survex", message=message, url=None, sb=(includelabel))
+            print(f"! ALREADY PENDING id {id}", file=sys.stderr)
+            return True

    def LoadSurvexFile(self, svxid):
        """Creates SurvexFile in the database, and SurvexDirectory if needed
@@ -1268,14 +1278,8 @@ class LoadingSurvex:
        self.currentsurvexfile = newfile
        newfile.primary = self.set_primary(headpath)
        
-        # REPLACE all this IdentifyCave() stuff with GCaveLookup ?
-        cave = IdentifyCave(headpath)  # cave already exists in db
-        if not cave:
-            # probably a surface survey, or a cave in a new area 
-            # e.g. 1624 not previously managed, and not in the pending list
-            self.ReportNonCaveIncludes(headpath, svxid, depth)
-        #try again
-        cave = IdentifyCave(headpath)  
+        # refactor this ! 
+        cave = self.IdentifyCave(headpath, svxid, depth)  # cave already exists in db?
        if cave:
            newfile.cave = cave
            # print(f"\n - New directory '{newdirectory}' for cave '{cave}'",file=sys.stderr)
@@ -1530,7 +1534,7 @@ class LoadingSurvex:
        slengthtotal = 0.0
        nlegstotal = 0
        self.relativefilename = path
-        IdentifyCave(path)  # this will produce null for survex files which are geographic collections
+        #self.IdentifyCave(path, svxid, depth)  # this will produce null for survex files which are geographic collections

        self.currentsurvexfile = survexblock.survexfile
        self.currentsurvexfile.save()  # django insists on this although it is already saved !?
@@ -2198,7 +2202,6 @@ def FindAndLoadSurvex():
    )
    print(f" -- (but ignoring {len(removals)} of them)", file=sys.stderr)

-    check_team_cache()
    s_date = date.today().isoformat().replace('-','.')
    print(f" -- Now loading the previously-omitted survex files as {UNSEENS} *date {s_date}", file=sys.stderr)
    print(f"  - (except: {excpts})", file=sys.stderr)
@@ -2251,8 +2254,6 @@ def FindAndLoadSurvex():
    flinear.write(f"{omit_scan.depthinclude:2} {indent} *edulcni {unseensroot}\n")
    fcollate.write(f";*edulcni {UNSEENS}\n")

-    check_team_cache()
-
    mem1 = get_process_memory()
    flinear.write(f"\n    - MEM:{mem1:.2f} MB STOP {UNSEENS} Unseen Oddments\n")
    flinear.write(f"    - MEM:{mem1 - mem0:.3f} MB ADDITIONALLY USED Unseen Oddments\n")
@@ -2294,6 +2295,7 @@ def FindAndLoadSurvex():
    # ps = pstats.Stats(pr2, stream=f)
    # ps.sort_stats(SortKey.CUMULATIVE)
    # ps.print_stats()
+
    mem1 = get_process_memory()
    print(f"\n - MEM:{mem1:7.2f} MB STOP", file=sys.stderr)
    print(f" - MEM:{mem1 - mem0:7.3f} MB ADDITIONALLY USED", file=sys.stderr)