refactoring cave id stuff in survex parser - working

2024-11-25 08:41:51 +00:00 · 2023-09-08 18:35:47 +03:00 · 2023-09-08 18:35:47 +03:00 · 7779544c0c
commit 7779544c0c
parent 28d1092956
1 changed files with 70 additions and 68 deletions
--- a/parsers/survex.py
+++ b/parsers/survex.py
@ -22,20 +22,19 @@ It also scans the Loser repo for all the svx files, which it loads individually
 """
 todo = """
 - Obscure bug in the *team inheritance and rootblock initialization needs tracking down,
  probably in the team cache which should NOT be global, but should be an instance variable of 
  LoadingSurvex
 - Lots to do to cut down on unnecessary .save() calls to avoid hitting the db so much. Should
  speed it up noticably. 
 - Obscure bug in the *team inheritance and rootblock initialization needs tracking down
 - Learn to use Django .select_related() and .prefetch_related() to speed things up
  https://zerotobyte.com/how-to-use-django-select-related-and-prefetch-related/
 - LoadSurvexFile() Creates a new current survexfile 
-        The survexblock passed-in is not necessarily the parent. FIX THIS.
+        The survexblock passed-in is not necessarily the survex parent. FIX THIS.
-        
+                
 - Finish writing the parse_one_file() function for survexfiles edited online. Perhaps
  easier if this is a completely new file rather than an existing file.. nasty.
 - When Olly implements LEG in the  'dump3d --legs' utility, then we can use that to get the length of
  all the legs in a survex block instead of adding them up oursleves. Which means that we can
  ignore all the  Units and offset stuff, that troggle will work with survex files with backsights,
@ -62,27 +61,10 @@ class SurvexLeg:
    compass = 0.0
    clino = 0.0
-def IdentifyCave(cavepath):
+        
-    """Given a file path for a survex file, or a survex-block path,
+
-    return the cave object
+
-        """
+
    caveslist = GetCaveLookup()
    if cavepath.lower() in caveslist:
        return caveslist[cavepath.lower()]
    # TO DO - this predates the big revision to Gcavelookup so look at this again carefully
    path_match = LoadingSurvex.rx_cave.search(cavepath) # use as Class method
    if path_match:
        sluggy = f"{path_match.group(1)}-{path_match.group(2)}"
        guesses = [sluggy.lower(), path_match.group(2).lower()]
        for g in guesses:
            if g in caveslist:
                caveslist[cavepath] = caveslist[g]
                return caveslist[g]
        print(f"    ! Failed to find cave for {cavepath.lower()}")
    else:
        # not a cave, but that is fine.
        # print(f'    ! No regex(standard identifier) cave match for {cavepath.lower()}')
        return None
 def datewallet(w, earliest):
    """Gets the date of the youngest survexblock associated with the wallet
@ -141,7 +123,8 @@ def get_offending_filename(path):
    """
    return "/survexfile/" + path + ".svx"
-trip_people_cache = {} # per survexblock, so robust wrt PUSH/POP begin/end
+# THIS SHOULD NOT BE GLOBAL ! SHould be per instance of file loader..
 trip_people_cache = {}   # indexed by  survexblock, so never needs cleaning out
 def get_team_on_trip(survexblock):
    """Uses a cache to avoid a database query if it doesn't need to.
    Only used for complete team."""
@ -165,8 +148,9 @@ def get_people_on_trip(survexblock):
    return list(set(people))
-trip_person_record = {}  # per survexblock, so robust wrt PUSH/POP begin/end
+# THIS SHOULD NOT BE GLOBAL ! SHould be per instance of file loader
-trip_team_cache = {}  # per survexblock, so robust wrt PUSH/POP begin/end
+trip_person_record = {}  # indexed by (survexblock, personexpedition) - so never needs cleaning out
 trip_team_cache = {}  #  indexed by  survexblock, so never needs cleaning out
 def put_person_on_trip(survexblock, personexpedition, tm):
    """Uses a cache to avoid a database query if it doesn't need to.
    Only used for a single person"""
@ -206,18 +190,17 @@ def confirm_team_on_trip(survexblock):
    SurvexPersonRole.objects.bulk_create(trip_team_cache[survexblock])
    trip_team_cache[survexblock] = [] # in database now, so empty cache    
-def check_team_cache():
+def check_team_cache(label=None):
    global trip_team_cache
    message = f"! check_team_cache() called.. "
    print(message)
-
+    print(message, file=sys.stderr)
    for block in trip_team_cache:
-        message = f"! *team CACHEFAIL, already created {block.survexfile.path} ({block})  "
+        message = f"! *team CACHEFAIL, trip_team_cache {block.survexfile.path} ({block}). label:{label}"
        print(message)
-        
+        print(message, file=sys.stderr)
-person_pending_cache = {}  # per survexblock, so robust wrt PUSH/POP begin/end
+person_pending_cache = {}  # indexed per survexblock, so robust wrt PUSH/POP begin/end
 def add_to_pending(survexblock, tm):
    """Collects team names before we have a date so cannot validate against 
    expo attendance yet"""
@ -1180,7 +1163,45 @@ class LoadingSurvex:
            self.svxprim[headpath.lower()] = primary
        return self.svxprim[headpath.lower()]
-    def ReportNonCaveIncludes(self, headpath, includelabel, depth):
+    def IdentifyCave(self, cavepath, svxid, depth):
        """Given a file path for a survex file, e.g. /1626/107/107.svx, or a survex-block path,
        return the cave object
        REWRITE ALL THIS and make a methoid on the class
        """
        caveslist = GetCaveLookup()
        if cavepath.lower() in caveslist: # will only work after we load in full paths as indexes, see below
            return caveslist[cavepath.lower()]
        #     rx_cave = re.compile(r"(?i)caves-(\d\d\d\d)/([-\d\w]+|\d\d\d\d-?\w+-\d+)")
        path_match = self.rx_cave.search(cavepath) # use as Class method. 
        if path_match:
            sluggy = f"{path_match.group(1)}-{path_match.group(2)}"
            # guesses = [sluggy.lower(), path_match.group(2).lower()] # this looks for JUST "107" and ignores 1626..
            guesses = [sluggy.lower()] # full 1626-107 search, don;t use short-forms
            for g in guesses:
                if g in caveslist:
                    caveslist[cavepath] = caveslist[g] # set "caves-1626/107/107.svx" as index to cave 1626-107
                    return caveslist[g]
            print(f"    ! Failed to find cave for {cavepath.lower()}", file=sys.stderr)
        else:
            # not a cave, but that is fine.
            if self.is_it_already_pending(cavepath, svxid, depth):
                pass
            else:
                # It is too late to add it to the pending caves list here, they were already 
                # processed in parsers/caves.py So we have to do a bespoke creation.
                cave = create_new_cave(svxid)
                message = f" ! Warning: cave identifier '{caveid}'or {id} (guessed from file path)  is not a known cave.  Need to add to expoweb/cave_data/pendingcaves.txt ?  In '{includelabel}.svx' at depth:[{len(depth)}]."
                print("\n" + message)
                print("\n" + message, file=sys.stderr)
                print(f"{self.pending}", end="", file=sys.stderr)
                stash_data_issue(parser="survex", message=message, url=None, sb=(includelabel))
                print(f'    ! No regex (standard identifier) cave match for {cavepath.lower()}', file=sys.stderr)
            return None
    def is_it_already_pending(self, headpath, includelabel, depth):
        """Ignore surface, kataser and gpx *include survex files"""
        if not self.pending:
            self.pending = set()
@ -1199,7 +1220,7 @@ class LoadingSurvex:
            message = f" - {headpath} is <ignorenoncave> (while creating '{includelabel}' sfile & sdirectory)"
            # print("\n"+message)
            # print("\n"+message,file=sys.stderr)
-            return
+            return True
        for i in self.ignoreprefix:
            if headpath.startswith(i):
                message = (
@ -1207,28 +1228,17 @@ class LoadingSurvex:
                )
                # print("\n"+message)
                # print("\n"+message,file=sys.stderr)
-                return
+                return True
        caveid = f"{headpath[6:10]}-{headpath[11:]}".upper()
        if caveid in self.pending:
            # Yes we didn't find this cave, but we know it is a pending one. So not an error.
-            # print(f'! ALREADY PENDING {caveid}',file=sys.stderr)
+            print(f'! ALREADY PENDING caveid {caveid}',file=sys.stderr)
-            return
+            return True
        id = caveid[5:]
        if id in self.pending:
-            print(f"! ALREADY PENDING {id}", file=sys.stderr)
+            print(f"! ALREADY PENDING id {id}", file=sys.stderr)
-            return
+            return True
        # It is too late to add it to the pending caves list here, they were already 
        # processed in parsers/caves.py So we have to do a bespoke creation.
        svxpath= includelabel
        cave = create_new_cave(svxpath)
        message = f" ! Warning: cave identifier '{caveid}'or {id} (guessed from file path)  is not a known cave.  Need to add to expoweb/cave_data/pendingcaves.txt ?  In '{includelabel}.svx' at depth:[{len(depth)}]."
        print("\n" + message)
        print("\n" + message, file=sys.stderr)
        print(f"{self.pending}", end="", file=sys.stderr)
        stash_data_issue(parser="survex", message=message, url=None, sb=(includelabel))
    def LoadSurvexFile(self, svxid):
        """Creates SurvexFile in the database, and SurvexDirectory if needed
        Creates a new current survexfile and valid .survexdirectory
@ -1267,15 +1277,9 @@ class LoadingSurvex:
        newfile.save()  # until we do this there is no internal id so no foreign key works
        self.currentsurvexfile = newfile
        newfile.primary = self.set_primary(headpath)
-
+        
-        # REPLACE all this IdentifyCave() stuff with GCaveLookup ?
+        # refactor this ! 
-        cave = IdentifyCave(headpath)  # cave already exists in db
+        cave = self.IdentifyCave(headpath, svxid, depth)  # cave already exists in db?
        if not cave:
            # probably a surface survey, or a cave in a new area 
            # e.g. 1624 not previously managed, and not in the pending list
            self.ReportNonCaveIncludes(headpath, svxid, depth)
        #try again
        cave = IdentifyCave(headpath)  
        if cave:
            newfile.cave = cave
            # print(f"\n - New directory '{newdirectory}' for cave '{cave}'",file=sys.stderr)
@ -1530,7 +1534,7 @@ class LoadingSurvex:
        slengthtotal = 0.0
        nlegstotal = 0
        self.relativefilename = path
-        IdentifyCave(path)  # this will produce null for survex files which are geographic collections
+        #self.IdentifyCave(path, svxid, depth)  # this will produce null for survex files which are geographic collections
        self.currentsurvexfile = survexblock.survexfile
        self.currentsurvexfile.save()  # django insists on this although it is already saved !?
@ -2198,7 +2202,6 @@ def FindAndLoadSurvex():
    )
    print(f" -- (but ignoring {len(removals)} of them)", file=sys.stderr)
    check_team_cache()
    s_date = date.today().isoformat().replace('-','.')
    print(f" -- Now loading the previously-omitted survex files as {UNSEENS} *date {s_date}", file=sys.stderr)
    print(f"  - (except: {excpts})", file=sys.stderr)
@ -2250,8 +2253,6 @@ def FindAndLoadSurvex():
    flinear.write(f"{omit_scan.depthinclude:2} {indent} *edulcni {unseensroot}\n")
    fcollate.write(f";*edulcni {UNSEENS}\n")
    check_team_cache()
    mem1 = get_process_memory()
    flinear.write(f"\n    - MEM:{mem1:.2f} MB STOP {UNSEENS} Unseen Oddments\n")
@ -2294,6 +2295,7 @@ def FindAndLoadSurvex():
    # ps = pstats.Stats(pr2, stream=f)
    # ps.sort_stats(SortKey.CUMULATIVE)
    # ps.print_stats()
    mem1 = get_process_memory()
    print(f"\n - MEM:{mem1:7.2f} MB STOP", file=sys.stderr)
    print(f" - MEM:{mem1 - mem0:7.3f} MB ADDITIONALLY USED", file=sys.stderr)