From 7779544c0ca3bc5329ccc010ff3f31edbb2f037e Mon Sep 17 00:00:00 2001 From: Philip Sargent Date: Fri, 8 Sep 2023 18:35:47 +0300 Subject: [PATCH] refactoring cave id stuff in survex parser - working --- parsers/survex.py | 138 +++++++++++++++++++++++----------------------- 1 file changed, 70 insertions(+), 68 deletions(-) diff --git a/parsers/survex.py b/parsers/survex.py index d712394..681dc48 100644 --- a/parsers/survex.py +++ b/parsers/survex.py @@ -22,20 +22,19 @@ It also scans the Loser repo for all the svx files, which it loads individually """ todo = """ +- Obscure bug in the *team inheritance and rootblock initialization needs tracking down, + probably in the team cache which should NOT be global, but should be an instance variable of + LoadingSurvex + - Lots to do to cut down on unnecessary .save() calls to avoid hitting the db so much. Should speed it up noticably. - -- Obscure bug in the *team inheritance and rootblock initialization needs tracking down - Learn to use Django .select_related() and .prefetch_related() to speed things up https://zerotobyte.com/how-to-use-django-select-related-and-prefetch-related/ - LoadSurvexFile() Creates a new current survexfile - The survexblock passed-in is not necessarily the parent. FIX THIS. - -- Finish writing the parse_one_file() function for survexfiles edited online. Perhaps - easier if this is a completely new file rather than an existing file.. nasty. - + The survexblock passed-in is not necessarily the survex parent. FIX THIS. + - When Olly implements LEG in the 'dump3d --legs' utility, then we can use that to get the length of all the legs in a survex block instead of adding them up oursleves. Which means that we can ignore all the Units and offset stuff, that troggle will work with survex files with backsights, @@ -62,27 +61,10 @@ class SurvexLeg: compass = 0.0 clino = 0.0 -def IdentifyCave(cavepath): - """Given a file path for a survex file, or a survex-block path, - return the cave object - """ - caveslist = GetCaveLookup() - if cavepath.lower() in caveslist: - return caveslist[cavepath.lower()] - # TO DO - this predates the big revision to Gcavelookup so look at this again carefully - path_match = LoadingSurvex.rx_cave.search(cavepath) # use as Class method - if path_match: - sluggy = f"{path_match.group(1)}-{path_match.group(2)}" - guesses = [sluggy.lower(), path_match.group(2).lower()] - for g in guesses: - if g in caveslist: - caveslist[cavepath] = caveslist[g] - return caveslist[g] - print(f" ! Failed to find cave for {cavepath.lower()}") - else: - # not a cave, but that is fine. - # print(f' ! No regex(standard identifier) cave match for {cavepath.lower()}') - return None + + + + def datewallet(w, earliest): """Gets the date of the youngest survexblock associated with the wallet @@ -141,7 +123,8 @@ def get_offending_filename(path): """ return "/survexfile/" + path + ".svx" -trip_people_cache = {} # per survexblock, so robust wrt PUSH/POP begin/end +# THIS SHOULD NOT BE GLOBAL ! SHould be per instance of file loader.. +trip_people_cache = {} # indexed by survexblock, so never needs cleaning out def get_team_on_trip(survexblock): """Uses a cache to avoid a database query if it doesn't need to. Only used for complete team.""" @@ -165,8 +148,9 @@ def get_people_on_trip(survexblock): return list(set(people)) -trip_person_record = {} # per survexblock, so robust wrt PUSH/POP begin/end -trip_team_cache = {} # per survexblock, so robust wrt PUSH/POP begin/end +# THIS SHOULD NOT BE GLOBAL ! SHould be per instance of file loader +trip_person_record = {} # indexed by (survexblock, personexpedition) - so never needs cleaning out +trip_team_cache = {} # indexed by survexblock, so never needs cleaning out def put_person_on_trip(survexblock, personexpedition, tm): """Uses a cache to avoid a database query if it doesn't need to. Only used for a single person""" @@ -206,18 +190,17 @@ def confirm_team_on_trip(survexblock): SurvexPersonRole.objects.bulk_create(trip_team_cache[survexblock]) trip_team_cache[survexblock] = [] # in database now, so empty cache -def check_team_cache(): +def check_team_cache(label=None): global trip_team_cache - message = f"! check_team_cache() called.. " print(message) - + print(message, file=sys.stderr) for block in trip_team_cache: - message = f"! *team CACHEFAIL, already created {block.survexfile.path} ({block}) " + message = f"! *team CACHEFAIL, trip_team_cache {block.survexfile.path} ({block}). label:{label}" print(message) - + print(message, file=sys.stderr) -person_pending_cache = {} # per survexblock, so robust wrt PUSH/POP begin/end +person_pending_cache = {} # indexed per survexblock, so robust wrt PUSH/POP begin/end def add_to_pending(survexblock, tm): """Collects team names before we have a date so cannot validate against expo attendance yet""" @@ -1180,7 +1163,45 @@ class LoadingSurvex: self.svxprim[headpath.lower()] = primary return self.svxprim[headpath.lower()] - def ReportNonCaveIncludes(self, headpath, includelabel, depth): + def IdentifyCave(self, cavepath, svxid, depth): + """Given a file path for a survex file, e.g. /1626/107/107.svx, or a survex-block path, + return the cave object + + REWRITE ALL THIS and make a methoid on the class + """ + caveslist = GetCaveLookup() + if cavepath.lower() in caveslist: # will only work after we load in full paths as indexes, see below + return caveslist[cavepath.lower()] + # rx_cave = re.compile(r"(?i)caves-(\d\d\d\d)/([-\d\w]+|\d\d\d\d-?\w+-\d+)") + path_match = self.rx_cave.search(cavepath) # use as Class method. + if path_match: + sluggy = f"{path_match.group(1)}-{path_match.group(2)}" + # guesses = [sluggy.lower(), path_match.group(2).lower()] # this looks for JUST "107" and ignores 1626.. + guesses = [sluggy.lower()] # full 1626-107 search, don;t use short-forms + for g in guesses: + if g in caveslist: + caveslist[cavepath] = caveslist[g] # set "caves-1626/107/107.svx" as index to cave 1626-107 + return caveslist[g] + print(f" ! Failed to find cave for {cavepath.lower()}", file=sys.stderr) + else: + # not a cave, but that is fine. + if self.is_it_already_pending(cavepath, svxid, depth): + pass + else: + # It is too late to add it to the pending caves list here, they were already + # processed in parsers/caves.py So we have to do a bespoke creation. + cave = create_new_cave(svxid) + + message = f" ! Warning: cave identifier '{caveid}'or {id} (guessed from file path) is not a known cave. Need to add to expoweb/cave_data/pendingcaves.txt ? In '{includelabel}.svx' at depth:[{len(depth)}]." + print("\n" + message) + print("\n" + message, file=sys.stderr) + print(f"{self.pending}", end="", file=sys.stderr) + stash_data_issue(parser="survex", message=message, url=None, sb=(includelabel)) + + print(f' ! No regex (standard identifier) cave match for {cavepath.lower()}', file=sys.stderr) + return None + + def is_it_already_pending(self, headpath, includelabel, depth): """Ignore surface, kataser and gpx *include survex files""" if not self.pending: self.pending = set() @@ -1199,7 +1220,7 @@ class LoadingSurvex: message = f" - {headpath} is (while creating '{includelabel}' sfile & sdirectory)" # print("\n"+message) # print("\n"+message,file=sys.stderr) - return + return True for i in self.ignoreprefix: if headpath.startswith(i): message = ( @@ -1207,28 +1228,17 @@ class LoadingSurvex: ) # print("\n"+message) # print("\n"+message,file=sys.stderr) - return + return True caveid = f"{headpath[6:10]}-{headpath[11:]}".upper() if caveid in self.pending: # Yes we didn't find this cave, but we know it is a pending one. So not an error. - # print(f'! ALREADY PENDING {caveid}',file=sys.stderr) - return + print(f'! ALREADY PENDING caveid {caveid}',file=sys.stderr) + return True id = caveid[5:] if id in self.pending: - print(f"! ALREADY PENDING {id}", file=sys.stderr) - return + print(f"! ALREADY PENDING id {id}", file=sys.stderr) + return True - # It is too late to add it to the pending caves list here, they were already - # processed in parsers/caves.py So we have to do a bespoke creation. - svxpath= includelabel - cave = create_new_cave(svxpath) - - message = f" ! Warning: cave identifier '{caveid}'or {id} (guessed from file path) is not a known cave. Need to add to expoweb/cave_data/pendingcaves.txt ? In '{includelabel}.svx' at depth:[{len(depth)}]." - print("\n" + message) - print("\n" + message, file=sys.stderr) - print(f"{self.pending}", end="", file=sys.stderr) - stash_data_issue(parser="survex", message=message, url=None, sb=(includelabel)) - def LoadSurvexFile(self, svxid): """Creates SurvexFile in the database, and SurvexDirectory if needed Creates a new current survexfile and valid .survexdirectory @@ -1267,15 +1277,9 @@ class LoadingSurvex: newfile.save() # until we do this there is no internal id so no foreign key works self.currentsurvexfile = newfile newfile.primary = self.set_primary(headpath) - - # REPLACE all this IdentifyCave() stuff with GCaveLookup ? - cave = IdentifyCave(headpath) # cave already exists in db - if not cave: - # probably a surface survey, or a cave in a new area - # e.g. 1624 not previously managed, and not in the pending list - self.ReportNonCaveIncludes(headpath, svxid, depth) - #try again - cave = IdentifyCave(headpath) + + # refactor this ! + cave = self.IdentifyCave(headpath, svxid, depth) # cave already exists in db? if cave: newfile.cave = cave # print(f"\n - New directory '{newdirectory}' for cave '{cave}'",file=sys.stderr) @@ -1530,7 +1534,7 @@ class LoadingSurvex: slengthtotal = 0.0 nlegstotal = 0 self.relativefilename = path - IdentifyCave(path) # this will produce null for survex files which are geographic collections + #self.IdentifyCave(path, svxid, depth) # this will produce null for survex files which are geographic collections self.currentsurvexfile = survexblock.survexfile self.currentsurvexfile.save() # django insists on this although it is already saved !? @@ -2198,7 +2202,6 @@ def FindAndLoadSurvex(): ) print(f" -- (but ignoring {len(removals)} of them)", file=sys.stderr) - check_team_cache() s_date = date.today().isoformat().replace('-','.') print(f" -- Now loading the previously-omitted survex files as {UNSEENS} *date {s_date}", file=sys.stderr) print(f" - (except: {excpts})", file=sys.stderr) @@ -2250,8 +2253,6 @@ def FindAndLoadSurvex(): flinear.write(f"{omit_scan.depthinclude:2} {indent} *edulcni {unseensroot}\n") fcollate.write(f";*edulcni {UNSEENS}\n") - - check_team_cache() mem1 = get_process_memory() flinear.write(f"\n - MEM:{mem1:.2f} MB STOP {UNSEENS} Unseen Oddments\n") @@ -2294,6 +2295,7 @@ def FindAndLoadSurvex(): # ps = pstats.Stats(pr2, stream=f) # ps.sort_stats(SortKey.CUMULATIVE) # ps.print_stats() + mem1 = get_process_memory() print(f"\n - MEM:{mem1:7.2f} MB STOP", file=sys.stderr) print(f" - MEM:{mem1 - mem0:7.3f} MB ADDITIONALLY USED", file=sys.stderr)