2
0
mirror of https://expo.survex.com/repositories/troggle/.git synced 2024-11-25 08:41:51 +00:00

refactoring cave id stuff in survex parser - working

This commit is contained in:
Philip Sargent 2023-09-08 18:35:47 +03:00
parent 28d1092956
commit 7779544c0c

View File

@ -22,20 +22,19 @@ It also scans the Loser repo for all the svx files, which it loads individually
""" """
todo = """ todo = """
- Obscure bug in the *team inheritance and rootblock initialization needs tracking down,
probably in the team cache which should NOT be global, but should be an instance variable of
LoadingSurvex
- Lots to do to cut down on unnecessary .save() calls to avoid hitting the db so much. Should - Lots to do to cut down on unnecessary .save() calls to avoid hitting the db so much. Should
speed it up noticably. speed it up noticably.
- Obscure bug in the *team inheritance and rootblock initialization needs tracking down
- Learn to use Django .select_related() and .prefetch_related() to speed things up - Learn to use Django .select_related() and .prefetch_related() to speed things up
https://zerotobyte.com/how-to-use-django-select-related-and-prefetch-related/ https://zerotobyte.com/how-to-use-django-select-related-and-prefetch-related/
- LoadSurvexFile() Creates a new current survexfile - LoadSurvexFile() Creates a new current survexfile
The survexblock passed-in is not necessarily the parent. FIX THIS. The survexblock passed-in is not necessarily the survex parent. FIX THIS.
- Finish writing the parse_one_file() function for survexfiles edited online. Perhaps
easier if this is a completely new file rather than an existing file.. nasty.
- When Olly implements LEG in the 'dump3d --legs' utility, then we can use that to get the length of - When Olly implements LEG in the 'dump3d --legs' utility, then we can use that to get the length of
all the legs in a survex block instead of adding them up oursleves. Which means that we can all the legs in a survex block instead of adding them up oursleves. Which means that we can
ignore all the Units and offset stuff, that troggle will work with survex files with backsights, ignore all the Units and offset stuff, that troggle will work with survex files with backsights,
@ -62,27 +61,10 @@ class SurvexLeg:
compass = 0.0 compass = 0.0
clino = 0.0 clino = 0.0
def IdentifyCave(cavepath):
"""Given a file path for a survex file, or a survex-block path,
return the cave object
"""
caveslist = GetCaveLookup()
if cavepath.lower() in caveslist:
return caveslist[cavepath.lower()]
# TO DO - this predates the big revision to Gcavelookup so look at this again carefully
path_match = LoadingSurvex.rx_cave.search(cavepath) # use as Class method
if path_match:
sluggy = f"{path_match.group(1)}-{path_match.group(2)}"
guesses = [sluggy.lower(), path_match.group(2).lower()]
for g in guesses:
if g in caveslist:
caveslist[cavepath] = caveslist[g]
return caveslist[g]
print(f" ! Failed to find cave for {cavepath.lower()}")
else:
# not a cave, but that is fine.
# print(f' ! No regex(standard identifier) cave match for {cavepath.lower()}')
return None
def datewallet(w, earliest): def datewallet(w, earliest):
"""Gets the date of the youngest survexblock associated with the wallet """Gets the date of the youngest survexblock associated with the wallet
@ -141,7 +123,8 @@ def get_offending_filename(path):
""" """
return "/survexfile/" + path + ".svx" return "/survexfile/" + path + ".svx"
trip_people_cache = {} # per survexblock, so robust wrt PUSH/POP begin/end # THIS SHOULD NOT BE GLOBAL ! SHould be per instance of file loader..
trip_people_cache = {} # indexed by survexblock, so never needs cleaning out
def get_team_on_trip(survexblock): def get_team_on_trip(survexblock):
"""Uses a cache to avoid a database query if it doesn't need to. """Uses a cache to avoid a database query if it doesn't need to.
Only used for complete team.""" Only used for complete team."""
@ -165,8 +148,9 @@ def get_people_on_trip(survexblock):
return list(set(people)) return list(set(people))
trip_person_record = {} # per survexblock, so robust wrt PUSH/POP begin/end # THIS SHOULD NOT BE GLOBAL ! SHould be per instance of file loader
trip_team_cache = {} # per survexblock, so robust wrt PUSH/POP begin/end trip_person_record = {} # indexed by (survexblock, personexpedition) - so never needs cleaning out
trip_team_cache = {} # indexed by survexblock, so never needs cleaning out
def put_person_on_trip(survexblock, personexpedition, tm): def put_person_on_trip(survexblock, personexpedition, tm):
"""Uses a cache to avoid a database query if it doesn't need to. """Uses a cache to avoid a database query if it doesn't need to.
Only used for a single person""" Only used for a single person"""
@ -206,18 +190,17 @@ def confirm_team_on_trip(survexblock):
SurvexPersonRole.objects.bulk_create(trip_team_cache[survexblock]) SurvexPersonRole.objects.bulk_create(trip_team_cache[survexblock])
trip_team_cache[survexblock] = [] # in database now, so empty cache trip_team_cache[survexblock] = [] # in database now, so empty cache
def check_team_cache(): def check_team_cache(label=None):
global trip_team_cache global trip_team_cache
message = f"! check_team_cache() called.. " message = f"! check_team_cache() called.. "
print(message) print(message)
print(message, file=sys.stderr)
for block in trip_team_cache: for block in trip_team_cache:
message = f"! *team CACHEFAIL, already created {block.survexfile.path} ({block}) " message = f"! *team CACHEFAIL, trip_team_cache {block.survexfile.path} ({block}). label:{label}"
print(message) print(message)
print(message, file=sys.stderr)
person_pending_cache = {} # per survexblock, so robust wrt PUSH/POP begin/end person_pending_cache = {} # indexed per survexblock, so robust wrt PUSH/POP begin/end
def add_to_pending(survexblock, tm): def add_to_pending(survexblock, tm):
"""Collects team names before we have a date so cannot validate against """Collects team names before we have a date so cannot validate against
expo attendance yet""" expo attendance yet"""
@ -1180,7 +1163,45 @@ class LoadingSurvex:
self.svxprim[headpath.lower()] = primary self.svxprim[headpath.lower()] = primary
return self.svxprim[headpath.lower()] return self.svxprim[headpath.lower()]
def ReportNonCaveIncludes(self, headpath, includelabel, depth): def IdentifyCave(self, cavepath, svxid, depth):
"""Given a file path for a survex file, e.g. /1626/107/107.svx, or a survex-block path,
return the cave object
REWRITE ALL THIS and make a methoid on the class
"""
caveslist = GetCaveLookup()
if cavepath.lower() in caveslist: # will only work after we load in full paths as indexes, see below
return caveslist[cavepath.lower()]
# rx_cave = re.compile(r"(?i)caves-(\d\d\d\d)/([-\d\w]+|\d\d\d\d-?\w+-\d+)")
path_match = self.rx_cave.search(cavepath) # use as Class method.
if path_match:
sluggy = f"{path_match.group(1)}-{path_match.group(2)}"
# guesses = [sluggy.lower(), path_match.group(2).lower()] # this looks for JUST "107" and ignores 1626..
guesses = [sluggy.lower()] # full 1626-107 search, don;t use short-forms
for g in guesses:
if g in caveslist:
caveslist[cavepath] = caveslist[g] # set "caves-1626/107/107.svx" as index to cave 1626-107
return caveslist[g]
print(f" ! Failed to find cave for {cavepath.lower()}", file=sys.stderr)
else:
# not a cave, but that is fine.
if self.is_it_already_pending(cavepath, svxid, depth):
pass
else:
# It is too late to add it to the pending caves list here, they were already
# processed in parsers/caves.py So we have to do a bespoke creation.
cave = create_new_cave(svxid)
message = f" ! Warning: cave identifier '{caveid}'or {id} (guessed from file path) is not a known cave. Need to add to expoweb/cave_data/pendingcaves.txt ? In '{includelabel}.svx' at depth:[{len(depth)}]."
print("\n" + message)
print("\n" + message, file=sys.stderr)
print(f"{self.pending}", end="", file=sys.stderr)
stash_data_issue(parser="survex", message=message, url=None, sb=(includelabel))
print(f' ! No regex (standard identifier) cave match for {cavepath.lower()}', file=sys.stderr)
return None
def is_it_already_pending(self, headpath, includelabel, depth):
"""Ignore surface, kataser and gpx *include survex files""" """Ignore surface, kataser and gpx *include survex files"""
if not self.pending: if not self.pending:
self.pending = set() self.pending = set()
@ -1199,7 +1220,7 @@ class LoadingSurvex:
message = f" - {headpath} is <ignorenoncave> (while creating '{includelabel}' sfile & sdirectory)" message = f" - {headpath} is <ignorenoncave> (while creating '{includelabel}' sfile & sdirectory)"
# print("\n"+message) # print("\n"+message)
# print("\n"+message,file=sys.stderr) # print("\n"+message,file=sys.stderr)
return return True
for i in self.ignoreprefix: for i in self.ignoreprefix:
if headpath.startswith(i): if headpath.startswith(i):
message = ( message = (
@ -1207,28 +1228,17 @@ class LoadingSurvex:
) )
# print("\n"+message) # print("\n"+message)
# print("\n"+message,file=sys.stderr) # print("\n"+message,file=sys.stderr)
return return True
caveid = f"{headpath[6:10]}-{headpath[11:]}".upper() caveid = f"{headpath[6:10]}-{headpath[11:]}".upper()
if caveid in self.pending: if caveid in self.pending:
# Yes we didn't find this cave, but we know it is a pending one. So not an error. # Yes we didn't find this cave, but we know it is a pending one. So not an error.
# print(f'! ALREADY PENDING {caveid}',file=sys.stderr) print(f'! ALREADY PENDING caveid {caveid}',file=sys.stderr)
return return True
id = caveid[5:] id = caveid[5:]
if id in self.pending: if id in self.pending:
print(f"! ALREADY PENDING {id}", file=sys.stderr) print(f"! ALREADY PENDING id {id}", file=sys.stderr)
return return True
# It is too late to add it to the pending caves list here, they were already
# processed in parsers/caves.py So we have to do a bespoke creation.
svxpath= includelabel
cave = create_new_cave(svxpath)
message = f" ! Warning: cave identifier '{caveid}'or {id} (guessed from file path) is not a known cave. Need to add to expoweb/cave_data/pendingcaves.txt ? In '{includelabel}.svx' at depth:[{len(depth)}]."
print("\n" + message)
print("\n" + message, file=sys.stderr)
print(f"{self.pending}", end="", file=sys.stderr)
stash_data_issue(parser="survex", message=message, url=None, sb=(includelabel))
def LoadSurvexFile(self, svxid): def LoadSurvexFile(self, svxid):
"""Creates SurvexFile in the database, and SurvexDirectory if needed """Creates SurvexFile in the database, and SurvexDirectory if needed
Creates a new current survexfile and valid .survexdirectory Creates a new current survexfile and valid .survexdirectory
@ -1267,15 +1277,9 @@ class LoadingSurvex:
newfile.save() # until we do this there is no internal id so no foreign key works newfile.save() # until we do this there is no internal id so no foreign key works
self.currentsurvexfile = newfile self.currentsurvexfile = newfile
newfile.primary = self.set_primary(headpath) newfile.primary = self.set_primary(headpath)
# REPLACE all this IdentifyCave() stuff with GCaveLookup ? # refactor this !
cave = IdentifyCave(headpath) # cave already exists in db cave = self.IdentifyCave(headpath, svxid, depth) # cave already exists in db?
if not cave:
# probably a surface survey, or a cave in a new area
# e.g. 1624 not previously managed, and not in the pending list
self.ReportNonCaveIncludes(headpath, svxid, depth)
#try again
cave = IdentifyCave(headpath)
if cave: if cave:
newfile.cave = cave newfile.cave = cave
# print(f"\n - New directory '{newdirectory}' for cave '{cave}'",file=sys.stderr) # print(f"\n - New directory '{newdirectory}' for cave '{cave}'",file=sys.stderr)
@ -1530,7 +1534,7 @@ class LoadingSurvex:
slengthtotal = 0.0 slengthtotal = 0.0
nlegstotal = 0 nlegstotal = 0
self.relativefilename = path self.relativefilename = path
IdentifyCave(path) # this will produce null for survex files which are geographic collections #self.IdentifyCave(path, svxid, depth) # this will produce null for survex files which are geographic collections
self.currentsurvexfile = survexblock.survexfile self.currentsurvexfile = survexblock.survexfile
self.currentsurvexfile.save() # django insists on this although it is already saved !? self.currentsurvexfile.save() # django insists on this although it is already saved !?
@ -2198,7 +2202,6 @@ def FindAndLoadSurvex():
) )
print(f" -- (but ignoring {len(removals)} of them)", file=sys.stderr) print(f" -- (but ignoring {len(removals)} of them)", file=sys.stderr)
check_team_cache()
s_date = date.today().isoformat().replace('-','.') s_date = date.today().isoformat().replace('-','.')
print(f" -- Now loading the previously-omitted survex files as {UNSEENS} *date {s_date}", file=sys.stderr) print(f" -- Now loading the previously-omitted survex files as {UNSEENS} *date {s_date}", file=sys.stderr)
print(f" - (except: {excpts})", file=sys.stderr) print(f" - (except: {excpts})", file=sys.stderr)
@ -2250,8 +2253,6 @@ def FindAndLoadSurvex():
flinear.write(f"{omit_scan.depthinclude:2} {indent} *edulcni {unseensroot}\n") flinear.write(f"{omit_scan.depthinclude:2} {indent} *edulcni {unseensroot}\n")
fcollate.write(f";*edulcni {UNSEENS}\n") fcollate.write(f";*edulcni {UNSEENS}\n")
check_team_cache()
mem1 = get_process_memory() mem1 = get_process_memory()
flinear.write(f"\n - MEM:{mem1:.2f} MB STOP {UNSEENS} Unseen Oddments\n") flinear.write(f"\n - MEM:{mem1:.2f} MB STOP {UNSEENS} Unseen Oddments\n")
@ -2294,6 +2295,7 @@ def FindAndLoadSurvex():
# ps = pstats.Stats(pr2, stream=f) # ps = pstats.Stats(pr2, stream=f)
# ps.sort_stats(SortKey.CUMULATIVE) # ps.sort_stats(SortKey.CUMULATIVE)
# ps.print_stats() # ps.print_stats()
mem1 = get_process_memory() mem1 = get_process_memory()
print(f"\n - MEM:{mem1:7.2f} MB STOP", file=sys.stderr) print(f"\n - MEM:{mem1:7.2f} MB STOP", file=sys.stderr)
print(f" - MEM:{mem1 - mem0:7.3f} MB ADDITIONALLY USED", file=sys.stderr) print(f" - MEM:{mem1 - mem0:7.3f} MB ADDITIONALLY USED", file=sys.stderr)