From 7e47fe1f30e68bff5a31caee8cdf356d0f802888 Mon Sep 17 00:00:00 2001 From: Philip Sargent Date: Wed, 5 Oct 2022 21:11:18 +0300 Subject: [PATCH] Parse all files, not just those in the *include tree --- parsers/caves.py | 5 +- parsers/survex.py | 208 +++++++++++++++++++++++++++++++++------------- 2 files changed, 156 insertions(+), 57 deletions(-) diff --git a/parsers/caves.py b/parsers/caves.py index 634434c..9458d7a 100644 --- a/parsers/caves.py +++ b/parsers/caves.py @@ -457,7 +457,7 @@ def readcaves(): DataIssue.objects.filter(parser='caves ok').delete() DataIssue.objects.filter(parser='entrances').delete() - print(" - Creating Areas 1623, 1624 and 1626") + print(" - Creating Areas 1623, 1624, 1627 and 1626") # This crashes on the server with MariaDB even though a null parent is explicitly allowed. area_1623= Area.objects.create(short_name = "1623", super=None) print(" - Saving Area 1623") @@ -468,6 +468,9 @@ def readcaves(): area_1626= Area.objects.create(short_name = "1626", super=None) print(" - Saving Area 1626") area_1626.save() + area_1627= Area.objects.create(short_name = "1627", super=None) + print(" - Saving Area 1627") + area_1627.save() with transaction.atomic(): diff --git a/parsers/survex.py b/parsers/survex.py index d37fea7..2740213 100644 --- a/parsers/survex.py +++ b/parsers/survex.py @@ -6,7 +6,7 @@ import copy import subprocess from pathlib import Path -from datetime import datetime, timedelta, date +from datetime import datetime, timedelta, date, timezone from django.utils.timezone import get_current_timezone from django.utils.timezone import make_aware @@ -39,9 +39,12 @@ todo = '''Also walk the entire tree in the :loser: repo looking for unconnected ''' survexblockroot = None +survexomitsroot = None ROOTBLOCK = "rootblock" +OMITBLOCK = "omitblock" METRESINFEET = 3.28084 +stop_dup_warning = False debugprint = False # Turns on debug printout for just one *include file debugprinttrigger = "!" # debugprinttrigger = "caves-1623/40/old/EisSVH" @@ -182,7 +185,7 @@ class LoadingSurvex(): callcount = 0 caverncount = 0 ignoreprefix = ["surface", "kataster", "fixedpts", "gpx"] - ignorenoncave = ["caves-1623", "caves-1626", "caves-1623/2007-neu"] + ignorenoncave = ["caves-1623", "caves-1623/2007-NEU","caves-1626", "caves-1624", "caves-1627", "fixedpts/gps/gps00raw", ""] includedfilename ="" currentsurvexblock = None currentsurvexfile = None @@ -344,7 +347,7 @@ class LoadingSurvex(): the rest is discarded after error-checking. Now skipping the error checking - returns as soon as the leg is not one we count. - REPLACE ALL THIS by reading the .log output of cavern for the file + REPLACE ALL THIS by reading the .log output of cavern for the file. But we need the lengths per Block, not by File. Hmm. """ invalid_clino = 180.0 invalid_compass = 720.0 @@ -457,7 +460,7 @@ class LoadingSurvex(): print(("! Clino misread in", survexblock.survexfile.path)) print((" datastar:", datastar)) print((" Line:", ls)) - message = ' ! Value Error: Clino misread in line %s in %s' % (ls, survexblock.survexfile.path) + message = f' ! Value Error: Clino misread in line \'{sline.lower()}\' {datastar=} {self.datastar=} {ls=} in\n{survexblock}\n{survexblock.survexfile}\n{survexblock.survexfile.path}' DataIssue.objects.create(parser='survexleg', message=message, url=get_offending_filename(survexblock.survexfile.path)) lclino = invalid_clino @@ -668,17 +671,24 @@ class LoadingSurvex(): datastar["tape"] = i-1 self.datastar = copy.deepcopy(datastar) return - elif ls[0] == "cartesian" or ls[0] == "nosurvey" or ls[0] == "diving" or ls[0] == "cylpolar" or ls[0] == "passage": - # message = " ! - *data {} blocks ignored. {}|{}" '{}' .format(ls[0].upper(), survexblock.name, survexblock.survexpath, args) + elif ls[0] == "passage" or ls[0] == "nosurvey" or ls[0] == "diving" or ls[0] == "cylpolar": + #message = " ! - *data {} blocks ignored. {}|{}" '{}' .format(ls[0].upper(), survexblock.name, survexblock.survexpath, args) # print(message) - # print(message,file=sys.stderr) - # DataIssue.objects.create(parser='survex', message=message) + #print(message,file=sys.stderr) + #DataIssue.objects.create(parser='survex', message=message) + self.datastar["type"] = ls[0] + elif ls[0] == "cartesian": # We should not ignore this ?! Default for Germans ? + #message = " ! - *data {} blocks ignored. {}|{}" '{}' .format(ls[0].upper(), survexblock.name, survexblock.survexpath, args) + # print(message) + #print(message,file=sys.stderr) + #DataIssue.objects.create(parser='survex', message=message) self.datastar["type"] = ls[0] else: message = " ! - Unrecognised *data statement '{}' {}|{}".format(args, survexblock.name, survexblock.survexpath) print(message) print(message,file=sys.stderr) DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(survexblock.survexfile.path)) + self.datastar["type"] = ls[0] def LoadSurvexFlags(self, args): # Valid flags are DUPLICATE, SPLAY, and SURFACE, and a flag may be preceded with NOT to turn it off. @@ -779,11 +789,11 @@ class LoadingSurvex(): # Yes we didn't find this cave, but we know it is a pending one. So not an error. # print(f'! ALREADY PENDING {caveid}',file=sys.stderr) return - - message = f" ! Error: {caveid} not a cave nor ignorable. headpath:'{headpath}' while parsing '{includelabel=}.svx' at depth:[{len(depth)}]. ignore prefix list:'{self.ignoreprefix}'" + + message = f" ! Warning: cave identifier '{caveid}' (guessed from file path) is not a known cave. Need to add to expoweb/cave_data/pending.txt ? In '{includelabel}.svx' at depth:[{len(depth)}]." print("\n"+message) print("\n"+message,file=sys.stderr) - DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(headpath)) + DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(includelabel)) print(f' # datastack in LoadSurvexFile:{includelabel} type:', end="",file=sys.stderr) for dict in self.datastack: print(f'<{dict["type"].upper()} >', end="",file=sys.stderr) @@ -1190,6 +1200,7 @@ class LoadingSurvex(): and reads only the *include and *begin and *end statements. It produces a linearised list of the include tree and detects blocks included more than once. """ + global stop_dup_warning thissvxline = 0 indent = " " * self.depthinclude sys.stderr.flush(); @@ -1202,18 +1213,23 @@ class LoadingSurvex(): if path in self.svxfileslist: # We have already used os.normpath() so this is OK. "/../" and "//" have been simplified already. - message = f" * Warning. Duplicate detected. We have already seen this *include '{path}' from another survex file. Detected at callcount:{self.callcount} depth:{self.depthinclude}" - print(message) - print(message,file=flinear) - print("\n"+message,file=sys.stderr) - DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(path)) - if self.svxfileslist.count(path) > 20: - message = " ! ERROR. Survex file already *included 20x. Probably an infinite loop so fix your *include statements that include this. Aborting. {}".format(path) + if stop_dup_warning: + #print("D",end="", file=sys.stderr) + pass + else: + message = f" * Warning. Duplicate detected. We have already seen this *include '{path}' from another survex file. Detected at callcount:{self.callcount} depth:{self.depthinclude}" print(message) print(message,file=flinear) - print(message,file=sys.stderr) + #print(message,file=sys.stderr) + DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(path)) + if self.svxfileslist.count(path) > 2: + message = " ! ERROR. Should have been caught before this. Survex file already *included 2x. Probably an infinite loop so fix your *include statements that include this. Aborting. {}".format(path) + print(message) + print(message,file=flinear) + #print(message,file=sys.stderr) DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(path)) return + return self.svxfileslist.append(path) try: @@ -1254,7 +1270,7 @@ class LoadingSurvex(): includepath = os.path.normpath(os.path.join(os.path.split(path)[0], re.sub(r"\.svx$", "", args))) fullpath = os.path.join(settings.SURVEX_DATA, includepath + ".svx") - self.RunSurvexIfNeeded(os.path.join(settings.SURVEX_DATA, includepath)) + self.RunSurvexIfNeeded(os.path.join(settings.SURVEX_DATA, includepath), path) self.checkUniqueness(os.path.join(settings.SURVEX_DATA, includepath)) if os.path.isfile(fullpath): #-------------------------------------------------------- @@ -1280,7 +1296,7 @@ class LoadingSurvex(): self.depthinclude -= 1 #-------------------------------------------------------- else: - message = " ! ERROR *include file not found for:'{}'".format(includepath) + message = f" ! ERROR *include file '{includepath}' not found, listed in '{fin.name}'" print(message) print(message,file=sys.stderr) DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(path)) @@ -1329,7 +1345,7 @@ class LoadingSurvex(): print(message) - def RunSurvexIfNeeded(self,fullpath): + def RunSurvexIfNeeded(self,fullpath, calledpath): now = time.time() cav_t = now - 365*24*3600 log_t = now - 365*24*3600 @@ -1368,7 +1384,7 @@ class LoadingSurvex(): outputdir = Path(svxpath).parent if not svxpath.is_file(): - message = f' ! BAD survex file "{fullpath}" specified in *include (somewhere).. ' + message = f' ! BAD survex file "{fullpath}" specified in *include in {calledpath} ' DataIssue.objects.create(parser='entrances', message=message) print(message) return @@ -1410,6 +1426,7 @@ class LoadingSurvex(): def FindAndLoadSurvex(survexblockroot): """Follows the *include links successively to find files in the whole include tree """ + global stop_dup_warning print(' - redirecting stdout to svxblks.log...') stdout_orig = sys.stdout # Redirect sys.stdout to the file @@ -1419,22 +1436,21 @@ def FindAndLoadSurvex(survexblockroot): survexfileroot = survexblockroot.survexfile # i.e. SURVEX_TOPNAME only collatefilename = "_" + survexfileroot.path + ".svx" + svx_scan = LoadingSurvex() svx_scan.callcount = 0 svx_scan.depthinclude = 0 fullpathtotop = os.path.join(survexfileroot.survexdirectory.path, survexfileroot.path) - # Rather than do this check for the presence of the .log and .3d files synchronously here, - # we should instead run this in a separate thread asynchronously. print(" - RunSurvexIfNeeded cavern on '{}'".format(fullpathtotop), file=sys.stderr) - svx_scan.RunSurvexIfNeeded(fullpathtotop) + svx_scan.RunSurvexIfNeeded(fullpathtotop, fullpathtotop) svx_scan.checkUniqueness(fullpathtotop) indent="" fcollate = open(collatefilename, 'w') mem0 = get_process_memory() - print(" - MEM:{:7.2f} MB START".format(mem0),file=sys.stderr) + print(" - MEM:{:7.2f} MB START".format(mem0),file=sys.stderr) flinear = open('svxlinear.log', 'w') flinear.write(" - MEM:{:7.2f} MB START {}\n".format(mem0,survexfileroot.path)) print(" ", file=sys.stderr,end='') @@ -1447,6 +1463,7 @@ def FindAndLoadSurvex(survexblockroot): from pstats import SortKey pr = cProfile.Profile() pr.enable() + #print(f"###{survexblockroot=} {survexfileroot.path=}",file=sys.stderr) #---------------------------------------------------------------- svx_scan.PushdownStackScan(survexblockroot, survexfileroot.path, finroot, flinear, fcollate) #---------------------------------------------------------------- @@ -1455,47 +1472,113 @@ def FindAndLoadSurvex(survexblockroot): ps = pstats.Stats(pr, stream=f) ps.sort_stats(SortKey.CUMULATIVE) ps.print_stats() - + flinear.write("{:2} {} *edulcni {}\n".format(svx_scan.depthinclude, indent, survexfileroot.path)) fcollate.write(";*edulcni {}\n".format(survexfileroot.path)) mem1 = get_process_memory() flinear.write("\n - MEM:{:.2f} MB STOP {}\n".format(mem1,survexfileroot.path)) - flinear.write(" - MEM:{:.3f} MB USED\n".format(mem1-mem0)) - svxfileslist = svx_scan.svxfileslist - flinear.write(" - {:,} survex files in linear include list \n".format(len(svxfileslist))) - flinear.close() - fcollate.close() - - print("\n - {:,} runs of survex 'cavern' refreshing .3d files \n".format(svx_scan.caverncount),file=sys.stderr) - - svx_scan = None # Hmm. Does this actually delete all the instance variables if they are lists, dicts etc.? - print("\n - {:,} survex files in linear include list \n".format(len(svxfileslist)),file=sys.stderr) + flinear.write(" - MEM:{:.3f} MB ADDITIONALLY USED\n".format(mem1-mem0)) + flinear.write(" - {:,} survex files in linear include list \n".format(len(svx_scan.svxfileslist))) + + print(" - {:,} runs of survex 'cavern' refreshing .3d files".format(svx_scan.caverncount),file=sys.stderr) + print(" - {:,} survex files from tree in linear include list".format(len(svx_scan.svxfileslist)),file=sys.stderr) mem1 = get_process_memory() - print(" - MEM:{:7.2f} MB END ".format(mem0),file=sys.stderr) - print(" - MEM:{:7.3f} MB USED".format(mem1-mem0),file=sys.stderr) + print(" - MEM:{:7.2f} MB END ".format(mem1),file=sys.stderr) + print(" - MEM:{:7.3f} MB ADDITIONALLY USED".format(mem1-mem0),file=sys.stderr) - a = [] + # + # Process all the omitted files in :loser: with some exceptions + # + unseens = set() b=[] for p in Path(settings.SURVEX_DATA).rglob('*.svx'): if p.is_file(): po = p.relative_to(Path(settings.SURVEX_DATA)) pox = po.with_suffix('') - if str(pox) not in svxfileslist: - print(f"[{pox}]", file=sys.stderr) - a.append(pox) + if str(pox) not in svx_scan.svxfileslist: + # print(f"[{pox}]", file=sys.stderr) + unseens.add(pox) else: - print("'", end=" ", file=sys.stderr) b.append(pox) + + if len(b) != len(svx_scan.svxfileslist): + print(f" ! Mismatch. {len(b)} survex files found which should be {len(svx_scan.svxfileslist)} in main tree)", file=sys.stderr) + + excpts = ["surface/terrain", "kataster/kataster-boundaries", "template", "docs", "_unseens"] + removals = [] + for x in unseens: + for o in excpts: + if str(x).strip().startswith(o): + removals.append(x) + for x in removals: + unseens.remove(x) + print(f" - {len(unseens)} survex files found which were not included in main tree. ({len(svx_scan.svxfileslist)} in main tree)", file=sys.stderr) + print(f" -- Now loading the previously-omitted survex files.", file=sys.stderr) + + with open(Path(settings.SURVEX_DATA, '_unseens.svx'), 'w') as u: + u.write(f"; {len(unseens):,} survex files not *included by {settings.SURVEX_TOPNAME} (which are {len(svx_scan.svxfileslist):,} files)\n") + u.write(f"; autogenerated by parser/survex.py from databasereset.py on '{datetime.now(timezone.utc)}'\n") + u.write(f"; omitting any file beginning with {excpts}\n\n") + u.write(f"*begin unseens\n") + for x in sorted(unseens): + u.write(f" *include {x}\n") + u.write(f"*end unseens\n") - print("=>", len(a), len(b), len(svxfileslist), file=sys.stderr) + survexfileroot = survexblockroot.survexfile # i.e. SURVEX_TOPNAME only + + omit_scan = LoadingSurvex() + omit_scan.callcount = 0 + omit_scan.depthinclude = 0 + fullpathtotop = os.path.join(survexfileroot.survexdirectory.path, '_unseens.svx') - for i in [0,1,2,3,4,5]: - print(f"==> [{svxfileslist[i]}]", file=sys.stderr) + # copy the list to prime the next pass through the files + omit_scan.svxfileslist = svx_scan.svxfileslist[:] + svx_scan.svxfileslist = [] # free memory + svx_scan = None # Hmm. Does this actually delete all the instance variables if they are lists, dicts etc.? + print(" - RunSurvexIfNeeded cavern on '{}'".format(fullpathtotop), file=sys.stderr) + omit_scan.RunSurvexIfNeeded(fullpathtotop, fullpathtotop) + omit_scan.checkUniqueness(fullpathtotop) + + mem0 = get_process_memory() + print(" - MEM:{:7.2f} MB START '_unseens'".format(mem0),file=sys.stderr) + #flinear = open('svxlinear.log', 'w') + flinear.write(f" - MEM:{mem0:7.2f} MB START '_unseens'\n") + print(" ", file=sys.stderr,end='') + + finroot = open(fullpathtotop) + fcollate.write(";*include {}\n".format('_unseens.svx')) + flinear.write("{:2} {} *include {}\n".format(omit_scan.depthinclude, indent, '_unseens')) + stop_dup_warning = True + #---------------------------------------------------------------- + omit_scan.PushdownStackScan(survexblockroot, '_unseens', finroot, flinear, fcollate) + #---------------------------------------------------------------- + stop_dup_warning = False + + flinear.write("{:2} {} *edulcni {}\n".format(omit_scan.depthinclude, indent, '_unseens')) + fcollate.write(";*edulcni {}\n".format('_unseens.svx')) + mem1 = get_process_memory() + flinear.write("\n - MEM:{:.2f} MB STOP {} OMIT\n".format(mem1,'_unseens.svx')) + flinear.write(" - MEM:{:.3f} MB ADDITIONALLY USED OMIT\n".format(mem1-mem0)) + flinear.write(" - {:,} survex files in linear include list OMIT \n".format(len(omit_scan.svxfileslist))) - svxfileslist = [] # free memory + flinear.close() + fcollate.close() + + print("\n - {:,} runs of survex 'cavern' refreshing .3d files in the unseen list \n".format(omit_scan.caverncount),file=sys.stderr) + + print("\n - {:,} survex files in linear include list including previously unseen ones \n".format(len(omit_scan.svxfileslist)),file=sys.stderr) + omit_scan = None # Hmm. Does this actually delete all the instance variables if they are lists, dicts etc.? + + mem1 = get_process_memory() + print(" - MEM:{:7.2f} MB END ".format(mem1),file=sys.stderr) + print(" - MEM:{:7.3f} MB ADDITIONALLY USED".format(mem1-mem0),file=sys.stderr) + + + + # Before doing this, it would be good to identify the *equate and *entrance we need that are relevant to the # entrance locations currently loaded after this by LoadPos(), but could better be done before ? @@ -1503,6 +1586,8 @@ def FindAndLoadSurvex(survexblockroot): print('\n - Loading All Survex Blocks (LinearLoad)',file=sys.stderr) svx_load = LoadingSurvex() + mem1 = get_process_memory() + print(" - MEM:{:7.2f} MB after creating empty loading object.".format(mem1),file=sys.stderr) svx_load.survexdict[survexfileroot.survexdirectory] = [] svx_load.survexdict[survexfileroot.survexdirectory].append(survexfileroot) @@ -1525,7 +1610,7 @@ def FindAndLoadSurvex(survexblockroot): # ps.print_stats() print("\n - MEM:{:7.2f} MB STOP".format(mem1),file=sys.stderr) - print(" - MEM:{:7.3f} MB USED".format(mem1-mem0),file=sys.stderr) + print(" - MEM:{:7.3f} MB ADDITIONALLY USED".format(mem1-mem0),file=sys.stderr) # Close the logging file, Restore sys.stdout to our old saved file handle sys.stdout.close() @@ -1536,12 +1621,12 @@ def FindAndLoadSurvex(survexblockroot): legsnumber = svx_load.legsnumber mem1 = get_process_memory() - print(" - Number of SurvexDirectories: {}".format(len(svx_load.survexdict))) + print(" - Number of SurvexDirectories: {:,}".format(len(svx_load.survexdict))) tf=0 for d in svx_load.survexdict: tf += len(svx_load.survexdict[d]) - print(" - Number of SurvexFiles: {}".format(tf)) - print(f" - Number of Survex legs: {legsnumber}") + print(f" - Number of SurvexFiles: {tf:,}") + print(f" - Number of Survex legs: {legsnumber:,}") svx_load = None return legsnumber @@ -1562,6 +1647,14 @@ def MakeSurvexFileRoot(): fileroot.survexdirectory = directoryroot # i.e. SURVEX_DATA/SURVEX_TOPNAME fileroot.save() # mutually dependent objects need a double-save like this return fileroot + +def MakeOmitFileRoot(fn): + """Returns a file_object.path = _unseens.svx associated with directory_object.path = SURVEX_DATA + """ + fileroot = SurvexFile(path=fn, cave=None) + fileroot.survexdirectory = SurvexDirectory.objects.get(path=settings.SURVEX_DATA) + fileroot.save() + return fileroot def LoadSurvexBlocks(): @@ -1571,7 +1664,7 @@ def LoadSurvexBlocks(): SurvexDirectory.objects.all().delete() SurvexPersonRole.objects.all().delete() SurvexStation.objects.all().delete() - print(" - survex Data Issues flushed") + print(" - survex Data Issues flushed") DataIssue.objects.filter(parser='survex').delete() DataIssue.objects.filter(parser='svxdate').delete() DataIssue.objects.filter(parser='survexleg').delete() @@ -1588,9 +1681,12 @@ def LoadSurvexBlocks(): # fix by restarting db on server # sudo service mariadb stop # sudo service mariadb start - - survexblockroot.save() + + omitsfileroot = MakeOmitFileRoot("_unseens.svx") + survexomitsroot = SurvexBlock(name=OMITBLOCK, survexpath="", cave=None, survexfile=omitsfileroot, + legsall=0, legslength=0.0) + survexomitsroot.save() print(' - Loading Survex Blocks...') memstart = get_process_memory()