diff --git a/parsers/survex.py b/parsers/survex.py index 3cf3168..5ebf555 100644 --- a/parsers/survex.py +++ b/parsers/survex.py @@ -46,8 +46,10 @@ survexomitsroot = None ROOTBLOCK = "rootblock" OMITBLOCK = "omitblock" METRESINFEET = 3.28084 +UNSEENS = "_unseens.svx" stop_dup_warning = False +dup_includes = 1 debugprint = False # Turns on debug printout for just one *include file debugprinttrigger = "!" @@ -260,8 +262,8 @@ class LoadingSurvex: rx_cave = re.compile(r"(?i)caves-(\d\d\d\d)/([-\d\w]+|\d\d\d\d-?\w+-\d+)") rx_comment = re.compile(r"([^;]*?)\s*(?:;\s*(.*))?\n?$") - rx_comminc = re.compile(r"(?i)^\|\*include[\s]*([-\w/]*).*$") # inserted by linear collate ;*include - rx_commcni = re.compile(r"(?i)^\|\*edulcni[\s]*([-\w/]*).*$") # inserted by linear collate ;*edulcni + rx_comminc = re.compile(r"(?i)^\|\*include[\s]*([-\w/]*).*$") # inserted by linear collate ;|*include + rx_commcni = re.compile(r"(?i)^\|\*edulcni[\s]*([-\w/]*).*$") # inserted by linear collate ;|*edulcni rx_include = re.compile(r"(?i)^\s*(\*include[\s].*)$") rx_include2 = re.compile("(?i)include$") rx_commref = re.compile(r"(?i)^\s*ref(?:erence)?[\s.:]*(\d+)\s*#\s*(X)?\s*(\d+)") @@ -300,7 +302,7 @@ class LoadingSurvex: stacksvxfiles = [] svxfileslist = [] svxdirs = {} - uniquename = {} + uniquefile = {} expos = {} survexdict = {} # each key is a directory, and its value is a list of files lineno = 0 @@ -1163,9 +1165,16 @@ class LoadingSurvex: """Creates SurvexFile in the database, and SurvexDirectory if needed with links to 'cave' Creates a new current survexfile and valid .survexdirectory - Inspects the parent folder of the survexfile and uses that to decide if this is a cave we know + Inspects the parent folder of the survexfile and uses that to decide if this is + a cave we know. + + If we see a duplicate cave, this is too late. It has already been included into the + long linear file. This needs to be prevented when the long linear file is created. + The survexblock passed-in is not necessarily the parent. FIX THIS. """ + global dup_includes + if debugprint: print(f" # datastack in LoadSurvexFile:{svxid} 'type':", end="") for dict in self.datastack: @@ -1173,10 +1182,20 @@ class LoadingSurvex: print("") depth = " " * self.depthbegin - # print("{:2}{} - NEW survexfile:'{}'".format(self.depthbegin, depth, svxid)) + print("{:2}{} - NEW survexfile:'{}'".format(self.depthbegin, depth, svxid)) headpath = os.path.dirname(svxid) - newfile = SurvexFile(path=svxid) + newfile, created = SurvexFile.objects.update_or_create(path=svxid) + if not created: + dup_includes += 1 + message = f" ! DUP SurvexFile '{svxid}' create attempt in LoadSurvexFile()" + print(message) + # print(message, file=sys.stderr) + stash_data_issue(parser="survex", message=message, url=f"/survexfile/{svxid}") + + self.currentsurvexfile = newfile + return # abort as everything already done for object creation + newfile.save() # until we do this there is no internal id so no foreign key works self.currentsurvexfile = newfile newdirectory = self.GetSurvexDirectory(headpath) @@ -1217,7 +1236,11 @@ class LoadingSurvex: print(f"'{dict['type'].upper()}' ", end="") print("") + def ProcessIncludeLine(self, included): + """As we read the long linear file, we come across lines telling us that the + content from this point on is from a particular included file + """ global debugprint svxid = included.groups()[0] if svxid.lower() == debugprinttrigger.lower(): @@ -1226,7 +1249,9 @@ class LoadingSurvex: self.stacksvxfiles.append(self.currentsurvexfile) def ProcessEdulcniLine(self, edulcni): - """Saves the current survexfile in the db""" + """As we read the long linear file, we come across lines telling us that the + we are about to pop back out of the contents of an included file + Saves the current survexfile object in the db to include the data parsed from it""" global debugprint svxid = edulcni.groups()[0] if debugprint: @@ -1277,8 +1302,8 @@ class LoadingSurvex: ) included = self.rx_comminc.match(comment) - # ;*include means 'we have been included'; whereas *include means 'proceed to include' - # bug, If the original survex file contians the line ;*include then we pick it up ! So fix our special code to be ;|*include + # ;|*include means 'we have been included'; whereas *include means 'proceed to include' + # No test here to check that this file has not already been included. Ouch. if included: self.ProcessIncludeLine(included) @@ -1553,7 +1578,7 @@ class LoadingSurvex: self.lineno += 1 sline, comment = self.rx_comment.match(svxline).groups() if comment: - # this catches the ;*include NEWFILE and ;*edulcni ENDOFFILE lines too + # this catches the ;|*include NEWFILE and ;|*edulcni ENDOFFILE lines too self.LoadSurvexComment(survexblock, comment) if not sline: @@ -1616,40 +1641,40 @@ class LoadingSurvex: if self.rx_include2.match(cmd): # rx_include2 = re.compile("(?i)include$") # if re.match("(?i)include$", cmd): - includepath = os.path.normpath(os.path.join(os.path.split(path)[0], re.sub(r"\.svx$", "", args))) - - fullpath = os.path.join(settings.SURVEX_DATA, includepath + ".svx") - self.RunSurvexIfNeeded(os.path.join(settings.SURVEX_DATA, includepath), path) - self.checkUniqueness(os.path.join(settings.SURVEX_DATA, includepath)) - if os.path.isfile(fullpath): - # -------------------------------------------------------- - self.depthinclude += 1 - # fininclude = open(fullpath,'r') - finincludename = fullpath - fcollate.write(f";|*include {includepath}\n") - flinear.write(f"{self.depthinclude:2} {indent} *include {includepath}\n") - push = includepath.lower() - self.includestack.append(push) - # ----------------- - self.PushdownStackScan(survexblock, includepath, finincludename, flinear, fcollate) - # ----------------- - pop = self.includestack.pop() - if pop != push: - message = "!! ERROR mismatch *include pop!=push {}".format(pop, push, self.includestack) + includepath = os.path.normpath(os.path.join(os.path.split(path)[0], re.sub(r"\.svx$", "", args))) # normalises path syntax + if self.never_seen(includepath, path): + fullpath = os.path.join(settings.SURVEX_DATA, includepath + ".svx") + self.RunSurvexIfNeeded(os.path.join(settings.SURVEX_DATA, includepath), path) + self.check_unique_name(os.path.join(settings.SURVEX_DATA, includepath)) + if os.path.isfile(fullpath): + # -------------------------------------------------------- + self.depthinclude += 1 + # fininclude = open(fullpath,'r') + finincludename = fullpath + fcollate.write(f";|*include {includepath}\n") + flinear.write(f"{self.depthinclude:2} {indent} *include {includepath}\n") + push = includepath.lower() + self.includestack.append(push) + # ----------------- + self.PushdownStackScan(survexblock, includepath, finincludename, flinear, fcollate) + # ----------------- + pop = self.includestack.pop() + if pop != push: + message = "!! ERROR mismatch *include pop!=push {}".format(pop, push, self.includestack) + print(message) + print(message, file=flinear) + print(message, file=sys.stderr) + stash_data_issue(parser="survex", message=message, url=None, sb=(path)) + flinear.write(f"{self.depthinclude:2} {indent} *edulcni {pop}\n") + fcollate.write(f";|*edulcni {pop}\n") + # fininclude.close() + self.depthinclude -= 1 + # -------------------------------------------------------- + else: + message = f" ! ERROR *include file '{includepath}' not found, listed in '{fin.name}'" print(message) - print(message, file=flinear) print(message, file=sys.stderr) stash_data_issue(parser="survex", message=message, url=None, sb=(path)) - flinear.write(f"{self.depthinclude:2} {indent} *edulcni {pop}\n") - fcollate.write(f";|*edulcni {pop}\n") - # fininclude.close() - self.depthinclude -= 1 - # -------------------------------------------------------- - else: - message = f" ! ERROR *include file '{includepath}' not found, listed in '{fin.name}'" - print(message) - print(message, file=sys.stderr) - stash_data_issue(parser="survex", message=message, url=None, sb=(path)) elif self.rx_begin2.match(cmd): #elif re.match("(?i)begin$", cmd): self.depthbegin += 1 @@ -1733,20 +1758,39 @@ class LoadingSurvex: print(message) print(message, file=sys.stderr) stash_data_issue(parser="survex", message=message, url=None, sb=(path)) + raise return # skip this survex file and all things *included in it - def checkUniqueness(self, fullpath): - fn = Path(fullpath).name - if fn not in self.uniquename: - self.uniquename[fn] = [fullpath] - else: - self.uniquename[fn].append(fullpath) - # This is not an error now that we are moving .3d files to the :loser: directory tree + def never_seen(self, incpath, parent): + """The _unseen files may include survex files we have already seen, and we do not + want to process them again. For the _unseens this is not an error, but for the main + *include tree it is an error. + """ + + if incpath in self.uniquefile: + self.uniquefile[incpath].append(parent) + message = ( - f" NOTE: non-unique survex filename, '{fn}' - '{self.uniquename[fn]}' #{len(self.uniquename[fn])}" + f" DUP: non-unique survex filepath, '{incpath}' - #{len(self.uniquefile[incpath])} '{self.uniquefile[incpath]}'" ) - # print(message) + print(message) # stash_data_issue(parser='survex', message=message) + for p in self.uniquefile[incpath]: + if p in self.uniquefile: + print(f"{p} <- {self.uniquefile[p]}") + return False + else: + self.uniquefile[incpath] = [parent] + return True + + def check_unique_name(self, fullpath): + """This only checks whether the last bit of the name of the survex file is unique, + e.g. "bigpitch", not whether the whole path of the survexfile has been seen before. + + We don't care about this any more. + """ + return + def RunSurvexIfNeeded(self, fullpath, calledpath): now = time.time() @@ -1843,7 +1887,13 @@ class LoadingSurvex: def FindAndLoadSurvex(survexblockroot): - """Follows the *include links successively to find files in the whole include tree""" + """Follows the *include links successively to find survex files + This proceeds in 3 phases: + 1. The root survex file is read and all the *include files are found, using PushdownStackScan() + 2. All the other survex files in the :loser: repo are found, and their *includes found, + using another PushdownStackScan() [duplicates omitted] + 3. The combined expanded file containing all the survex data is parsed as a single file, + using LinearLoad()""" global stop_dup_warning print(" - redirecting stdout to svxblks.log...") stdout_orig = sys.stdout @@ -1861,15 +1911,16 @@ def FindAndLoadSurvex(survexblockroot): print(f" - RunSurvexIfNeeded cavern on '{fullpathtotop}'", file=sys.stderr) svx_scan.RunSurvexIfNeeded(fullpathtotop, fullpathtotop) - svx_scan.checkUniqueness(fullpathtotop) + svx_scan.check_unique_name(fullpathtotop) + svx_scan.uniquefile[str(survexfileroot)] = ["0"] indent = "" fcollate = open(collatefilename, "w") mem0 = get_process_memory() - print(f" - MEM:{mem0:7.2f} MB START", file=sys.stderr) + print(f" - MEM:{mem0:7.2f} MB START '{survexfileroot}'", file=sys.stderr) flinear = open("svxlinear.log", "w") - flinear.write(f" - MEM:{mem0:7.2f} MB START {survexfileroot.path}\n") + flinear.write(f" - MEM:{mem0:7.2f} MB START '{survexfileroot.path}'\n") print(" ", file=sys.stderr, end="") finrootname = Path(settings.SURVEX_DATA, survexfileroot.path + ".svx") @@ -1897,16 +1948,24 @@ def FindAndLoadSurvex(survexblockroot): flinear.write(f"\n - MEM:{mem1:.2f} MB STOP {survexfileroot.path}\n") flinear.write(f" - MEM:{mem1 - mem0:.3f} MB ADDITIONALLY USED\n") flinear.write(f" - {len(svx_scan.svxfileslist):,} survex files in linear include list \n") - + flinear.write(f" - {len(svx_scan.uniquefile):,} unique survex files in linear include list \n") + for j in svx_scan.svxfileslist: + if j not in svx_scan.uniquefile: + flinear.write(f" - '{j}' {type(j)} not in unique list \n") + for f in svx_scan.uniquefile: + # flinear.write(f" - '{f}' {type(f)} {svx_scan.uniquefile[f]} \n") + if len(svx_scan.uniquefile[f]) > 1: + flinear.write(f" - '{f}' {type(f)} {svx_scan.uniquefile[f]} dup survex files \n") + print(f"\n - {svx_scan.caverncount:,} runs of survex 'cavern' refreshing .3d files", file=sys.stderr) print(f" - {len(svx_scan.svxfileslist):,} survex files from tree in linear include list", file=sys.stderr) - + print(f" - {len(svx_scan.uniquefile):,} unique survex files from tree in linear include list", file=sys.stderr) mem1 = get_process_memory() print(f" - MEM:{mem1:7.2f} MB END ", file=sys.stderr) print(f" - MEM:{mem1 - mem0:7.3f} MB ADDITIONALLY USED", file=sys.stderr) + # # Process all the omitted files in :loser: with some exceptions - # unseens = set() b = [] @@ -1926,13 +1985,14 @@ def FindAndLoadSurvex(survexblockroot): file=sys.stderr, ) - excpts = ["surface/terrain", "kataster/kataster-boundaries", "template", "docs", "_unseens"] + unseensroot = re.sub(r"\.svx$", "", UNSEENS) + excpts = ["surface/terrain", "kataster/kataster-boundaries", "template", "docs", unseensroot] removals = [] for x in unseens: for o in excpts: if str(x).strip().startswith(o): removals.append(x) - # special fix for file not actually in survex format + # special fix for .svx file not actually in survex format unseens.remove(Path("fixedpts/gps/gps00raw")) for x in removals: @@ -1944,7 +2004,7 @@ def FindAndLoadSurvex(survexblockroot): check_team_cache() print(" -- Now loading the previously-omitted survex files.", file=sys.stderr) - with open(Path(settings.SURVEX_DATA, "_unseens.svx"), "w") as u: + with open(Path(settings.SURVEX_DATA, UNSEENS), "w") as u: u.write( f"; {len(unseens):,} survex files not *included by {settings.SURVEX_TOPNAME} (which are {len(svx_scan.svxfileslist):,} files)\n" ) @@ -1960,7 +2020,7 @@ def FindAndLoadSurvex(survexblockroot): omit_scan = LoadingSurvex() omit_scan.callcount = 0 omit_scan.depthinclude = 0 - fullpathtotop = os.path.join(survexfileroot.survexdirectory.path, "_unseens.svx") + fullpathtotop = os.path.join(survexfileroot.survexdirectory.path, UNSEENS) # copy the list to prime the next pass through the files omit_scan.svxfileslist = svx_scan.svxfileslist[:] @@ -1969,32 +2029,35 @@ def FindAndLoadSurvex(survexblockroot): print(f" - RunSurvexIfNeeded cavern on '{fullpathtotop}'", file=sys.stderr) omit_scan.RunSurvexIfNeeded(fullpathtotop, fullpathtotop) - omit_scan.checkUniqueness(fullpathtotop) + omit_scan.check_unique_name(fullpathtotop) + omit_scan.uniquefile[unseensroot] = ["0"] mem0 = get_process_memory() - print(f" - MEM:{mem0:7.2f} MB START '_unseens'", file=sys.stderr) + print(f" - MEM:{mem0:7.2f} MB START '{unseensroot}'", file=sys.stderr) # flinear = open('svxlinear.log', 'w') - flinear.write(f" - MEM:{mem0:7.2f} MB START '_unseens'\n") + flinear.write(f" - MEM:{mem0:7.2f} MB START '{unseensroot}'\n") print(" ", file=sys.stderr, end="") + # this is a bit tricky as some unseen files will *include files we have already seen, which + # we should not process again. finrootname = fullpathtotop - fcollate.write(";*include _unseens.svx\n") - flinear.write(f"{omit_scan.depthinclude:2} {indent} *include _unseens\n") - stop_dup_warning = True + fcollate.write(f";*include {UNSEENS}\n") + flinear.write(f"{omit_scan.depthinclude:2} {indent} *include {unseensroot}\n") + # stop_dup_warning = True # ---------------------------------------------------------------- - omit_scan.PushdownStackScan(survexblockroot, "_unseens", finrootname, flinear, fcollate) + omit_scan.PushdownStackScan(survexblockroot, unseensroot, finrootname, flinear, fcollate) # ---------------------------------------------------------------- - stop_dup_warning = False + # stop_dup_warning = False - flinear.write(f"{omit_scan.depthinclude:2} {indent} *edulcni _unseens\n") - fcollate.write(";*edulcni _unseens.svx\n") + flinear.write(f"{omit_scan.depthinclude:2} {indent} *edulcni {unseensroot}\n") + fcollate.write(f";*edulcni {UNSEENS}\n") check_team_cache() mem1 = get_process_memory() - flinear.write(f"\n - MEM:{mem1:.2f} MB STOP _unseens.svx OMIT\n") - flinear.write(f" - MEM:{mem1 - mem0:.3f} MB ADDITIONALLY USED OMIT\n") - flinear.write(f" - {len(omit_scan.svxfileslist):,} survex files in linear include list OMIT \n") + flinear.write(f"\n - MEM:{mem1:.2f} MB STOP {UNSEENS} Unseen Oddments\n") + flinear.write(f" - MEM:{mem1 - mem0:.3f} MB ADDITIONALLY USED Unseen Oddments\n") + flinear.write(f" - {len(omit_scan.svxfileslist):,} survex files in linear include list Unseen Oddments \n") flinear.close() fcollate.close() @@ -2085,6 +2148,7 @@ def MakeOmitFileRoot(fn): def LoadSurvexBlocks(): + global dup_includes mem1 = get_process_memory() print(f" - MEM:{mem1:7.2f} MB now ", file=sys.stderr) start = time.time() @@ -2129,7 +2193,7 @@ def LoadSurvexBlocks(): # sudo service mariadb start survexblockroot.save() - omitsfileroot = MakeOmitFileRoot("_unseens.svx") + omitsfileroot = MakeOmitFileRoot(UNSEENS) survexomitsroot = SurvexBlock( name=OMITBLOCK, survexpath="", survexfile=omitsfileroot, legsall=0, legslength=0.0 ) @@ -2157,5 +2221,6 @@ def LoadSurvexBlocks(): store_data_issues() # duration = time.time() - start # print(f" - TIME: {duration:7.2f} s", file=sys.stderr) + print(f" - Duplicate *includes = {dup_includes}") print(" - Loaded All Survex Blocks.")