forked from expo/troggle
clean up de-duplication code
This commit is contained in:
parent
5067ef2c8c
commit
dc03016dbe
@ -46,8 +46,10 @@ survexomitsroot = None
|
||||
ROOTBLOCK = "rootblock"
|
||||
OMITBLOCK = "omitblock"
|
||||
METRESINFEET = 3.28084
|
||||
UNSEENS = "_unseens.svx"
|
||||
|
||||
stop_dup_warning = False
|
||||
dup_includes = 1
|
||||
debugprint = False # Turns on debug printout for just one *include file
|
||||
debugprinttrigger = "!"
|
||||
|
||||
@ -260,8 +262,8 @@ class LoadingSurvex:
|
||||
|
||||
rx_cave = re.compile(r"(?i)caves-(\d\d\d\d)/([-\d\w]+|\d\d\d\d-?\w+-\d+)")
|
||||
rx_comment = re.compile(r"([^;]*?)\s*(?:;\s*(.*))?\n?$")
|
||||
rx_comminc = re.compile(r"(?i)^\|\*include[\s]*([-\w/]*).*$") # inserted by linear collate ;*include
|
||||
rx_commcni = re.compile(r"(?i)^\|\*edulcni[\s]*([-\w/]*).*$") # inserted by linear collate ;*edulcni
|
||||
rx_comminc = re.compile(r"(?i)^\|\*include[\s]*([-\w/]*).*$") # inserted by linear collate ;|*include
|
||||
rx_commcni = re.compile(r"(?i)^\|\*edulcni[\s]*([-\w/]*).*$") # inserted by linear collate ;|*edulcni
|
||||
rx_include = re.compile(r"(?i)^\s*(\*include[\s].*)$")
|
||||
rx_include2 = re.compile("(?i)include$")
|
||||
rx_commref = re.compile(r"(?i)^\s*ref(?:erence)?[\s.:]*(\d+)\s*#\s*(X)?\s*(\d+)")
|
||||
@ -300,7 +302,7 @@ class LoadingSurvex:
|
||||
stacksvxfiles = []
|
||||
svxfileslist = []
|
||||
svxdirs = {}
|
||||
uniquename = {}
|
||||
uniquefile = {}
|
||||
expos = {}
|
||||
survexdict = {} # each key is a directory, and its value is a list of files
|
||||
lineno = 0
|
||||
@ -1163,9 +1165,16 @@ class LoadingSurvex:
|
||||
"""Creates SurvexFile in the database, and SurvexDirectory if needed
|
||||
with links to 'cave'
|
||||
Creates a new current survexfile and valid .survexdirectory
|
||||
Inspects the parent folder of the survexfile and uses that to decide if this is a cave we know
|
||||
Inspects the parent folder of the survexfile and uses that to decide if this is
|
||||
a cave we know.
|
||||
|
||||
If we see a duplicate cave, this is too late. It has already been included into the
|
||||
long linear file. This needs to be prevented when the long linear file is created.
|
||||
|
||||
The survexblock passed-in is not necessarily the parent. FIX THIS.
|
||||
"""
|
||||
global dup_includes
|
||||
|
||||
if debugprint:
|
||||
print(f" # datastack in LoadSurvexFile:{svxid} 'type':", end="")
|
||||
for dict in self.datastack:
|
||||
@ -1173,10 +1182,20 @@ class LoadingSurvex:
|
||||
print("")
|
||||
|
||||
depth = " " * self.depthbegin
|
||||
# print("{:2}{} - NEW survexfile:'{}'".format(self.depthbegin, depth, svxid))
|
||||
print("{:2}{} - NEW survexfile:'{}'".format(self.depthbegin, depth, svxid))
|
||||
headpath = os.path.dirname(svxid)
|
||||
|
||||
newfile = SurvexFile(path=svxid)
|
||||
newfile, created = SurvexFile.objects.update_or_create(path=svxid)
|
||||
if not created:
|
||||
dup_includes += 1
|
||||
message = f" ! DUP SurvexFile '{svxid}' create attempt in LoadSurvexFile()"
|
||||
print(message)
|
||||
# print(message, file=sys.stderr)
|
||||
stash_data_issue(parser="survex", message=message, url=f"/survexfile/{svxid}")
|
||||
|
||||
self.currentsurvexfile = newfile
|
||||
return # abort as everything already done for object creation
|
||||
|
||||
newfile.save() # until we do this there is no internal id so no foreign key works
|
||||
self.currentsurvexfile = newfile
|
||||
newdirectory = self.GetSurvexDirectory(headpath)
|
||||
@ -1217,7 +1236,11 @@ class LoadingSurvex:
|
||||
print(f"'{dict['type'].upper()}' ", end="")
|
||||
print("")
|
||||
|
||||
|
||||
def ProcessIncludeLine(self, included):
|
||||
"""As we read the long linear file, we come across lines telling us that the
|
||||
content from this point on is from a particular included file
|
||||
"""
|
||||
global debugprint
|
||||
svxid = included.groups()[0]
|
||||
if svxid.lower() == debugprinttrigger.lower():
|
||||
@ -1226,7 +1249,9 @@ class LoadingSurvex:
|
||||
self.stacksvxfiles.append(self.currentsurvexfile)
|
||||
|
||||
def ProcessEdulcniLine(self, edulcni):
|
||||
"""Saves the current survexfile in the db"""
|
||||
"""As we read the long linear file, we come across lines telling us that the
|
||||
we are about to pop back out of the contents of an included file
|
||||
Saves the current survexfile object in the db to include the data parsed from it"""
|
||||
global debugprint
|
||||
svxid = edulcni.groups()[0]
|
||||
if debugprint:
|
||||
@ -1277,8 +1302,8 @@ class LoadingSurvex:
|
||||
)
|
||||
|
||||
included = self.rx_comminc.match(comment)
|
||||
# ;*include means 'we have been included'; whereas *include means 'proceed to include'
|
||||
# bug, If the original survex file contians the line ;*include then we pick it up ! So fix our special code to be ;|*include
|
||||
# ;|*include means 'we have been included'; whereas *include means 'proceed to include'
|
||||
# No test here to check that this file has not already been included. Ouch.
|
||||
if included:
|
||||
self.ProcessIncludeLine(included)
|
||||
|
||||
@ -1553,7 +1578,7 @@ class LoadingSurvex:
|
||||
self.lineno += 1
|
||||
sline, comment = self.rx_comment.match(svxline).groups()
|
||||
if comment:
|
||||
# this catches the ;*include NEWFILE and ;*edulcni ENDOFFILE lines too
|
||||
# this catches the ;|*include NEWFILE and ;|*edulcni ENDOFFILE lines too
|
||||
self.LoadSurvexComment(survexblock, comment)
|
||||
|
||||
if not sline:
|
||||
@ -1616,40 +1641,40 @@ class LoadingSurvex:
|
||||
if self.rx_include2.match(cmd):
|
||||
# rx_include2 = re.compile("(?i)include$")
|
||||
# if re.match("(?i)include$", cmd):
|
||||
includepath = os.path.normpath(os.path.join(os.path.split(path)[0], re.sub(r"\.svx$", "", args)))
|
||||
|
||||
fullpath = os.path.join(settings.SURVEX_DATA, includepath + ".svx")
|
||||
self.RunSurvexIfNeeded(os.path.join(settings.SURVEX_DATA, includepath), path)
|
||||
self.checkUniqueness(os.path.join(settings.SURVEX_DATA, includepath))
|
||||
if os.path.isfile(fullpath):
|
||||
# --------------------------------------------------------
|
||||
self.depthinclude += 1
|
||||
# fininclude = open(fullpath,'r')
|
||||
finincludename = fullpath
|
||||
fcollate.write(f";|*include {includepath}\n")
|
||||
flinear.write(f"{self.depthinclude:2} {indent} *include {includepath}\n")
|
||||
push = includepath.lower()
|
||||
self.includestack.append(push)
|
||||
# -----------------
|
||||
self.PushdownStackScan(survexblock, includepath, finincludename, flinear, fcollate)
|
||||
# -----------------
|
||||
pop = self.includestack.pop()
|
||||
if pop != push:
|
||||
message = "!! ERROR mismatch *include pop!=push {}".format(pop, push, self.includestack)
|
||||
includepath = os.path.normpath(os.path.join(os.path.split(path)[0], re.sub(r"\.svx$", "", args))) # normalises path syntax
|
||||
if self.never_seen(includepath, path):
|
||||
fullpath = os.path.join(settings.SURVEX_DATA, includepath + ".svx")
|
||||
self.RunSurvexIfNeeded(os.path.join(settings.SURVEX_DATA, includepath), path)
|
||||
self.check_unique_name(os.path.join(settings.SURVEX_DATA, includepath))
|
||||
if os.path.isfile(fullpath):
|
||||
# --------------------------------------------------------
|
||||
self.depthinclude += 1
|
||||
# fininclude = open(fullpath,'r')
|
||||
finincludename = fullpath
|
||||
fcollate.write(f";|*include {includepath}\n")
|
||||
flinear.write(f"{self.depthinclude:2} {indent} *include {includepath}\n")
|
||||
push = includepath.lower()
|
||||
self.includestack.append(push)
|
||||
# -----------------
|
||||
self.PushdownStackScan(survexblock, includepath, finincludename, flinear, fcollate)
|
||||
# -----------------
|
||||
pop = self.includestack.pop()
|
||||
if pop != push:
|
||||
message = "!! ERROR mismatch *include pop!=push {}".format(pop, push, self.includestack)
|
||||
print(message)
|
||||
print(message, file=flinear)
|
||||
print(message, file=sys.stderr)
|
||||
stash_data_issue(parser="survex", message=message, url=None, sb=(path))
|
||||
flinear.write(f"{self.depthinclude:2} {indent} *edulcni {pop}\n")
|
||||
fcollate.write(f";|*edulcni {pop}\n")
|
||||
# fininclude.close()
|
||||
self.depthinclude -= 1
|
||||
# --------------------------------------------------------
|
||||
else:
|
||||
message = f" ! ERROR *include file '{includepath}' not found, listed in '{fin.name}'"
|
||||
print(message)
|
||||
print(message, file=flinear)
|
||||
print(message, file=sys.stderr)
|
||||
stash_data_issue(parser="survex", message=message, url=None, sb=(path))
|
||||
flinear.write(f"{self.depthinclude:2} {indent} *edulcni {pop}\n")
|
||||
fcollate.write(f";|*edulcni {pop}\n")
|
||||
# fininclude.close()
|
||||
self.depthinclude -= 1
|
||||
# --------------------------------------------------------
|
||||
else:
|
||||
message = f" ! ERROR *include file '{includepath}' not found, listed in '{fin.name}'"
|
||||
print(message)
|
||||
print(message, file=sys.stderr)
|
||||
stash_data_issue(parser="survex", message=message, url=None, sb=(path))
|
||||
elif self.rx_begin2.match(cmd):
|
||||
#elif re.match("(?i)begin$", cmd):
|
||||
self.depthbegin += 1
|
||||
@ -1733,20 +1758,39 @@ class LoadingSurvex:
|
||||
print(message)
|
||||
print(message, file=sys.stderr)
|
||||
stash_data_issue(parser="survex", message=message, url=None, sb=(path))
|
||||
raise
|
||||
return # skip this survex file and all things *included in it
|
||||
|
||||
def checkUniqueness(self, fullpath):
|
||||
fn = Path(fullpath).name
|
||||
if fn not in self.uniquename:
|
||||
self.uniquename[fn] = [fullpath]
|
||||
else:
|
||||
self.uniquename[fn].append(fullpath)
|
||||
# This is not an error now that we are moving .3d files to the :loser: directory tree
|
||||
def never_seen(self, incpath, parent):
|
||||
"""The _unseen files may include survex files we have already seen, and we do not
|
||||
want to process them again. For the _unseens this is not an error, but for the main
|
||||
*include tree it is an error.
|
||||
"""
|
||||
|
||||
if incpath in self.uniquefile:
|
||||
self.uniquefile[incpath].append(parent)
|
||||
|
||||
message = (
|
||||
f" NOTE: non-unique survex filename, '{fn}' - '{self.uniquename[fn]}' #{len(self.uniquename[fn])}"
|
||||
f" DUP: non-unique survex filepath, '{incpath}' - #{len(self.uniquefile[incpath])} '{self.uniquefile[incpath]}'"
|
||||
)
|
||||
# print(message)
|
||||
print(message)
|
||||
# stash_data_issue(parser='survex', message=message)
|
||||
for p in self.uniquefile[incpath]:
|
||||
if p in self.uniquefile:
|
||||
print(f"{p} <- {self.uniquefile[p]}")
|
||||
return False
|
||||
else:
|
||||
self.uniquefile[incpath] = [parent]
|
||||
return True
|
||||
|
||||
def check_unique_name(self, fullpath):
|
||||
"""This only checks whether the last bit of the name of the survex file is unique,
|
||||
e.g. "bigpitch", not whether the whole path of the survexfile has been seen before.
|
||||
|
||||
We don't care about this any more.
|
||||
"""
|
||||
return
|
||||
|
||||
|
||||
def RunSurvexIfNeeded(self, fullpath, calledpath):
|
||||
now = time.time()
|
||||
@ -1843,7 +1887,13 @@ class LoadingSurvex:
|
||||
|
||||
|
||||
def FindAndLoadSurvex(survexblockroot):
|
||||
"""Follows the *include links successively to find files in the whole include tree"""
|
||||
"""Follows the *include links successively to find survex files
|
||||
This proceeds in 3 phases:
|
||||
1. The root survex file is read and all the *include files are found, using PushdownStackScan()
|
||||
2. All the other survex files in the :loser: repo are found, and their *includes found,
|
||||
using another PushdownStackScan() [duplicates omitted]
|
||||
3. The combined expanded file containing all the survex data is parsed as a single file,
|
||||
using LinearLoad()"""
|
||||
global stop_dup_warning
|
||||
print(" - redirecting stdout to svxblks.log...")
|
||||
stdout_orig = sys.stdout
|
||||
@ -1861,15 +1911,16 @@ def FindAndLoadSurvex(survexblockroot):
|
||||
|
||||
print(f" - RunSurvexIfNeeded cavern on '{fullpathtotop}'", file=sys.stderr)
|
||||
svx_scan.RunSurvexIfNeeded(fullpathtotop, fullpathtotop)
|
||||
svx_scan.checkUniqueness(fullpathtotop)
|
||||
svx_scan.check_unique_name(fullpathtotop)
|
||||
svx_scan.uniquefile[str(survexfileroot)] = ["0"]
|
||||
|
||||
indent = ""
|
||||
fcollate = open(collatefilename, "w")
|
||||
|
||||
mem0 = get_process_memory()
|
||||
print(f" - MEM:{mem0:7.2f} MB START", file=sys.stderr)
|
||||
print(f" - MEM:{mem0:7.2f} MB START '{survexfileroot}'", file=sys.stderr)
|
||||
flinear = open("svxlinear.log", "w")
|
||||
flinear.write(f" - MEM:{mem0:7.2f} MB START {survexfileroot.path}\n")
|
||||
flinear.write(f" - MEM:{mem0:7.2f} MB START '{survexfileroot.path}'\n")
|
||||
print(" ", file=sys.stderr, end="")
|
||||
|
||||
finrootname = Path(settings.SURVEX_DATA, survexfileroot.path + ".svx")
|
||||
@ -1897,16 +1948,24 @@ def FindAndLoadSurvex(survexblockroot):
|
||||
flinear.write(f"\n - MEM:{mem1:.2f} MB STOP {survexfileroot.path}\n")
|
||||
flinear.write(f" - MEM:{mem1 - mem0:.3f} MB ADDITIONALLY USED\n")
|
||||
flinear.write(f" - {len(svx_scan.svxfileslist):,} survex files in linear include list \n")
|
||||
|
||||
flinear.write(f" - {len(svx_scan.uniquefile):,} unique survex files in linear include list \n")
|
||||
for j in svx_scan.svxfileslist:
|
||||
if j not in svx_scan.uniquefile:
|
||||
flinear.write(f" - '{j}' {type(j)} not in unique list \n")
|
||||
for f in svx_scan.uniquefile:
|
||||
# flinear.write(f" - '{f}' {type(f)} {svx_scan.uniquefile[f]} \n")
|
||||
if len(svx_scan.uniquefile[f]) > 1:
|
||||
flinear.write(f" - '{f}' {type(f)} {svx_scan.uniquefile[f]} dup survex files \n")
|
||||
|
||||
print(f"\n - {svx_scan.caverncount:,} runs of survex 'cavern' refreshing .3d files", file=sys.stderr)
|
||||
print(f" - {len(svx_scan.svxfileslist):,} survex files from tree in linear include list", file=sys.stderr)
|
||||
|
||||
print(f" - {len(svx_scan.uniquefile):,} unique survex files from tree in linear include list", file=sys.stderr)
|
||||
mem1 = get_process_memory()
|
||||
print(f" - MEM:{mem1:7.2f} MB END ", file=sys.stderr)
|
||||
print(f" - MEM:{mem1 - mem0:7.3f} MB ADDITIONALLY USED", file=sys.stderr)
|
||||
|
||||
#
|
||||
# Process all the omitted files in :loser: with some exceptions
|
||||
#
|
||||
unseens = set()
|
||||
b = []
|
||||
|
||||
@ -1926,13 +1985,14 @@ def FindAndLoadSurvex(survexblockroot):
|
||||
file=sys.stderr,
|
||||
)
|
||||
|
||||
excpts = ["surface/terrain", "kataster/kataster-boundaries", "template", "docs", "_unseens"]
|
||||
unseensroot = re.sub(r"\.svx$", "", UNSEENS)
|
||||
excpts = ["surface/terrain", "kataster/kataster-boundaries", "template", "docs", unseensroot]
|
||||
removals = []
|
||||
for x in unseens:
|
||||
for o in excpts:
|
||||
if str(x).strip().startswith(o):
|
||||
removals.append(x)
|
||||
# special fix for file not actually in survex format
|
||||
# special fix for .svx file not actually in survex format
|
||||
unseens.remove(Path("fixedpts/gps/gps00raw"))
|
||||
|
||||
for x in removals:
|
||||
@ -1944,7 +2004,7 @@ def FindAndLoadSurvex(survexblockroot):
|
||||
check_team_cache()
|
||||
print(" -- Now loading the previously-omitted survex files.", file=sys.stderr)
|
||||
|
||||
with open(Path(settings.SURVEX_DATA, "_unseens.svx"), "w") as u:
|
||||
with open(Path(settings.SURVEX_DATA, UNSEENS), "w") as u:
|
||||
u.write(
|
||||
f"; {len(unseens):,} survex files not *included by {settings.SURVEX_TOPNAME} (which are {len(svx_scan.svxfileslist):,} files)\n"
|
||||
)
|
||||
@ -1960,7 +2020,7 @@ def FindAndLoadSurvex(survexblockroot):
|
||||
omit_scan = LoadingSurvex()
|
||||
omit_scan.callcount = 0
|
||||
omit_scan.depthinclude = 0
|
||||
fullpathtotop = os.path.join(survexfileroot.survexdirectory.path, "_unseens.svx")
|
||||
fullpathtotop = os.path.join(survexfileroot.survexdirectory.path, UNSEENS)
|
||||
|
||||
# copy the list to prime the next pass through the files
|
||||
omit_scan.svxfileslist = svx_scan.svxfileslist[:]
|
||||
@ -1969,32 +2029,35 @@ def FindAndLoadSurvex(survexblockroot):
|
||||
|
||||
print(f" - RunSurvexIfNeeded cavern on '{fullpathtotop}'", file=sys.stderr)
|
||||
omit_scan.RunSurvexIfNeeded(fullpathtotop, fullpathtotop)
|
||||
omit_scan.checkUniqueness(fullpathtotop)
|
||||
omit_scan.check_unique_name(fullpathtotop)
|
||||
omit_scan.uniquefile[unseensroot] = ["0"]
|
||||
|
||||
mem0 = get_process_memory()
|
||||
print(f" - MEM:{mem0:7.2f} MB START '_unseens'", file=sys.stderr)
|
||||
print(f" - MEM:{mem0:7.2f} MB START '{unseensroot}'", file=sys.stderr)
|
||||
# flinear = open('svxlinear.log', 'w')
|
||||
flinear.write(f" - MEM:{mem0:7.2f} MB START '_unseens'\n")
|
||||
flinear.write(f" - MEM:{mem0:7.2f} MB START '{unseensroot}'\n")
|
||||
print(" ", file=sys.stderr, end="")
|
||||
|
||||
# this is a bit tricky as some unseen files will *include files we have already seen, which
|
||||
# we should not process again.
|
||||
finrootname = fullpathtotop
|
||||
fcollate.write(";*include _unseens.svx\n")
|
||||
flinear.write(f"{omit_scan.depthinclude:2} {indent} *include _unseens\n")
|
||||
stop_dup_warning = True
|
||||
fcollate.write(f";*include {UNSEENS}\n")
|
||||
flinear.write(f"{omit_scan.depthinclude:2} {indent} *include {unseensroot}\n")
|
||||
# stop_dup_warning = True
|
||||
# ----------------------------------------------------------------
|
||||
omit_scan.PushdownStackScan(survexblockroot, "_unseens", finrootname, flinear, fcollate)
|
||||
omit_scan.PushdownStackScan(survexblockroot, unseensroot, finrootname, flinear, fcollate)
|
||||
# ----------------------------------------------------------------
|
||||
stop_dup_warning = False
|
||||
# stop_dup_warning = False
|
||||
|
||||
flinear.write(f"{omit_scan.depthinclude:2} {indent} *edulcni _unseens\n")
|
||||
fcollate.write(";*edulcni _unseens.svx\n")
|
||||
flinear.write(f"{omit_scan.depthinclude:2} {indent} *edulcni {unseensroot}\n")
|
||||
fcollate.write(f";*edulcni {UNSEENS}\n")
|
||||
|
||||
check_team_cache()
|
||||
|
||||
mem1 = get_process_memory()
|
||||
flinear.write(f"\n - MEM:{mem1:.2f} MB STOP _unseens.svx OMIT\n")
|
||||
flinear.write(f" - MEM:{mem1 - mem0:.3f} MB ADDITIONALLY USED OMIT\n")
|
||||
flinear.write(f" - {len(omit_scan.svxfileslist):,} survex files in linear include list OMIT \n")
|
||||
flinear.write(f"\n - MEM:{mem1:.2f} MB STOP {UNSEENS} Unseen Oddments\n")
|
||||
flinear.write(f" - MEM:{mem1 - mem0:.3f} MB ADDITIONALLY USED Unseen Oddments\n")
|
||||
flinear.write(f" - {len(omit_scan.svxfileslist):,} survex files in linear include list Unseen Oddments \n")
|
||||
|
||||
flinear.close()
|
||||
fcollate.close()
|
||||
@ -2085,6 +2148,7 @@ def MakeOmitFileRoot(fn):
|
||||
|
||||
|
||||
def LoadSurvexBlocks():
|
||||
global dup_includes
|
||||
mem1 = get_process_memory()
|
||||
print(f" - MEM:{mem1:7.2f} MB now ", file=sys.stderr)
|
||||
start = time.time()
|
||||
@ -2129,7 +2193,7 @@ def LoadSurvexBlocks():
|
||||
# sudo service mariadb start
|
||||
survexblockroot.save()
|
||||
|
||||
omitsfileroot = MakeOmitFileRoot("_unseens.svx")
|
||||
omitsfileroot = MakeOmitFileRoot(UNSEENS)
|
||||
survexomitsroot = SurvexBlock(
|
||||
name=OMITBLOCK, survexpath="", survexfile=omitsfileroot, legsall=0, legslength=0.0
|
||||
)
|
||||
@ -2157,5 +2221,6 @@ def LoadSurvexBlocks():
|
||||
store_data_issues()
|
||||
# duration = time.time() - start
|
||||
# print(f" - TIME: {duration:7.2f} s", file=sys.stderr)
|
||||
print(f" - Duplicate *includes = {dup_includes}")
|
||||
print(" - Loaded All Survex Blocks.")
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user