clean up de-duplication code

This commit is contained in:
Philip Sargent 2023-02-28 16:18:29 +00:00
parent 5067ef2c8c
commit dc03016dbe

View File

@ -46,8 +46,10 @@ survexomitsroot = None
ROOTBLOCK = "rootblock"
OMITBLOCK = "omitblock"
METRESINFEET = 3.28084
UNSEENS = "_unseens.svx"
stop_dup_warning = False
dup_includes = 1
debugprint = False # Turns on debug printout for just one *include file
debugprinttrigger = "!"
@ -260,8 +262,8 @@ class LoadingSurvex:
rx_cave = re.compile(r"(?i)caves-(\d\d\d\d)/([-\d\w]+|\d\d\d\d-?\w+-\d+)")
rx_comment = re.compile(r"([^;]*?)\s*(?:;\s*(.*))?\n?$")
rx_comminc = re.compile(r"(?i)^\|\*include[\s]*([-\w/]*).*$") # inserted by linear collate ;*include
rx_commcni = re.compile(r"(?i)^\|\*edulcni[\s]*([-\w/]*).*$") # inserted by linear collate ;*edulcni
rx_comminc = re.compile(r"(?i)^\|\*include[\s]*([-\w/]*).*$") # inserted by linear collate ;|*include
rx_commcni = re.compile(r"(?i)^\|\*edulcni[\s]*([-\w/]*).*$") # inserted by linear collate ;|*edulcni
rx_include = re.compile(r"(?i)^\s*(\*include[\s].*)$")
rx_include2 = re.compile("(?i)include$")
rx_commref = re.compile(r"(?i)^\s*ref(?:erence)?[\s.:]*(\d+)\s*#\s*(X)?\s*(\d+)")
@ -300,7 +302,7 @@ class LoadingSurvex:
stacksvxfiles = []
svxfileslist = []
svxdirs = {}
uniquename = {}
uniquefile = {}
expos = {}
survexdict = {} # each key is a directory, and its value is a list of files
lineno = 0
@ -1163,9 +1165,16 @@ class LoadingSurvex:
"""Creates SurvexFile in the database, and SurvexDirectory if needed
with links to 'cave'
Creates a new current survexfile and valid .survexdirectory
Inspects the parent folder of the survexfile and uses that to decide if this is a cave we know
Inspects the parent folder of the survexfile and uses that to decide if this is
a cave we know.
If we see a duplicate cave, this is too late. It has already been included into the
long linear file. This needs to be prevented when the long linear file is created.
The survexblock passed-in is not necessarily the parent. FIX THIS.
"""
global dup_includes
if debugprint:
print(f" # datastack in LoadSurvexFile:{svxid} 'type':", end="")
for dict in self.datastack:
@ -1173,10 +1182,20 @@ class LoadingSurvex:
print("")
depth = " " * self.depthbegin
# print("{:2}{} - NEW survexfile:'{}'".format(self.depthbegin, depth, svxid))
print("{:2}{} - NEW survexfile:'{}'".format(self.depthbegin, depth, svxid))
headpath = os.path.dirname(svxid)
newfile = SurvexFile(path=svxid)
newfile, created = SurvexFile.objects.update_or_create(path=svxid)
if not created:
dup_includes += 1
message = f" ! DUP SurvexFile '{svxid}' create attempt in LoadSurvexFile()"
print(message)
# print(message, file=sys.stderr)
stash_data_issue(parser="survex", message=message, url=f"/survexfile/{svxid}")
self.currentsurvexfile = newfile
return # abort as everything already done for object creation
newfile.save() # until we do this there is no internal id so no foreign key works
self.currentsurvexfile = newfile
newdirectory = self.GetSurvexDirectory(headpath)
@ -1217,7 +1236,11 @@ class LoadingSurvex:
print(f"'{dict['type'].upper()}' ", end="")
print("")
def ProcessIncludeLine(self, included):
"""As we read the long linear file, we come across lines telling us that the
content from this point on is from a particular included file
"""
global debugprint
svxid = included.groups()[0]
if svxid.lower() == debugprinttrigger.lower():
@ -1226,7 +1249,9 @@ class LoadingSurvex:
self.stacksvxfiles.append(self.currentsurvexfile)
def ProcessEdulcniLine(self, edulcni):
"""Saves the current survexfile in the db"""
"""As we read the long linear file, we come across lines telling us that the
we are about to pop back out of the contents of an included file
Saves the current survexfile object in the db to include the data parsed from it"""
global debugprint
svxid = edulcni.groups()[0]
if debugprint:
@ -1277,8 +1302,8 @@ class LoadingSurvex:
)
included = self.rx_comminc.match(comment)
# ;*include means 'we have been included'; whereas *include means 'proceed to include'
# bug, If the original survex file contians the line ;*include then we pick it up ! So fix our special code to be ;|*include
# ;|*include means 'we have been included'; whereas *include means 'proceed to include'
# No test here to check that this file has not already been included. Ouch.
if included:
self.ProcessIncludeLine(included)
@ -1553,7 +1578,7 @@ class LoadingSurvex:
self.lineno += 1
sline, comment = self.rx_comment.match(svxline).groups()
if comment:
# this catches the ;*include NEWFILE and ;*edulcni ENDOFFILE lines too
# this catches the ;|*include NEWFILE and ;|*edulcni ENDOFFILE lines too
self.LoadSurvexComment(survexblock, comment)
if not sline:
@ -1616,40 +1641,40 @@ class LoadingSurvex:
if self.rx_include2.match(cmd):
# rx_include2 = re.compile("(?i)include$")
# if re.match("(?i)include$", cmd):
includepath = os.path.normpath(os.path.join(os.path.split(path)[0], re.sub(r"\.svx$", "", args)))
fullpath = os.path.join(settings.SURVEX_DATA, includepath + ".svx")
self.RunSurvexIfNeeded(os.path.join(settings.SURVEX_DATA, includepath), path)
self.checkUniqueness(os.path.join(settings.SURVEX_DATA, includepath))
if os.path.isfile(fullpath):
# --------------------------------------------------------
self.depthinclude += 1
# fininclude = open(fullpath,'r')
finincludename = fullpath
fcollate.write(f";|*include {includepath}\n")
flinear.write(f"{self.depthinclude:2} {indent} *include {includepath}\n")
push = includepath.lower()
self.includestack.append(push)
# -----------------
self.PushdownStackScan(survexblock, includepath, finincludename, flinear, fcollate)
# -----------------
pop = self.includestack.pop()
if pop != push:
message = "!! ERROR mismatch *include pop!=push {}".format(pop, push, self.includestack)
includepath = os.path.normpath(os.path.join(os.path.split(path)[0], re.sub(r"\.svx$", "", args))) # normalises path syntax
if self.never_seen(includepath, path):
fullpath = os.path.join(settings.SURVEX_DATA, includepath + ".svx")
self.RunSurvexIfNeeded(os.path.join(settings.SURVEX_DATA, includepath), path)
self.check_unique_name(os.path.join(settings.SURVEX_DATA, includepath))
if os.path.isfile(fullpath):
# --------------------------------------------------------
self.depthinclude += 1
# fininclude = open(fullpath,'r')
finincludename = fullpath
fcollate.write(f";|*include {includepath}\n")
flinear.write(f"{self.depthinclude:2} {indent} *include {includepath}\n")
push = includepath.lower()
self.includestack.append(push)
# -----------------
self.PushdownStackScan(survexblock, includepath, finincludename, flinear, fcollate)
# -----------------
pop = self.includestack.pop()
if pop != push:
message = "!! ERROR mismatch *include pop!=push {}".format(pop, push, self.includestack)
print(message)
print(message, file=flinear)
print(message, file=sys.stderr)
stash_data_issue(parser="survex", message=message, url=None, sb=(path))
flinear.write(f"{self.depthinclude:2} {indent} *edulcni {pop}\n")
fcollate.write(f";|*edulcni {pop}\n")
# fininclude.close()
self.depthinclude -= 1
# --------------------------------------------------------
else:
message = f" ! ERROR *include file '{includepath}' not found, listed in '{fin.name}'"
print(message)
print(message, file=flinear)
print(message, file=sys.stderr)
stash_data_issue(parser="survex", message=message, url=None, sb=(path))
flinear.write(f"{self.depthinclude:2} {indent} *edulcni {pop}\n")
fcollate.write(f";|*edulcni {pop}\n")
# fininclude.close()
self.depthinclude -= 1
# --------------------------------------------------------
else:
message = f" ! ERROR *include file '{includepath}' not found, listed in '{fin.name}'"
print(message)
print(message, file=sys.stderr)
stash_data_issue(parser="survex", message=message, url=None, sb=(path))
elif self.rx_begin2.match(cmd):
#elif re.match("(?i)begin$", cmd):
self.depthbegin += 1
@ -1733,20 +1758,39 @@ class LoadingSurvex:
print(message)
print(message, file=sys.stderr)
stash_data_issue(parser="survex", message=message, url=None, sb=(path))
raise
return # skip this survex file and all things *included in it
def checkUniqueness(self, fullpath):
fn = Path(fullpath).name
if fn not in self.uniquename:
self.uniquename[fn] = [fullpath]
else:
self.uniquename[fn].append(fullpath)
# This is not an error now that we are moving .3d files to the :loser: directory tree
def never_seen(self, incpath, parent):
"""The _unseen files may include survex files we have already seen, and we do not
want to process them again. For the _unseens this is not an error, but for the main
*include tree it is an error.
"""
if incpath in self.uniquefile:
self.uniquefile[incpath].append(parent)
message = (
f" NOTE: non-unique survex filename, '{fn}' - '{self.uniquename[fn]}' #{len(self.uniquename[fn])}"
f" DUP: non-unique survex filepath, '{incpath}' - #{len(self.uniquefile[incpath])} '{self.uniquefile[incpath]}'"
)
# print(message)
print(message)
# stash_data_issue(parser='survex', message=message)
for p in self.uniquefile[incpath]:
if p in self.uniquefile:
print(f"{p} <- {self.uniquefile[p]}")
return False
else:
self.uniquefile[incpath] = [parent]
return True
def check_unique_name(self, fullpath):
"""This only checks whether the last bit of the name of the survex file is unique,
e.g. "bigpitch", not whether the whole path of the survexfile has been seen before.
We don't care about this any more.
"""
return
def RunSurvexIfNeeded(self, fullpath, calledpath):
now = time.time()
@ -1843,7 +1887,13 @@ class LoadingSurvex:
def FindAndLoadSurvex(survexblockroot):
"""Follows the *include links successively to find files in the whole include tree"""
"""Follows the *include links successively to find survex files
This proceeds in 3 phases:
1. The root survex file is read and all the *include files are found, using PushdownStackScan()
2. All the other survex files in the :loser: repo are found, and their *includes found,
using another PushdownStackScan() [duplicates omitted]
3. The combined expanded file containing all the survex data is parsed as a single file,
using LinearLoad()"""
global stop_dup_warning
print(" - redirecting stdout to svxblks.log...")
stdout_orig = sys.stdout
@ -1861,15 +1911,16 @@ def FindAndLoadSurvex(survexblockroot):
print(f" - RunSurvexIfNeeded cavern on '{fullpathtotop}'", file=sys.stderr)
svx_scan.RunSurvexIfNeeded(fullpathtotop, fullpathtotop)
svx_scan.checkUniqueness(fullpathtotop)
svx_scan.check_unique_name(fullpathtotop)
svx_scan.uniquefile[str(survexfileroot)] = ["0"]
indent = ""
fcollate = open(collatefilename, "w")
mem0 = get_process_memory()
print(f" - MEM:{mem0:7.2f} MB START", file=sys.stderr)
print(f" - MEM:{mem0:7.2f} MB START '{survexfileroot}'", file=sys.stderr)
flinear = open("svxlinear.log", "w")
flinear.write(f" - MEM:{mem0:7.2f} MB START {survexfileroot.path}\n")
flinear.write(f" - MEM:{mem0:7.2f} MB START '{survexfileroot.path}'\n")
print(" ", file=sys.stderr, end="")
finrootname = Path(settings.SURVEX_DATA, survexfileroot.path + ".svx")
@ -1897,16 +1948,24 @@ def FindAndLoadSurvex(survexblockroot):
flinear.write(f"\n - MEM:{mem1:.2f} MB STOP {survexfileroot.path}\n")
flinear.write(f" - MEM:{mem1 - mem0:.3f} MB ADDITIONALLY USED\n")
flinear.write(f" - {len(svx_scan.svxfileslist):,} survex files in linear include list \n")
flinear.write(f" - {len(svx_scan.uniquefile):,} unique survex files in linear include list \n")
for j in svx_scan.svxfileslist:
if j not in svx_scan.uniquefile:
flinear.write(f" - '{j}' {type(j)} not in unique list \n")
for f in svx_scan.uniquefile:
# flinear.write(f" - '{f}' {type(f)} {svx_scan.uniquefile[f]} \n")
if len(svx_scan.uniquefile[f]) > 1:
flinear.write(f" - '{f}' {type(f)} {svx_scan.uniquefile[f]} dup survex files \n")
print(f"\n - {svx_scan.caverncount:,} runs of survex 'cavern' refreshing .3d files", file=sys.stderr)
print(f" - {len(svx_scan.svxfileslist):,} survex files from tree in linear include list", file=sys.stderr)
print(f" - {len(svx_scan.uniquefile):,} unique survex files from tree in linear include list", file=sys.stderr)
mem1 = get_process_memory()
print(f" - MEM:{mem1:7.2f} MB END ", file=sys.stderr)
print(f" - MEM:{mem1 - mem0:7.3f} MB ADDITIONALLY USED", file=sys.stderr)
#
# Process all the omitted files in :loser: with some exceptions
#
unseens = set()
b = []
@ -1926,13 +1985,14 @@ def FindAndLoadSurvex(survexblockroot):
file=sys.stderr,
)
excpts = ["surface/terrain", "kataster/kataster-boundaries", "template", "docs", "_unseens"]
unseensroot = re.sub(r"\.svx$", "", UNSEENS)
excpts = ["surface/terrain", "kataster/kataster-boundaries", "template", "docs", unseensroot]
removals = []
for x in unseens:
for o in excpts:
if str(x).strip().startswith(o):
removals.append(x)
# special fix for file not actually in survex format
# special fix for .svx file not actually in survex format
unseens.remove(Path("fixedpts/gps/gps00raw"))
for x in removals:
@ -1944,7 +2004,7 @@ def FindAndLoadSurvex(survexblockroot):
check_team_cache()
print(" -- Now loading the previously-omitted survex files.", file=sys.stderr)
with open(Path(settings.SURVEX_DATA, "_unseens.svx"), "w") as u:
with open(Path(settings.SURVEX_DATA, UNSEENS), "w") as u:
u.write(
f"; {len(unseens):,} survex files not *included by {settings.SURVEX_TOPNAME} (which are {len(svx_scan.svxfileslist):,} files)\n"
)
@ -1960,7 +2020,7 @@ def FindAndLoadSurvex(survexblockroot):
omit_scan = LoadingSurvex()
omit_scan.callcount = 0
omit_scan.depthinclude = 0
fullpathtotop = os.path.join(survexfileroot.survexdirectory.path, "_unseens.svx")
fullpathtotop = os.path.join(survexfileroot.survexdirectory.path, UNSEENS)
# copy the list to prime the next pass through the files
omit_scan.svxfileslist = svx_scan.svxfileslist[:]
@ -1969,32 +2029,35 @@ def FindAndLoadSurvex(survexblockroot):
print(f" - RunSurvexIfNeeded cavern on '{fullpathtotop}'", file=sys.stderr)
omit_scan.RunSurvexIfNeeded(fullpathtotop, fullpathtotop)
omit_scan.checkUniqueness(fullpathtotop)
omit_scan.check_unique_name(fullpathtotop)
omit_scan.uniquefile[unseensroot] = ["0"]
mem0 = get_process_memory()
print(f" - MEM:{mem0:7.2f} MB START '_unseens'", file=sys.stderr)
print(f" - MEM:{mem0:7.2f} MB START '{unseensroot}'", file=sys.stderr)
# flinear = open('svxlinear.log', 'w')
flinear.write(f" - MEM:{mem0:7.2f} MB START '_unseens'\n")
flinear.write(f" - MEM:{mem0:7.2f} MB START '{unseensroot}'\n")
print(" ", file=sys.stderr, end="")
# this is a bit tricky as some unseen files will *include files we have already seen, which
# we should not process again.
finrootname = fullpathtotop
fcollate.write(";*include _unseens.svx\n")
flinear.write(f"{omit_scan.depthinclude:2} {indent} *include _unseens\n")
stop_dup_warning = True
fcollate.write(f";*include {UNSEENS}\n")
flinear.write(f"{omit_scan.depthinclude:2} {indent} *include {unseensroot}\n")
# stop_dup_warning = True
# ----------------------------------------------------------------
omit_scan.PushdownStackScan(survexblockroot, "_unseens", finrootname, flinear, fcollate)
omit_scan.PushdownStackScan(survexblockroot, unseensroot, finrootname, flinear, fcollate)
# ----------------------------------------------------------------
stop_dup_warning = False
# stop_dup_warning = False
flinear.write(f"{omit_scan.depthinclude:2} {indent} *edulcni _unseens\n")
fcollate.write(";*edulcni _unseens.svx\n")
flinear.write(f"{omit_scan.depthinclude:2} {indent} *edulcni {unseensroot}\n")
fcollate.write(f";*edulcni {UNSEENS}\n")
check_team_cache()
mem1 = get_process_memory()
flinear.write(f"\n - MEM:{mem1:.2f} MB STOP _unseens.svx OMIT\n")
flinear.write(f" - MEM:{mem1 - mem0:.3f} MB ADDITIONALLY USED OMIT\n")
flinear.write(f" - {len(omit_scan.svxfileslist):,} survex files in linear include list OMIT \n")
flinear.write(f"\n - MEM:{mem1:.2f} MB STOP {UNSEENS} Unseen Oddments\n")
flinear.write(f" - MEM:{mem1 - mem0:.3f} MB ADDITIONALLY USED Unseen Oddments\n")
flinear.write(f" - {len(omit_scan.svxfileslist):,} survex files in linear include list Unseen Oddments \n")
flinear.close()
fcollate.close()
@ -2085,6 +2148,7 @@ def MakeOmitFileRoot(fn):
def LoadSurvexBlocks():
global dup_includes
mem1 = get_process_memory()
print(f" - MEM:{mem1:7.2f} MB now ", file=sys.stderr)
start = time.time()
@ -2129,7 +2193,7 @@ def LoadSurvexBlocks():
# sudo service mariadb start
survexblockroot.save()
omitsfileroot = MakeOmitFileRoot("_unseens.svx")
omitsfileroot = MakeOmitFileRoot(UNSEENS)
survexomitsroot = SurvexBlock(
name=OMITBLOCK, survexpath="", survexfile=omitsfileroot, legsall=0, legslength=0.0
)
@ -2157,5 +2221,6 @@ def LoadSurvexBlocks():
store_data_issues()
# duration = time.time() - start
# print(f" - TIME: {duration:7.2f} s", file=sys.stderr)
print(f" - Duplicate *includes = {dup_includes}")
print(" - Loaded All Survex Blocks.")