2
0
mirror of https://expo.survex.com/repositories/troggle/.git synced 2024-11-25 16:51:54 +00:00

clean up de-duplication code

This commit is contained in:
Philip Sargent 2023-02-28 16:18:29 +00:00
parent 5067ef2c8c
commit dc03016dbe

View File

@ -46,8 +46,10 @@ survexomitsroot = None
ROOTBLOCK = "rootblock" ROOTBLOCK = "rootblock"
OMITBLOCK = "omitblock" OMITBLOCK = "omitblock"
METRESINFEET = 3.28084 METRESINFEET = 3.28084
UNSEENS = "_unseens.svx"
stop_dup_warning = False stop_dup_warning = False
dup_includes = 1
debugprint = False # Turns on debug printout for just one *include file debugprint = False # Turns on debug printout for just one *include file
debugprinttrigger = "!" debugprinttrigger = "!"
@ -260,8 +262,8 @@ class LoadingSurvex:
rx_cave = re.compile(r"(?i)caves-(\d\d\d\d)/([-\d\w]+|\d\d\d\d-?\w+-\d+)") rx_cave = re.compile(r"(?i)caves-(\d\d\d\d)/([-\d\w]+|\d\d\d\d-?\w+-\d+)")
rx_comment = re.compile(r"([^;]*?)\s*(?:;\s*(.*))?\n?$") rx_comment = re.compile(r"([^;]*?)\s*(?:;\s*(.*))?\n?$")
rx_comminc = re.compile(r"(?i)^\|\*include[\s]*([-\w/]*).*$") # inserted by linear collate ;*include rx_comminc = re.compile(r"(?i)^\|\*include[\s]*([-\w/]*).*$") # inserted by linear collate ;|*include
rx_commcni = re.compile(r"(?i)^\|\*edulcni[\s]*([-\w/]*).*$") # inserted by linear collate ;*edulcni rx_commcni = re.compile(r"(?i)^\|\*edulcni[\s]*([-\w/]*).*$") # inserted by linear collate ;|*edulcni
rx_include = re.compile(r"(?i)^\s*(\*include[\s].*)$") rx_include = re.compile(r"(?i)^\s*(\*include[\s].*)$")
rx_include2 = re.compile("(?i)include$") rx_include2 = re.compile("(?i)include$")
rx_commref = re.compile(r"(?i)^\s*ref(?:erence)?[\s.:]*(\d+)\s*#\s*(X)?\s*(\d+)") rx_commref = re.compile(r"(?i)^\s*ref(?:erence)?[\s.:]*(\d+)\s*#\s*(X)?\s*(\d+)")
@ -300,7 +302,7 @@ class LoadingSurvex:
stacksvxfiles = [] stacksvxfiles = []
svxfileslist = [] svxfileslist = []
svxdirs = {} svxdirs = {}
uniquename = {} uniquefile = {}
expos = {} expos = {}
survexdict = {} # each key is a directory, and its value is a list of files survexdict = {} # each key is a directory, and its value is a list of files
lineno = 0 lineno = 0
@ -1163,9 +1165,16 @@ class LoadingSurvex:
"""Creates SurvexFile in the database, and SurvexDirectory if needed """Creates SurvexFile in the database, and SurvexDirectory if needed
with links to 'cave' with links to 'cave'
Creates a new current survexfile and valid .survexdirectory Creates a new current survexfile and valid .survexdirectory
Inspects the parent folder of the survexfile and uses that to decide if this is a cave we know Inspects the parent folder of the survexfile and uses that to decide if this is
a cave we know.
If we see a duplicate cave, this is too late. It has already been included into the
long linear file. This needs to be prevented when the long linear file is created.
The survexblock passed-in is not necessarily the parent. FIX THIS. The survexblock passed-in is not necessarily the parent. FIX THIS.
""" """
global dup_includes
if debugprint: if debugprint:
print(f" # datastack in LoadSurvexFile:{svxid} 'type':", end="") print(f" # datastack in LoadSurvexFile:{svxid} 'type':", end="")
for dict in self.datastack: for dict in self.datastack:
@ -1173,10 +1182,20 @@ class LoadingSurvex:
print("") print("")
depth = " " * self.depthbegin depth = " " * self.depthbegin
# print("{:2}{} - NEW survexfile:'{}'".format(self.depthbegin, depth, svxid)) print("{:2}{} - NEW survexfile:'{}'".format(self.depthbegin, depth, svxid))
headpath = os.path.dirname(svxid) headpath = os.path.dirname(svxid)
newfile = SurvexFile(path=svxid) newfile, created = SurvexFile.objects.update_or_create(path=svxid)
if not created:
dup_includes += 1
message = f" ! DUP SurvexFile '{svxid}' create attempt in LoadSurvexFile()"
print(message)
# print(message, file=sys.stderr)
stash_data_issue(parser="survex", message=message, url=f"/survexfile/{svxid}")
self.currentsurvexfile = newfile
return # abort as everything already done for object creation
newfile.save() # until we do this there is no internal id so no foreign key works newfile.save() # until we do this there is no internal id so no foreign key works
self.currentsurvexfile = newfile self.currentsurvexfile = newfile
newdirectory = self.GetSurvexDirectory(headpath) newdirectory = self.GetSurvexDirectory(headpath)
@ -1217,7 +1236,11 @@ class LoadingSurvex:
print(f"'{dict['type'].upper()}' ", end="") print(f"'{dict['type'].upper()}' ", end="")
print("") print("")
def ProcessIncludeLine(self, included): def ProcessIncludeLine(self, included):
"""As we read the long linear file, we come across lines telling us that the
content from this point on is from a particular included file
"""
global debugprint global debugprint
svxid = included.groups()[0] svxid = included.groups()[0]
if svxid.lower() == debugprinttrigger.lower(): if svxid.lower() == debugprinttrigger.lower():
@ -1226,7 +1249,9 @@ class LoadingSurvex:
self.stacksvxfiles.append(self.currentsurvexfile) self.stacksvxfiles.append(self.currentsurvexfile)
def ProcessEdulcniLine(self, edulcni): def ProcessEdulcniLine(self, edulcni):
"""Saves the current survexfile in the db""" """As we read the long linear file, we come across lines telling us that the
we are about to pop back out of the contents of an included file
Saves the current survexfile object in the db to include the data parsed from it"""
global debugprint global debugprint
svxid = edulcni.groups()[0] svxid = edulcni.groups()[0]
if debugprint: if debugprint:
@ -1277,8 +1302,8 @@ class LoadingSurvex:
) )
included = self.rx_comminc.match(comment) included = self.rx_comminc.match(comment)
# ;*include means 'we have been included'; whereas *include means 'proceed to include' # ;|*include means 'we have been included'; whereas *include means 'proceed to include'
# bug, If the original survex file contians the line ;*include then we pick it up ! So fix our special code to be ;|*include # No test here to check that this file has not already been included. Ouch.
if included: if included:
self.ProcessIncludeLine(included) self.ProcessIncludeLine(included)
@ -1553,7 +1578,7 @@ class LoadingSurvex:
self.lineno += 1 self.lineno += 1
sline, comment = self.rx_comment.match(svxline).groups() sline, comment = self.rx_comment.match(svxline).groups()
if comment: if comment:
# this catches the ;*include NEWFILE and ;*edulcni ENDOFFILE lines too # this catches the ;|*include NEWFILE and ;|*edulcni ENDOFFILE lines too
self.LoadSurvexComment(survexblock, comment) self.LoadSurvexComment(survexblock, comment)
if not sline: if not sline:
@ -1616,11 +1641,11 @@ class LoadingSurvex:
if self.rx_include2.match(cmd): if self.rx_include2.match(cmd):
# rx_include2 = re.compile("(?i)include$") # rx_include2 = re.compile("(?i)include$")
# if re.match("(?i)include$", cmd): # if re.match("(?i)include$", cmd):
includepath = os.path.normpath(os.path.join(os.path.split(path)[0], re.sub(r"\.svx$", "", args))) includepath = os.path.normpath(os.path.join(os.path.split(path)[0], re.sub(r"\.svx$", "", args))) # normalises path syntax
if self.never_seen(includepath, path):
fullpath = os.path.join(settings.SURVEX_DATA, includepath + ".svx") fullpath = os.path.join(settings.SURVEX_DATA, includepath + ".svx")
self.RunSurvexIfNeeded(os.path.join(settings.SURVEX_DATA, includepath), path) self.RunSurvexIfNeeded(os.path.join(settings.SURVEX_DATA, includepath), path)
self.checkUniqueness(os.path.join(settings.SURVEX_DATA, includepath)) self.check_unique_name(os.path.join(settings.SURVEX_DATA, includepath))
if os.path.isfile(fullpath): if os.path.isfile(fullpath):
# -------------------------------------------------------- # --------------------------------------------------------
self.depthinclude += 1 self.depthinclude += 1
@ -1733,20 +1758,39 @@ class LoadingSurvex:
print(message) print(message)
print(message, file=sys.stderr) print(message, file=sys.stderr)
stash_data_issue(parser="survex", message=message, url=None, sb=(path)) stash_data_issue(parser="survex", message=message, url=None, sb=(path))
raise
return # skip this survex file and all things *included in it return # skip this survex file and all things *included in it
def checkUniqueness(self, fullpath): def never_seen(self, incpath, parent):
fn = Path(fullpath).name """The _unseen files may include survex files we have already seen, and we do not
if fn not in self.uniquename: want to process them again. For the _unseens this is not an error, but for the main
self.uniquename[fn] = [fullpath] *include tree it is an error.
else: """
self.uniquename[fn].append(fullpath)
# This is not an error now that we are moving .3d files to the :loser: directory tree if incpath in self.uniquefile:
self.uniquefile[incpath].append(parent)
message = ( message = (
f" NOTE: non-unique survex filename, '{fn}' - '{self.uniquename[fn]}' #{len(self.uniquename[fn])}" f" DUP: non-unique survex filepath, '{incpath}' - #{len(self.uniquefile[incpath])} '{self.uniquefile[incpath]}'"
) )
# print(message) print(message)
# stash_data_issue(parser='survex', message=message) # stash_data_issue(parser='survex', message=message)
for p in self.uniquefile[incpath]:
if p in self.uniquefile:
print(f"{p} <- {self.uniquefile[p]}")
return False
else:
self.uniquefile[incpath] = [parent]
return True
def check_unique_name(self, fullpath):
"""This only checks whether the last bit of the name of the survex file is unique,
e.g. "bigpitch", not whether the whole path of the survexfile has been seen before.
We don't care about this any more.
"""
return
def RunSurvexIfNeeded(self, fullpath, calledpath): def RunSurvexIfNeeded(self, fullpath, calledpath):
now = time.time() now = time.time()
@ -1843,7 +1887,13 @@ class LoadingSurvex:
def FindAndLoadSurvex(survexblockroot): def FindAndLoadSurvex(survexblockroot):
"""Follows the *include links successively to find files in the whole include tree""" """Follows the *include links successively to find survex files
This proceeds in 3 phases:
1. The root survex file is read and all the *include files are found, using PushdownStackScan()
2. All the other survex files in the :loser: repo are found, and their *includes found,
using another PushdownStackScan() [duplicates omitted]
3. The combined expanded file containing all the survex data is parsed as a single file,
using LinearLoad()"""
global stop_dup_warning global stop_dup_warning
print(" - redirecting stdout to svxblks.log...") print(" - redirecting stdout to svxblks.log...")
stdout_orig = sys.stdout stdout_orig = sys.stdout
@ -1861,15 +1911,16 @@ def FindAndLoadSurvex(survexblockroot):
print(f" - RunSurvexIfNeeded cavern on '{fullpathtotop}'", file=sys.stderr) print(f" - RunSurvexIfNeeded cavern on '{fullpathtotop}'", file=sys.stderr)
svx_scan.RunSurvexIfNeeded(fullpathtotop, fullpathtotop) svx_scan.RunSurvexIfNeeded(fullpathtotop, fullpathtotop)
svx_scan.checkUniqueness(fullpathtotop) svx_scan.check_unique_name(fullpathtotop)
svx_scan.uniquefile[str(survexfileroot)] = ["0"]
indent = "" indent = ""
fcollate = open(collatefilename, "w") fcollate = open(collatefilename, "w")
mem0 = get_process_memory() mem0 = get_process_memory()
print(f" - MEM:{mem0:7.2f} MB START", file=sys.stderr) print(f" - MEM:{mem0:7.2f} MB START '{survexfileroot}'", file=sys.stderr)
flinear = open("svxlinear.log", "w") flinear = open("svxlinear.log", "w")
flinear.write(f" - MEM:{mem0:7.2f} MB START {survexfileroot.path}\n") flinear.write(f" - MEM:{mem0:7.2f} MB START '{survexfileroot.path}'\n")
print(" ", file=sys.stderr, end="") print(" ", file=sys.stderr, end="")
finrootname = Path(settings.SURVEX_DATA, survexfileroot.path + ".svx") finrootname = Path(settings.SURVEX_DATA, survexfileroot.path + ".svx")
@ -1897,16 +1948,24 @@ def FindAndLoadSurvex(survexblockroot):
flinear.write(f"\n - MEM:{mem1:.2f} MB STOP {survexfileroot.path}\n") flinear.write(f"\n - MEM:{mem1:.2f} MB STOP {survexfileroot.path}\n")
flinear.write(f" - MEM:{mem1 - mem0:.3f} MB ADDITIONALLY USED\n") flinear.write(f" - MEM:{mem1 - mem0:.3f} MB ADDITIONALLY USED\n")
flinear.write(f" - {len(svx_scan.svxfileslist):,} survex files in linear include list \n") flinear.write(f" - {len(svx_scan.svxfileslist):,} survex files in linear include list \n")
flinear.write(f" - {len(svx_scan.uniquefile):,} unique survex files in linear include list \n")
for j in svx_scan.svxfileslist:
if j not in svx_scan.uniquefile:
flinear.write(f" - '{j}' {type(j)} not in unique list \n")
for f in svx_scan.uniquefile:
# flinear.write(f" - '{f}' {type(f)} {svx_scan.uniquefile[f]} \n")
if len(svx_scan.uniquefile[f]) > 1:
flinear.write(f" - '{f}' {type(f)} {svx_scan.uniquefile[f]} dup survex files \n")
print(f"\n - {svx_scan.caverncount:,} runs of survex 'cavern' refreshing .3d files", file=sys.stderr) print(f"\n - {svx_scan.caverncount:,} runs of survex 'cavern' refreshing .3d files", file=sys.stderr)
print(f" - {len(svx_scan.svxfileslist):,} survex files from tree in linear include list", file=sys.stderr) print(f" - {len(svx_scan.svxfileslist):,} survex files from tree in linear include list", file=sys.stderr)
print(f" - {len(svx_scan.uniquefile):,} unique survex files from tree in linear include list", file=sys.stderr)
mem1 = get_process_memory() mem1 = get_process_memory()
print(f" - MEM:{mem1:7.2f} MB END ", file=sys.stderr) print(f" - MEM:{mem1:7.2f} MB END ", file=sys.stderr)
print(f" - MEM:{mem1 - mem0:7.3f} MB ADDITIONALLY USED", file=sys.stderr) print(f" - MEM:{mem1 - mem0:7.3f} MB ADDITIONALLY USED", file=sys.stderr)
# #
# Process all the omitted files in :loser: with some exceptions # Process all the omitted files in :loser: with some exceptions
#
unseens = set() unseens = set()
b = [] b = []
@ -1926,13 +1985,14 @@ def FindAndLoadSurvex(survexblockroot):
file=sys.stderr, file=sys.stderr,
) )
excpts = ["surface/terrain", "kataster/kataster-boundaries", "template", "docs", "_unseens"] unseensroot = re.sub(r"\.svx$", "", UNSEENS)
excpts = ["surface/terrain", "kataster/kataster-boundaries", "template", "docs", unseensroot]
removals = [] removals = []
for x in unseens: for x in unseens:
for o in excpts: for o in excpts:
if str(x).strip().startswith(o): if str(x).strip().startswith(o):
removals.append(x) removals.append(x)
# special fix for file not actually in survex format # special fix for .svx file not actually in survex format
unseens.remove(Path("fixedpts/gps/gps00raw")) unseens.remove(Path("fixedpts/gps/gps00raw"))
for x in removals: for x in removals:
@ -1944,7 +2004,7 @@ def FindAndLoadSurvex(survexblockroot):
check_team_cache() check_team_cache()
print(" -- Now loading the previously-omitted survex files.", file=sys.stderr) print(" -- Now loading the previously-omitted survex files.", file=sys.stderr)
with open(Path(settings.SURVEX_DATA, "_unseens.svx"), "w") as u: with open(Path(settings.SURVEX_DATA, UNSEENS), "w") as u:
u.write( u.write(
f"; {len(unseens):,} survex files not *included by {settings.SURVEX_TOPNAME} (which are {len(svx_scan.svxfileslist):,} files)\n" f"; {len(unseens):,} survex files not *included by {settings.SURVEX_TOPNAME} (which are {len(svx_scan.svxfileslist):,} files)\n"
) )
@ -1960,7 +2020,7 @@ def FindAndLoadSurvex(survexblockroot):
omit_scan = LoadingSurvex() omit_scan = LoadingSurvex()
omit_scan.callcount = 0 omit_scan.callcount = 0
omit_scan.depthinclude = 0 omit_scan.depthinclude = 0
fullpathtotop = os.path.join(survexfileroot.survexdirectory.path, "_unseens.svx") fullpathtotop = os.path.join(survexfileroot.survexdirectory.path, UNSEENS)
# copy the list to prime the next pass through the files # copy the list to prime the next pass through the files
omit_scan.svxfileslist = svx_scan.svxfileslist[:] omit_scan.svxfileslist = svx_scan.svxfileslist[:]
@ -1969,32 +2029,35 @@ def FindAndLoadSurvex(survexblockroot):
print(f" - RunSurvexIfNeeded cavern on '{fullpathtotop}'", file=sys.stderr) print(f" - RunSurvexIfNeeded cavern on '{fullpathtotop}'", file=sys.stderr)
omit_scan.RunSurvexIfNeeded(fullpathtotop, fullpathtotop) omit_scan.RunSurvexIfNeeded(fullpathtotop, fullpathtotop)
omit_scan.checkUniqueness(fullpathtotop) omit_scan.check_unique_name(fullpathtotop)
omit_scan.uniquefile[unseensroot] = ["0"]
mem0 = get_process_memory() mem0 = get_process_memory()
print(f" - MEM:{mem0:7.2f} MB START '_unseens'", file=sys.stderr) print(f" - MEM:{mem0:7.2f} MB START '{unseensroot}'", file=sys.stderr)
# flinear = open('svxlinear.log', 'w') # flinear = open('svxlinear.log', 'w')
flinear.write(f" - MEM:{mem0:7.2f} MB START '_unseens'\n") flinear.write(f" - MEM:{mem0:7.2f} MB START '{unseensroot}'\n")
print(" ", file=sys.stderr, end="") print(" ", file=sys.stderr, end="")
# this is a bit tricky as some unseen files will *include files we have already seen, which
# we should not process again.
finrootname = fullpathtotop finrootname = fullpathtotop
fcollate.write(";*include _unseens.svx\n") fcollate.write(f";*include {UNSEENS}\n")
flinear.write(f"{omit_scan.depthinclude:2} {indent} *include _unseens\n") flinear.write(f"{omit_scan.depthinclude:2} {indent} *include {unseensroot}\n")
stop_dup_warning = True # stop_dup_warning = True
# ---------------------------------------------------------------- # ----------------------------------------------------------------
omit_scan.PushdownStackScan(survexblockroot, "_unseens", finrootname, flinear, fcollate) omit_scan.PushdownStackScan(survexblockroot, unseensroot, finrootname, flinear, fcollate)
# ---------------------------------------------------------------- # ----------------------------------------------------------------
stop_dup_warning = False # stop_dup_warning = False
flinear.write(f"{omit_scan.depthinclude:2} {indent} *edulcni _unseens\n") flinear.write(f"{omit_scan.depthinclude:2} {indent} *edulcni {unseensroot}\n")
fcollate.write(";*edulcni _unseens.svx\n") fcollate.write(f";*edulcni {UNSEENS}\n")
check_team_cache() check_team_cache()
mem1 = get_process_memory() mem1 = get_process_memory()
flinear.write(f"\n - MEM:{mem1:.2f} MB STOP _unseens.svx OMIT\n") flinear.write(f"\n - MEM:{mem1:.2f} MB STOP {UNSEENS} Unseen Oddments\n")
flinear.write(f" - MEM:{mem1 - mem0:.3f} MB ADDITIONALLY USED OMIT\n") flinear.write(f" - MEM:{mem1 - mem0:.3f} MB ADDITIONALLY USED Unseen Oddments\n")
flinear.write(f" - {len(omit_scan.svxfileslist):,} survex files in linear include list OMIT \n") flinear.write(f" - {len(omit_scan.svxfileslist):,} survex files in linear include list Unseen Oddments \n")
flinear.close() flinear.close()
fcollate.close() fcollate.close()
@ -2085,6 +2148,7 @@ def MakeOmitFileRoot(fn):
def LoadSurvexBlocks(): def LoadSurvexBlocks():
global dup_includes
mem1 = get_process_memory() mem1 = get_process_memory()
print(f" - MEM:{mem1:7.2f} MB now ", file=sys.stderr) print(f" - MEM:{mem1:7.2f} MB now ", file=sys.stderr)
start = time.time() start = time.time()
@ -2129,7 +2193,7 @@ def LoadSurvexBlocks():
# sudo service mariadb start # sudo service mariadb start
survexblockroot.save() survexblockroot.save()
omitsfileroot = MakeOmitFileRoot("_unseens.svx") omitsfileroot = MakeOmitFileRoot(UNSEENS)
survexomitsroot = SurvexBlock( survexomitsroot = SurvexBlock(
name=OMITBLOCK, survexpath="", survexfile=omitsfileroot, legsall=0, legslength=0.0 name=OMITBLOCK, survexpath="", survexfile=omitsfileroot, legsall=0, legslength=0.0
) )
@ -2157,5 +2221,6 @@ def LoadSurvexBlocks():
store_data_issues() store_data_issues()
# duration = time.time() - start # duration = time.time() - start
# print(f" - TIME: {duration:7.2f} s", file=sys.stderr) # print(f" - TIME: {duration:7.2f} s", file=sys.stderr)
print(f" - Duplicate *includes = {dup_includes}")
print(" - Loaded All Survex Blocks.") print(" - Loaded All Survex Blocks.")