2
0
mirror of https://expo.survex.com/repositories/troggle/.git synced 2024-11-25 08:41:51 +00:00

big changes to cope with survexblock not yet dated, no *date yet

This commit is contained in:
Philip Sargent 2022-10-07 23:48:41 +03:00
parent bec262bb2d
commit 4e9680a3ad

View File

@ -19,18 +19,11 @@ from troggle.parsers.logbooks import GetCaveLookup
from troggle.core.models.troggle import DataIssue, Expedition from troggle.core.models.troggle import DataIssue, Expedition
from troggle.core.models.survex import SurvexPersonRole, Wallet, SurvexDirectory, SurvexFile, SurvexBlock, SurvexStation from troggle.core.models.survex import SurvexPersonRole, Wallet, SurvexDirectory, SurvexFile, SurvexBlock, SurvexStation
'''Imports the tree of survex files following form a defined root .svx file '''Imports the tree of survex files following from a defined root .svx file
It does also NOT scan the Loser repo for all the svx files - though it should ! It also scans the Loser repo for all the svx files, which it loads individually afterwards.
''' '''
todo = '''Also walk the entire tree in the :loser: repo looking for unconnected survex files todo = '''
- add them to the system so that they can be reported-on
- produce a parser report and create a troggle report page (some are OK, e.g. futility series replaced by ARGE survey in 115)
- If you look at e.g. http://expo.survex.com/survexfile/161#T_caves-1623/161/lhr/alllhr
you will see than have the team members are recognised by this parser, but not recognised by the
wider troggle system (the name is not a hyperlink) - apparently randomly.
GetPersonExpeditionNameLookup() needs to be fixed.
-#BUG, if *date comes after *team, the person's date is not set at all. It needs re-setting at the endof the block. -#BUG, if *date comes after *team, the person's date is not set at all. It needs re-setting at the endof the block.
@ -105,7 +98,8 @@ def get_people_on_trip(survexblock):
for p in qpeople: for p in qpeople:
people.append(f'{p.personname}') people.append(f'{p.personname}')
return list(set(people)) return list(set(people))
class LoadingSurvex(): class LoadingSurvex():
"""A 'survex block' is a *begin...*end set of cave data. """A 'survex block' is a *begin...*end set of cave data.
A survex file can contain many begin-end blocks, which can be nested, and which can *include A survex file can contain many begin-end blocks, which can be nested, and which can *include
@ -128,8 +122,10 @@ class LoadingSurvex():
rx_names = re.compile(r'(?i)names') rx_names = re.compile(r'(?i)names')
rx_flagsnot= re.compile(r"not\s") rx_flagsnot= re.compile(r"not\s")
rx_linelen = re.compile(r"[\d\-+.]+$") rx_linelen = re.compile(r"[\d\-+.]+$")
instruments = "(waiting_patiently|slacker|Useless|nagging|unknown|Inst|instrument|rig|rigger|rigging|helper|something| compass|comp|clino|Notes|sketch|book|Tape|Dog|Pics|photo|drawing|Helper|GPS|Disto|Distox|Distox2|topodroid|point|Consultant|nail|polish|nail_polish_bitch|nail_polish_monkey|varnish|nail_polish|nail_varnish|bitch|monkey|PowerDrill|drill)" instruments = "(bitch|bodger|bolt|bolter|bolting|book|clino|comp|compass|consultant|disto|distox|distox2|dog|dogsbody|drawing|drill|gps|helper|inst|instr|instrument|monkey|nagging|nail|nail_polish|nail_polish_bitch|nail_polish_monkey|nail_varnish|nail_varnish_bitch|note|paint|photo|pic|point|polish|powerdrill|rig|rigger|rigging|sketch|slacker|something|tape|topodroid|unknown|useless|varnish|waiting_patiently)"
rx_teammem = re.compile(r"(?i)"+instruments+"?(?:es|s)?\s+(.*)"+instruments+"?(?:es|s)?$") rx_teammem = re.compile(r"(?i)"+instruments+"?(?:es|s)?\s+(.*)$")
rx_teamold = re.compile(r"(?i)(.*)\s+"+instruments+"?(?:es|s)?$")
rx_teamabs = re.compile(r"(?i)^\s*("+instruments+")?(?:es|s)?\s*$")
rx_person = re.compile(r"(?i) and | / |, | & | \+ |^both$|^none$") rx_person = re.compile(r"(?i) and | / |, | & | \+ |^both$|^none$")
rx_qm = re.compile(r'(?i)^\s*QM(\d+)\s+?([a-dA-DxX])\s+([\w\-\_]+)\.([\w\.\-]+)\s+(([\w\-]+)\.([\w\.\-]+)|\-)\s+(.+)$') rx_qm = re.compile(r'(?i)^\s*QM(\d+)\s+?([a-dA-DxX])\s+([\w\-\_]+)\.([\w\.\-]+)\s+(([\w\-]+)\.([\w\.\-]+)|\-)\s+(.+)$')
# does not recognise non numeric suffix survey point ids # does not recognise non numeric suffix survey point ids
@ -228,22 +224,74 @@ class LoadingSurvex():
personrole is used to record that a person was on a trip, NOT the role they played. personrole is used to record that a person was on a trip, NOT the role they played.
(NB PersonTrip is a logbook thing) (NB PersonTrip is a logbook thing)
""" """
teammembers = [ ] def record_team_member(tm, survexblock):
mteammember = self.rx_teammem.match(line) tm = tm.strip('\"\'')
if mteammember: # Refactor. The dict GetPersonExpeditionNameLookup(expo) indexes by name and has values of personexpedition
# This is convoluted, the whole personexpedition concept is unnecessary.
# we need the current expedition, but if there has been no date yet in the survex file, we don't know which one it is.
# so we can't validate whether the person was on expo or not.
# we will have to attach them to the survexblock anyway, and then do a
# later check on whether they are valid when we get the date.
personrole, created = SurvexPersonRole.objects.update_or_create(survexblock=survexblock, personexpedition=personexpedition, personname=tm)
expo = survexblock.expedition # may be None if no *date yet
# this syntax was bizarre.. made more obvious
if expo:
if survexblock.expeditionday: # *date has been set
personrole.expeditionday = survexblock.expeditionday
else:
# should not happen
message = "! *team {} expo ok, expedition day not in *team {} ({}) created? '{}'".format(expo.year, survexblock.survexfile.path, survexblock, created )
print(self.insp+message)
DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(survexblock.survexfile.path))
personexpedition = GetPersonExpeditionNameLookup(expo).get(tm.lower())
personrole.person=personexpedition.person
self.currentpersonexped.append(personexpedition)
if not personexpedition:
# we know the date and expo, but can't find the person
message = "! *team {} '{}' FAIL personexpedition lookup on *team {} ({}) in '{}' {} ".format(expo.year, tm, survexblock.survexfile.path, survexblock, created, line)
print(self.insp+message)
DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(survexblock.survexfile.path))
else:
personexpedition = None
# don't know the date yet, assume the person is valid. It wull get picked up with the *date appears
personrole.save()
mteammember = self.rx_teammem.match(line) # matches the role at the beginning
if not mteammember:
moldstyle = self.rx_teamold.match(line) # matches the role at the the end of the string
if moldstyle:
for tm in self.rx_person.split(moldstyle.group(1)):
if tm:
record_team_member(tm, survexblock)
# seems to be working
# msg = "! OLD tm='{}' line: '{}' ({}) {}".format(tm, line, survexblock, survexblock.survexfile.path)
# print(msg, file=sys.stderr)
else:
message = "! *team {} ({}) Weird '{}' oldstyle line: '{}'".format(survexblock.survexfile.path, survexblock, mteammember.group(1), line)
print(self.insp+message)
DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(survexblock.survexfile.path))
else:
nullmember = self.rx_teamabs.match(line) # matches empty role line. Ignore these.
if not nullmember:
message = "! *team {} ({}) Bad line: '{}'".format(survexblock.survexfile.path, survexblock, line)
print(self.insp+message)
DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(survexblock.survexfile.path))
else:
for tm in self.rx_person.split(mteammember.group(2)): for tm in self.rx_person.split(mteammember.group(2)):
if tm: if tm:
tm = tm.strip('\"\'') record_team_member(tm, survexblock)
personexpedition = survexblock.expedition and GetPersonExpeditionNameLookup(survexblock.expedition).get(tm.lower()) else:
if (personexpedition, tm) not in teammembers: if not mteammember.group(2).lower() in ('none', 'both'):
teammembers.append((personexpedition, tm)) message = "! Weird *team '{}' newstyle line: '{}' ({}) {}".format(mteammember.group(2), line, survexblock, survexblock.survexfile.path)
personrole = SurvexPersonRole(survexblock=survexblock, personexpedition=personexpedition, personname=tm) print(self.insp+message)
personrole.save() DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(survexblock.survexfile.path))
personrole.expeditionday = survexblock.expeditionday #BUG, if *date comes after *team, this is NOT SET.
if personexpedition:
personrole.person=personexpedition.person
self.currentpersonexped.append(personexpedition)
personrole.save()
def LoadSurvexEntrance(self, survexblock, line): def LoadSurvexEntrance(self, survexblock, line):
# Not using this yet # Not using this yet
@ -284,29 +332,53 @@ class LoadingSurvex():
message = "! *UNITS in YARDS!? - not converted '{}' ({}) {}".format(line, survexblock, survexblock.survexfile.path) message = "! *UNITS in YARDS!? - not converted '{}' ({}) {}".format(line, survexblock, survexblock.survexfile.path)
print(self.insp+message) print(self.insp+message)
DataIssue.objects.create(parser='survexunits', message=message) DataIssue.objects.create(parser='survexunits', message=message)
def get_expo_from_year(self, year):
# cacheing to save DB query on every block
if year in self.expos:
expo = self.expos[year]
else:
expeditions = Expedition.objects.filter(year=year)
if len(expeditions) != 1 :
message = f"! More than one expedition in year {year} '{line}' ({survexblock}) {survexblock.survexfile.path}"
print(self.insp+message)
DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(survexblock.survexfile.path))
expo= expeditions[0]
self.expos[year]= expo
return expo
def LoadSurvexDate(self, survexblock, line): def LoadSurvexDate(self, survexblock, line):
# we should make this a date RANGE for everything? # we should make this a date RANGE for everything?
def findexpedition(year):
return Expedition.objects.filter(year=year)
def setdate(year): def setdate_on_survexblock(year):
# cacheing to save DB query on every block expo = self.get_expo_from_year(year)
if year in self.expos:
expo = self.expos[year]
else:
expeditions = findexpedition(year)
if len(expeditions) != 1 :
message = f"! More than one expedition in year {year} '{line}' ({survexblock}) {survexblock.survexfile.path}"
print(self.insp+message)
DataIssue.objects.create(parser='survexunits', message=message)
expo= expeditions[0]
self.expos[year]= expo
survexblock.expedition = expo survexblock.expedition = expo
survexblock.expeditionday = survexblock.expedition.get_expedition_day(survexblock.date) survexblock.expeditionday = expo.get_expedition_day(survexblock.date)
survexblock.save() survexblock.save()
team = SurvexPersonRole.objects.filter(survexblock=survexblock)
for p in team:
if not p.expeditionday: # *date and *team in 'wrong' order. All working now.
p.expeditionday = survexblock.expeditionday
p.save()
if not p.personexpedition: # again, we didn't know the date until now
pe = GetPersonExpeditionNameLookup(expo).get(p.personname.lower())
if pe:
# message = "! {} ({}) Fixing undated personexpedition '{}'".format(survexblock.survexfile.path, survexblock, p.personname)
# print(self.insp+message)
# DataIssue.objects.create(parser='survex', message=message)
p.personexpedition = pe
p.person = p.personexpedition.person
p.save()
else:
message = "! *team {} '{}' FAIL personexpedition lookup on *date {} ({}) '{}'".format(year, p, survexblock.survexfile.path, survexblock, p.personname)
print(self.insp+message)
DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(survexblock.survexfile.path))
oline = line oline = line
if len(line) > 10: if len(line) > 10:
@ -320,7 +392,7 @@ class LoadingSurvex():
# TO DO set to correct Austrian timezone Europe/Vienna ? # TO DO set to correct Austrian timezone Europe/Vienna ?
# %m and %d need leading zeros. Source svx files require them. # %m and %d need leading zeros. Source svx files require them.
survexblock.date = datetime.strptime(line.replace('.','-'), '%Y-%m-%d') survexblock.date = datetime.strptime(line.replace('.','-'), '%Y-%m-%d')
setdate(year) setdate_on_survexblock(year)
elif len(line) == 7: elif len(line) == 7:
year = line[:4] year = line[:4]
perps = get_people_on_trip(survexblock) # What, you don't know Judge Dredd slang ? perps = get_people_on_trip(survexblock) # What, you don't know Judge Dredd slang ?
@ -328,7 +400,7 @@ class LoadingSurvex():
print(self.insp+message) print(self.insp+message)
DataIssue.objects.create(parser='svxdate', message=message, url=get_offending_filename(survexblock.survexfile.path)) DataIssue.objects.create(parser='svxdate', message=message, url=get_offending_filename(survexblock.survexfile.path))
survexblock.date = datetime.strptime(line.replace('.','-'), '%Y-%m') # sets to first of month survexblock.date = datetime.strptime(line.replace('.','-'), '%Y-%m') # sets to first of month
setdate(year) setdate_on_survexblock(year)
elif len(line) == 4: elif len(line) == 4:
year = line[:4] year = line[:4]
perps = get_people_on_trip(survexblock) perps = get_people_on_trip(survexblock)
@ -336,13 +408,13 @@ class LoadingSurvex():
print(self.insp+message) print(self.insp+message)
DataIssue.objects.create(parser='svxdate', message=message, url=get_offending_filename(survexblock.survexfile.path)) DataIssue.objects.create(parser='svxdate', message=message, url=get_offending_filename(survexblock.survexfile.path))
survexblock.date = datetime.strptime(line, '%Y') # sets to January 1st survexblock.date = datetime.strptime(line, '%Y') # sets to January 1st
setdate(year) setdate_on_survexblock(year)
else: else:
# these errors are reporting the wrong survexblock, which is actually a SurvexFile (!) # these errors are reporting the wrong survexblock, which is actually a SurvexFile (!)
message = "! DATE Error unrecognised '{}' ({}) {}".format(oline, survexblock, survexblock.survexfile.path) message = "! DATE Error unrecognised '{}-{}' ({}) {}".format(oline, survexblock, type(survexblock), survexblock.survexfile.path)
print(self.insp+message) print(self.insp+message)
DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(survexblock.survexfile.path)) DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(survexblock.survexfile.path))
print(f" {survexblock.parent=}") # fails as SUrvexFile has no .parent ...ugh. print(f" {type(survexblock)=}") # survexblock.parent fails as a SurvexFile has no .parent ...ugh.
print(f" {survexblock.survexpath=}") print(f" {survexblock.survexpath=}")
print(f" {survexblock.survexfile=}") print(f" {survexblock.survexfile=}")
#raise #raise
@ -976,6 +1048,7 @@ class LoadingSurvex():
def LinearLoad(self, survexblock, path, collatefilename): def LinearLoad(self, survexblock, path, collatefilename):
"""Loads a single survex file. Usually used to import all the survex files which have been collated """Loads a single survex file. Usually used to import all the survex files which have been collated
into a single file. Loads the begin/end blocks using a stack for labels. into a single file. Loads the begin/end blocks using a stack for labels.
Uses the python generator idiom to avoid loading the whole file (21MB) into memory.
""" """
blkid = None blkid = None
pathlist = None pathlist = None
@ -1192,7 +1265,7 @@ class LoadingSurvex():
# this is a python generator idiom. # this is a python generator idiom.
# see https://realpython.com/introduction-to-python-generators/ # see https://realpython.com/introduction-to-python-generators/
# this is the first use of generators in troggle (Oct.2022) # this is the first use of generators in troggle (Oct.2022) and save 21 MB of memory
with open(collatefilename, "r") as fcollate: with open(collatefilename, "r") as fcollate:
for svxline in fcollate: for svxline in fcollate:
self.lineno += 1 self.lineno += 1
@ -1349,7 +1422,7 @@ class LoadingSurvex():
return return
return return
try: try:
# python generator idiom again # python generator idiom again. Not important here as these are small files
with open(finname, "r") as fin: with open(finname, "r") as fin:
for svxline in fin: for svxline in fin:
process_line(svxline) process_line(svxline)
@ -1475,7 +1548,6 @@ def FindAndLoadSurvex(survexblockroot):
survexfileroot = survexblockroot.survexfile # i.e. SURVEX_TOPNAME only survexfileroot = survexblockroot.survexfile # i.e. SURVEX_TOPNAME only
collatefilename = "_" + survexfileroot.path + ".svx" collatefilename = "_" + survexfileroot.path + ".svx"
svx_scan = LoadingSurvex() svx_scan = LoadingSurvex()
svx_scan.callcount = 0 svx_scan.callcount = 0
svx_scan.depthinclude = 0 svx_scan.depthinclude = 0
@ -1502,7 +1574,6 @@ def FindAndLoadSurvex(survexblockroot):
from pstats import SortKey from pstats import SortKey
pr = cProfile.Profile() pr = cProfile.Profile()
pr.enable() pr.enable()
#print(f"###{survexblockroot=} {survexfileroot.path=}",file=sys.stderr)
#---------------------------------------------------------------- #----------------------------------------------------------------
svx_scan.PushdownStackScan(survexblockroot, survexfileroot.path, finrootname, flinear, fcollate) svx_scan.PushdownStackScan(survexblockroot, survexfileroot.path, finrootname, flinear, fcollate)
#---------------------------------------------------------------- #----------------------------------------------------------------
@ -1524,8 +1595,7 @@ def FindAndLoadSurvex(survexblockroot):
mem1 = get_process_memory() mem1 = get_process_memory()
print(" - MEM:{:7.2f} MB END ".format(mem1),file=sys.stderr) print(" - MEM:{:7.2f} MB END ".format(mem1),file=sys.stderr)
print(" - MEM:{:7.3f} MB ADDITIONALLY USED".format(mem1-mem0),file=sys.stderr) print(" - MEM:{:7.3f} MB ADDITIONALLY USED".format(mem1-mem0),file=sys.stderr)
# #
# Process all the omitted files in :loser: with some exceptions # Process all the omitted files in :loser: with some exceptions
# #
@ -1556,8 +1626,6 @@ def FindAndLoadSurvex(survexblockroot):
for x in removals: for x in removals:
unseens.remove(x) unseens.remove(x)
# for x in unseens:
# print(f"'{x}', ", end='', file=sys.stderr)
print(f"\n - {len(unseens)} survex files found which were not included in main tree. ({len(svx_scan.svxfileslist)} in main tree)", file=sys.stderr) print(f"\n - {len(unseens)} survex files found which were not included in main tree. ({len(svx_scan.svxfileslist)} in main tree)", file=sys.stderr)
print(f" -- Now loading the previously-omitted survex files.", file=sys.stderr) print(f" -- Now loading the previously-omitted survex files.", file=sys.stderr)
@ -1621,7 +1689,6 @@ def FindAndLoadSurvex(survexblockroot):
print(" - MEM:{:7.3f} MB ADDITIONALLY USED".format(mem1-mem0),file=sys.stderr) print(" - MEM:{:7.3f} MB ADDITIONALLY USED".format(mem1-mem0),file=sys.stderr)
# Before doing this, it would be good to identify the *equate and *entrance we need that are relevant to the # Before doing this, it would be good to identify the *equate and *entrance we need that are relevant to the
# entrance locations currently loaded after this by LoadPos(), but could better be done before ? # entrance locations currently loaded after this by LoadPos(), but could better be done before ?
# look in MapLocations() for how we find the entrances # look in MapLocations() for how we find the entrances
@ -1635,18 +1702,15 @@ def FindAndLoadSurvex(survexblockroot):
#pr2 = cProfile.Profile() #pr2 = cProfile.Profile()
#pr2.enable() #pr2.enable()
mem1 = get_process_memory()
print(f" - MEM:{mem1:7.2f} MB NOT reading '{collatefilename}' into memory.",file=sys.stderr)
print(" ", file=sys.stderr,end='') print(" ", file=sys.stderr,end='')
#---------------------------------------------------------------- #----------------------------------------------------------------
svx_load.LinearLoad(survexblockroot,survexfileroot.path, collatefilename) svx_load.LinearLoad(survexblockroot, survexfileroot.path, collatefilename)
#---------------------------------------------------------------- #----------------------------------------------------------------
#pr2.disable() #pr2.disable()
# with open('LinearLoad.prof', 'w') as f: # with open('LinearLoad.prof', 'w') as f:
# ps = pstats.Stats(pr2, stream=f) # ps = pstats.Stats(pr2, stream=f)
# ps.sort_stats(SortKey.CUMULATIVE) # ps.sort_stats(SortKey.CUMULATIVE)
# ps.print_stats() # ps.print_stats()
svxlines = [] # empty 30MB of stashed file
mem1 = get_process_memory() mem1 = get_process_memory()
print("\n - MEM:{:7.2f} MB STOP".format(mem1),file=sys.stderr) print("\n - MEM:{:7.2f} MB STOP".format(mem1),file=sys.stderr)
print(" - MEM:{:7.3f} MB ADDITIONALLY USED".format(mem1-mem0),file=sys.stderr) print(" - MEM:{:7.3f} MB ADDITIONALLY USED".format(mem1-mem0),file=sys.stderr)
@ -1696,21 +1760,33 @@ def MakeOmitFileRoot(fn):
return fileroot return fileroot
def LoadSurvexBlocks(): def LoadSurvexBlocks():
mem1 = get_process_memory()
print(" - MEM:{:7.2f} MB now ".format(mem1),file=sys.stderr)
print(' - Flushing All Survex Blocks...') print(' - Flushing All Survex Blocks...')
# why does this increase memory use by 20 MB ?!
# We have foreign keys, Django needs to load the related objects
# in order to resolve how the relation should handle the deletion:
# https://docs.djangoproject.com/en/3.2/ref/models/fields/#django.db.models.ForeignKey.on_delete
SurvexBlock.objects.all().delete() SurvexBlock.objects.all().delete()
SurvexFile.objects.all().delete() SurvexFile.objects.all().delete()
SurvexDirectory.objects.all().delete() SurvexDirectory.objects.all().delete()
SurvexPersonRole.objects.all().delete() SurvexPersonRole.objects.all().delete()
SurvexStation.objects.all().delete() SurvexStation.objects.all().delete()
print(" - survex Data Issues flushed") mem1 = get_process_memory()
print(" - MEM:{:7.2f} MB now. Foreign key objects loaded on deletion. ".format(mem1),file=sys.stderr)
print(" - Flushing survex Data Issues ")
DataIssue.objects.filter(parser='survex').delete() DataIssue.objects.filter(parser='survex').delete()
DataIssue.objects.filter(parser='svxdate').delete() DataIssue.objects.filter(parser='svxdate').delete()
DataIssue.objects.filter(parser='survexleg').delete() DataIssue.objects.filter(parser='survexleg').delete()
DataIssue.objects.filter(parser='survexunits').delete() DataIssue.objects.filter(parser='survexunits').delete()
DataIssue.objects.filter(parser='entrances').delete() DataIssue.objects.filter(parser='entrances').delete()
DataIssue.objects.filter(parser='xEntrances').delete() DataIssue.objects.filter(parser='xEntrances').delete()
print(" - survex Data Issues flushed")
mem1 = get_process_memory()
print(" - MEM:{:7.2f} MB now ".format(mem1),file=sys.stderr)
survexfileroot = MakeSurvexFileRoot() survexfileroot = MakeSurvexFileRoot()
# this next makes a block_object assciated with a file_object.path = SURVEX_TOPNAME # this next makes a block_object assciated with a file_object.path = SURVEX_TOPNAME
survexblockroot = SurvexBlock(name=ROOTBLOCK, survexpath="", cave=None, survexfile=survexfileroot, survexblockroot = SurvexBlock(name=ROOTBLOCK, survexpath="", cave=None, survexfile=survexfileroot,