diff --git a/parsers/survex.py b/parsers/survex.py index 1f97dd0..57ae52c 100644 --- a/parsers/survex.py +++ b/parsers/survex.py @@ -19,18 +19,11 @@ from troggle.parsers.logbooks import GetCaveLookup from troggle.core.models.troggle import DataIssue, Expedition from troggle.core.models.survex import SurvexPersonRole, Wallet, SurvexDirectory, SurvexFile, SurvexBlock, SurvexStation -'''Imports the tree of survex files following form a defined root .svx file -It does also NOT scan the Loser repo for all the svx files - though it should ! +'''Imports the tree of survex files following from a defined root .svx file +It also scans the Loser repo for all the svx files, which it loads individually afterwards. ''' -todo = '''Also walk the entire tree in the :loser: repo looking for unconnected survex files -- add them to the system so that they can be reported-on -- produce a parser report and create a troggle report page (some are OK, e.g. futility series replaced by ARGE survey in 115) - -- If you look at e.g. http://expo.survex.com/survexfile/161#T_caves-1623/161/lhr/alllhr - you will see than have the team members are recognised by this parser, but not recognised by the - wider troggle system (the name is not a hyperlink) - apparently randomly. - GetPersonExpeditionNameLookup() needs to be fixed. +todo = ''' -#BUG, if *date comes after *team, the person's date is not set at all. It needs re-setting at the endof the block. @@ -105,7 +98,8 @@ def get_people_on_trip(survexblock): for p in qpeople: people.append(f'{p.personname}') return list(set(people)) - + + class LoadingSurvex(): """A 'survex block' is a *begin...*end set of cave data. A survex file can contain many begin-end blocks, which can be nested, and which can *include @@ -128,8 +122,10 @@ class LoadingSurvex(): rx_names = re.compile(r'(?i)names') rx_flagsnot= re.compile(r"not\s") rx_linelen = re.compile(r"[\d\-+.]+$") - instruments = "(waiting_patiently|slacker|Useless|nagging|unknown|Inst|instrument|rig|rigger|rigging|helper|something| compass|comp|clino|Notes|sketch|book|Tape|Dog|Pics|photo|drawing|Helper|GPS|Disto|Distox|Distox2|topodroid|point|Consultant|nail|polish|nail_polish_bitch|nail_polish_monkey|varnish|nail_polish|nail_varnish|bitch|monkey|PowerDrill|drill)" - rx_teammem = re.compile(r"(?i)"+instruments+"?(?:es|s)?\s+(.*)"+instruments+"?(?:es|s)?$") + instruments = "(bitch|bodger|bolt|bolter|bolting|book|clino|comp|compass|consultant|disto|distox|distox2|dog|dogsbody|drawing|drill|gps|helper|inst|instr|instrument|monkey|nagging|nail|nail_polish|nail_polish_bitch|nail_polish_monkey|nail_varnish|nail_varnish_bitch|note|paint|photo|pic|point|polish|powerdrill|rig|rigger|rigging|sketch|slacker|something|tape|topodroid|unknown|useless|varnish|waiting_patiently)" + rx_teammem = re.compile(r"(?i)"+instruments+"?(?:es|s)?\s+(.*)$") + rx_teamold = re.compile(r"(?i)(.*)\s+"+instruments+"?(?:es|s)?$") + rx_teamabs = re.compile(r"(?i)^\s*("+instruments+")?(?:es|s)?\s*$") rx_person = re.compile(r"(?i) and | / |, | & | \+ |^both$|^none$") rx_qm = re.compile(r'(?i)^\s*QM(\d+)\s+?([a-dA-DxX])\s+([\w\-\_]+)\.([\w\.\-]+)\s+(([\w\-]+)\.([\w\.\-]+)|\-)\s+(.+)$') # does not recognise non numeric suffix survey point ids @@ -228,22 +224,74 @@ class LoadingSurvex(): personrole is used to record that a person was on a trip, NOT the role they played. (NB PersonTrip is a logbook thing) """ - teammembers = [ ] - mteammember = self.rx_teammem.match(line) - if mteammember: + def record_team_member(tm, survexblock): + tm = tm.strip('\"\'') + # Refactor. The dict GetPersonExpeditionNameLookup(expo) indexes by name and has values of personexpedition + # This is convoluted, the whole personexpedition concept is unnecessary. + + # we need the current expedition, but if there has been no date yet in the survex file, we don't know which one it is. + # so we can't validate whether the person was on expo or not. + # we will have to attach them to the survexblock anyway, and then do a + # later check on whether they are valid when we get the date. + + personrole, created = SurvexPersonRole.objects.update_or_create(survexblock=survexblock, personexpedition=personexpedition, personname=tm) + + expo = survexblock.expedition # may be None if no *date yet + # this syntax was bizarre.. made more obvious + if expo: + if survexblock.expeditionday: # *date has been set + personrole.expeditionday = survexblock.expeditionday + else: + # should not happen + message = "! *team {} expo ok, expedition day not in *team {} ({}) created? '{}'".format(expo.year, survexblock.survexfile.path, survexblock, created ) + print(self.insp+message) + DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(survexblock.survexfile.path)) + + + personexpedition = GetPersonExpeditionNameLookup(expo).get(tm.lower()) + personrole.person=personexpedition.person + self.currentpersonexped.append(personexpedition) + + if not personexpedition: + # we know the date and expo, but can't find the person + message = "! *team {} '{}' FAIL personexpedition lookup on *team {} ({}) in '{}' {} ".format(expo.year, tm, survexblock.survexfile.path, survexblock, created, line) + print(self.insp+message) + DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(survexblock.survexfile.path)) + else: + personexpedition = None + # don't know the date yet, assume the person is valid. It wull get picked up with the *date appears + + personrole.save() + + mteammember = self.rx_teammem.match(line) # matches the role at the beginning + if not mteammember: + moldstyle = self.rx_teamold.match(line) # matches the role at the the end of the string + if moldstyle: + for tm in self.rx_person.split(moldstyle.group(1)): + if tm: + record_team_member(tm, survexblock) + # seems to be working + # msg = "! OLD tm='{}' line: '{}' ({}) {}".format(tm, line, survexblock, survexblock.survexfile.path) + # print(msg, file=sys.stderr) + else: + message = "! *team {} ({}) Weird '{}' oldstyle line: '{}'".format(survexblock.survexfile.path, survexblock, mteammember.group(1), line) + print(self.insp+message) + DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(survexblock.survexfile.path)) + else: + nullmember = self.rx_teamabs.match(line) # matches empty role line. Ignore these. + if not nullmember: + message = "! *team {} ({}) Bad line: '{}'".format(survexblock.survexfile.path, survexblock, line) + print(self.insp+message) + DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(survexblock.survexfile.path)) + else: for tm in self.rx_person.split(mteammember.group(2)): if tm: - tm = tm.strip('\"\'') - personexpedition = survexblock.expedition and GetPersonExpeditionNameLookup(survexblock.expedition).get(tm.lower()) - if (personexpedition, tm) not in teammembers: - teammembers.append((personexpedition, tm)) - personrole = SurvexPersonRole(survexblock=survexblock, personexpedition=personexpedition, personname=tm) - personrole.save() - personrole.expeditionday = survexblock.expeditionday #BUG, if *date comes after *team, this is NOT SET. - if personexpedition: - personrole.person=personexpedition.person - self.currentpersonexped.append(personexpedition) - personrole.save() + record_team_member(tm, survexblock) + else: + if not mteammember.group(2).lower() in ('none', 'both'): + message = "! Weird *team '{}' newstyle line: '{}' ({}) {}".format(mteammember.group(2), line, survexblock, survexblock.survexfile.path) + print(self.insp+message) + DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(survexblock.survexfile.path)) def LoadSurvexEntrance(self, survexblock, line): # Not using this yet @@ -284,29 +332,53 @@ class LoadingSurvex(): message = "! *UNITS in YARDS!? - not converted '{}' ({}) {}".format(line, survexblock, survexblock.survexfile.path) print(self.insp+message) DataIssue.objects.create(parser='survexunits', message=message) - + + def get_expo_from_year(self, year): + # cacheing to save DB query on every block + if year in self.expos: + expo = self.expos[year] + else: + expeditions = Expedition.objects.filter(year=year) + if len(expeditions) != 1 : + message = f"! More than one expedition in year {year} '{line}' ({survexblock}) {survexblock.survexfile.path}" + print(self.insp+message) + DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(survexblock.survexfile.path)) + + expo= expeditions[0] + self.expos[year]= expo + return expo + def LoadSurvexDate(self, survexblock, line): # we should make this a date RANGE for everything? - def findexpedition(year): - return Expedition.objects.filter(year=year) - def setdate(year): - # cacheing to save DB query on every block - if year in self.expos: - expo = self.expos[year] - else: - expeditions = findexpedition(year) - if len(expeditions) != 1 : - message = f"! More than one expedition in year {year} '{line}' ({survexblock}) {survexblock.survexfile.path}" - print(self.insp+message) - DataIssue.objects.create(parser='survexunits', message=message) - - expo= expeditions[0] - self.expos[year]= expo - + def setdate_on_survexblock(year): + expo = self.get_expo_from_year(year) survexblock.expedition = expo - survexblock.expeditionday = survexblock.expedition.get_expedition_day(survexblock.date) + survexblock.expeditionday = expo.get_expedition_day(survexblock.date) survexblock.save() + + team = SurvexPersonRole.objects.filter(survexblock=survexblock) + for p in team: + if not p.expeditionday: # *date and *team in 'wrong' order. All working now. + + p.expeditionday = survexblock.expeditionday + p.save() + + if not p.personexpedition: # again, we didn't know the date until now + pe = GetPersonExpeditionNameLookup(expo).get(p.personname.lower()) + if pe: + # message = "! {} ({}) Fixing undated personexpedition '{}'".format(survexblock.survexfile.path, survexblock, p.personname) + # print(self.insp+message) + # DataIssue.objects.create(parser='survex', message=message) + p.personexpedition = pe + p.person = p.personexpedition.person + p.save() + else: + message = "! *team {} '{}' FAIL personexpedition lookup on *date {} ({}) '{}'".format(year, p, survexblock.survexfile.path, survexblock, p.personname) + print(self.insp+message) + DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(survexblock.survexfile.path)) + + oline = line if len(line) > 10: @@ -320,7 +392,7 @@ class LoadingSurvex(): # TO DO set to correct Austrian timezone Europe/Vienna ? # %m and %d need leading zeros. Source svx files require them. survexblock.date = datetime.strptime(line.replace('.','-'), '%Y-%m-%d') - setdate(year) + setdate_on_survexblock(year) elif len(line) == 7: year = line[:4] perps = get_people_on_trip(survexblock) # What, you don't know Judge Dredd slang ? @@ -328,7 +400,7 @@ class LoadingSurvex(): print(self.insp+message) DataIssue.objects.create(parser='svxdate', message=message, url=get_offending_filename(survexblock.survexfile.path)) survexblock.date = datetime.strptime(line.replace('.','-'), '%Y-%m') # sets to first of month - setdate(year) + setdate_on_survexblock(year) elif len(line) == 4: year = line[:4] perps = get_people_on_trip(survexblock) @@ -336,13 +408,13 @@ class LoadingSurvex(): print(self.insp+message) DataIssue.objects.create(parser='svxdate', message=message, url=get_offending_filename(survexblock.survexfile.path)) survexblock.date = datetime.strptime(line, '%Y') # sets to January 1st - setdate(year) + setdate_on_survexblock(year) else: # these errors are reporting the wrong survexblock, which is actually a SurvexFile (!) - message = "! DATE Error unrecognised '{}' ({}) {}".format(oline, survexblock, survexblock.survexfile.path) + message = "! DATE Error unrecognised '{}-{}' ({}) {}".format(oline, survexblock, type(survexblock), survexblock.survexfile.path) print(self.insp+message) DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(survexblock.survexfile.path)) - print(f" {survexblock.parent=}") # fails as SUrvexFile has no .parent ...ugh. + print(f" {type(survexblock)=}") # survexblock.parent fails as a SurvexFile has no .parent ...ugh. print(f" {survexblock.survexpath=}") print(f" {survexblock.survexfile=}") #raise @@ -976,6 +1048,7 @@ class LoadingSurvex(): def LinearLoad(self, survexblock, path, collatefilename): """Loads a single survex file. Usually used to import all the survex files which have been collated into a single file. Loads the begin/end blocks using a stack for labels. + Uses the python generator idiom to avoid loading the whole file (21MB) into memory. """ blkid = None pathlist = None @@ -1192,7 +1265,7 @@ class LoadingSurvex(): # this is a python generator idiom. # see https://realpython.com/introduction-to-python-generators/ - # this is the first use of generators in troggle (Oct.2022) + # this is the first use of generators in troggle (Oct.2022) and save 21 MB of memory with open(collatefilename, "r") as fcollate: for svxline in fcollate: self.lineno += 1 @@ -1349,7 +1422,7 @@ class LoadingSurvex(): return return try: - # python generator idiom again + # python generator idiom again. Not important here as these are small files with open(finname, "r") as fin: for svxline in fin: process_line(svxline) @@ -1475,7 +1548,6 @@ def FindAndLoadSurvex(survexblockroot): survexfileroot = survexblockroot.survexfile # i.e. SURVEX_TOPNAME only collatefilename = "_" + survexfileroot.path + ".svx" - svx_scan = LoadingSurvex() svx_scan.callcount = 0 svx_scan.depthinclude = 0 @@ -1502,7 +1574,6 @@ def FindAndLoadSurvex(survexblockroot): from pstats import SortKey pr = cProfile.Profile() pr.enable() - #print(f"###{survexblockroot=} {survexfileroot.path=}",file=sys.stderr) #---------------------------------------------------------------- svx_scan.PushdownStackScan(survexblockroot, survexfileroot.path, finrootname, flinear, fcollate) #---------------------------------------------------------------- @@ -1524,8 +1595,7 @@ def FindAndLoadSurvex(survexblockroot): mem1 = get_process_memory() print(" - MEM:{:7.2f} MB END ".format(mem1),file=sys.stderr) - print(" - MEM:{:7.3f} MB ADDITIONALLY USED".format(mem1-mem0),file=sys.stderr) - + print(" - MEM:{:7.3f} MB ADDITIONALLY USED".format(mem1-mem0),file=sys.stderr) # # Process all the omitted files in :loser: with some exceptions # @@ -1556,8 +1626,6 @@ def FindAndLoadSurvex(survexblockroot): for x in removals: unseens.remove(x) - # for x in unseens: - # print(f"'{x}', ", end='', file=sys.stderr) print(f"\n - {len(unseens)} survex files found which were not included in main tree. ({len(svx_scan.svxfileslist)} in main tree)", file=sys.stderr) print(f" -- Now loading the previously-omitted survex files.", file=sys.stderr) @@ -1621,7 +1689,6 @@ def FindAndLoadSurvex(survexblockroot): print(" - MEM:{:7.3f} MB ADDITIONALLY USED".format(mem1-mem0),file=sys.stderr) - # Before doing this, it would be good to identify the *equate and *entrance we need that are relevant to the # entrance locations currently loaded after this by LoadPos(), but could better be done before ? # look in MapLocations() for how we find the entrances @@ -1635,18 +1702,15 @@ def FindAndLoadSurvex(survexblockroot): #pr2 = cProfile.Profile() #pr2.enable() - mem1 = get_process_memory() - print(f" - MEM:{mem1:7.2f} MB NOT reading '{collatefilename}' into memory.",file=sys.stderr) print(" ", file=sys.stderr,end='') #---------------------------------------------------------------- - svx_load.LinearLoad(survexblockroot,survexfileroot.path, collatefilename) + svx_load.LinearLoad(survexblockroot, survexfileroot.path, collatefilename) #---------------------------------------------------------------- #pr2.disable() # with open('LinearLoad.prof', 'w') as f: # ps = pstats.Stats(pr2, stream=f) # ps.sort_stats(SortKey.CUMULATIVE) # ps.print_stats() - svxlines = [] # empty 30MB of stashed file mem1 = get_process_memory() print("\n - MEM:{:7.2f} MB STOP".format(mem1),file=sys.stderr) print(" - MEM:{:7.3f} MB ADDITIONALLY USED".format(mem1-mem0),file=sys.stderr) @@ -1696,21 +1760,33 @@ def MakeOmitFileRoot(fn): return fileroot def LoadSurvexBlocks(): + mem1 = get_process_memory() + print(" - MEM:{:7.2f} MB now ".format(mem1),file=sys.stderr) print(' - Flushing All Survex Blocks...') + # why does this increase memory use by 20 MB ?! + # We have foreign keys, Django needs to load the related objects + # in order to resolve how the relation should handle the deletion: + # https://docs.djangoproject.com/en/3.2/ref/models/fields/#django.db.models.ForeignKey.on_delete SurvexBlock.objects.all().delete() SurvexFile.objects.all().delete() SurvexDirectory.objects.all().delete() SurvexPersonRole.objects.all().delete() SurvexStation.objects.all().delete() - print(" - survex Data Issues flushed") + mem1 = get_process_memory() + print(" - MEM:{:7.2f} MB now. Foreign key objects loaded on deletion. ".format(mem1),file=sys.stderr) + + print(" - Flushing survex Data Issues ") DataIssue.objects.filter(parser='survex').delete() DataIssue.objects.filter(parser='svxdate').delete() DataIssue.objects.filter(parser='survexleg').delete() DataIssue.objects.filter(parser='survexunits').delete() DataIssue.objects.filter(parser='entrances').delete() DataIssue.objects.filter(parser='xEntrances').delete() - + print(" - survex Data Issues flushed") + mem1 = get_process_memory() + print(" - MEM:{:7.2f} MB now ".format(mem1),file=sys.stderr) + survexfileroot = MakeSurvexFileRoot() # this next makes a block_object assciated with a file_object.path = SURVEX_TOPNAME survexblockroot = SurvexBlock(name=ROOTBLOCK, survexpath="", cave=None, survexfile=survexfileroot,