big changes to cope with survexblock not yet dated, no *date yet

2026-02-08 13:10:05 +00:00 · 2022-10-07 23:48:41 +03:00
parent bec262bb2d
commit 4e9680a3ad
1 changed files with 141 additions and 65 deletions
--- a/parsers/survex.py
+++ b/parsers/survex.py
@@ -19,18 +19,11 @@ from troggle.parsers.logbooks import GetCaveLookup
 from troggle.core.models.troggle import DataIssue, Expedition
 from troggle.core.models.survex import SurvexPersonRole, Wallet, SurvexDirectory, SurvexFile, SurvexBlock, SurvexStation

-'''Imports the tree of survex files following form a defined root .svx file
-It does also NOT scan the Loser repo for all the svx files - though it should !
+'''Imports the tree of survex files following from a defined root .svx file
+It also scans the Loser repo for all the svx files, which it loads individually afterwards.
 '''

-todo = '''Also walk the entire tree in the :loser: repo looking for unconnected survex files
- add them to the system so that they can be reported-on
- produce a parser report and create a troggle report page (some are OK, e.g. futility series replaced by ARGE survey in 115)
-
-       If you look at e.g. http://expo.survex.com/survexfile/161#T_caves-1623/161/lhr/alllhr
-        you will see than have the team members are recognised by this parser, but not recognised by the
-        wider troggle system (the name is not a hyperlink) - apparently randomly. 
-        GetPersonExpeditionNameLookup() needs to be fixed.
+todo = '''
        
 -#BUG, if *date comes after *team, the person's date is not set at all. It needs re-setting at the endof the block.

@@ -106,6 +99,7 @@ def get_people_on_trip(survexblock):
        people.append(f'{p.personname}')
    return list(set(people))

+    
 class LoadingSurvex():
    """A 'survex block' is a *begin...*end set of cave data.
    A survex file can contain many begin-end blocks, which can be nested, and which can *include
@@ -128,8 +122,10 @@ class LoadingSurvex():
    rx_names   = re.compile(r'(?i)names')
    rx_flagsnot= re.compile(r"not\s")
    rx_linelen = re.compile(r"[\d\-+.]+$")
-    instruments = "(waiting_patiently|slacker|Useless|nagging|unknown|Inst|instrument|rig|rigger|rigging|helper|something| compass|comp|clino|Notes|sketch|book|Tape|Dog|Pics|photo|drawing|Helper|GPS|Disto|Distox|Distox2|topodroid|point|Consultant|nail|polish|nail_polish_bitch|nail_polish_monkey|varnish|nail_polish|nail_varnish|bitch|monkey|PowerDrill|drill)"
-    rx_teammem = re.compile(r"(?i)"+instruments+"?(?:es|s)?\s+(.*)"+instruments+"?(?:es|s)?$")
+    instruments = "(bitch|bodger|bolt|bolter|bolting|book|clino|comp|compass|consultant|disto|distox|distox2|dog|dogsbody|drawing|drill|gps|helper|inst|instr|instrument|monkey|nagging|nail|nail_polish|nail_polish_bitch|nail_polish_monkey|nail_varnish|nail_varnish_bitch|note|paint|photo|pic|point|polish|powerdrill|rig|rigger|rigging|sketch|slacker|something|tape|topodroid|unknown|useless|varnish|waiting_patiently)"
+    rx_teammem = re.compile(r"(?i)"+instruments+"?(?:es|s)?\s+(.*)$")
+    rx_teamold = re.compile(r"(?i)(.*)\s+"+instruments+"?(?:es|s)?$")
+    rx_teamabs = re.compile(r"(?i)^\s*("+instruments+")?(?:es|s)?\s*$")
    rx_person  = re.compile(r"(?i) and | / |, | & | \+ |^both$|^none$")
    rx_qm      = re.compile(r'(?i)^\s*QM(\d+)\s+?([a-dA-DxX])\s+([\w\-\_]+)\.([\w\.\-]+)\s+(([\w\-]+)\.([\w\.\-]+)|\-)\s+(.+)$')
    # does not recognise non numeric suffix survey point ids
@@ -228,23 +224,75 @@ class LoadingSurvex():
        personrole is used to record that a person was on a trip, NOT the role they played.
        (NB PersonTrip is a logbook thing)
        """
-        teammembers = [ ]
-        mteammember = self.rx_teammem.match(line)
-        if mteammember:
-            for tm in self.rx_person.split(mteammember.group(2)):
-                if tm:
+        def record_team_member(tm, survexblock):
            tm = tm.strip('\"\'')
-                    personexpedition = survexblock.expedition and GetPersonExpeditionNameLookup(survexblock.expedition).get(tm.lower())
-                    if (personexpedition, tm) not in teammembers:
-                        teammembers.append((personexpedition, tm))
-                        personrole = SurvexPersonRole(survexblock=survexblock, personexpedition=personexpedition, personname=tm)
-                        personrole.save()
-                        personrole.expeditionday = survexblock.expeditionday #BUG, if *date comes after *team, this is NOT SET.
-                        if personexpedition:
+            # Refactor. The dict GetPersonExpeditionNameLookup(expo) indexes by name and has values of personexpedition
+            # This is convoluted, the whole personexpedition concept is unnecessary.
+            
+            # we need the current expedition, but if there has been no date yet in the survex file, we don't know which one it is.
+            # so we can't validate whether the person was on expo or not.
+            # we will have to attach them to the survexblock anyway, and then do a 
+            # later check on whether they are valid when we get the date.
+            
+            personrole, created = SurvexPersonRole.objects.update_or_create(survexblock=survexblock, personexpedition=personexpedition, personname=tm)
+             
+            expo = survexblock.expedition # may be None if no *date yet
+            # this syntax was bizarre.. made more obvious
+            if expo:
+                if survexblock.expeditionday: # *date has been set
+                    personrole.expeditionday = survexblock.expeditionday 
+                else:
+                    # should not happen
+                    message = "! *team {} expo ok, expedition day not in  *team {} ({}) created? '{}'".format(expo.year, survexblock.survexfile.path, survexblock, created )
+                    print(self.insp+message)
+                    DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(survexblock.survexfile.path))
+
+
+                personexpedition =  GetPersonExpeditionNameLookup(expo).get(tm.lower())
                personrole.person=personexpedition.person
                self.currentpersonexped.append(personexpedition)
+                
+                if not personexpedition: 
+                    # we know the date and expo, but can't find the person
+                    message = "! *team {} '{}' FAIL personexpedition lookup on *team {} ({})  in '{}' {} ".format(expo.year, tm, survexblock.survexfile.path, survexblock, created, line)
+                    print(self.insp+message)
+                    DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(survexblock.survexfile.path))
+            else:
+                personexpedition = None
+                # don't know the date yet, assume the person is valid. It wull get picked up with the *date appears
+
            personrole.save()       
    
+        mteammember = self.rx_teammem.match(line) # matches the role  at the beginning 
+        if not mteammember:
+            moldstyle = self.rx_teamold.match(line) # matches the role at the the end of the string
+            if moldstyle:
+                for tm in self.rx_person.split(moldstyle.group(1)):
+                    if tm:
+                        record_team_member(tm, survexblock)
+                        # seems to be working
+                        # msg = "! OLD tm='{}' line: '{}' ({}) {}".format(tm, line, survexblock, survexblock.survexfile.path)
+                        # print(msg,  file=sys.stderr)
+                    else:
+                        message = "! *team {} ({}) Weird '{}' oldstyle line: '{}'".format(survexblock.survexfile.path, survexblock, mteammember.group(1), line)
+                        print(self.insp+message)
+                        DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(survexblock.survexfile.path))
+            else:
+                nullmember = self.rx_teamabs.match(line) # matches empty role line. Ignore these.
+                if not nullmember:
+                    message = "! *team {} ({}) Bad line: '{}'".format(survexblock.survexfile.path, survexblock, line)
+                    print(self.insp+message)
+                    DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(survexblock.survexfile.path))
+        else:
+            for tm in self.rx_person.split(mteammember.group(2)):
+                if tm:
+                    record_team_member(tm, survexblock)
+                else:
+                    if not mteammember.group(2).lower() in ('none', 'both'):
+                        message = "! Weird *team '{}' newstyle line: '{}' ({}) {}".format(mteammember.group(2), line, survexblock, survexblock.survexfile.path)
+                        print(self.insp+message)
+                        DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(survexblock.survexfile.path))
+
    def LoadSurvexEntrance(self, survexblock, line):
        # Not using this yet
        pass
@@ -285,29 +333,53 @@ class LoadingSurvex():
            print(self.insp+message)
            DataIssue.objects.create(parser='survexunits', message=message)
    
-    def LoadSurvexDate(self, survexblock, line):
-        # we should make this a date RANGE for everything?
-        def findexpedition(year):
-            return Expedition.objects.filter(year=year)
-            
-        def setdate(year):
+    def get_expo_from_year(self, year):
        # cacheing to save DB query on every block 
        if year in self.expos:
            expo = self.expos[year]
        else:
-                expeditions = findexpedition(year)
+            expeditions = Expedition.objects.filter(year=year)
            if len(expeditions) != 1 :
                message = f"! More than one expedition in year {year} '{line}' ({survexblock}) {survexblock.survexfile.path}"
                print(self.insp+message)
-                    DataIssue.objects.create(parser='survexunits', message=message)
+                DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(survexblock.survexfile.path))
               
            expo= expeditions[0]
            self.expos[year]= expo   
+        return expo       
    
+    def LoadSurvexDate(self, survexblock, line):
+        # we should make this a date RANGE for everything?
+            
+        def setdate_on_survexblock(year):
+            expo = self.get_expo_from_year(year)
            survexblock.expedition = expo
-            survexblock.expeditionday = survexblock.expedition.get_expedition_day(survexblock.date)
+            survexblock.expeditionday = expo.get_expedition_day(survexblock.date)
            survexblock.save()
            
+            team = SurvexPersonRole.objects.filter(survexblock=survexblock)
+            for p in team:
+                if not p.expeditionday: # *date and *team in 'wrong' order. All working now.
+                    
+                    p.expeditionday = survexblock.expeditionday 
+                    p.save()
+                    
+                    if not p.personexpedition: # again, we didn't know the date until now
+                        pe = GetPersonExpeditionNameLookup(expo).get(p.personname.lower())
+                        if pe:
+                            # message = "!  {} ({}) Fixing undated personexpedition '{}'".format(survexblock.survexfile.path, survexblock, p.personname)
+                            # print(self.insp+message)
+                            # DataIssue.objects.create(parser='survex', message=message)
+                            p.personexpedition = pe
+                            p.person = p.personexpedition.person
+                            p.save()
+                        else:
+                            message = "! *team {} '{}' FAIL personexpedition lookup on *date {} ({})  '{}'".format(year, p, survexblock.survexfile.path, survexblock, p.personname)
+                            print(self.insp+message)
+                            DataIssue.objects.create(parser='survex', message=message, url=get_offending_filename(survexblock.survexfile.path))
+                       
+
+
        oline = line
        if len(line) > 10: 
            # message = "! DATE Warning LONG DATE '{}' ({}) {}".format(oline, survexblock, survexblock.survexfile.path)
@@ -320,7 +392,7 @@ class LoadingSurvex():
            # TO DO set to correct Austrian timezone Europe/Vienna ?
            # %m and %d need leading zeros. Source svx files require them.
            survexblock.date = datetime.strptime(line.replace('.','-'), '%Y-%m-%d')
-            setdate(year)
+            setdate_on_survexblock(year)
        elif len(line) == 7: 
            year = line[:4]
            perps = get_people_on_trip(survexblock) # What, you don't know Judge Dredd slang ?
@@ -328,7 +400,7 @@ class LoadingSurvex():
            print(self.insp+message)
            DataIssue.objects.create(parser='svxdate', message=message,  url=get_offending_filename(survexblock.survexfile.path))
            survexblock.date = datetime.strptime(line.replace('.','-'), '%Y-%m') # sets to first of month
-            setdate(year)
+            setdate_on_survexblock(year)
        elif len(line) == 4: 
            year = line[:4]
            perps = get_people_on_trip(survexblock)
@@ -336,13 +408,13 @@ class LoadingSurvex():
            print(self.insp+message)
            DataIssue.objects.create(parser='svxdate', message=message,  url=get_offending_filename(survexblock.survexfile.path))
            survexblock.date = datetime.strptime(line, '%Y') # sets to January 1st
-            setdate(year)
+            setdate_on_survexblock(year)
        else:
            # these errors are reporting the wrong survexblock, which is actually a SurvexFile (!)
-            message = "! DATE Error unrecognised '{}' ({}) {}".format(oline, survexblock, survexblock.survexfile.path)
+            message = "! DATE Error unrecognised '{}-{}' ({}) {}".format(oline, survexblock, type(survexblock), survexblock.survexfile.path)
            print(self.insp+message)
            DataIssue.objects.create(parser='survex', message=message,  url=get_offending_filename(survexblock.survexfile.path))
-            print(f"  {survexblock.parent=}") # fails as SUrvexFile has no .parent ...ugh.
+            print(f"  {type(survexblock)=}") # survexblock.parent fails as a SurvexFile has no .parent ...ugh.
            print(f"  {survexblock.survexpath=}")
            print(f"  {survexblock.survexfile=}")
            #raise
@@ -976,6 +1048,7 @@ class LoadingSurvex():
    def LinearLoad(self, survexblock, path, collatefilename):
        """Loads a single survex file. Usually used to import all the survex files which have been collated
        into a single file. Loads the begin/end blocks using a stack for labels.
+        Uses the python generator idiom to avoid loading the whole file (21MB) into memory.
        """
        blkid = None
        pathlist = None
@@ -1192,7 +1265,7 @@ class LoadingSurvex():

        # this is a python generator idiom. 
        # see https://realpython.com/introduction-to-python-generators/
-        # this is the first use of generators in troggle (Oct.2022)
+        # this is the first use of generators in troggle (Oct.2022) and save 21 MB of memory
        with open(collatefilename, "r") as fcollate:
            for svxline in fcollate:
                self.lineno += 1
@@ -1349,7 +1422,7 @@ class LoadingSurvex():
                return
            return
        try:
-            # python generator idiom again
+            # python generator idiom again. Not important here as these are small files
            with open(finname, "r") as fin:
                for svxline in fin:
                    process_line(svxline)
@@ -1475,7 +1548,6 @@ def FindAndLoadSurvex(survexblockroot):
    survexfileroot = survexblockroot.survexfile # i.e. SURVEX_TOPNAME only
    collatefilename = "_" + survexfileroot.path + ".svx"

-
    svx_scan = LoadingSurvex()
    svx_scan.callcount = 0
    svx_scan.depthinclude = 0
@@ -1502,7 +1574,6 @@ def FindAndLoadSurvex(survexblockroot):
    from pstats import SortKey
    pr = cProfile.Profile()
    pr.enable()
-    #print(f"###{survexblockroot=} {survexfileroot.path=}",file=sys.stderr)
    #----------------------------------------------------------------
    svx_scan.PushdownStackScan(survexblockroot, survexfileroot.path, finrootname, flinear, fcollate)
    #----------------------------------------------------------------
@@ -1525,7 +1596,6 @@ def FindAndLoadSurvex(survexblockroot):
    mem1 = get_process_memory()
    print("  - MEM:{:7.2f} MB END ".format(mem1),file=sys.stderr)
    print("  - MEM:{:7.3f} MB ADDITIONALLY USED".format(mem1-mem0),file=sys.stderr)  
-    
    #
    # Process all the omitted files in :loser: with some exceptions
    #
@@ -1556,8 +1626,6 @@ def FindAndLoadSurvex(survexblockroot):
    
    for x in removals:
        unseens.remove(x)
-    # for x in unseens:
-        # print(f"'{x}', ", end='', file=sys.stderr)
    print(f"\n  - {len(unseens)} survex files found which were not included in main tree. ({len(svx_scan.svxfileslist)} in main tree)", file=sys.stderr)
    print(f" -- Now loading the previously-omitted survex files.", file=sys.stderr)
    
@@ -1621,7 +1689,6 @@ def FindAndLoadSurvex(survexblockroot):
    print("  - MEM:{:7.3f} MB ADDITIONALLY USED".format(mem1-mem0),file=sys.stderr)

 
- 
    # Before doing this, it would be good to identify the *equate and *entrance we need that are relevant to the
    # entrance locations currently loaded after this by LoadPos(), but could better be done before ?
    # look in MapLocations() for how we find the entrances
@@ -1635,18 +1702,15 @@ def FindAndLoadSurvex(survexblockroot):

    #pr2 = cProfile.Profile()
    #pr2.enable()
-    mem1 = get_process_memory()
-    print(f"  - MEM:{mem1:7.2f} MB NOT reading '{collatefilename}' into memory.",file=sys.stderr)
    print("    ", file=sys.stderr,end='')
    #----------------------------------------------------------------
-    svx_load.LinearLoad(survexblockroot,survexfileroot.path, collatefilename)
+    svx_load.LinearLoad(survexblockroot, survexfileroot.path, collatefilename)
    #----------------------------------------------------------------
    #pr2.disable()
    # with open('LinearLoad.prof', 'w') as f:
        # ps = pstats.Stats(pr2, stream=f)
        # ps.sort_stats(SortKey.CUMULATIVE)
        # ps.print_stats()
-    svxlines = [] # empty 30MB of stashed file
    mem1 = get_process_memory()
    print("\n - MEM:{:7.2f} MB STOP".format(mem1),file=sys.stderr)
    print(" - MEM:{:7.3f} MB ADDITIONALLY USED".format(mem1-mem0),file=sys.stderr)
@@ -1696,20 +1760,32 @@ def MakeOmitFileRoot(fn):
    return fileroot

 def LoadSurvexBlocks():
+    mem1 = get_process_memory()
+    print("  - MEM:{:7.2f} MB now ".format(mem1),file=sys.stderr)

    print(' - Flushing All Survex Blocks...')
+    # why does this increase memory use by 20 MB ?!
+    # We have foreign keys, Django needs to load the related objects 
+    # in order to resolve how the relation should handle the deletion: 
+    # https://docs.djangoproject.com/en/3.2/ref/models/fields/#django.db.models.ForeignKey.on_delete
    SurvexBlock.objects.all().delete()
    SurvexFile.objects.all().delete()
    SurvexDirectory.objects.all().delete()
    SurvexPersonRole.objects.all().delete()
    SurvexStation.objects.all().delete()
-    print("  - survex Data Issues flushed")
+    mem1 = get_process_memory()
+    print("  - MEM:{:7.2f} MB now. Foreign key objects loaded on deletion. ".format(mem1),file=sys.stderr)
+    
+    print("  - Flushing survex Data Issues ")
    DataIssue.objects.filter(parser='survex').delete()
    DataIssue.objects.filter(parser='svxdate').delete()
    DataIssue.objects.filter(parser='survexleg').delete()
    DataIssue.objects.filter(parser='survexunits').delete()
    DataIssue.objects.filter(parser='entrances').delete()
    DataIssue.objects.filter(parser='xEntrances').delete()
+    print("  - survex Data Issues flushed")
+    mem1 = get_process_memory()
+    print("  - MEM:{:7.2f} MB now ".format(mem1),file=sys.stderr)
    
    survexfileroot = MakeSurvexFileRoot()
    # this next makes a block_object assciated with a file_object.path = SURVEX_TOPNAME