diff --git a/parsers/survex.py b/parsers/survex.py index 7e289ee..003760b 100644 --- a/parsers/survex.py +++ b/parsers/survex.py @@ -156,17 +156,22 @@ def store_data_issues(loadex = None): chnage to using Class not global stash """ global dataissues - print(f" - Storing {len(dataissues)} Data Issues into database") + if loadex: + id = loadex.name + else: + id = "global dataissues" + + print(f" - Storing {len(dataissues)} Data Issues into database (for '{id}' loadfile)") # make a list of objects, but don't commit to database yet di_list = [] for issue in dataissues: - if not loadex: + if not loadex: #catch-all at end parser, message, url, _ = issue else: parser, message, url, blkid = issue if blkid: - sb = loadex._pending_block_saves[blkid] + sb = loadex._pending_block_saves[blkid] # only works if DataIssues processed before SBs if url is None: if sb is not None: try: @@ -174,7 +179,7 @@ def store_data_issues(loadex = None): except Exception as e: print(f" ! store_data_issues() '{e}' '{sb=}' -- '{url=}'", file=sys.stderr) url = get_offending_filename(sb) # assumed to be text - di_list.append(DataIssue(parser=parser, message=message, url=url)) + di_list.append(DataIssue(parser=parser, message=message, url=url)) # Now commit to db DataIssue.objects.bulk_create(di_list) dataissues = set() @@ -229,9 +234,9 @@ class LoadingSurvex: other survex files. A 'scanswallet' is what we today call a "survey scans folder" or a "wallet". """ - # python regex flags (?i) means case-insentitive, (?s) means . matches newline too # see https://docs.python.org/3/library/re.html + # name = "3 word catchphrase here" rx_begin = re.compile(r"(?i)begin") rx_begin2 = re.compile("(?i)begin$") @@ -449,9 +454,9 @@ class LoadingSurvex: _pending_qm_saves = {} # a dict of lists indexed by survexblock UUID - def __init__(self): + def __init__(self, name): + self.name = name self.caveslist = GetCaveLookup() - pass def LoadSurvexFallThrough(self, survexblock, line, cmd): if cmd == "require": @@ -473,8 +478,7 @@ class LoadingSurvex: ) print(self.insp + message) stash_data_issue( - parser="survex", message=message, url=None, sb=(survexblock.survexfile.path) - ) + parser="survex", message=message, url=None, sb=survexblock) def save_survexblocks_to_db(self): """This saves the in-memory python objects into the database, at which point @@ -577,6 +581,8 @@ class LoadingSurvex: return final_chunks def sb_save_sqlite(): + # sb_save_mysql() + # return try: SurvexBlock.objects.bulk_create( chunk, @@ -613,6 +619,7 @@ class LoadingSurvex: # update the block if it changed got_obj._blockid = sb._blockid got_obj.save() + stash_data_issue(parser="survex", message=f"SB saved {sb}") if (nc + ns == len(chunk)): result = "OK" else: @@ -641,9 +648,9 @@ class LoadingSurvex: print(" - Success: Entire tree of survexblocks saved to db.", file=sys.stderr) except Exception as e: - print(f"Failed at chunk {i+1}: {e}", file=sys.stderr) - - return + print(f"Failed at chunk {i+1}: {e}", file=sys.stderr) + + self._pending_block_saves = {} # in database now, so empty cache def save_personroles_to_db(self): """This should be run only after all the survexblocks have @@ -656,6 +663,8 @@ class LoadingSurvex: """ def pr_save_sqlite(): + # pr_save_mysql() + # return try: SurvexPersonRole.objects.bulk_create(valid_list, update_fields = ['survexblock', 'personname', 'person', 'personexpedition'], @@ -674,6 +683,9 @@ class LoadingSurvex: nc = 0 ns = 0 for pr in valid_list: + pr.save() # simplest is best, if it works + continue + # This is not the complete set of fields we need: got_obj, created = SurvexPersonRole.objects.get_or_create( survexblock=pr.survexblock, personname=pr.personname, @@ -688,6 +700,7 @@ class LoadingSurvex: # update the block if it changed got_obj.survexblock = pr.survexblock got_obj.save() + stash_data_issue(parser="survex", message=f"PR saved {pr}") print(f" - {ns}/{nc} SurvexPersonRoles saved/created to db", file=sys.stderr) pr_list = [] @@ -714,13 +727,18 @@ class LoadingSurvex: else: pr_save_sqlite() - _pending_pr_saves = {} # in database now, so empty cache + self._pending_pr_saves = {} # in database now, so empty cache def save_qms_to_db(self): """This should be run only after all the survexblocks have been saved to the database and so have _blockid that can be used as a ForeignKey + + Actually we need to update all the fields, not just the co-unique ones, + so this is not working correctly at all. """ def qm_save_sqlite(): + qm_save_mysql() + return # MariaDB/MySQL do not support (and don't need) unique_fields by sqlite neeeds them bulk_kwargs = { "update_conflicts": True, @@ -743,6 +761,10 @@ class LoadingSurvex: nc = 0 ns = 0 for qm in qms_valid_list: + qm.save() # simplest is best, if it works + continue + + # This is not the complete set of fields we need: got_obj, created = QM.objects.get_or_create( cave_id=qm.cave_id, blockname=qm.blockname, @@ -757,34 +779,41 @@ class LoadingSurvex: ns += 1 got_obj.block = qm.block got_obj.save() + stash_data_issue(parser="survex", message=f"QM saved {qm}") + print(f"QM saved {qm}", file=sys.stderr) print(f" - {ns}/{nc} QMs saved/created to db", file=sys.stderr) qms = [] for blk in self._pending_qm_saves: - qms = qms + self._pending_qm_saves[blk] + qms += self._pending_qm_saves[blk] print(f" - Saving {len(qms)} QMs to db", file=sys.stderr) - qms_valid_list = [] - for qm in qms: - try: - qm.full_clean() - qms_valid_list.append(qm) - except ValidationError as e: - message = f" ! QM is invalid: '{e}' '{qm}'" - print(message, file=sys.stderr) - stash_data_issue( - parser="survex", - message=message - ) + qms_valid_list = self.validify(qms, "QM") if connection.vendor == 'mysql': qm_save_mysql() else: qm_save_sqlite() - - + self._pending_qm_saves = {} # in database now, so empty cache print(f" - QMs saved to db", file=sys.stderr) - + + def validify(self, obj_list, id_string): + valid_list = [] + for o in obj_list: + try: + o.full_clean() + valid_list.append(o) + except ValidationError as e: + message = f" ! {id_string} is invalid: '{e}' '{o}'" + print(message, file=sys.stderr) + stash_data_issue(parser="survex", message=message) + if not valid_list: + message = f" !! {id_string} no valid items." + print(message, file=sys.stderr) + message = f" - {id_string} there are {len(valid_list)} valid {id_string}s in the list" + print(message, file=sys.stderr) + return valid_list + def put_personrole_on_trip(self, survexblock, personexpedition, tm): """ Only used for a single person. @@ -1079,7 +1108,7 @@ class LoadingSurvex: So we have to recognise the '*fix' too Note that the cache self.fixes would simply use survexblock.id as a key, - but at this point int he parsing we have not yet saved survexblock to the db so + but at this point in the parsing we have not yet saved survexblock to the db so survexblock.id is not available. """ # *fix|36|reference|36359.40|82216.08|2000.00\n @@ -1577,7 +1606,7 @@ class LoadingSurvex: else: message = f" ! Wallet *REF {refscan} in {survexblock.survexfile.path} - Already a DIFFERENT wallet is set for this block '{survexblock.scanswallet.walletname}'" print(self.insp + message) - stash_data_issue(parser="ref", message=message, url=url) + stash_data_issue(parser="ref", message=message, url=url, sb=survexblock) return url = get_offending_filename(survexblock.survexfile.path) @@ -1596,7 +1625,7 @@ class LoadingSurvex: if len(args) < 4: message = f" ! Empty or BAD *REF statement '{args}' in '{survexblock.survexfile.path}'" print(self.insp + message) - stash_data_issue(parser="survex", message=message, url=url) + stash_data_issue(parser="survex", message=message, url=url, sb=survexblock) return argsgps = self.rx_argsref.match(args) @@ -1606,30 +1635,30 @@ class LoadingSurvex: perps = get_people_on_trip(survexblock) message = f" ! Wallet *REF bad in '{survexblock.survexfile.path}' malformed id '{args}' {perps}" print(self.insp + message) - stash_data_issue(parser="ref", message=message, url=url) + stash_data_issue(parser="ref", message=message, url=url, sb=survexblock) return if letterx: message = f" ! Wallet *REF has LETTER in '{survexblock.survexfile.path}' malformed id '{args}' {perps}" print(self.insp + message) - stash_data_issue(parser="ref", message=message, url=url) + stash_data_issue(parser="ref", message=message, url=url, sb=survexblock) if len(walletnum) < 2: walletnum = "0" + walletnum if not (int(yr) > 1960 and int(yr) < 2050): message = " ! Wallet year out of bounds {yr} '{refscan}' {survexblock.survexfile.path}" print(self.insp + message) - stash_data_issue(parser="ref", message=message, url=url) + stash_data_issue(parser="ref", message=message, url=url, sb=survexblock) refscan = f"{yr}#{walletnum}" try: if int(walletnum) > 99: message = f" ! Wallet *REF {refscan} - very big (more than 99) so probably wrong in '{survexblock.survexfile.path}'" print(self.insp + message) - stash_data_issue(parser="ref", message=message, url=url) + stash_data_issue(parser="ref", message=message, url=url, sb=survexblock) except: message = f" ! Wallet *REF {refscan} - not numeric in '{survexblock.survexfile.path}'" print(self.insp + message) - stash_data_issue(parser="ref", message=message, url=url) + stash_data_issue(parser="ref", message=message, url=url, sb=survexblock) # Look to see if we have a record of this wallet already - which would be unexpected global wallet_cache @@ -1645,7 +1674,7 @@ class LoadingSurvex: if len(manywallets) > 1: message = f" ! Wallet *REF {refscan} - more than one found {len(manywallets)} wallets in db with same id {survexblock.survexfile.path}" print(self.insp + message) - stash_data_issue(parser="ref", message=message, url=url) + stash_data_issue(parser="ref", message=message, url=url, sb=survexblock) if survexblock.scanswallet: if survexblock.scanswallet.walletname == refscan: @@ -1665,7 +1694,7 @@ class LoadingSurvex: perps = get_people_on_trip(survexblock) message = f" ! Wallet *REF bad in '{survexblock.survexfile.path}' '{refscan}' NOT in database i.e. wallet does not exist {perps}." print(self.insp + message) - stash_data_issue(parser="ref", message=message, url=url) + stash_data_issue(parser="ref", message=message, url=url, sb=survexblock) def LoadSurvexDataNormal(self, survexblock, args): """Sets the order for data elements in this and following blocks, e.g. @@ -1898,7 +1927,7 @@ class LoadingSurvex: self.currentsurvexfile = newfile return # abort as everything already done for object creation - newfile.save() # until we do this there is no internal .id so no foreign key works + newfile.save() # until we do this there is no internal .pk (.id) so no foreign key works self.currentsurvexfile = newfile newfile.primary = self.set_primary(headpath) @@ -1947,36 +1976,17 @@ class LoadingSurvex: debugprint = False self.currentsurvexfile.save() self.currentsurvexfile = self.stacksvxfiles.pop() - - # def TickSurvexQM(self, survexblock, qmtick): - # """Interpret the specially formatted comment which is a QM TICKED statement - # This is now not used, we have abandoned this experiment.""" - # # Now we need to find the correct QM object. It will be in the same block and have the same number. - - # try: - # # could try to search on blockname instead? - # # but the QMn TICK has to be in the same block anyway - # qm = QM.objects.filter(block=survexblock, number=int(qmtick.group(1))) - # except: - # # raise - # message = f' ! QM TICK find FAIL QM{qmtick.group(1)} date:"{qmtick.group(2)}" qmlist:"{qm}" in "{survexblock.survexfile.path}" + completion_description:"{qmtick.group(3)}" ' - # print(message) - # stash_data_issue( - # parser="survex", message=message, url=None, sb=survexblock - # ) - # if len(qm) > 1: - # message = f' ! QM TICK MULTIPLE found FAIL QM{qmtick.group(1)} date:"{qmtick.group(2)}" in "{survexblock.survexfile.path}" + completion_description:"{qmtick.group(3)}" ' - # print(message) - # stash_data_issue( - # parser="survex", message=message, url=None, sb=survexblock - # ) - # qm[0].ticked = True - # # qm[0].ticked_date = qmtick.group(2) # not in data model yet - # qm[0].completion_description = qmtick.group(3) - # qm[0].save() def LoadSurvexQM(self, survexblock, qmline): - """Interpret the specially formatted comment which is a QM definition""" + """Interpret the specially formatted comment which is a QM definition + + We need the year the QM was discovered in order to construct its troggle name, + which in principle we could get from the survexblock name for the survey + station at which the QM exists, and take the year from the date of the survex + block which sets that survey station. + But that's too hard, so we just take the year of the survexblock enclosing + the ;QM specially formatted comment. + """ insp = self.insp # create a short, hopefully-unique name for this block to be used in the QM id @@ -2016,17 +2026,16 @@ class LoadingSurvex: #;QM1 A B6 - see plan drawing there is definitely a QM - # NB none of the SurveyStations are in the DB now, so if we want to link to aSurvexStation + # NB none of the SurveyStations are in the DB now, so if we want to link to a SurvexStation # we would have to create one. But that is not obligatory and no QMs loaded from CSVs have one # Older troggle/CSV assumes a logbook entry 'found_by' for each QM, with a date. # We don't use this anymore. - if survexblock.survexfile.cave: survexblock.survexfile.cave.slug() - self.fix_undated(survexblock) # null-op if already set, inherits date if needed + self.fix_undated(survexblock) # null-op if already set, inherits date if needed. see function docm. comment above try: expoyear = str(survexblock.date.year) except Exception as e: @@ -2036,8 +2045,8 @@ class LoadingSurvex: print(self.insp + message) stash_data_issue( parser="survex", message=message, url=None, sb=survexblock) - # we could look at child blocks in the same survexfile and see if they have dates, - # and if all such things are int he same year, that would be unambiguous. + # we could look at all child blocks in the same survexfile and see if they have dates, + # and if all such things are in the same year, that would be unambiguous. # But better to flag it as a DataIssue to be fixed in the survex file. expoyear = settings.EPOCH.year # 1970 @@ -2045,7 +2054,6 @@ class LoadingSurvex: try: qm = QM( number=qm_no, - # nearest_station=a_survex_station_object, # can be null resolution_station_name=resolution_station_name, nearest_station_name=qm_nearest, ticked=qm_ticked, @@ -2056,7 +2064,6 @@ class LoadingSurvex: expoyear=expoyear, cave=survexblock.survexfile.cave, ) - # qm.save if survexblock._blockid not in self._pending_qm_saves: self._pending_qm_saves[survexblock._blockid] = [] self._pending_qm_saves[survexblock._blockid].append(qm) @@ -2433,17 +2440,22 @@ class LoadingSurvex: # Getting round MariaDB foibles: put these in different transactions with transaction.atomic(): + store_data_issues(self) # must happen before survexblocks : uses cached _blkid self.save_survexblocks_to_db() n = SurvexBlock.objects.all().count() print(f" + Now {n} SurvexBlocks in total", file=sys.stderr) + with transaction.atomic(): self.save_personroles_to_db() n = SurvexPersonRole.objects.all().count() print(f" + Now {n} SurvexPersonRoles in total", file=sys.stderr) + + n = QM.objects.all().count() + print(f" - {n} QMs already", file=sys.stderr) with transaction.atomic(): self.save_qms_to_db() n = QM.objects.all().count() - print(f" + Now {n} QMs in total", file=sys.stderr) + print(f" + {n} QMs now", file=sys.stderr) def PushdownStackScan(self, survexblock, path, finname, flinear, io_collate): """Follows the *include links in all the survex files from the root file (usually 1623.svx) @@ -2590,7 +2602,7 @@ class LoadingSurvex: message = f" ! ERROR *include file '{path}' in '{survexblock}' has unexpected error on opening or reading file. OMITTED!" print(message) print(message, file=sys.stderr) - stash_data_issue(parser="survex", message=message, url=None, sb=(path)) + stash_data_issue(parser="survex", message=message, url=path, ) raise return # skip this survex file and all things *included in it @@ -2626,7 +2638,7 @@ class LoadingSurvex: """regenerates the .3d file from the .svx if it is older than the svx file, or older than the software, or randomly using chaosmonkey() just to keep things ticking over. - This completely fails for e.g. troggle_import_root.svx as the survex file merely includes other files, + This completely fails for e.g. SURVEX_TOPNAME (troggle_import_root.svx) as the survex file merely includes other files, which may well have changed, e.g. the *fix statements in gps25.svx """ try: @@ -2645,8 +2657,8 @@ class LoadingSurvex: text=True, ) if sp.returncode != 0: - message = f" ! Error when running {settings.CAVERN}: {fullpath}" - url = f"/survexfile{fullpath}.svx".replace(str(settings.SURVEX_DATA), "") + message = f" ! Error when running {settings.CAVERN}: '{fullpath}'" + url = f"/survexfile{fullpath}.svx".replace(str(settings.SURVEX_DATA), "/") stash_data_issue(parser="survex", message=message, url=url) print(message) print( @@ -2663,8 +2675,9 @@ class LoadingSurvex: if errpath.stat().st_size == 0: errpath.unlink() # delete empty closure error file except: - message = f' ! FAIL running cavern on survex file "{fullpath}" specified in *include in {calledpath} ' - stash_data_issue(parser="survex", message=message) + message = f" ! FAIL running cavern on survex file '{fullpath}' specified in *include in {calledpath} " + url = f"/survexfile{fullpath}.svx".replace(str(settings.SURVEX_DATA), "/") + stash_data_issue(parser="survex", message=message, url=url) print(message) svxpath = Path(fullpath + ".svx") @@ -2747,13 +2760,14 @@ def FindAndLoadSurvex(): survexfileroot = survexblockroot.survexfile # i.e. SURVEX_TOPNAME only collatefilename = "_" + survexfileroot.path + ".svx" - svx_scan = LoadingSurvex() + svx_scan = LoadingSurvex(f"{settings.SURVEX_TOPNAME}.svx") svx_scan.callcount = 0 svx_scan.depthinclude = 0 fullpathtotop = str(Path(survexfileroot.path).parent / survexfileroot.path) # In fact we always want to run this, and the location stuff later needs the .pos file # so we should not be using the RunSurvexIfNeeded function. + # This runs cavern on the whole tree of > 442,000 survey legs print(f" - Running cavern on fullpathtotop : '{fullpathtotop}'", file=sys.stderr) logpath = Path(fullpathtotop + ".log") @@ -2771,8 +2785,8 @@ def FindAndLoadSurvex(): ) if sp.returncode != 0: - message = f" ! Error when running {settings.CAVERN}: {fullpathtotop}" - url = f"/survexfile{fullpathtotop}.svx".replace(str(settings.SURVEX_DATA), "") + message = f" ! Error when running ROOT {settings.CAVERN}: {fullpathtotop} in '{outputdir}'" + url = f"/survexfile/{fullpathtotop}.svx".replace(str(settings.SURVEX_DATA), "/") stash_data_issue(parser="survex", message=message, url=url) print(message) print( @@ -2789,7 +2803,8 @@ def FindAndLoadSurvex(): if errpath.stat().st_size == 0: errpath.unlink() # delete empty closure error file except: - message = f' ! FAIL running cavern on survex file "{fullpathtotop}"' + message = f" ! FAIL running cavern on ROOT survex file '{fullpathtotop}' in '{outputdir}'" + url = f"/survexfile/{fullpathtotop}.svx".replace(str(settings.SURVEX_DATA), "/") stash_data_issue(parser="survex", message=message) print(message) @@ -2830,7 +2845,6 @@ def FindAndLoadSurvex(): io_collate.write(f";*edulcni {survexfileroot.path}\n") svx_scan.check_cache_clean() - store_data_issues(svx_scan) mem1 = get_process_memory() flinear.write(f"\n - MEM:{mem1:.2f} MB STOP {survexfileroot.path}\n") @@ -2914,7 +2928,7 @@ def FindAndLoadSurvex(): survexfileroot = survexblockroot.survexfile # i.e. SURVEX_TOPNAME only - omit_scan = LoadingSurvex() + omit_scan = LoadingSurvex(f"Not included in {settings.SURVEX_TOPNAME}") omit_scan.callcount = 0 omit_scan.depthinclude = 0 fullpathtotop = str(Path(settings.SURVEX_DATA, UNSEENS)) @@ -2981,7 +2995,7 @@ def FindAndLoadSurvex(): # look in MapLocations() for how we find the entrances print("\n - Loading All Survex Blocks (LinearLoad)", file=sys.stderr) - svx_load = LoadingSurvex() + svx_load = LoadingSurvex("All Survex Blocks (LinearLoad)") # pr2 = cProfile.Profile() # pr2.enable() @@ -3164,10 +3178,10 @@ def parse_one_file(fpath): # --------------------------------------in progress-- survexfile=svxfileroot, legsall=0, legslength=0.0 ) - block_dummy.name=f"#{block_dummy.id}_{str(Path(str(svxfileroot)))}", + block_dummy.name=f"#{block_dummy.pk}_{str(Path(str(svxfileroot)))}", #svxfileroot.save() block_dummy.save() - print(f" - block_dummy now '{block_dummy}' {type(block_dummy)} id={block_dummy.id} f:{block_dummy.survexfile}\n -- {block_dummy.name=}") + print(f" - block_dummy now '{block_dummy}' {type(block_dummy)} id={block_dummy.pk} f:{block_dummy.survexfile}\n -- {block_dummy.name=}") # ---------------------------------------------------------------- svx_load.LoadSurvexFile(fpath) # otherwise only called for *include files @@ -3180,7 +3194,7 @@ def parse_one_file(fpath): # --------------------------------------in progress-- global svx_load print(f"\n - Loading One Survex file '{fpath}'", file=sys.stderr) - svx_load = LoadingSurvex() + svx_load = LoadingSurvex(f"One Survex file '{fpath}'") fname = Path(settings.SURVEX_DATA, (fpath + ".svx")) @@ -3201,7 +3215,7 @@ def parse_one_file(fpath): # --------------------------------------in progress-- parents =set() if sbs: for sb in sbs: - # print(f" - {sb.id} checking survex block {sb=}") + # print(f" - {sb.pk} checking survex block {sb=}") try: if sb.parent: parents.add(sb.parent) @@ -3210,10 +3224,10 @@ def parse_one_file(fpath): # --------------------------------------in progress-- print(f" ! FAILURE to access sb.parent {sb=}\n ! {sb.parent_id=} ")# \n{dir(sb)} # even though the parent_id exists.. hmm. for sb in sbs: - # print(f" - {sb.id} {sb.pk} {sb}") + # print(f" - {sb.pk} {sb}") sb_keep = sb if sb not in parents: - # print(f" - {sb.id} Deleting survex block {sb=}") + # print(f" - {sb.pk} Deleting survex block {sb=}") sb.delete() if parents: @@ -3238,8 +3252,8 @@ def parse_one_file(fpath): # --------------------------------------in progress-- if len(sbs)<1: print(f" ! No survex blocks found. Parser failure...") for sb in sbs: - print(f" - {sb.id} re-setting survex block parent {sb=}", file=sys.stderr) - print(f" - {sb.id} re-setting survex block parent {sb=}") + print(f" - {sb.pk} re-setting survex block parent {sb=}", file=sys.stderr) + print(f" - {sb.pk} re-setting survex block parent {sb=}") sb.parent = existingparent # should be all the same sb.save() @@ -3319,10 +3333,11 @@ def set_survexblocks(): def survexifywallets(): """Gets the caves from the list of survexblocks - We seem to hve a LOT of blocks with no atatched scnaswallet. Is this because we are + We seem to hve a LOT of blocks with no attached scanswallet. Is this because we are not inheriting *ref properly in the survexfile ? """ print(f" - Update wallets with survex data") + print(f" - Update wallets with survex data", file=sys.stderr) start = time.time() # if there is a wallet for a block, add the people to the wallet @@ -3453,6 +3468,7 @@ def LoadSurvexBlocks(): # ---------------------------------------------------------------- block_start = time.time() FindAndLoadSurvex() + store_data_issues() print(f" - FindAndLoadSurvex() took {time.time() - block_start:.2f}s", file=sys.stderr) # ---------------------------------------------------------------- memend = get_process_memory()