From 86ba377bd83ec3f0748d39b24e886f63b350cdf0 Mon Sep 17 00:00:00 2001 From: Philip Sargent Date: Wed, 28 Jan 2026 16:58:56 +0000 Subject: [PATCH] profiling --- parsers/survex.py | 97 ++++++++++++++++++++++++++++++----------------- 1 file changed, 63 insertions(+), 34 deletions(-) diff --git a/parsers/survex.py b/parsers/survex.py index b9c26ca..75d5e88 100644 --- a/parsers/survex.py +++ b/parsers/survex.py @@ -73,6 +73,12 @@ debugprinttrigger = "!" dataissues = set() +# Caches for ORM minimization +survexblock_cache = None # {scanswallet_id: [SurvexBlock, ...]} +personrole_cache = None # {survexblock_id: [SurvexPersonRole, ...]} +wallet_cache = None # {walletname: [Wallet, ...]} +trip_people_cache = {} # indexed by survexblock, so never needs cleaning out + class SurvexLeg: """No longer a models.Model subclass, so no longer a database table""" @@ -80,16 +86,18 @@ class SurvexLeg: compass = 0.0 clino = 0.0 - def datewallet(w, earliest): - """Gets the date of the youngest survexblock associated with the wallet - REFACTOR this to do the whole date-getting task - - Currently there is only one SurvexBlock, but this is in anticipation of - changing the schema to allow many. - """ + """Gets the date of the youngest survexblock associated with the wallet, using a cache.""" + global survexblock_cache + if survexblock_cache is None: + # Build cache: {scanswallet_id: [SurvexBlock, ...]} + survexblock_cache = {} + for b in SurvexBlock.objects.all().select_related("survexfile", "scanswallet"): + if b.scanswallet_id not in survexblock_cache: + survexblock_cache[b.scanswallet_id] = [] + survexblock_cache[b.scanswallet_id].append(b) first = earliest - blocks = SurvexBlock.objects.filter(scanswallet=w).select_related("survexfile", "scanswallet") + blocks = survexblock_cache.get(w.id, []) for b in blocks: if b.date: if b.date < first: @@ -142,18 +150,21 @@ def get_offending_filename(path): return "/survexfile/" + path + ".svx" -trip_people_cache = {} # indexed by survexblock, so never needs cleaning out def get_team_on_trip(survexblock): - """Uses a cache to avoid a database query if it doesn't need to. - Only used for complete team.""" - global trip_people_cache - + """Uses a cache to avoid a database query if it doesn't need to. Only used for complete team.""" + global trip_people_cache, personrole_cache + if personrole_cache is None: + # Build cache: {survexblock_id: [SurvexPersonRole, ...]} + personrole_cache = {} + for pr in SurvexPersonRole.objects.all().select_related("person", "personexpedition"): + if pr.survexblock_id not in personrole_cache: + personrole_cache[pr.survexblock_id] = [] + personrole_cache[pr.survexblock_id].append(pr) if survexblock in trip_people_cache: if len(trip_people_cache[survexblock]) > 0: return trip_people_cache[survexblock] - - qpeople = SurvexPersonRole.objects.filter(survexblock=survexblock).select_related("person", "personexpedition") - trip_people_cache[survexblock] = qpeople # this is a query list + qpeople = personrole_cache.get(survexblock.id, []) + trip_people_cache[survexblock] = qpeople return qpeople def get_people_on_trip(survexblock): @@ -849,10 +860,23 @@ class LoadingSurvex: if year in self._expedition_cache: expo = self._expedition_cache[year] else: - message = f"! DATE INCORRECT. There is no expedition for the year {year}. {survexblock.survexfile.path} ({survexblock}) - set to 1976." - print(self.insp + message) - stash_data_issue(parser='survex', message=message, url=None, sb=(survexblock.survexfile.path)) - expo = self._expedition_cache.get("1976") + expeditions = Expedition.objects.filter(year=year) + if len(expeditions) > 1: + message = ( + f"! More than one expedition in year {year} '{line}' ({survexblock}) {survexblock.survexfile.path}" + ) + print(self.insp + message) + stash_data_issue( + parser="survex", message=message, url=None, sb=(survexblock.survexfile.path) + ) + if expeditions: + expo = expeditions[0] + self.expos[year] = expo + else: + expo = Expedition.objects.get(year="1976") + message = f"! DATE INCORRECT. There is no expedition for the year {year}. {survexblock.survexfile.path} ({survexblock}) - set to 1976." + print(self.insp + message) + stash_data_issue(parser='survex', message=message, url=None, sb=(survexblock.survexfile.path)) return expo def LoadSurvexDate(self, survexblock, line): @@ -1275,9 +1299,15 @@ class LoadingSurvex: stash_data_issue(parser="ref", message=message, url=url) # Look to see if we have a record of this wallet already - which would be unexpected - manywallets = Wallet.objects.filter( - walletname=refscan - ) # assumes all wallets found in earlier pass of data import + global wallet_cache + if wallet_cache is None: + # Build cache: {walletname: [Wallet, ...]} + wallet_cache = {} + for w in Wallet.objects.all(): + if w.walletname not in wallet_cache: + wallet_cache[w.walletname] = [] + wallet_cache[w.walletname].append(w) + manywallets = wallet_cache.get(refscan, []) if manywallets: if len(manywallets) > 1: message = f" ! Wallet *REF {refscan} - more than one found {len(manywallets)} wallets in db with same id {survexblock.survexfile.path}" @@ -1290,7 +1320,6 @@ class LoadingSurvex: pass else: check_reused_wallet() - else: survexblock.scanswallet = manywallets[0] # this is a ForeignKey field # Only save if changed @@ -2427,9 +2456,9 @@ def FindAndLoadSurvex(): io_collate.write(f";*include {survexfileroot.path}\n") flinear.write(f"{svx_scan.depthinclude:2} {indent} *include {survexfileroot.path}\n") - # import cProfile - # import pstats - # from pstats import SortKey + import cProfile + import pstats + from pstats import SortKey # pr = cProfile.Profile() # pr.enable() @@ -2594,17 +2623,17 @@ def FindAndLoadSurvex(): print("\n - Loading All Survex Blocks (LinearLoad)", file=sys.stderr) svx_load = LoadingSurvex() - # pr2 = cProfile.Profile() - # pr2.enable() + pr2 = cProfile.Profile() + pr2.enable() print(" ", file=sys.stderr, end="") # ---------------------------------------------------------------- svx_load.LinearLoad(survexblockroot, survexfileroot.path, collatefilename) # ---------------------------------------------------------------- - # pr2.disable() - # with open('LinearLoad.prof', 'w') as f: - # ps = pstats.Stats(pr2, stream=f) - # ps.sort_stats(SortKey.CUMULATIVE) - # ps.print_stats() + pr2.disable() + with open('LinearLoad.prof', 'w') as f: + ps = pstats.Stats(pr2, stream=f) + ps.sort_stats(SortKey.CUMULATIVE) + ps.print_stats() mem1 = get_process_memory() print(f"\n - MEM:{mem1:7.2f} MB STOP", file=sys.stderr)