From f3b46856ee5e480b5f0dc83d3e29e7c9f2fa8fb6 Mon Sep 17 00:00:00 2001 From: Philip Sargent Date: Mon, 15 Dec 2025 19:42:23 +0000 Subject: [PATCH] chunked helper to return DrawinGFile objects --- core/TESTS/test_drawings.py | 19 +++++++++++++++++++ parsers/drawings.py | 32 +++++++++++++++++++++++++++----- 2 files changed, 46 insertions(+), 5 deletions(-) diff --git a/core/TESTS/test_drawings.py b/core/TESTS/test_drawings.py index 4133707..60acca2 100644 --- a/core/TESTS/test_drawings.py +++ b/core/TESTS/test_drawings.py @@ -145,3 +145,22 @@ class DrawingsPathlibTests(TestCase): self.assertEqual(set(drawings.IMAGE_LIKE_EXTS), set(drawings.IMAGE_EXTS)) self.assertIn('.th', drawings.SUPPORTED_EXTENSIONS) self.assertIn('.png', drawings.SUPPORTED_EXTENSIONS) + + def test_fetch_drawingfiles_by_paths_chunks(self): + # Create more items than typical SQLite parameter limit to ensure chunking + count = 1200 + rel_paths = [] + objs = [] + for i in range(count): + rel = f'bigdir/file{i}.txt' + rel_paths.append(rel) + objs.append(DrawingFile(dwgpath=rel, dwgname=f'name{i}')) + + # Bulk create them efficiently + DrawingFile.objects.bulk_create(objs) + + mapping = drawings.fetch_drawingfiles_by_paths(rel_paths, chunk_size=500) + self.assertEqual(len(mapping), count) + # Spot-check a few entries + self.assertIn('bigdir/file0.txt', mapping) + self.assertIn(f'bigdir/file{count-1}.txt', mapping) diff --git a/parsers/drawings.py b/parsers/drawings.py index 3bfa310..453a9b1 100644 --- a/parsers/drawings.py +++ b/parsers/drawings.py @@ -54,6 +54,30 @@ def _is_image_suffix(suffix: str) -> bool: return False return suffix.lower() in IMAGE_EXTS + +def fetch_drawingfiles_by_paths(paths, chunk_size: int = 500): + """Fetch DrawingFile objects for the given iterable of paths in chunks. + + This avoids building a very large SQL IN(...) clause which can exceed DB + parameter limits (SQLite defaults to 999 bound variables). A default + chunk_size of 500 is conservative and works well across backends. + + Returns a dict mapping dwgpath -> DrawingFile (first match if duplicates). + """ + mapping = {} + if not paths: + return mapping + + # Ensure we iterate over a list to allow slicing in chunks + rel_paths = list(paths) + for i in range(0, len(rel_paths), chunk_size): + chunk = rel_paths[i : i + chunk_size] + for obj in DrawingFile.objects.filter(dwgpath__in=chunk): + # if duplicates exist, preserve the first one seen + mapping.setdefault(obj.dwgpath, obj) + + return mapping + rx_wallet = re.compile(r""" # r"(\d\d\d\d#X?\d+\w?|1995-96kh|92-94Surveybookkh|1991surveybook|smkhs)/(.*?(?:png|jpg|pdf|jpeg|gif|txt))$", path # This regex is designed to extract a specific directory prefix and a filename @@ -330,8 +354,7 @@ def setdrwfileinfo(dwgfile): def load_drawings_files(): """Breadth first search of drawings directory looking for sub-directories and *.xml filesize - This is brain-damaged very early code. Should be replaced with proper use of pathlib. - + Why do we have all this detection of file types/! Why not use get_mime_types ? What is it all for ?? @@ -379,10 +402,9 @@ def load_drawings_files(): for i in range(0, len(objs_to_create), chunk_size): DrawingFile.objects.bulk_create(objs_to_create[i : i + chunk_size]) - # Re-fetch created objects and map by dwgpath + # Re-fetch created objects and map by dwgpath using a chunked fetch helper rel_paths = [rel for (_, rel, _, _) in files_meta] - created_objs = DrawingFile.objects.filter(dwgpath__in=rel_paths) - mapping = {obj.dwgpath: obj for obj in created_objs} + mapping = fetch_drawingfiles_by_paths(rel_paths, chunk_size=500) # Reconstruct all_xml using the created model instances for ext, rel, _, p in files_meta: