diff --git a/parsers/drawings.py b/parsers/drawings.py index 309926e..b4afbd2 100644 --- a/parsers/drawings.py +++ b/parsers/drawings.py @@ -15,33 +15,64 @@ for tunnel and therion files todo = """ - Rename functions more consistently between tunnel and therion variants +- fix missed embedded survex files and their *ref wallet + - Refactor to use pathlib instead of whacky resetting of loop variable inside loop to scan sub-folders. This will definitely break at some point.. - Recode rx_valid_ext to use pathlib suffix() function -- Recode load_drawings_files() to use a list of suffices - not the huge if-else monstrosity - - implement: findimportinsert(therionfile, imp) Tries to link the scrap (Therion format) to the referenced therion scrap """ -rx_valid_ext = re.compile(r"(?i)\.(?:png|jpg|pdf|jpeg|gif|txt)$") +rx_valid_ext = re.compile(r"(?i)\.(?:png|jpg|pdf|jpeg|gif|txt|svg)$") +rx_wallet = re.compile(r""" + # r"(\d\d\d\d#X?\d+\w?|1995-96kh|92-94Surveybookkh|1991surveybook|smkhs)/(.*?(?:png|jpg|pdf|jpeg|gif|txt))$", path + # This regex is designed to extract a specific directory prefix and a filename + # from the end of a path string. + # --- Group 1: Directory or Survey Prefix --- + ( # Start of Capture Group 1 + \d{4}\#X?\d+\w? # Matches patterns like "2025#123", "2016#X04" or "1999#45a", NB # must be escaped in VERBOSE mode + | # OR + 1995-96kh # Matches the literal string "1995-96kh" + | # OR + 92-94Surveybookkh # Matches the literal string "92-94Surveybookkh" + | # OR + 1991surveybook # Matches the literal string "1991surveybook" + | # OR + smkhs # Matches the literal string "smkhs" + ) # End of Capture Group 1 -def find_dwg_file(dwgfile, path): + / # A literal forward slash separating the parts + + # --- Group 2: Filename --- + ( # Start of Capture Group 2 + .*? # Non-greedily match the filename stem (any character) + (?: # Start of a non-capturing group for the extension + png|jpg|pdf|jpeg|gif|txt|svg + ) # End of the extension group + ) # End of Capture Group 2 + + $ # Anchor, ensuring the match is at the end of the string + """, re.VERBOSE | re.IGNORECASE) + +def parse_tnl_file(dwgfile, path): """Is given a line of text 'path' which may or may not contain a recognisable name of a scanned file which we have already seen when we imported all the files we could find in the surveyscans direstories. - The purpose is to find cross-references between Tunnel drawing files. But this is not reported anywhere yet ? + The purpose is to find cross-references between Tunnel drawing files and wallets + AND to find the names of the scanfiles in that wallet - from reading the Tunnel file not from interrogating the wallet. + Note that this means that the list of scanfiles will be as it was when the drawing was created, not as it is now. Perhaps + we should not actually do it this way ? Or at least, label the table heading. - What is all this really for ?! Is this data used anywhere ?? + This is used to tie drawings to the wallet, and thus the original survey data. Tunnel files + contain a centreline which is an embedded survex file. """ - wallet, scansfile = None, None - mscansdir = re.search( - r"(\d\d\d\d#X?\d+\w?|1995-96kh|92-94Surveybookkh|1991surveybook|smkhs)/(.*?(?:png|jpg|pdf|jpeg|gif|txt))$", path - ) - if mscansdir: + wallet, scansfile = None, None + if mscansdir := rx_wallet.search(path): # walrus + # print(f"{path} -- {mscansdir.group(1)=} -- {mscansdir.group(2)=}") scanswalletl = Wallet.objects.filter(walletname=mscansdir.group(1)) # This should be changed to properly detect if a list of folders is returned and do something sensible, not just pick the first. if len(scanswalletl): @@ -89,8 +120,9 @@ def find_dwg_file(dwgfile, path): def findwalletimage(therionfile, foundpath): """Tries to link the drawing file (Therion format) to the referenced image (scan) file""" + wallet, scansfile = None, None foundpath = foundpath.strip("{}") - mscansdir = re.search(r"(\d\d\d\d#\d+\w?|1995-96kh|92-94Surveybookkh|1991surveybook|smkhs)", foundpath) + mscansdir = rx_wallet.search(foundpath) if mscansdir: scanswalletl = Wallet.objects.filter(walletname=mscansdir.group(1)) # This should be changed to properly detect if a list of folders is returned and do something sensible, not just pick the first. @@ -191,12 +223,18 @@ def settherionfileinfo(filetuple): rx_skpath = re.compile(rb" # - for path, style in rx_pcpath.findall(ttext): - find_dwg_file(dwgfile, path.decode()) + # sfsketch="surveyscans/2025/2025#41/plan_diddlypot.png" + for scanfile_path, style in rx_pcpath.findall(ttext): + parse_tnl_file(dwgfile, scanfile_path.decode()) + + # + # *file_begin "/home/expo/loser/caves-1623/2025-dw-01/trip1.svx" "trip1.svx" | *begin 1 | *export 1 25 | | ; Cave: 2025-dw-01 | ; Area in cave/QM: Entrance series | *title "2025-dw-01" | *date 2025.07.13 | *team "Dylan Wase" notes | *team "Daniel Gorst" dog | *instrument SAP "SAP6 Dylan" | *ref 2025#20 | + for refs in rx_pctext.findall(ttext): + try: + wallets = Wallet.objects.filter(walletname=refs.decode()) + if wallets: + for w in wallets: + dwgfile.dwgwallets.add(w) + except: + message = f" ! wallet not found referenced from {dwgfile} -- '{refs.decode()}' " + print(message) + DataIssue.objects.create(parser="Tunnel", message=message, url=f"/dwgdataraw/{dwgfile}") # should also scan and look for survex blocks that might have been included, and image scans # which would populate dwgfile.survexfile @@ -261,6 +313,8 @@ def load_drawings_files(): os.remove("therionrefs.log") drawingsdirs = [""] + supported_extensions = {".txt", ".xml", ".th", ".th2", ".pdf", ".png", ".svg", ".jpg"} # set + while drawingsdirs: drawdir = drawingsdirs.pop() for f in os.listdir(os.path.join(drawdatadir, drawdir)): @@ -272,64 +326,37 @@ def load_drawings_files(): drawingsdirs.append( lf ) # lunatic! adding to list in middle of list while loop! Replace with pathlib functions. - elif Path(f).suffix.lower() == ".txt": - # Always creates new - dwgfile = DrawingFile(dwgpath=lf, dwgname=os.path.split(f[:-4])[1]) - dwgfile.save() - all_xml.append(("txt", dwgfile)) - elif Path(f).suffix.lower() == ".xml": - # Always creates new - dwgfile = DrawingFile(dwgpath=lf, dwgname=os.path.split(f[:-4])[1]) - dwgfile.save() - all_xml.append(("xml", dwgfile)) - elif Path(f).suffix.lower() == ".th": - # Always creates new - dwgfile = DrawingFile(dwgpath=lf, dwgname=os.path.split(f[:-4])[1]) - dwgfile.save() - all_xml.append(("th", dwgfile)) - elif Path(f).suffix.lower() == ".th2": - # Always creates new - dwgfile = DrawingFile(dwgpath=lf, dwgname=os.path.split(f[:-4])[1]) - dwgfile.save() - all_xml.append(("th2", dwgfile)) - elif Path(f).suffix.lower() == ".pdf": - # Always creates new - dwgfile = DrawingFile(dwgpath=lf, dwgname=os.path.split(f[:-4])[1]) - dwgfile.save() - all_xml.append(("pdf", dwgfile)) - elif Path(f).suffix.lower() == ".png": - # Always creates new - dwgfile = DrawingFile(dwgpath=lf, dwgname=os.path.split(f[:-4])[1]) - dwgfile.save() - all_xml.append(("png", dwgfile)) - elif Path(f).suffix.lower() == ".svg": - # Always creates new - dwgfile = DrawingFile(dwgpath=lf, dwgname=os.path.split(f[:-4])[1]) - dwgfile.save() - all_xml.append(("svg", dwgfile)) - elif Path(f).suffix.lower() == ".jpg": - # Always creates new - dwgfile = DrawingFile(dwgpath=lf, dwgname=os.path.split(f[:-4])[1]) - dwgfile.save() - all_xml.append(("jpg", dwgfile)) - elif Path(f).suffix == "": - # therion file - dwgfile = DrawingFile(dwgpath=lf, dwgname=os.path.split(f)[1]) - dwgfile.save() - all_xml.append(("", dwgfile)) + else: + file_path = Path(f) + suffix = file_path.suffix.lower() + + if suffix in supported_extensions: + dwgfile = DrawingFile(dwgpath=lf, dwgname=file_path.stem) + dwgfile.save() + # Get the extension without the dot for the tuple. + all_xml.append((suffix[1:], dwgfile)) + + elif suffix == "": + # This handles the special case for files with no extension. + dwgfile = DrawingFile(dwgpath=lf, dwgname=file_path.name) + dwgfile.save() + all_xml.append(("", dwgfile)) print(f" - {len(all_xml)} Drawings files found") for d in all_xml: - if d[0] in ["pdf", "txt", "svg", "jpg", "png", ""]: - setdrwfileinfo(d[1]) - if d[0] == "xml": - settnlfileinfo(d[1]) + extension, filename = d + if extension in {"pdf", "txt", "svg", "jpg", "png", ""}: # set + setdrwfileinfo(filename) + if extension == "xml": + settnlfileinfo(filename) # important to import .th2 files before .th so that we can assign them when found in .th files - if d[0] == "th2": + if extension == "th2": settherionfileinfo(d) - if d[0] == "th": + if extension == "th": settherionfileinfo(d) + print(f" - Drawings parsed") + # for drawfile in DrawingFile.objects.all(): # SetTunnelfileInfo(drawfile) diff --git a/templates/caveindex.html b/templates/caveindex.html index 945fc83..610fae1 100644 --- a/templates/caveindex.html +++ b/templates/caveindex.html @@ -23,7 +23,8 @@ {% include 'cave_red_star.html' %}

See All Caves for all the caves in areas 1623, 1626, 1624, 1627
-See Undropped Caves for all unexplored caves +See Undropped Caves for all unexplored caves
+See Lost Caves for caves we have mislaid.

New Cave
diff --git a/templates/cavesallindex.html b/templates/cavesallindex.html index 4a881f3..f792e0f 100644 --- a/templates/cavesallindex.html +++ b/templates/cavesallindex.html @@ -23,6 +23,7 @@ {% include 'cave_red_star.html' %}

See Undropped Caves for all unexplored caves
+See Lost Caves for caves we have mislaid.
See Recent Caves for a shorter list of recent caves.

diff --git a/templates/cavesundropped.html b/templates/cavesundropped.html index 322750f..bcc5f88 100644 --- a/templates/cavesundropped.html +++ b/templates/cavesundropped.html @@ -11,6 +11,7 @@

See All Caves for all the caves in areas 1623, 1626, 1624, 1627
See Recent Caves for a full list of recent caves.
+See Lost Caves for caves we have mislaid.
Download Undropped GPX file (This is only the subset which are fully located.)

diff --git a/templates/dwgfiles.html b/templates/dwgfiles.html index cfda968..23a307f 100644 --- a/templates/dwgfiles.html +++ b/templates/dwgfiles.html @@ -10,9 +10,11 @@ has been done after the most recent database reset.

All Tunnel and Therion files - linked to wallets, survey scans, frames and scraps

This is the list of drawings as at the most recent database reset. +

Note that the list of scanned files is taken from text inside the drawing file, it is not +necessarily what is currently in the wallet. - + {% for dwgfile in dwgfiles %}
Drawing (Tunnel or Therion) FileSizePathsWalletsScan files in the walletsFrames
Drawing (Tunnel or Therion) FileSizePathsWalletsScan files when the drawing was createdFrames