import os import re from pathlib import Path import settings from troggle.core.models.survex import DrawingFile from troggle.core.models.troggle import DataIssue from troggle.core.models.wallets import Wallet """Searches through all the :drawings: repository looking for tunnel and therion files """ todo = """ - Rename functions more consistently between tunnel and therion variants - Refactor to use pathlib instead of whacky resetting of loop variable inside loop to scan sub-folders. This will definitely break at some point.. - Recode rx_valid_ext to use pathlib suffix() function - implement: findimportinsert(therionfile, imp) Tries to link the scrap (Therion format) to the referenced therion scrap """ rx_xth_me = re.compile(r"xth_me_image_insert.*{.*}$", re.MULTILINE) rx_scrap = re.compile(r"^survey (\w*).*$", re.MULTILINE) rx_input = re.compile(r"^input ", re.MULTILINE) rx_line = re.compile(r"^line ", re.MULTILINE) rx_ref = re.compile(r"^#?\s?ref\s*\:?\s*([^\s\t]*)", re.MULTILINE) rx_skpath = re.compile(r" 0, False otherwise. """ try: fullpath = Path(fullpath) size = fullpath.stat().st_size except Exception as e: message = f"! Unable to stat file {fullpath}: {e}" print(message) DataIssue.objects.create(parser=parser_label, message=message, url=f"{url_prefix}/{getattr(model_obj, 'dwgpath', '')}") return False model_obj.filesize = size model_obj.save() if size <= 0: message = f"! Zero length {parser_label.lower()} file {fullpath}" print(message) DataIssue.objects.create(parser=parser_label, message=message, url=f"{url_prefix}/{getattr(model_obj, 'dwgpath', '')}") return False return True def _read_text_file(fullpath): """Read text file robustly, returning a str (falls back to binary decode).""" try: path = Path(fullpath) with path.open("r", encoding="utf-8", errors="replace") as fh: return fh.read() except Exception: try: with path.open("rb") as fh: return fh.read().decode("utf-8", errors="replace") except Exception as e: print(f"! Unable to read file {fullpath}: {e}") return "" def parse_tnl_file(dwgfile, path): """Is given a line of text 'path' which may or may not contain a recognisable name of a scanned file which we have already seen when we imported all the files we could find in the surveyscans direstories. The purpose is to find cross-references between Tunnel drawing files and wallets AND to find the names of the scanfiles in that wallet - from reading the Tunnel file not from interrogating the wallet. Note that this means that the list of scanfiles will be as it was when the drawing was created, not as it is now. Perhaps we should not actually do it this way ? Or at least, label the table heading. This is used to tie drawings to the wallet, and thus the original survey data. Tunnel files contain a centreline which is an embedded survex file. """ wallet, scansfile = None, None if mscansdir := rx_wallet.search(path): # walrus # print(f"{path} -- {mscansdir.group(1)=} -- {mscansdir.group(2)=}") scanswalletl = Wallet.objects.filter(walletname=mscansdir.group(1)) # This should be changed to properly detect if a list of folders is returned and do something sensible, not just pick the first. e.g. use the __in Django idiom if len(scanswalletl): wallet = scanswalletl[0] if len(scanswalletl) > 1: message = f"! More than one scan FOLDER matches filter query. [{scanswalletl[0]}]: {mscansdir.group(1)} {mscansdir.group(2)} {dwgfile.dwgpath} {path}" print(message) DataIssue.objects.create(parser="Tunnel", message=message) if wallet: scansfilel = wallet.singlescan_set.filter(name=mscansdir.group(2)) if len(scansfilel): if len(scansfilel) > 1: plist = [sf.ffile for sf in scansfilel] message = f"! More than one image FILENAME matches filter query. [{scansfilel[0]}]: {mscansdir.group(1)} {mscansdir.group(2)} {dwgfile.dwgpath} {path} {plist}" print(message) DataIssue.objects.create(parser="Tunnel", message=message) scansfile = scansfilel[0] if wallet: dwgfile.dwgwallets.add(wallet) if scansfile: dwgfile.scans.add(scansfile) elif path: suffix = Path(path).suffix.lower() if suffix in IMAGE_EXTS: # It's an image/scanned file type; we don't treat it as a referenced drawing return # Not an image file: perhaps a reference to another drawing (no ext or other ext) name = Path(path).name rdwgfilel = DrawingFile.objects.filter(dwgname=name) # Check if it is another drawing file we have already seen if len(rdwgfilel): if len(rdwgfilel) > 1: plist = [] for df in rdwgfilel: plist.append(df.dwgpath) message = f"- Warning {len(rdwgfilel)} files named '{name}' {plist}" # should not be a problem? print(message) DataIssue.objects.create(parser="Tunnel", message=message, url=f"/dwgdataraw/{path}") rdwgfile = rdwgfilel[0] dwgfile.dwgcontains.add(rdwgfile) dwgfile.save() def findwalletimage(therionfile, foundpath): """Tries to link the drawing file (Therion format) to the referenced image (scan) file""" wallet, scansfile = None, None foundpath = foundpath.strip("{}") mscansdir = rx_wallet.search(foundpath) if mscansdir: scanswalletl = Wallet.objects.filter(walletname=mscansdir.group(1)) # This should be changed to properly detect if a list of folders is returned and do something sensible, not just pick the first. Use the __in idom if len(scanswalletl): wallet = scanswalletl[0] if len(scanswalletl) > 1: message = f"! More than one scan FOLDER matches filter query. [{therionfile}]: {mscansdir.group(1)} {foundpath}" print(message) DataIssue.objects.create(parser="Therion", message=message) if wallet: therionfile.dwgwallets.add(wallet) scanfilename = Path(foundpath).name scansfilel = wallet.singlescan_set.filter(name=scanfilename, wallet=wallet) if len(scansfilel): # message = f'! {len(scansfilel)} {scansfilel} = {scanfilename} found in the wallet specified {wallet.walletname}' # print(message) if len(scansfilel) > 1: plist = [sf.ffile for sf in scansfilel] message = f"! More than one image FILENAME matches filter query. [{scansfilel[0]}]: {mscansdir.group(1)} {foundpath} {plist}" print(message) DataIssue.objects.create(parser="Therion", message=message) scansfile = scansfilel[0] therionfile.scans.add(scansfile) else: message = f'! In {wallet.walletname} scanned file is not actually found {scanfilename} mentioned in "{therionfile.dwgpath}"' wurl = f"/survey_scans/{wallet.walletname}/".replace("#", ":") # print(message) DataIssue.objects.create(parser="Therion", message=message, url=wurl) def findimportinsert(therionfile, imp): """Tries to link the scrap (Therion format) to the referenced therion scrap""" pass def settherionfileinfo(filetuple): """Read in the drawing file contents and sets values on the dwgfile object""" thtype, therionfile = filetuple ff = Path(settings.DRAWINGS_DATA) / therionfile.dwgpath if not _set_filesize_and_check(ff, therionfile, "Therion"): return ttext = _read_text_file(ff) # The equivalent for a tunnel 'path' would be a .th2 'line wall' or 'scrap' # print(len(re.findall(r"line", ttext))) if thtype == "th": therionfile.npaths = len(rx_input.findall(ttext)) if wallet_texts := rx_ref.findall(ttext): # print(f"#ref {therionfile.dwgname} : {wallet_text}") if wallets := Wallet.objects.filter(walletname__in=wallet_texts): # ! Django idiom not used elsewhere. A filter using a list of strings. for w in wallets: therionfile.dwgwallets.add(w) elif thtype == "th2": therionfile.npaths = len(rx_line.findall(ttext)) # scan and look for survex blocks that might have been included, and image scans (as for tunnel drawings) # which would populate dwgfile.survexfile # in .th2 files: # ##XTHERION## xth_me_image_insert {500 1 1.0} {1700 {}} ../../../expofiles/surveyscans/2014/01popped_elev1.jpeg 0 {} # scrap blownout -projection plan -scale [-81.0 -42.0 216.0 -42.0 0.0 0.0 7.5438 0.0 m] for xth_me in rx_xth_me.findall(ttext): # WORK IN PROGRESS. Do not clutter up the DataIssues list with this # Surely not needed for .th files ?? only .th2 ? message = f"! Un-parsed image filename: {therionfile.dwgname} : {xth_me.split()[-3]} - {therionfile.dwgpath}" # print(message) # DataIssue.objects.create(parser='xTherion', message=message, url=f'/dwgdataraw/{therionfile.dwgpath}') # ! Un-parsed image filename: 107coldest : ../../../expofiles/surveyscans/2015/2015#20/notes.jpg - therion/plan/107coldest.th2 with open("therionrefs.log", "a") as lg: lg.write(message + "\n") findwalletimage(therionfile, xth_me.split()[-3]) # print(f"{therionfile.dwgname} :{xth_me.split()[-3]}") # therionfile.save() for inp in rx_input.findall(ttext): # if this 'input' is a .th2 file we have already seen, then we can assign this as a sub-file # but we would need to disentangle to get the current path properly message = f"! Un-set (?) Therion .th2 input: - {therionfile.dwgname} : {inp} - {therionfile.dwgpath}" # print(message) DataIssue.objects.create(parser="xTherion", message=message, url=f"/dwgdataraw/{therionfile.dwgpath}") findimportinsert(therionfile, inp) therionfile.save() def settnlfileinfo(dwgfile): """Read in the drawing file contents and sets values on the dwgfile object Should try to read the date too e.g. tunneldate="2010-08-16 22:51:57 then we could display on the master calendar per expo. Tunnel files are unfortunately not fully compliant XML so we can't use any of the XML parsing tools available. Thanks Julian. *ref wallet identifiers may be found in at least two different places in tunnel files. """ ff = Path(settings.DRAWINGS_DATA) / dwgfile.dwgpath if not _set_filesize_and_check(ff, dwgfile, "Tunnel"): return ttext = _read_text_file(ff) dwgfile.npaths = len(rx_skpath.findall(ttext)) # dwgfile.save() # example drawing file in Tunnel format. # # # sfsketch="surveyscans/2025/2025#41/plan_diddlypot.png" for scanfile_path, style in rx_pcpath.findall(ttext): parse_tnl_file(dwgfile, scanfile_path) # # *file_begin "/home/expo/loser/caves-1623/2025-dw-01/trip1.svx" "trip1.svx" | *begin 1 | *export 1 25 | | ; Cave: 2025-dw-01 | ; Area in cave/QM: Entrance series | *title "2025-dw-01" | *date 2025.07.13 | *team "Dylan Wase" notes | *team "Daniel Gorst" dog | *instrument SAP "SAP6 Dylan" | *ref 2025#20 | for refs in rx_pctext.findall(ttext): try: wallets = Wallet.objects.filter(walletname=refs) if wallets: for w in wallets: dwgfile.dwgwallets.add(w) except Exception as e: message = f" ! wallet not found referenced from {dwgfile} -- '{refs}' ({e}) " print(message) DataIssue.objects.create(parser="Tunnel", message=message, url=f"/dwgdataraw/{dwgfile}") # should also scan and look for survex blocks that might have been included, and image scans # which would populate dwgfile.survexfile dwgfile.save() def setdrwfileinfo(dwgfile): """Read in the drawing file contents and sets values on the dwgfile object, but these are SVGs, PDFs or .txt files, so there is no useful format to search for """ ff = Path(settings.DRAWINGS_DATA) / dwgfile.dwgpath if not _set_filesize_and_check(ff, dwgfile, "drawings"): return # nothing more to parse for generic files def load_drawings_files(): """Breadth first search of drawings directory looking for sub-directories and *.xml filesize This is brain-damaged very early code. Should be replaced with proper use of pathlib. Why do we have all this detection of file types/! Why not use get_mime_types ? What is it all for ?? We import JPG, PNG and SVG files; which have already been put on the server, but the upload form intentionally refuses to upload PNG and JPG (though it does allow SVG) """ all_xml = [] drawdatadir = Path(settings.DRAWINGS_DATA) DrawingFile.objects.all().delete() DataIssue.objects.filter(parser="drawings").delete() DataIssue.objects.filter(parser="Therion").delete() DataIssue.objects.filter(parser="xTherion").delete() DataIssue.objects.filter(parser="Tunnel").delete() if os.path.isfile("therionrefs.log"): os.remove("therionrefs.log") supported_extensions = {".txt", ".xml", ".th", ".th2", ".pdf", ".png", ".svg", ".jpg"} # Walk the tree with pathlib, skip hidden and backup files for p in drawdatadir.rglob('*'): if p.name.startswith('.') or p.name.endswith('~'): continue if p.is_dir(): continue suffix = p.suffix.lower() if suffix in supported_extensions or suffix == '': rel = p.relative_to(drawdatadir).as_posix() if suffix == '': dwgname = p.name ext = '' else: dwgname = p.stem ext = suffix[1:] dwgfile = DrawingFile(dwgpath=rel, dwgname=dwgname) dwgfile.save() all_xml.append((ext, dwgfile, p)) print(f" - {len(all_xml)} Drawings files found") # Process in a deterministic order; ensure .th2 are handled before .th ext_priority = {'th2': 0, 'th': 1} all_xml.sort(key=lambda t: ext_priority.get(t[0], 2)) for extension, filename, pathobj in all_xml: if extension in {"pdf", "txt", "svg", "jpg", "png", ""}: setdrwfileinfo(filename) elif extension == "xml": settnlfileinfo(filename) # important to import .th2 files before .th so that we can assign them when found in .th files elif extension == "th2": settherionfileinfo(("th2", filename)) elif extension == "th": settherionfileinfo(("th", filename)) print(f" - Drawings parsed") # for drawfile in DrawingFile.objects.all(): # SetTunnelfileInfo(drawfile)