2
0
mirror of https://expo.survex.com/repositories/troggle/.git synced 2025-12-18 19:37:09 +00:00

expanded search for wallets inside tunnel files

This commit is contained in:
2025-09-27 16:49:27 +03:00
parent ff739b9e24
commit 4d3821f572
5 changed files with 101 additions and 69 deletions

View File

@@ -15,33 +15,64 @@ for tunnel and therion files
todo = """
- Rename functions more consistently between tunnel and therion variants
- fix missed embedded survex files and their *ref wallet
- Refactor to use pathlib instead of whacky resetting of loop variable inside loop
to scan sub-folders. This will definitely break at some point..
- Recode rx_valid_ext to use pathlib suffix() function
- Recode load_drawings_files() to use a list of suffices - not the huge if-else monstrosity
- implement: findimportinsert(therionfile, imp)
Tries to link the scrap (Therion format) to the referenced therion scrap
"""
rx_valid_ext = re.compile(r"(?i)\.(?:png|jpg|pdf|jpeg|gif|txt)$")
rx_valid_ext = re.compile(r"(?i)\.(?:png|jpg|pdf|jpeg|gif|txt|svg)$")
rx_wallet = re.compile(r"""
# r"(\d\d\d\d#X?\d+\w?|1995-96kh|92-94Surveybookkh|1991surveybook|smkhs)/(.*?(?:png|jpg|pdf|jpeg|gif|txt))$", path
# This regex is designed to extract a specific directory prefix and a filename
# from the end of a path string.
# --- Group 1: Directory or Survey Prefix ---
( # Start of Capture Group 1
\d{4}\#X?\d+\w? # Matches patterns like "2025#123", "2016#X04" or "1999#45a", NB # must be escaped in VERBOSE mode
| # OR
1995-96kh # Matches the literal string "1995-96kh"
| # OR
92-94Surveybookkh # Matches the literal string "92-94Surveybookkh"
| # OR
1991surveybook # Matches the literal string "1991surveybook"
| # OR
smkhs # Matches the literal string "smkhs"
) # End of Capture Group 1
def find_dwg_file(dwgfile, path):
/ # A literal forward slash separating the parts
# --- Group 2: Filename ---
( # Start of Capture Group 2
.*? # Non-greedily match the filename stem (any character)
(?: # Start of a non-capturing group for the extension
png|jpg|pdf|jpeg|gif|txt|svg
) # End of the extension group
) # End of Capture Group 2
$ # Anchor, ensuring the match is at the end of the string
""", re.VERBOSE | re.IGNORECASE)
def parse_tnl_file(dwgfile, path):
"""Is given a line of text 'path' which may or may not contain a recognisable name of a scanned file
which we have already seen when we imported all the files we could find in the surveyscans direstories.
The purpose is to find cross-references between Tunnel drawing files. But this is not reported anywhere yet ?
The purpose is to find cross-references between Tunnel drawing files and wallets
AND to find the names of the scanfiles in that wallet - from reading the Tunnel file not from interrogating the wallet.
Note that this means that the list of scanfiles will be as it was when the drawing was created, not as it is now. Perhaps
we should not actually do it this way ? Or at least, label the table heading.
What is all this really for ?! Is this data used anywhere ??
This is used to tie drawings to the wallet, and thus the original survey data. Tunnel files
contain a centreline which is an embedded survex file.
"""
wallet, scansfile = None, None
mscansdir = re.search(
r"(\d\d\d\d#X?\d+\w?|1995-96kh|92-94Surveybookkh|1991surveybook|smkhs)/(.*?(?:png|jpg|pdf|jpeg|gif|txt))$", path
)
if mscansdir:
wallet, scansfile = None, None
if mscansdir := rx_wallet.search(path): # walrus
# print(f"{path} -- {mscansdir.group(1)=} -- {mscansdir.group(2)=}")
scanswalletl = Wallet.objects.filter(walletname=mscansdir.group(1))
# This should be changed to properly detect if a list of folders is returned and do something sensible, not just pick the first.
if len(scanswalletl):
@@ -89,8 +120,9 @@ def find_dwg_file(dwgfile, path):
def findwalletimage(therionfile, foundpath):
"""Tries to link the drawing file (Therion format) to the referenced image (scan) file"""
wallet, scansfile = None, None
foundpath = foundpath.strip("{}")
mscansdir = re.search(r"(\d\d\d\d#\d+\w?|1995-96kh|92-94Surveybookkh|1991surveybook|smkhs)", foundpath)
mscansdir = rx_wallet.search(foundpath)
if mscansdir:
scanswalletl = Wallet.objects.filter(walletname=mscansdir.group(1))
# This should be changed to properly detect if a list of folders is returned and do something sensible, not just pick the first.
@@ -191,12 +223,18 @@ def settherionfileinfo(filetuple):
rx_skpath = re.compile(rb"<skpath")
rx_pcpath = re.compile(rb'<pcarea area_signal="frame".*?sfsketch="([^"]*)" sfstyle="([^"]*)"')
rx_pctext = re.compile(rb'pctext.*?\*ref&space;([^&]*)')
def settnlfileinfo(dwgfile):
"""Read in the drawing file contents and sets values on the dwgfile object
Should try to read the date too e.g. tunneldate="2010-08-16 22:51:57
then we could display on the master calendar per expo.
Tunnel files are unfortunately not fully compliant XML so we can't use any of the XML parsing tools
available. Thanks Julian.
*ref wallet identifiers may be found in at least two different places in tunnel files.
"""
ff = os.path.join(settings.DRAWINGS_DATA, dwgfile.dwgpath)
dwgfile.filesize = os.stat(ff)[stat.ST_SIZE]
@@ -205,19 +243,33 @@ def settnlfileinfo(dwgfile):
print(message)
DataIssue.objects.create(parser="Tunnel", message=message, url=f"/dwgdataraw/{dwgfile.dwgpath}")
return
fin = open(ff, "rb")
fin = open(ff, "rb") # tunnel files are not actually ascii, despite what they say. They have weird bytes in them.
ttext = fin.read()
fin.close()
dwgfile.npaths = len(rx_skpath.findall(ttext))
dwgfile.save()
# dwgfile.save()
# example drawing file in Tunnel format.
# <tunnelxml tunnelversion="version2009-06-21 Matienzo" tunnelproject="ireby" tunneluser="goatchurch" tunneldate="2009-06-29 23:22:17">
# <pcarea area_signal="frame" sfscaledown="12.282584" sfrotatedeg="-90.76982" sfxtrans="11.676667377221136" sfytrans="-15.677173422877454" sfsketch="204description/scans/plan(38).png" sfstyle="" nodeconnzsetrelative="0.0">
for path, style in rx_pcpath.findall(ttext):
find_dwg_file(dwgfile, path.decode())
# sfsketch="surveyscans/2025/2025#41/plan_diddlypot.png"
for scanfile_path, style in rx_pcpath.findall(ttext):
parse_tnl_file(dwgfile, scanfile_path.decode())
# <pathcodes>
# <pctext style="survey" nodeposxrel="-1.0" nodeposyrel="-1.0"> *file_begin "/home/expo/loser/caves-1623/2025-dw-01/trip1.svx" "trip1.svx" | *begin 1 | *export 1 25 | | ; Cave: 2025-dw-01 | ; Area in cave/QM: Entrance series | *title "2025-dw-01" | *date 2025.07.13 | *team "Dylan Wase" notes | *team "Daniel Gorst" dog | *instrument SAP "SAP6 Dylan" | *ref 2025#20 |
for refs in rx_pctext.findall(ttext):
try:
wallets = Wallet.objects.filter(walletname=refs.decode())
if wallets:
for w in wallets:
dwgfile.dwgwallets.add(w)
except:
message = f" ! wallet not found referenced from {dwgfile} -- '{refs.decode()}' "
print(message)
DataIssue.objects.create(parser="Tunnel", message=message, url=f"/dwgdataraw/{dwgfile}")
# should also scan and look for survex blocks that might have been included, and image scans
# which would populate dwgfile.survexfile
@@ -261,6 +313,8 @@ def load_drawings_files():
os.remove("therionrefs.log")
drawingsdirs = [""]
supported_extensions = {".txt", ".xml", ".th", ".th2", ".pdf", ".png", ".svg", ".jpg"} # set
while drawingsdirs:
drawdir = drawingsdirs.pop()
for f in os.listdir(os.path.join(drawdatadir, drawdir)):
@@ -272,64 +326,37 @@ def load_drawings_files():
drawingsdirs.append(
lf
) # lunatic! adding to list in middle of list while loop! Replace with pathlib functions.
elif Path(f).suffix.lower() == ".txt":
# Always creates new
dwgfile = DrawingFile(dwgpath=lf, dwgname=os.path.split(f[:-4])[1])
dwgfile.save()
all_xml.append(("txt", dwgfile))
elif Path(f).suffix.lower() == ".xml":
# Always creates new
dwgfile = DrawingFile(dwgpath=lf, dwgname=os.path.split(f[:-4])[1])
dwgfile.save()
all_xml.append(("xml", dwgfile))
elif Path(f).suffix.lower() == ".th":
# Always creates new
dwgfile = DrawingFile(dwgpath=lf, dwgname=os.path.split(f[:-4])[1])
dwgfile.save()
all_xml.append(("th", dwgfile))
elif Path(f).suffix.lower() == ".th2":
# Always creates new
dwgfile = DrawingFile(dwgpath=lf, dwgname=os.path.split(f[:-4])[1])
dwgfile.save()
all_xml.append(("th2", dwgfile))
elif Path(f).suffix.lower() == ".pdf":
# Always creates new
dwgfile = DrawingFile(dwgpath=lf, dwgname=os.path.split(f[:-4])[1])
dwgfile.save()
all_xml.append(("pdf", dwgfile))
elif Path(f).suffix.lower() == ".png":
# Always creates new
dwgfile = DrawingFile(dwgpath=lf, dwgname=os.path.split(f[:-4])[1])
dwgfile.save()
all_xml.append(("png", dwgfile))
elif Path(f).suffix.lower() == ".svg":
# Always creates new
dwgfile = DrawingFile(dwgpath=lf, dwgname=os.path.split(f[:-4])[1])
dwgfile.save()
all_xml.append(("svg", dwgfile))
elif Path(f).suffix.lower() == ".jpg":
# Always creates new
dwgfile = DrawingFile(dwgpath=lf, dwgname=os.path.split(f[:-4])[1])
dwgfile.save()
all_xml.append(("jpg", dwgfile))
elif Path(f).suffix == "":
# therion file
dwgfile = DrawingFile(dwgpath=lf, dwgname=os.path.split(f)[1])
dwgfile.save()
all_xml.append(("", dwgfile))
else:
file_path = Path(f)
suffix = file_path.suffix.lower()
if suffix in supported_extensions:
dwgfile = DrawingFile(dwgpath=lf, dwgname=file_path.stem)
dwgfile.save()
# Get the extension without the dot for the tuple.
all_xml.append((suffix[1:], dwgfile))
elif suffix == "":
# This handles the special case for files with no extension.
dwgfile = DrawingFile(dwgpath=lf, dwgname=file_path.name)
dwgfile.save()
all_xml.append(("", dwgfile))
print(f" - {len(all_xml)} Drawings files found")
for d in all_xml:
if d[0] in ["pdf", "txt", "svg", "jpg", "png", ""]:
setdrwfileinfo(d[1])
if d[0] == "xml":
settnlfileinfo(d[1])
extension, filename = d
if extension in {"pdf", "txt", "svg", "jpg", "png", ""}: # set
setdrwfileinfo(filename)
if extension == "xml":
settnlfileinfo(filename)
# important to import .th2 files before .th so that we can assign them when found in .th files
if d[0] == "th2":
if extension == "th2":
settherionfileinfo(d)
if d[0] == "th":
if extension == "th":
settherionfileinfo(d)
print(f" - Drawings parsed")
# for drawfile in DrawingFile.objects.all():
# SetTunnelfileInfo(drawfile)

View File

@@ -23,7 +23,8 @@
{% include 'cave_red_star.html' %}
<p>
See <em><a href="/caves">All Caves</a></em> for all the caves in areas 1623, 1626, 1624, 1627 <br />
See <em><a href="/caves_undropped">Undropped Caves</a></em> for all unexplored caves
See <em><a href="/caves_undropped">Undropped Caves</a></em> for all unexplored caves<br />
See <em> <a href="/enttags">Lost Caves</a></em> for caves we have mislaid.
<p style="text-align:right">
<a href="{% url "newcave" %}">New Cave</a><br>

View File

@@ -23,6 +23,7 @@
{% include 'cave_red_star.html' %}
<p>See <em><a href="/caves_undropped">Undropped Caves</a></em> for all unexplored caves<br />
See <em> <a href="/enttags">Lost Caves</a></em> for caves we have mislaid.<br />
See <em> <a href="/caves_recent">Recent Caves</a></em> for a shorter list of recent caves.
<p style="text-align:right">

View File

@@ -11,6 +11,7 @@
<p>
See <em><a href="/caves">All Caves</a></em> for all the caves in areas 1623, 1626, 1624, 1627 <br />
See <em><a href="/caves_recent">Recent Caves</a></em> for a full list of recent caves.<br />
See <em> <a href="/enttags">Lost Caves</a></em> for caves we have mislaid.<br />
Download <em><a href="/caves_undropped_gpx" download="undropped.gpx">Undropped GPX file</a></em> (This is only the subset which are fully located.)
<p style="text-align:right">

View File

@@ -10,9 +10,11 @@ has been done <em>after</em> the most recent database reset.
<h3>All Tunnel and Therion files - linked to wallets, survey scans, frames and scraps</h3>
<p>This is the list of drawings as at the most recent database reset.
<p>Note that the list of scanned files is taken from text inside the drawing file, it is not
necessarily what is currently in the wallet.
<table style="font-size: 85%" width=95%>
<tr><th>Drawing (Tunnel or Therion) File</th><th>Size</th><th>Paths</th><th>Wallets</th><th>Scan files in the wallets</th><th>Frames</th></tr>
<tr><th>Drawing (Tunnel or Therion) File</th><th>Size</th><th>Paths</th><th>Wallets</th><th>Scan files when the drawing was created</th><th>Frames</th></tr>
{% for dwgfile in dwgfiles %}
<tr>