2012-06-10 16:56:12 +01:00
import os
import re
2023-08-04 07:08:18 +01:00
import string
2023-09-15 20:41:40 +01:00
import sys
2021-04-14 18:24:08 +01:00
from pathlib import Path
2012-06-10 16:56:12 +01:00
2020-05-24 01:57:06 +01:00
from django . conf import settings
2021-04-06 22:50:57 +01:00
from django . db import transaction
2020-05-24 01:57:06 +01:00
2024-12-15 18:54:47 +00:00
from troggle . core . models . caves import Cave , CaveAndEntrance , Entrance , GetCaveLookup
from troggle . core . models . survex import SurvexStation
2021-04-13 00:43:57 +01:00
from troggle . core . models . troggle import DataIssue
2023-01-19 21:18:42 +00:00
from troggle . settings import CAVEDESCRIPTIONS , ENTRANCEDESCRIPTIONS , EXPOWEB , SURVEX_DATA
2022-07-25 00:57:00 +01:00
2023-04-22 01:24:32 +01:00
""" Reads all the cave description data and entrance description data
by parsing the xml files stored as e . g .
: EXPOWEB : / cave_data / 1623 - 161. html
or
: EXPOWEB : / entrance_data / 1623 - 161 g . html
2021-04-06 22:50:57 +01:00
and creating the various Cave , Entrance and necessary Area objects .
2022-07-27 21:24:40 +01:00
This is the first import that happens after the database is reinitialised .
2022-07-21 19:01:57 +01:00
So is the first thing that creates tables .
2023-01-19 21:18:42 +00:00
"""
2012-06-10 16:56:12 +01:00
2023-01-19 21:18:42 +00:00
todo = """
2023-09-11 18:38:14 +01:00
- When reading cave data , to start off wit we do not know the cave id ( slug ) so we can ' t give a useful url in
the error message , but we do have the filename . Systematize this , and the same thing with reading entrance files .
2023-02-02 21:50:40 +00:00
- Cannot use Edit This Page for pendingcaves . txt_edit as Edit This Page is expecting an html file .
2022-03-05 12:20:26 +00:00
So we will need a separate file - editing capability just for this configuration file ? !
2022-07-19 17:48:11 +01:00
2023-04-22 22:05:12 +01:00
- we want to overwrite a PENDING cave if we are now importing the 1623 - xxx . html file for it
2023-04-22 23:27:06 +01:00
- rewrite archaic regex
re . findall ( " < %(itemname)s >(.*?)</ %(itemname)s > " % { " itemname " : itemname } , text , re . S )
in modern form and pre - compile it .
2023-02-02 21:50:40 +00:00
- crashes on MariaDB in databasereset . py on server when deleting Caves and complains Area needs a
non null parent , But this is not true . The only solution we have found is to let it crash , then
stop and restart MariaDB ( requires a logon able to sudo ) and then restart the databasereset . py
2023-09-11 18:38:14 +01:00
again . ( status as of July 2022 ) . May not happen now that class Area is removed ( Sept .2023 ) .
2023-01-19 21:18:42 +00:00
"""
2023-09-11 18:38:14 +01:00
AREACODES = { " 1623 " , " 1624 " , " 1626 " , " 1627 " } # NB set not dict
2023-09-14 13:29:00 +01:00
ARGEAREAS = { " 1626 " , " 1627 " , " 1624 " } # NB set not dict
2023-09-11 18:38:14 +01:00
2020-07-06 20:27:31 +01:00
entrances_xslug = { }
caves_xslug = { }
areas_xslug = { }
2024-06-29 22:17:39 +01:00
LETTERS = list ( string . ascii_lowercase )
2023-01-19 21:18:42 +00:00
2021-04-26 17:23:23 +01:00
def dummy_entrance ( k , slug , msg = " DUMMY " ) :
2023-01-19 21:18:42 +00:00
""" Returns an empty entrance object for either a PENDING cave or a DUMMY entrance if
2021-04-26 17:23:23 +01:00
user forgot to provide one when creating the cave
2023-01-19 21:18:42 +00:00
"""
2023-03-28 19:08:05 +01:00
ent = Entrance . objects . create ( # creates object and saves into db
name = k , slug = k , filename = k + " .html " ,
2023-01-19 21:18:42 +00:00
entrance_description = " Dummy entrance: auto-created when registering a new cave "
+ " and you forgot to create an entrance for it. Click on ' Edit ' to enter the correct data, then ' Submit ' . " ,
marking = " ? " ,
)
2021-04-26 17:23:23 +01:00
if ent :
return ent
else :
2023-03-28 19:26:37 +01:00
message = f " ! { k : 11s } { msg } - { slug } { k } entrance create failure "
2023-03-28 19:08:05 +01:00
DataIssue . objects . create ( parser = " entrances " , message = message , url = f " { slug } " )
2021-04-26 17:23:23 +01:00
print ( message )
2023-05-02 15:42:58 +01:00
raise # caught and handled by calling routine.
2023-01-19 21:18:42 +00:00
2021-04-26 17:23:23 +01:00
2023-03-28 19:08:05 +01:00
2021-04-26 17:23:23 +01:00
def set_dummy_entrance ( id , slug , cave , msg = " DUMMY " ) :
2023-03-28 19:08:05 +01:00
""" Called only when reading the cave and entrance html files
2023-05-02 15:42:58 +01:00
Called when the Entrance field in a cave_data file is either missing or
holds a null string instead of a filename .
Previously , the lack of an entrance where an entrance was expected , caused troggle to crash in several places .
But it is more robust now , so this is not necessary . . . we hope .
Also , Cave and Entrance editing now expects there to be a real file ( since April 2023 ) , so creating this
dummy is actually harmful . So this is commented out , pending removal after further experience .
global variable entrances_xslug is simply a cache of references to Entrance objects
to speed things up when parsing a lot of caves and entrances . All DB actions are time - consuming
so
"""
2021-04-26 17:23:23 +01:00
global entrances_xslug
2023-07-25 23:23:49 +01:00
message = f " - Note: Missing Entrance for entrance ' { id } ' on cave ' { cave } ' - Is this a problem? "
2023-05-02 15:42:58 +01:00
DataIssue . objects . create ( parser = " entrances " , message = message , url = f " { cave . url } " )
2023-01-19 21:18:42 +00:00
2023-09-25 23:10:50 +01:00
def add_cave_to_pending_list ( id , wallet , message ) :
""" (id, f " Wallet {wallet} - Could not find id < {id} > " )
"""
wurl = f " /walletedit/ { wallet . walletname } " . replace ( ' # ' , ' : ' )
2023-10-21 20:31:33 +01:00
DataIssue . objects . update_or_create ( parser = " wallets " , message = message , url = wurl )
2023-09-25 23:10:50 +01:00
2023-09-15 20:41:40 +01:00
def create_new_cave ( svxpath , svxid = None , msg = None ) :
2023-03-24 00:54:26 +00:00
""" This is called only when a new survex file is edited online which has a path on the
: loser : repo which is not recognised as a known cave .
2023-09-08 17:51:04 +01:00
ALSO called by survex parser when it finds a cave it doesn ' t recognise
2023-03-24 00:54:26 +00:00
"""
2023-09-15 20:41:40 +01:00
# e.g. svxpath = "caves-1623/666/beast" .svx - from the *inlcude tree
# e.g. svxid = "caves-1623/666/beast"
2023-09-10 00:06:38 +01:00
print ( f " Create new cave at { svxpath } - { msg } " )
2023-03-24 00:54:26 +00:00
#
2023-09-15 20:41:40 +01:00
survex_file = " "
if svxid :
sv = Path ( settings . SURVEX_DATA , svxid + " .svx " )
if sv . is_file :
survex_file = svxid + " .svx "
else :
sv = Path ( settings . SURVEX_DATA , svxpath + " .svx " )
if sv . is_file :
survex_file = svxpath + " .svx "
if survex_file :
# message = f"Found a survex file {survex_file=} {svxpath=} {svxid=} "
# DataIssue.objects.create(parser="caves", message=message)
# print(message, file=sys.stderr)
# print(message)
pass
else :
message = f " NOT found a survex file { svxpath =} { svxid =} "
DataIssue . objects . create ( parser = " caves " , message = message )
print ( message , file = sys . stderr )
print ( message )
2023-03-24 00:54:26 +00:00
parts = svxpath . split ( " / " )
a = parts [ 0 ] [ - 4 : ]
caveid = parts [ 1 ]
print ( f " parts { parts } , { a } , { caveid } " )
# double check
if a [ 0 : 3 ] == " 162 " :
2023-09-10 13:42:36 +01:00
areacode = a [ 0 : 4 ]
2023-11-03 14:54:57 +00:00
url = f " { areacode } / { caveid } / { caveid } .html " # Note we are appending the .html as we are believe in backwards compatability. This is to fix Martin's new 2023 app.
2023-07-08 16:56:49 +01:00
else :
2023-09-10 00:06:38 +01:00
print ( f " WARNING: parsers/caves/create_new_cave called with svxpath ' { svxpath } ' . Surely it should start ' caves-162* ' ? { msg } " )
2023-09-10 13:42:36 +01:00
areacode = " 1623 "
2023-11-03 14:54:57 +00:00
url = f " 1623/ { caveid } / { caveid } .html "
2023-09-11 18:38:14 +01:00
2023-09-10 13:42:36 +01:00
k = f " { areacode } - { caveid } "
2023-03-24 00:54:26 +00:00
2023-09-10 13:42:36 +01:00
caves = Cave . objects . filter ( unofficial_number = caveid , areacode = areacode )
2023-03-24 00:54:26 +00:00
if caves :
2023-09-10 13:42:36 +01:00
message = f " ! Already exists, caveid: { k } in areacode { areacode } { caves } - { msg } "
2023-03-24 00:54:26 +00:00
DataIssue . objects . create ( parser = " caves " , message = message )
print ( message )
return caves [ 0 ]
2023-09-27 23:01:32 +01:00
urltest = Cave . objects . filter ( url = url )
if urltest :
message = f " ! Cave { urltest [ 0 ] } already exists with this url { url } . Can ' t create new cave { slug } from { svxpath } "
DataIssue . objects . create ( parser = " caves " , message = message , url = url )
print ( message )
return urltest [ 0 ]
2023-03-24 00:54:26 +00:00
try :
2023-09-10 13:42:36 +01:00
cave = do_pending_cave ( k , caveid , url , areacode , msg )
2023-03-24 00:54:26 +00:00
except :
2023-09-10 13:42:36 +01:00
message = f " ! Error. Cannot create pending cave and entrance, pending-id: { k } in area { areacode } - { msg } "
2023-03-24 00:54:26 +00:00
DataIssue . objects . create ( parser = " caves " , message = message )
print ( message )
raise
2023-09-15 20:41:40 +01:00
# But this sets the survex file on the Cave from the first one we find, not necessarily the best survex file for this cave
cave . survex_file = survex_file # primary survex file for Cave
2023-09-10 13:42:36 +01:00
cave . areacode = areacode
2023-03-24 00:54:26 +00:00
cave . save ( )
return cave
2023-01-19 21:18:42 +00:00
2023-09-11 20:42:12 +01:00
def do_ARGE_cave ( slug , caveid , areacode , svxid ) :
2023-09-11 18:38:14 +01:00
""" Only called by survex parser, NOT the cave parser.
2023-09-10 11:44:06 +01:00
Creates a new Cave object , but with abbreviated data as the survex file ( from ARGE ) is all we have .
We already know the survex file .
2023-09-11 18:38:14 +01:00
We already know that the cave doesn ' t exist... though there are bugs..
2023-09-11 20:42:12 +01:00
2023-09-27 19:44:04 +01:00
Assumes anything in the ARGE list of cave areas is Arge , which is not true for 1626. . .
2023-09-11 20:42:12 +01:00
caveid may be kataster number or it may be e . g . LA34
2023-09-10 11:44:06 +01:00
"""
2023-09-27 19:44:04 +01:00
default_note = " This is (probably) an ARGE cave where we only have the survex file and no other information "
2023-11-03 14:54:57 +00:00
url = f " { areacode } / { caveid } / { caveid } .html "
2023-09-10 11:44:06 +01:00
urltest = Cave . objects . filter ( url = url )
if urltest :
2023-09-27 23:01:32 +01:00
message = f " ! Cave { urltest [ 0 ] } already exists with this url { url } . Can ' t create new ARGE cave { slug } from { svxid } "
2023-09-10 11:44:06 +01:00
DataIssue . objects . create ( parser = " caves " , message = message , url = url )
print ( message )
return urltest [ 0 ]
2023-09-10 14:35:00 +01:00
numtest = Cave . objects . filter ( unofficial_number = caveid . upper ( ) , areacode = areacode )
2023-09-10 11:44:06 +01:00
if numtest :
2023-09-10 14:35:00 +01:00
message = f " ! Cave { numtest [ 0 ] } already exists with this areacode { areacode } and unofficial_number { caveid . upper ( ) } . Can ' t create new ARGE cave { slug } "
2023-09-10 11:44:06 +01:00
DataIssue . objects . create ( parser = " caves " , message = message , url = url )
print ( message )
return numtest [ 0 ]
2023-09-11 18:38:14 +01:00
sv = Path ( settings . SURVEX_DATA , svxid + " .svx " )
if sv . is_file :
with open ( sv , " r " ) as s :
line1 = s . readline ( )
line2 = s . readline ( )
line3 = s . readline ( )
2023-10-25 20:35:51 +01:00
rest = s . readlines ( )
2023-09-11 18:38:14 +01:00
else :
2023-09-11 20:42:12 +01:00
print ( f " not correct svxid { svxid } { sv } " , file = sys . stderr )
2023-10-25 20:35:51 +01:00
print ( f " { caveid } { rest } " )
passages = " \n "
for line in rest :
if line . strip ( ) . startswith ( " *begin " ) :
passages = f " { passages } { line } "
2023-09-10 14:35:00 +01:00
cave = Cave (
2023-10-25 20:35:51 +01:00
underground_description = " ARGE cave.<br>3 lines of the survexfile, then all the *begin lines:<br><pre> " + line1 + line2 + line3 + passages + " </pre> " ,
2023-09-11 18:38:14 +01:00
unofficial_number = " ARGE " ,
2023-09-11 20:42:12 +01:00
survex_file = f " { svxid } .svx " ,
2023-09-11 18:38:14 +01:00
url = url ,
2023-09-10 14:35:00 +01:00
notes = default_note ,
areacode = areacode ,
2023-09-10 11:44:06 +01:00
)
if cave :
2023-09-10 14:35:00 +01:00
try :
kn = int ( caveid )
cave . kataster_number = kn # should only set this if all digit
except :
# must be unofficial 'number' or name
2023-09-26 20:16:19 +01:00
cave . unofficial_number = caveid
2023-09-10 14:35:00 +01:00
cave . save ( )
2023-09-10 11:44:06 +01:00
else :
2023-09-10 13:42:36 +01:00
message = f " ! { slug : 11s } ARGE cave create failure { caveid =} { url =} { areacode =} { svxid =} "
2023-09-10 11:44:06 +01:00
DataIssue . objects . create ( parser = " caves " , message = message )
print ( message )
2023-09-10 13:42:36 +01:00
return None
2023-09-10 11:44:06 +01:00
return cave
2023-09-10 13:42:36 +01:00
def do_pending_cave ( slug , caveid , url , areacode , msg = None ) :
2023-01-19 21:18:42 +00:00
"""
default for a PENDING cave , should be overwritten in the db later if a real cave of the same name exists
in expoweb / cave_data / 1623 - " k " . html
2023-09-10 11:44:06 +01:00
Note that at this point ( parsing caves ) in importing the data we have not yet seen the survex files , so we can ' t
2022-09-19 19:54:51 +01:00
look inside the relevant survex file to find the year and so we con ' t provide helpful links.
2023-09-10 11:44:06 +01:00
This also gets called when parsing survex files , when we do have this info .
2023-01-19 21:18:42 +00:00
"""
2022-10-06 19:02:15 +01:00
def get_survex_file ( k ) :
2023-01-19 21:18:42 +00:00
""" Guesses at and finds a survex file for this pending cave.
2023-09-15 09:20:01 +01:00
Convoluted . Needs rewriting .
Pointless if this cave is being created because we found a survex file . . .
2023-09-15 20:41:40 +01:00
2023-09-25 23:10:50 +01:00
One problem is that the Cave name may have different capitalisation from the survex filename ,
2023-09-15 20:41:40 +01:00
e . g . 2018 - NTU - 02 has a survex file 2018 - ntu - 02. svx
2023-01-19 21:18:42 +00:00
"""
2022-10-06 19:02:15 +01:00
if k [ 0 : 3 ] == " 162 " :
id = Path ( k [ 5 : ] )
else :
id = Path ( k )
2023-01-19 21:18:42 +00:00
2023-09-10 13:42:36 +01:00
survex_file = f " caves- { areacode } / { id } / { id } .svx "
2022-10-06 19:02:15 +01:00
if Path ( settings . SURVEX_DATA , survex_file ) . is_file ( ) :
return survex_file
else :
2023-09-10 13:42:36 +01:00
survex_file = f " caves- { areacode } / { id } .svx "
2022-10-06 19:02:15 +01:00
if Path ( settings . SURVEX_DATA , survex_file ) . is_file ( ) :
return survex_file
2023-01-19 21:18:42 +00:00
2023-09-25 23:10:50 +01:00
# This should find the file even if the capitalisation is different, or if the directory name is totally different
2022-10-06 19:02:15 +01:00
survex_file = " "
2023-09-10 13:42:36 +01:00
d = Path ( settings . SURVEX_DATA , f " caves- { areacode } / { id } " )
2022-10-06 19:02:15 +01:00
if d . is_dir ( ) :
prime_suspect = " "
dir = d . iterdir ( )
for f in dir :
if f . suffix == " .svx " :
survex_file = f . relative_to ( settings . SURVEX_DATA )
2023-01-19 21:18:42 +00:00
chk = min ( 5 , len ( f . name ) - 1 )
if str ( f . name ) [ : chk ] . lower ( ) == str ( id . name ) [ : chk ] . lower ( ) : # bodge which mostly works
2022-10-06 19:02:15 +01:00
prime_suspect = survex_file
if prime_suspect :
survex_file = prime_suspect
2023-09-15 20:41:40 +01:00
# message = f" ! {k:14} Found a survex file which might be the right one: {survex_file} - {msg}"
# DataIssue.objects.create(parser='caves', message=message, url=url)
# print(message)
if Path ( settings . SURVEX_DATA , survex_file ) . is_file ( ) :
return survex_file
return " "
2022-10-06 19:02:15 +01:00
2022-07-28 16:36:57 +01:00
g = GetCaveLookup ( )
2022-10-06 19:02:15 +01:00
with transaction . atomic ( ) :
if slug in g :
2023-09-10 11:44:06 +01:00
message = f " ! { slug : 18 } cave listed in pendingcaves.txt already exists. - { msg } "
2023-01-19 21:18:42 +00:00
DataIssue . objects . create ( parser = " caves " , message = message , url = url )
2022-10-06 19:02:15 +01:00
print ( message )
return
2022-07-28 16:36:57 +01:00
2023-08-02 16:23:04 +01:00
default_note = " A reference has been found to this cave id in a survex file in the loser repo, or in a wallet metadata "
default_note + = " in a JSON file in the drawings repo, but no Cave Description exists in expoweb (in /cave_data/)<br><br><br> \n "
2023-10-20 16:06:19 +01:00
default_note + = " INSTRUCTIONS: FIRST read the notes in <a href= ' /cave_data/pendingcaves.txt ' >pendingcaves.txt</a><br /> "
default_note + = " Next open ' This survex file ' (link above the CaveView panel) to find the date and info. Then "
2023-01-19 21:34:09 +00:00
default_note + = ' <br><br> \n \n - (0) look in the <a href= " /noinfo/cave-number-index " >cave number index</a> for notes on this cave, '
default_note + = " <br><br> \n \n - (1) search in the survex file for the *ref to find a "
default_note + = " relevant wallet, e.g.<a href= ' /survey_scans/2009 % 252311/ ' >2009#11</a> and read the notes image files <br> \n - "
2023-01-19 21:18:42 +00:00
default_note + = (
2023-01-19 21:34:09 +00:00
" <br><br> \n \n - (2) search in the Expo for that year e.g. <a href= ' /expedition/2009 ' >2009</a> to find a "
2023-01-19 21:18:42 +00:00
)
2023-01-19 21:34:09 +00:00
default_note + = " relevant logbook entry, remember that the date may have been recorded incorrectly, "
2023-01-19 21:18:42 +00:00
default_note + = (
2023-01-19 21:34:09 +00:00
" so check for trips i.e. logbook entries involving the same people as were listed in the survex file, "
2023-01-19 21:18:42 +00:00
)
default_note + = (
2023-01-19 21:34:09 +00:00
" and you should also check the scanned copy of the logbook (linked from each logbook entry page) "
2023-01-19 21:18:42 +00:00
)
2023-01-19 21:34:09 +00:00
default_note + = " just in case a vital trip was not transcribed, then <br> \n - "
2023-01-19 21:18:42 +00:00
default_note + = (
2023-01-19 21:34:09 +00:00
" click on ' Edit this cave ' and copy the information you find in the survex file and the logbook "
2023-01-19 21:18:42 +00:00
)
2023-01-19 21:34:09 +00:00
default_note + = " and delete all the text in the ' Notes ' section - which is the text you are reading now. "
default_note + = " <br><br> \n \n - Only two fields on this form are essential. "
default_note + = " Documentation of all the fields on ' Edit this cave ' form is in <a href= ' /handbook/survey/caveentryfields.html ' >handbook/survey/caveentryfields</a> "
default_note + = " <br><br> \n \n - "
default_note + = " You will also need to create a new entrance from the ' Edit this cave ' page. Ignore the existing dummy one, it will evaporate on the next full import. "
default_note + = " <br><br> \n \n - "
default_note + = " When you Submit it will create a new file in expoweb/cave_data/ "
2023-01-19 21:18:42 +00:00
default_note + = (
2023-01-19 21:34:09 +00:00
" <br><br> \n \n - Now you can edit the entrance info: click on Edit below for the dummy entrance. "
2023-01-19 21:18:42 +00:00
)
2023-01-19 21:34:09 +00:00
default_note + = " and then Submit to save it (if you forget to do this, a dummy entrance will be created for your new cave description). "
default_note + = " <br><br> \n \n - Finally, you need to find a nerd to edit the file ' <var>expoweb/cave_data/pending.txt</var> ' "
2023-01-19 21:18:42 +00:00
default_note + = (
f " to remove the line <br><var> { slug } </var><br> as it is no longer ' pending ' but ' done. Well Done. "
)
2023-09-10 00:06:38 +01:00
urltest = Cave . objects . filter ( url = url )
if urltest :
message = f " ! Cave { urltest [ 0 ] } already exists with this url { url } . Can ' t create new cave { slug } "
DataIssue . objects . create ( parser = " caves " , message = message , url = url )
print ( message )
return urltest [ 0 ]
2021-04-26 02:10:45 +01:00
2023-09-10 11:44:06 +01:00
survex_file = get_survex_file ( slug )
2023-01-19 21:18:42 +00:00
2022-10-06 19:02:15 +01:00
cave = Cave (
2023-09-26 20:16:19 +01:00
unofficial_number = caveid ,
2023-07-29 15:21:27 +01:00
underground_description = " Pending cave write-up - No cave description created yet. " ,
2023-01-19 21:18:42 +00:00
survex_file = survex_file ,
url = url ,
notes = default_note ,
2023-09-10 13:42:36 +01:00
areacode = areacode ,
2023-01-19 21:18:42 +00:00
)
2022-10-06 19:02:15 +01:00
if cave :
2023-01-19 21:18:42 +00:00
cave . save ( ) # must save to have id before foreign keys work. This is also a ManyToMany key.
2023-09-15 20:41:40 +01:00
message = f " ! { slug : 18 } Pending cave write-up url: { url } - { survex_file =} - { msg } "
2023-01-19 21:18:42 +00:00
DataIssue . objects . create ( parser = " caves " , message = message , url = url )
2022-10-06 19:02:15 +01:00
print ( message )
2023-01-19 21:18:42 +00:00
2022-10-06 19:02:15 +01:00
else :
2023-09-10 11:44:06 +01:00
message = f " ! { slug : 11s } PENDING cave create failure - { msg } "
2023-01-19 21:18:42 +00:00
DataIssue . objects . create ( parser = " caves " , message = message )
2021-04-26 02:10:45 +01:00
print ( message )
2023-03-24 00:54:26 +00:00
return cave
2021-04-26 02:10:45 +01:00
2023-04-22 01:24:32 +01:00
def getXML ( text , itemname , minItems = 1 , maxItems = None , context = " " ) :
""" Reads a single XML tag
Should throw exception rather than producing error message here ,
then handle exception in calling routine where it has the context .
2023-04-22 22:05:12 +01:00
2023-08-04 14:08:07 +01:00
This always succeeds , but it produces error message on the terminal and in the
DataIssues log .
2023-04-22 01:24:32 +01:00
"""
items = re . findall ( " < %(itemname)s >(.*?)</ %(itemname)s > " % { " itemname " : itemname } , text , re . S )
if len ( items ) < minItems :
message = (
2023-08-04 14:08:07 +01:00
" ! %(count)i x %(itemname)s found, at least %(min)i expected. Load may ABORT. "
2023-04-22 01:24:32 +01:00
% { " count " : len ( items ) , " itemname " : itemname , " min " : minItems }
+ " in file "
+ context
)
DataIssue . objects . create ( parser = " caves " , message = message , url = " " + context )
print ( message )
if maxItems is not None and len ( items ) > maxItems :
message = (
2023-08-04 14:08:07 +01:00
" ! %(count)i x %(itemname)s found, no more than %(max)i expected in this XML unit. Load may ABORT. "
2023-04-22 01:24:32 +01:00
% { " count " : len ( items ) , " itemname " : itemname , " max " : maxItems }
+ " in file "
+ context
)
2023-04-22 22:05:12 +01:00
DataIssue . objects . create ( parser = " caves " , message = message , url = " " + context )
2023-04-22 01:24:32 +01:00
print ( message )
if minItems == 0 :
if not items :
items = [ " " ]
return items
2023-04-22 03:26:53 +01:00
def boolify ( boolstrs ) :
return {
" True " : True ,
" False " : False ,
" true " : True ,
" false " : False } [ boolstrs [ 0 ] ]
2023-07-25 22:14:46 +01:00
def validate_station ( station ) :
""" It is possible to break troggle entirely by getting this wrong.
2023-10-07 00:26:52 +01:00
These station identifiers are matched against other stations using . endswith ( )
2023-07-25 22:14:46 +01:00
in parsers / locations . py
so a simple number here will match hundreds of SUrvexStation objects
It should be , e . g . " 1623.p240 "
2023-10-07 00:26:52 +01:00
We will test them against survex stations after we have loaded them .
2023-07-25 22:14:46 +01:00
"""
if station == " " :
return True
2023-10-07 00:26:52 +01:00
# CANNOT test against locations as we have not read the survex files yet. Hmph.
# Must have the right format in its name
2023-07-25 22:14:46 +01:00
dot = station . find ( " . " )
if dot == - 1 :
2023-10-07 00:26:52 +01:00
print ( dot )
2023-07-25 22:14:46 +01:00
# no full stop found. Bad station identifier.
2023-10-21 14:22:20 +01:00
# should just skip really, and log an error
2023-07-25 22:14:46 +01:00
raise
else :
return True
2023-04-22 03:26:53 +01:00
2023-04-22 22:05:12 +01:00
def read_entrance ( filename , ent = None ) :
""" Reads an entrance description from the .html file.
2023-08-03 14:11:46 +01:00
Runs on initial full import , and also whenever an entrance is edited online .
2023-04-22 22:05:12 +01:00
If not called as part of initial import , then the global lists will not be correct
but this is OK , a search will find them in the db .
2024-06-29 22:17:39 +01:00
EDIT href examples
/ 1623 - 1 : 1623 - 1 a_entrance_edit
/ 1623 / 1 / 1623 - 1 _cave_edit /
2023-04-22 22:05:12 +01:00
Args :
2024-07-02 18:01:15 +01:00
filename : The name of the entrance_data . html file , e . g . 1623 - JS - 01 a . html
2023-04-22 22:05:12 +01:00
ent : The entrance object , if it already exists .
Returns :
The entrance object , or a new entrance object if ` ent ` is None .
"""
2024-07-02 08:55:11 +01:00
def getXMLmin0 ( field ) :
return getXML ( entrancecontents , field , minItems = 0 , maxItems = 1 , context = context )
2023-03-31 12:19:22 +01:00
def getXMLmax1 ( field ) :
return getXML ( entrancecontents , field , maxItems = 1 , context = context )
2020-07-06 20:27:31 +01:00
global entrances_xslug
global caves_xslug
global areas_xslug
# Note: these are HTML files in the EXPOWEB repo, not from the loser repo.
2022-03-05 12:20:26 +00:00
with open ( os . path . join ( ENTRANCEDESCRIPTIONS , filename ) ) as f :
2012-06-10 16:56:12 +01:00
contents = f . read ( )
2021-04-14 22:50:47 +01:00
context = filename
2023-04-22 03:26:53 +01:00
2024-07-02 18:01:15 +01:00
# Derive the letter, entrance slug and cave slug from the filename
entslug_fn = filename [ : - 5 ] # remove .html
2024-06-29 22:17:39 +01:00
if entslug_fn [ - 1 ] in LETTERS :
caveslug_fn = entslug_fn [ : - 1 ]
letter_fn = entslug_fn [ - 1 ]
else :
caveslug_fn = entslug_fn
letter_fn = " "
2024-07-02 10:23:33 +01:00
ent_area = filename [ : 4 ]
2024-07-02 18:01:15 +01:00
cave_name = caveslug_fn [ 5 : ] # remove initial 1623-
ent_edit_url = f " / { caveslug_fn } : { entslug_fn } _entrance_edit "
cave_edit_url = f " / { ent_area } / { cave_name } / { cave_name } _cave_edit "
2024-06-29 22:17:39 +01:00
2024-07-02 10:23:33 +01:00
# validate filename, check areacode
if ent_area not in AREACODES :
message = f ' ! BAD AREA CODE in " { filename } " . Not recognised. '
DataIssue . objects . create ( parser = " entrances " , message = message , url = ent_edit_url )
print ( message )
2024-07-02 18:01:15 +01:00
# New system 2024, create the Cave object when parsing Entrances, not Caves
cave = make_cave ( caveslug_fn )
# try:
# cs = CaveSlug.objects.update_or_create(cave=cave, slug=caveslug_fn, primary=True)
# except Exception as ex:
# #raise
# # This fails to do an update! It just crashes.. to be fixed
# message = f" ! Entrances: CaveSlug {cave} update/create failure : {caveslug_fn}, skipping cave_data file {filename} with exception\nException: {ex.__class__}"
# DataIssue.objects.create(parser="caves", message=message, url=context)
# print(message)
2023-10-15 16:39:00 +01:00
2023-01-19 21:18:42 +00:00
entrancecontentslist = getXML ( contents , " entrance " , maxItems = 1 , context = context )
2021-04-14 18:24:08 +01:00
if len ( entrancecontentslist ) != 1 :
2023-04-22 22:05:12 +01:00
message = f ' ! BAD ENTRANCE DATA in " { filename } " . More than one entrance. Edit file manually, click. '
2024-06-29 22:17:39 +01:00
DataIssue . objects . create ( parser = " entrances " , message = message , url = ent_edit_url )
2021-04-14 18:24:08 +01:00
print ( message )
2023-04-22 22:05:12 +01:00
return None
2023-04-22 03:26:53 +01:00
entrancecontents = entrancecontentslist [ 0 ]
2024-07-02 18:01:15 +01:00
slugs = getXMLmin0 ( " slug " ) # not the full slug, just the id: i.e. without the 1623- prefix
2024-07-02 00:18:09 +01:00
# we ignore all these, because we now just use the filename. But if they are there, we validate them.
if len ( slugs ) > 0 :
slug = slugs [ 0 ]
if len ( slugs ) > 1 :
# Only ever one of these per entrance in the expo dataset
message = f " ! - More than one slug for an entrance: { entrance } , slugs: { slugs } . Ignoring all of them. "
DataIssue . objects . create ( parser = " entrances " , message = message , url = cave_edit_url )
print ( message )
2024-07-02 08:55:11 +01:00
if slug and slug != entslug_fn :
message = f " ! - Warning, mismatch between entrance slug (or unofficial name) and filename: { slug =} { filename =} . Ignoring slug field, using filename. "
2024-07-02 00:18:09 +01:00
DataIssue . objects . create ( parser = " xEntrances " , message = message , url = cave_edit_url )
print ( message )
slug = entslug_fn # force
2024-06-29 22:17:39 +01:00
lastvisit = getXML ( entrancecontents , " lastvisit " , maxItems = 1 , minItems = 0 , context = cave_edit_url )
lat_wgs84 = getXML ( entrancecontents , " lat_wgs84 " , maxItems = 1 , minItems = 0 , context = cave_edit_url )
long_wgs84 = getXML ( entrancecontents , " long_wgs84 " , maxItems = 1 , minItems = 0 , context = cave_edit_url )
2023-04-22 03:26:53 +01:00
alt = getXMLmax1 ( " alt " )
approach = getXMLmax1 ( " approach " )
bearings = getXMLmax1 ( " bearings " )
entrance_description = getXMLmax1 ( " entrance_description " )
explorers = getXMLmax1 ( " explorers " )
findability = getXMLmax1 ( " findability " )
findability_description = getXMLmax1 ( " findability_description " )
location_description = getXMLmax1 ( " location_description " )
2023-11-18 14:17:50 +00:00
#map_description = getXMLmax1("map_description")
2023-04-22 03:26:53 +01:00
marking = getXMLmax1 ( " marking " )
marking_comment = getXMLmax1 ( " marking_comment " )
name = getXMLmax1 ( " name " )
non_public = getXMLmax1 ( " non_public " )
other_description = getXMLmax1 ( " other_description " )
other_station = getXMLmax1 ( " other_station " )
photo = getXMLmax1 ( " photo " )
tag_station = getXMLmax1 ( " tag_station " )
underground_description = getXMLmax1 ( " underground_description " )
2023-04-22 22:05:12 +01:00
if not ent :
ent , state = Entrance . objects . update_or_create ( slug = slug )
ent . name = name [ 0 ]
ent . non_public = boolify ( non_public )
ent . alt = alt [ 0 ]
ent . approach = approach [ 0 ]
ent . bearings = bearings [ 0 ]
2023-09-26 22:38:07 +01:00
ent . lat_wgs84 = lat_wgs84 [ 0 ]
ent . long_wgs84 = long_wgs84 [ 0 ]
2023-04-22 22:05:12 +01:00
ent . entrance_description = entrance_description [ 0 ]
ent . explorers = explorers [ 0 ]
ent . filename = filename
ent . findability = findability [ 0 ]
ent . findability_description = findability_description [ 0 ]
ent . lastvisit = lastvisit [ 0 ]
ent . location_description = location_description [ 0 ]
2023-11-18 14:17:50 +00:00
#ent.map_description=map_description[0]
2023-04-22 22:05:12 +01:00
ent . marking = marking [ 0 ]
ent . marking_comment = marking_comment [ 0 ]
ent . other_description = other_description [ 0 ]
ent . other_station = other_station [ 0 ]
ent . photo = photo [ 0 ]
2023-11-07 21:23:15 +00:00
# ent.slug=slugs[0] # set algorithically
2023-04-22 22:05:12 +01:00
ent . tag_station = tag_station [ 0 ]
ent . underground_description = underground_description [ 0 ]
2023-10-11 22:58:20 +01:00
for st in [ ent . other_station , ent . tag_station ] :
2023-10-07 00:26:52 +01:00
#validate_station(st)
2023-07-25 22:14:46 +01:00
try :
validate_station ( st )
except :
message = f " ! BAD ENTRANCE TAG ' { st } ' in ' { filename } ' . Must format like ' 1623.p204 ' . Edit file manually, click. "
#http://localhost:8000/1623/2023-EBH-01/1623-2023-EBH-01:1623-2023-EBH-01_entrance_edit
2024-06-29 22:17:39 +01:00
DataIssue . objects . create ( parser = " entrances " , message = message , url = ent_edit_url ) # url=f"/1623/{slug}/{slug}:{slug}_entrance_edit")
2023-07-25 22:14:46 +01:00
print ( message )
2023-04-22 22:05:12 +01:00
ent . save ( )
return ent
2012-06-10 16:56:12 +01:00
2024-07-02 18:01:15 +01:00
def make_cave ( slug ) :
""" Making a Cave object, but when we have multiple entrances, the Cave object may already exist """
filename = f " { slug } .html "
try :
cave , state = Cave . objects . update_or_create ( filename = filename ) # replace with slug when CaveSlug tidied up
#print(f" - created:{state} cave.id:{cave} with {filename=}")
except :
print ( f " ! FAILED to get only one CAVE in db when updating using: { filename } or not found. " )
2024-07-12 16:18:05 +01:00
kaves = Cave . objects . all ( ) . filter ( filename = filename ) # assumes filename unique, eeugh.
2024-07-02 18:01:15 +01:00
for k in kaves :
message = " ! - DUPLICATES in db. kataster: " + str ( k . kataster_number ) + " , slug: " + str ( k . slug ( ) )
DataIssue . objects . create ( parser = " caves " , message = message , url = context )
print ( message )
for k in kaves :
if k . slug ( ) is not None :
print ( " ! - OVERWRITING this one: slug: " + str ( k . slug ( ) ) )
k . notes = " DUPLICATE kataster number found on import. Please fix \n " + k . notes
cave = k
return cave
2024-06-28 16:31:23 +01:00
def read_cave ( filename , mvf = None , cave = None ) :
2023-04-05 20:42:19 +01:00
""" Reads an entrance description from the .html file
2023-04-22 01:24:32 +01:00
Convoluted . Sorry . Needs rewriting
2024-06-28 11:53:44 +01:00
mvf is a file handle for misnamed files
2023-04-22 01:24:32 +01:00
If not called as part of initial import , then the global lists will not be correct
but this is OK , a search will find them in the db .
2023-04-22 23:15:50 +01:00
Attempted to use standard python3 .11 xml library but fails on HTML entities ( 2023 - 04 - 23 )
import xml . etree . ElementTree as ET
tree = ET . parse ( fn )
xml_root = tree . getroot ( )
for t in [ " html " , " head " , " body " , " cave " , " non_public " , " caveslug " , " official_name " , " entrance " ] :
elements = xml_root . findall ( t )
2023-01-19 21:18:42 +00:00
"""
2024-07-02 08:55:11 +01:00
def getXMLmin0 ( field ) :
return getXML ( cavecontents , field , minItems = 0 , maxItems = 1 , context = context )
2023-04-22 22:05:12 +01:00
def getXMLmax1 ( field ) :
2024-07-02 08:55:11 +01:00
return getXML ( cavecontents , field , minItems = 0 , maxItems = 1 , context = context )
2023-04-22 22:05:12 +01:00
2023-04-22 01:24:32 +01:00
def do_entrances ( ) :
2023-04-22 03:26:53 +01:00
""" For both bulk import and individual re-reading of cave_data file,
fix the entrances
2024-06-29 10:10:35 +01:00
What is Class CaveAndEntrance for ? It was to allow mandy < = > many relationship between caves and entrances , but now we insist
only one Cave for any Entrance , so this Class is reduncdant and should be removed . .
2023-04-22 03:26:53 +01:00
"""
2024-07-14 12:29:21 +01:00
c = cave
2023-04-22 03:26:53 +01:00
for e in entrances :
2024-07-15 11:42:46 +01:00
eslugs = getXML ( e , " entranceslug " , maxItems = 1 , context = context )
if len ( eslugs ) < 1 :
print ( f " TYPO IN cave_data file <entrance> tag contents \n <entrance> contents: { e } \n { eslugs =} " )
eslug = eslugs [ 0 ]
2023-08-02 16:23:04 +01:00
# if eslug.endswith(('a','b','c','d','e','f')):
# print(f"! Entrance {eslug}")
2024-06-30 18:38:02 +01:00
# if eslug.endswith('a b'):
# message = f' - Entrance has weird name slug:"{eslug}" cave:"{cave}" caveslug:"{slug}" filename:"cave_data/{filename}"'
# DataIssue.objects.create(parser="xEntrances", message=message, url=f"{cave.url}_cave_edit/")
2023-08-03 14:11:46 +01:00
# print(message)
2023-08-02 16:23:04 +01:00
2023-04-22 03:26:53 +01:00
letter = getXML ( e , " letter " , maxItems = 1 , context = context ) [ 0 ]
2023-08-03 09:48:03 +01:00
if len ( entrances ) > 1 and letter == " " :
2023-11-07 21:23:15 +00:00
# Usually the second entrance is 'b', but the first is still unlettered. So probably 'a'
2023-08-03 09:48:03 +01:00
letter = eslug [ - 1 ] . lower ( )
2023-08-04 07:08:18 +01:00
if letter . lower ( ) not in list ( string . ascii_lowercase ) :
2023-11-07 21:23:15 +00:00
letter = " a "
message = f " - Warning - Empty ' letter ' field for ' { eslug } ' in multiple-entrance cave ' { cave } ' , setting to { letter } . "
#eurl = f"{cave.url}_cave_edit/"
eurl = Path ( cave . url ) . parent + f " { cave . slug ( ) } _cave_edit/ "
# edit recognizer: (?P<path>.*)/(?P<slug>[^/]+)_cave_edit/$
DataIssue . objects . create ( parser = " entrances " , message = message , url = eurl )
2023-08-03 09:48:03 +01:00
print ( message )
2023-04-22 01:24:32 +01:00
if len ( entrances ) == 1 and not eslug : # may be empty: <entranceslug></entranceslug>
2023-08-02 16:23:04 +01:00
msg = " DUMMY: no entrance slug read from file, so assume textually same as cave slug "
set_dummy_entrance ( slug [ 5 : ] , slug , c , msg = msg )
print ( f " ! { msg } \n - { slug } { c } " )
2023-04-22 01:24:32 +01:00
else :
2023-08-02 16:23:04 +01:00
if eslug in entrances_xslug :
# print(f"eslug {eslug} found eslug in xslug cache ")
entrance = entrances_xslug [ eslug ]
else :
# print(f"eslug {eslug} looking up entrance ")
try :
2023-04-22 01:24:32 +01:00
entrance = Entrance . objects . get ( slug = eslug )
entrances_xslug [ eslug ] = entrance
2023-08-02 16:23:04 +01:00
except :
2024-06-29 10:10:35 +01:00
message = f " ! Fail entrance loading { eslug } /entrance_data/ { eslug } file does not exist or loading it from { filename } failed. "
2023-09-10 14:35:00 +01:00
DataIssue . objects . create ( parser = " entrances " , message = message , url = f " { cave . url } _cave_edit/ " )
2023-08-02 16:23:04 +01:00
print ( message )
2024-06-29 10:10:35 +01:00
print ( e )
2023-08-02 16:23:04 +01:00
return
if eslug != f " { entrance } " :
message = f " eslug { eslug } using different entrance { entrance } to set CaveAndEntrance "
2023-09-10 13:42:36 +01:00
DataIssue . objects . create ( parser = " xEntrances " , message = message , url = f " { cave . areacode } / { cave . areacode } - { cave . url } _cave_edit/ " )
2023-08-02 16:23:04 +01:00
print ( message )
try :
2023-08-03 09:48:03 +01:00
# this fails if there is not an unambiguous letter set.
2023-04-22 01:24:32 +01:00
CaveAndEntrance . objects . update_or_create (
2023-08-03 14:11:46 +01:00
cave = cave , entranceletter = letter , entrance = entrance
2023-04-22 01:24:32 +01:00
)
except :
2023-08-02 16:23:04 +01:00
print ( f " ! Entrance setting failure { slug } " )
message = f ' ! Entrance setting failure, slug: " { slug } " #entrances: { len ( entrances ) } letter: " { letter } " cave: " { cave } " filename: " cave_data/ { filename } " \n { e } '
2023-11-03 14:54:57 +00:00
DataIssue . objects . create ( parser = " entrances " , message = message , url = f " { cave . areacode } / { cave . areacode } - { cave . url } _cave_edit/ " )
2023-04-22 01:24:32 +01:00
print ( message )
2023-04-22 03:26:53 +01:00
def reload_entrances ( ) :
""" For individual re-reading of a cave_data file when editing,
also re - read the entrance_data files
"""
for eslug in entrances_xslug :
entrance = entrances_xslug [ eslug ]
2023-04-22 22:05:12 +01:00
read_entrance ( entrance . filename , ent = entrance )
2023-04-22 03:26:53 +01:00
entrance . save ( )
2023-04-22 22:05:12 +01:00
2024-07-02 18:01:15 +01:00
def do_caveslugstuff ( context ) :
2023-04-22 22:05:12 +01:00
""" This may be a fossil. We only have one slug per cave in troggle.
2024-07-02 18:01:15 +01:00
Pending destruction of this whole concept and Class CaveSlug
What is Class CaveSlug for ?
"""
2024-07-12 16:18:05 +01:00
return
2023-04-22 22:05:12 +01:00
2023-11-03 14:54:57 +00:00
def check_directory ( areacode , caveid , url , cave ) :
dir = Path ( settings . EXPOWEB , areacode , caveid )
dir_l = Path ( settings . EXPOWEB , areacode , caveid . lower ( ) )
dir_u = Path ( settings . EXPOWEB , areacode , caveid . upper ( ) )
if dir . is_dir ( ) :
return
if dir_l . is_dir ( ) or dir_u . is_dir ( ) :
2024-06-29 10:10:35 +01:00
message = f " ! Cave URL capitalisation incorrect ' { dir } ' is not a directory but different capitalisation is. { url =} \n - Fix by renaming cave_data/ { caveid } .html which determines the cave id OR by renaming the directory and hand-fixing all the links to the files in the cave description. "
DataIssue . objects . create ( parser = " caves " , message = message , url = f " { cave . newslug ( ) } _cave_edit/ " )
2023-11-03 14:54:57 +00:00
print ( message )
return
if cave . filename :
# not a pending cave, yet the directory does not exist. This is FINE. Many don't (yet)
pass
2023-09-25 23:26:50 +01:00
def check_slug ( areacode , kataster_number , unofficial_number , url ) :
2024-06-28 16:31:23 +01:00
""" There is a <caveslug> field in the .html file, but we now ignore it as we use the
filename itself to set the slug .
2024-07-02 10:23:33 +01:00
However we do check it for sanity , if it is there , pending its removal eventually . """
2024-06-28 16:31:23 +01:00
2023-09-25 23:10:50 +01:00
if kataster_number :
if slug == f " { areacode } - { kataster_number } " :
2024-06-08 11:30:39 +01:00
return slug
2024-06-28 16:31:23 +01:00
message = f " ! Cave Slug mismatch (kataster): ' { slug } ' != ' { areacode } - { kataster_number } ' { url =} in file { filename } . IGNORING caveslug field in the .html file. "
2024-06-08 11:30:39 +01:00
correctslug = f " { areacode } - { kataster_number } "
2024-06-28 11:53:44 +01:00
2023-09-25 23:10:50 +01:00
else :
if slug == f " { areacode } - { unofficial_number } " :
2024-06-08 11:30:39 +01:00
return slug
2023-09-25 23:10:50 +01:00
if slug . lower ( ) == f " { areacode } - { unofficial_number . lower ( ) } " :
2024-07-02 00:18:09 +01:00
message = f " ! Cave Slug capitalisation incorrect (unofficial): ' { slug } ' != ' { areacode } - { unofficial_number } ' { url =} in file { filename } . IGNORING caveslug field in the .html file. "
2024-06-08 11:30:39 +01:00
correctslug = slug . lower ( )
else :
2024-06-28 16:31:23 +01:00
message = f " ! Cave Slug mismatch (unofficial): ' { slug } ' != ' { areacode } - { unofficial_number } ' { url =} in file { filename } IGNORING caveslug field in the .html file. "
correctslug = f " { areacode } - { unofficial_number } "
msgurl = f " / { correctslug [ 0 : 4 ] } / { correctslug } _cave_edit/ "
DataIssue . objects . create ( parser = " caves " , message = message , url = msgurl ) # url here is for the href link to edit the bad data in the DataIssues page
2024-06-28 11:53:44 +01:00
mvtext = f " mv { filename } { correctslug } .html "
#print(mvtext)
if filename != f " { correctslug } .html " :
2024-07-02 00:18:09 +01:00
message = f " ! Filename is not the same as the cave slug ' { slug } ' != ' { areacode } - { unofficial_number } ' { url =} in file { filename } IGNORING caveslug field in the .html file. "
2024-06-28 16:31:23 +01:00
DataIssue . objects . create ( parser = " caves " , message = message , url = msgurl ) # url here is for where the file actually is, for editing
2024-06-28 11:53:44 +01:00
mvf . write ( mvtext + " \n " )
2024-07-02 00:18:09 +01:00
print ( message )
2024-06-08 11:30:39 +01:00
return correctslug
2023-09-25 23:10:50 +01:00
2020-07-06 20:27:31 +01:00
global entrances_xslug
global caves_xslug
global areas_xslug
2023-01-19 21:18:42 +00:00
2023-04-22 22:05:12 +01:00
# Note: we want to overwrite a PENDING cave if we are now importing the 1623-xxx.html file for it
2020-07-06 20:27:31 +01:00
# Note: these are HTML files in the EXPOWEB repo, not from the loser repo.
2023-04-22 01:24:32 +01:00
fn = settings . CAVEDESCRIPTIONS / filename
2023-04-22 23:15:50 +01:00
2024-07-02 10:23:33 +01:00
#print(f" - Reading Cave from cave descriptions file {fn}")
2023-04-22 01:24:32 +01:00
if not fn . exists ( ) :
message = f " ! Cave_data file reading problem filename: ' cave_data/ { filename } ' "
2024-07-02 10:23:33 +01:00
DataIssue . objects . create ( parser = " caves " , message = message , url = " " )
2023-04-22 01:24:32 +01:00
print ( message )
2023-04-22 22:05:12 +01:00
return None
2024-07-02 10:23:33 +01:00
# ignore <caveslug> read from the filename
slug = filename [ : - 5 ] # strip off the ".html" at the end of the filename
slugs = [ slug ]
areacode = slug [ : 4 ]
context = f " / { areacode } / { slug } _cave_edit "
2024-06-28 11:53:44 +01:00
2023-04-22 01:24:32 +01:00
with open ( fn ) as f :
2012-06-10 16:56:12 +01:00
contents = f . read ( )
2023-01-19 21:18:42 +00:00
cavecontentslist = getXML ( contents , " cave " , maxItems = 1 , context = context )
2023-04-22 22:05:12 +01:00
2021-04-14 18:24:08 +01:00
if len ( cavecontentslist ) != 1 :
2023-04-22 22:05:12 +01:00
message = f ' ! BAD CAVE DATA in " { filename } " . More than one cave. Edit file manually, click. '
2024-07-02 10:23:33 +01:00
DataIssue . objects . create ( parser = " caves " , message = message , url = " " )
2021-04-14 18:24:08 +01:00
print ( message )
2023-04-22 22:05:12 +01:00
return None
2023-04-22 01:24:32 +01:00
cavecontents = cavecontentslist [ 0 ]
2024-07-02 10:23:33 +01:00
2024-06-28 16:53:54 +01:00
#print(f"{filename=} {slug=}")
2024-07-02 00:18:09 +01:00
if slug [ - 1 ] . lower ( ) in LETTERS :
message = f " ! Cave name ends in a letter not a number. Fix this! in file { filename } "
DataIssue . objects . create ( parser = " caves " , message = message , url = context ) # url here is for where the file actually is, for editing
print ( message )
2023-09-11 18:38:14 +01:00
2023-04-22 22:05:12 +01:00
non_public = getXMLmax1 ( " non_public " )
official_name = getXMLmax1 ( " official_name " )
kataster_code = getXMLmax1 ( " kataster_code " )
kataster_number = getXMLmax1 ( " kataster_number " )
unofficial_number = getXMLmax1 ( " unofficial_number " )
explorers = getXMLmax1 ( " explorers " )
underground_description = getXMLmax1 ( " underground_description " )
equipment = getXMLmax1 ( " equipment " )
references = getXMLmax1 ( " references " )
survey = getXMLmax1 ( " survey " )
2023-11-20 19:27:09 +00:00
#kataster_status = getXMLmax1("kataster_status")
2023-11-18 14:17:50 +00:00
#underground_centre_line = getXMLmax1("underground_centre_line")
2023-04-22 22:05:12 +01:00
notes = getXMLmax1 ( " notes " )
survex_file = getXMLmax1 ( " survex_file " )
description_file = getXMLmax1 ( " description_file " )
2024-07-02 08:55:11 +01:00
2024-07-02 10:23:33 +01:00
# Optional, but probably deprecated as we should just derive this from the survex data
2024-07-02 08:55:11 +01:00
length = getXMLmin0 ( " length " )
depth = getXMLmin0 ( " depth " )
extent = getXMLmin0 ( " extent " )
2024-07-02 11:15:34 +01:00
2023-04-22 22:05:12 +01:00
manual_edit = True
if not cave :
2023-09-25 23:10:50 +01:00
# we are parsing using databaseReset.py not an online edit
2024-07-02 18:01:15 +01:00
# we have already checked for uniqueness but the Cave object may/should be already created by the Entrance parsing
2023-04-22 22:05:12 +01:00
manual_edit = False
2024-07-02 18:01:15 +01:00
2024-07-03 09:48:38 +01:00
# The Cave object should already have been created when reading the entrance_data file
2024-07-02 18:01:15 +01:00
caves = Cave . objects . filter ( filename = filename )
if len ( caves ) == 1 :
cave = caves [ 0 ]
else :
c = Cave . objects . filter ( filename = filename . lower ( ) )
if len ( c ) == 1 :
cave = c [ 0 ]
else :
print ( f " * Cannot find single Cave object for cave_data/ { filename } from entrance_data file. { len ( caves ) } found " )
return False
2023-04-22 22:05:12 +01:00
# From here on the code applies to both edited and newly-imported caves (mostly!)
2024-07-02 18:01:15 +01:00
do_caveslugstuff ( context ) # needs cave!=None
2023-09-25 23:10:50 +01:00
2024-06-30 18:38:02 +01:00
# We no longer need the <area> tag to define 1623 etc as we get that from the filename.
2024-07-02 10:23:33 +01:00
areas = getXML ( cavecontents , " area " , context = context , minItems = 0 ) # can be multiple <area> tags
2023-09-25 23:10:50 +01:00
for area_slug in areas :
2024-07-02 00:18:09 +01:00
if area_slug not in AREACODES : # only detect subareas
2023-09-25 23:10:50 +01:00
cave . subarea = area_slug
if not cave . areacode :
2024-07-02 00:18:09 +01:00
if areacode in AREACODES :
cave . areacode = areacode
2023-09-25 23:10:50 +01:00
2023-04-22 22:05:12 +01:00
cave . non_public = boolify ( non_public )
cave . official_name = official_name [ 0 ]
cave . kataster_code = kataster_code [ 0 ]
2024-07-02 11:15:34 +01:00
if " + " in kataster_code [ 0 ] :
cave . fully_explored = True
2024-07-02 18:01:15 +01:00
# print(f"{kataster_code[0]} {slug}")
2023-04-22 22:05:12 +01:00
cave . kataster_number = kataster_number [ 0 ]
cave . unofficial_number = unofficial_number [ 0 ]
cave . explorers = explorers [ 0 ]
cave . underground_description = underground_description [ 0 ]
cave . equipment = equipment [ 0 ]
cave . references = references [ 0 ]
cave . survey = survey [ 0 ]
2023-11-20 19:27:09 +00:00
#cave.kataster_status=kataster_status[0]
2023-11-18 14:17:50 +00:00
#cave.underground_centre_line=underground_centre_line[0]
2023-04-22 22:05:12 +01:00
cave . notes = notes [ 0 ]
2024-07-02 08:55:11 +01:00
if length :
cave . length = length [ 0 ]
if depth :
cave . depth = depth [ 0 ]
if extent :
cave . extent = extent [ 0 ]
2023-04-22 22:05:12 +01:00
cave . survex_file = survex_file [ 0 ]
cave . description_file = description_file [ 0 ]
2024-07-02 10:23:33 +01:00
# cave.url=url[0] # set algorithically now:
2023-11-03 14:54:57 +00:00
cave . url = f " { cave . areacode } / { cave . number ( ) } / { cave . number ( ) } .html "
check_directory ( cave . areacode , cave . number ( ) , cave . url , cave )
2023-09-25 23:10:50 +01:00
2024-06-28 16:31:23 +01:00
slug = check_slug ( cave . areacode , cave . kataster_number , cave . unofficial_number , cave . url ) #NB cave.slug is not a field on Cave
2023-09-27 23:01:32 +01:00
2024-07-02 10:23:33 +01:00
# Thsi whole way of doing entrances can be replaced by simply knowing the entrance_data filename what the cave is. to do.
2023-04-22 22:05:12 +01:00
entrances = getXML ( cavecontents , " entrance " , context = context )
2024-07-15 11:42:46 +01:00
#do_entrances()
2023-04-22 22:05:12 +01:00
if not entrances or len ( entrances ) < 1 :
# missing entrance link in cave_data/1623-* .html file
set_dummy_entrance ( slug [ 5 : ] , slug , cave , msg = " DUMMY: no entrances " )
else :
do_entrances ( )
if manual_edit :
reload_entrances ( )
2023-04-22 01:24:32 +01:00
if survex_file [ 0 ] :
if not ( Path ( SURVEX_DATA ) / survex_file [ 0 ] ) . is_file ( ) :
message = f ' ! { slug : 12 } survex filename does not exist :LOSER: " { survex_file [ 0 ] } " in " { filename } " '
DataIssue . objects . create ( parser = " caves " , message = message , url = f " / { slug [ 0 : 4 ] } / { slug } _cave_edit/ " )
print ( message )
2023-10-07 00:26:52 +01:00
# else:
# print(f"{slug:12} survex filename UNSET")
2023-09-25 23:10:50 +01:00
2023-01-19 21:18:42 +00:00
2023-04-22 01:24:32 +01:00
if description_file [ 0 ] : # if not an empty string
2023-07-14 10:20:57 +01:00
message = f ' - { slug : 12 } Note (not an error): complex description filename " { description_file [ 0 ] } " inside " cave_data/ { filename } " '
2023-04-22 01:24:32 +01:00
DataIssue . objects . create ( parser = " caves ok " , message = message , url = f " / { slug } _cave_edit/ " )
2023-08-03 14:11:46 +01:00
# print(message)
2023-01-19 21:18:42 +00:00
2023-04-22 01:24:32 +01:00
if not ( Path ( EXPOWEB ) / description_file [ 0 ] ) . is_file ( ) :
message = f ' ! { slug : 12 } description filename " { EXPOWEB } / { description_file [ 0 ] } " does not refer to a real file '
DataIssue . objects . create ( parser = " caves " , message = message , url = f " / { slug } _cave_edit/ " )
print ( message )
2023-04-22 22:05:12 +01:00
cave . save ( )
return cave
2023-08-02 16:23:04 +01:00
2022-07-23 17:26:47 +01:00
def readcaves ( ) :
2023-04-22 01:24:32 +01:00
""" Called from databaseReset mass importer.
Reads the xml - format HTML ' cave ' files in the EXPOWEB repo , the survex files from the loser repo .
"""
2023-03-28 19:08:05 +01:00
# Pending is for those caves which do not have cave_data/1623-xxx.html XML files even though
# they exist and have surveys.
2023-08-02 16:23:04 +01:00
with transaction . atomic ( ) :
pending = set ( )
fpending = Path ( CAVEDESCRIPTIONS , " pendingcaves.txt " )
if fpending . is_file ( ) :
with open ( fpending , " r " ) as fo :
cids = fo . readlines ( )
for cid in cids :
2023-10-16 11:35:36 +01:00
pcaveid = cid . split ( " ; " , 1 ) [ 0 ] # split on ";" and take the first bit
pcaveid = pcaveid . strip ( ) . rstrip ( " \n " )
if pcaveid == " " :
continue
pending . add ( pcaveid )
2022-07-23 17:26:47 +01:00
with transaction . atomic ( ) :
print ( " - Deleting Caves and Entrances " )
# attempting to avoid MariaDB crash when doing this
try :
Area . objects . all ( ) . delete ( )
except :
pass
try :
Cave . objects . all ( ) . delete ( )
except :
pass
try :
Entrance . objects . all ( ) . delete ( )
except :
pass
2023-08-02 16:23:04 +01:00
# Clear the cave data issues and the caves as we are reloading
DataIssue . objects . filter ( parser = " areas " ) . delete ( )
DataIssue . objects . filter ( parser = " caves " ) . delete ( )
DataIssue . objects . filter ( parser = " caves ok " ) . delete ( )
2023-09-06 15:19:20 +01:00
DataIssue . objects . filter ( parser = " aliases " ) . delete ( )
DataIssue . objects . filter ( parser = " aliases ok " ) . delete ( )
2023-08-02 16:23:04 +01:00
#DataIssue.objects.filter(parser="entrances").delete()
#DataIssue.objects.filter(parser="xEntrances").delete()
2023-01-19 21:18:42 +00:00
2022-07-23 17:26:47 +01:00
with transaction . atomic ( ) :
print ( " - Reading Entrances from entrance descriptions xml files " )
2024-06-29 22:17:39 +01:00
for filename in next ( os . walk ( ENTRANCEDESCRIPTIONS ) ) [ 2 ] : # Should be a better way of getting a list of files, use pathlib !
2023-04-22 22:05:12 +01:00
read_entrance ( filename )
2022-07-23 17:26:47 +01:00
2023-08-02 16:23:04 +01:00
# Why is this needed ? Without it, we lose these DataIssues!
ent_issues = DataIssue . objects . filter ( parser = " entrances " )
print ( f " _ We now have { len ( ent_issues ) } entrance DataIssues " )
with transaction . atomic ( ) :
2022-07-23 17:26:47 +01:00
print ( " - Reading Caves from cave descriptions xml files " )
2024-06-29 10:08:33 +01:00
mvscript = " /tmp/mvscript.sh " # in .gitignore so no problem creating it on server in /troggle/
2024-06-28 11:53:44 +01:00
with open ( mvscript , " w " ) as mvf : # overwrite
mvf . write ( f " cd { CAVEDESCRIPTIONS } \n " )
for filename in next ( os . walk ( CAVEDESCRIPTIONS ) ) [ 2 ] : # Should be a better way of getting a list of files
if filename . endswith ( " .html " ) :
read_cave ( filename , mvf )
2022-07-25 00:57:00 +01:00
2023-01-19 21:18:42 +00:00
print ( " - Setting up all the variously useful alias names " )
2023-01-19 21:34:09 +00:00
GetCaveLookup ( )
2023-01-19 21:18:42 +00:00
print ( " - Setting pending caves " )
2022-07-28 16:36:57 +01:00
# Do this last, so we can detect if they are created and no longer 'pending'
2022-10-06 19:02:15 +01:00
with transaction . atomic ( ) :
for k in pending :
2023-01-19 21:18:42 +00:00
2022-10-06 19:02:15 +01:00
if k [ 0 : 3 ] == " 162 " :
2023-09-10 13:42:36 +01:00
areacode = k [ 0 : 4 ]
2023-07-05 18:21:15 +01:00
number = k [ 5 : ]
2023-09-25 23:10:50 +01:00
url = f " { areacode } / { k [ 5 : ] } .html " # Note we are appending the .html to allow for offline websites
2022-10-06 19:02:15 +01:00
else :
2023-09-10 13:42:36 +01:00
areacode = " 1623 "
2023-07-05 18:21:15 +01:00
number = k
2023-01-19 21:18:42 +00:00
url = f " 1623/ { k } "
2022-09-25 19:43:00 +01:00
2023-01-19 21:18:42 +00:00
try :
2023-09-10 13:42:36 +01:00
do_pending_cave ( k , number , url , areacode )
2022-10-06 19:02:15 +01:00
except :
2023-09-10 13:42:36 +01:00
message = f " ! Error. Cannot create pending cave, pending-id: { k } in area { areacode } "
2023-01-19 21:18:42 +00:00
DataIssue . objects . create ( parser = " caves " , message = message )
2022-10-06 19:02:15 +01:00
print ( message )
raise