2012-06-10 16:56:12 +01:00
import os
import re
2023-04-22 23:15:50 +01:00
2021-04-14 18:24:08 +01:00
from pathlib import Path
2012-06-10 16:56:12 +01:00
2020-05-24 01:57:06 +01:00
from django . conf import settings
2021-04-06 22:50:57 +01:00
from django . db import transaction
2020-05-24 01:57:06 +01:00
2023-03-28 20:30:00 +01:00
from troggle . core . models . caves import Area , Cave , CaveAndEntrance , Entrance , GetCaveLookup
2023-01-29 21:45:51 +00:00
from troggle . core . models . logbooks import CaveSlug
2021-04-13 00:43:57 +01:00
from troggle . core . models . troggle import DataIssue
2023-01-19 21:18:42 +00:00
from troggle . settings import CAVEDESCRIPTIONS , ENTRANCEDESCRIPTIONS , EXPOWEB , SURVEX_DATA
2022-07-25 00:57:00 +01:00
2023-04-22 01:24:32 +01:00
""" Reads all the cave description data and entrance description data
by parsing the xml files stored as e . g .
: EXPOWEB : / cave_data / 1623 - 161. html
or
: EXPOWEB : / entrance_data / 1623 - 161 g . html
2021-04-06 22:50:57 +01:00
and creating the various Cave , Entrance and necessary Area objects .
2022-07-27 21:24:40 +01:00
This is the first import that happens after the database is reinitialised .
2022-07-21 19:01:57 +01:00
So is the first thing that creates tables .
2023-01-19 21:18:42 +00:00
"""
2012-06-10 16:56:12 +01:00
2023-01-19 21:18:42 +00:00
todo = """
2023-02-02 21:50:40 +00:00
- Cannot use Edit This Page for pendingcaves . txt_edit as Edit This Page is expecting an html file .
2022-03-05 12:20:26 +00:00
So we will need a separate file - editing capability just for this configuration file ? !
2022-07-19 17:48:11 +01:00
2023-04-22 22:05:12 +01:00
- we want to overwrite a PENDING cave if we are now importing the 1623 - xxx . html file for it
2023-04-22 23:27:06 +01:00
- rewrite archaic regex
re . findall ( " < %(itemname)s >(.*?)</ %(itemname)s > " % { " itemname " : itemname } , text , re . S )
in modern form and pre - compile it .
2023-02-02 21:50:40 +00:00
- Semi - automagically import all the 1627 - pending caves and create HTML files for them to be
edited individually . ( These are caves we only know about because we have German survex files . )
- crashes on MariaDB in databasereset . py on server when deleting Caves and complains Area needs a
non null parent , But this is not true . The only solution we have found is to let it crash , then
stop and restart MariaDB ( requires a logon able to sudo ) and then restart the databasereset . py
again . ( status as of July 2022 )
2023-01-19 21:18:42 +00:00
"""
2020-07-06 20:27:31 +01:00
entrances_xslug = { }
caves_xslug = { }
areas_xslug = { }
2023-01-19 21:18:42 +00:00
2021-04-26 17:23:23 +01:00
def dummy_entrance ( k , slug , msg = " DUMMY " ) :
2023-01-19 21:18:42 +00:00
""" Returns an empty entrance object for either a PENDING cave or a DUMMY entrance if
2021-04-26 17:23:23 +01:00
user forgot to provide one when creating the cave
2023-01-19 21:18:42 +00:00
"""
2023-03-28 19:08:05 +01:00
ent = Entrance . objects . create ( # creates object and saves into db
name = k , slug = k , filename = k + " .html " ,
2023-01-19 21:18:42 +00:00
entrance_description = " Dummy entrance: auto-created when registering a new cave "
+ " and you forgot to create an entrance for it. Click on ' Edit ' to enter the correct data, then ' Submit ' . " ,
marking = " ? " ,
)
2021-04-26 17:23:23 +01:00
if ent :
return ent
else :
2023-03-28 19:26:37 +01:00
message = f " ! { k : 11s } { msg } - { slug } { k } entrance create failure "
2023-03-28 19:08:05 +01:00
DataIssue . objects . create ( parser = " entrances " , message = message , url = f " { slug } " )
2021-04-26 17:23:23 +01:00
print ( message )
2023-05-02 15:42:58 +01:00
raise # caught and handled by calling routine.
2023-01-19 21:18:42 +00:00
2021-04-26 17:23:23 +01:00
2023-03-28 19:08:05 +01:00
2021-04-26 17:23:23 +01:00
def set_dummy_entrance ( id , slug , cave , msg = " DUMMY " ) :
2023-03-28 19:08:05 +01:00
""" Called only when reading the cave and entrance html files
2023-05-02 15:42:58 +01:00
Called when the Entrance field in a cave_data file is either missing or
holds a null string instead of a filename .
Previously , the lack of an entrance where an entrance was expected , caused troggle to crash in several places .
But it is more robust now , so this is not necessary . . . we hope .
Also , Cave and Entrance editing now expects there to be a real file ( since April 2023 ) , so creating this
dummy is actually harmful . So this is commented out , pending removal after further experience .
global variable entrances_xslug is simply a cache of references to Entrance objects
to speed things up when parsing a lot of caves and entrances . All DB actions are time - consuming
so
"""
2021-04-26 17:23:23 +01:00
global entrances_xslug
2023-07-25 23:23:49 +01:00
message = f " - Note: Missing Entrance for entrance ' { id } ' on cave ' { cave } ' - Is this a problem? "
2023-05-02 15:42:58 +01:00
DataIssue . objects . create ( parser = " entrances " , message = message , url = f " { cave . url } " )
2023-07-25 23:23:49 +01:00
# try:
# entrance = dummy_entrance(id, slug, msg="DUMMY")
# entrances_xslug[slug] = entrance
# CaveAndEntrance.objects.update_or_create(cave=cave, entrance_letter="", entrance=entrance)
# pass
# except:
# message = f' ! Entrance Dummy setting failure, slug:"{slug}" cave id :"{id}" '
# DataIssue.objects.create(parser="entrances", message=message, url=f"{cave.url}")
# print(message)
2023-01-19 21:18:42 +00:00
2023-03-24 00:54:26 +00:00
def make_areas ( ) :
print ( " - Creating Areas 1623, 1624, 1627 and 1626 " )
# This crashes on the server with MariaDB even though a null parent is explicitly allowed.
area_1623 = Area . objects . create ( short_name = " 1623 " , super = None )
area_1623 . save ( )
area_1624 = Area . objects . create ( short_name = " 1624 " , super = None )
area_1624 . save ( )
area_1626 = Area . objects . create ( short_name = " 1626 " , super = None )
area_1626 . save ( )
area_1627 = Area . objects . create ( short_name = " 1627 " , super = None )
area_1627 . save ( )
def get_area ( areanum ) :
""" Given the number as a string, return the area object
"""
a = Area . objects . all ( )
if len ( a ) == 0 :
make_areas ( )
area = Area . objects . get ( short_name = " 1623 " ) # default
if areanum == " 1623 " :
area = Area . objects . get ( short_name = " 1623 " )
if areanum == " 1624 " :
area = Area . objects . get ( short_name = " 1624 " )
if areanum == " 1626 " :
area = Area . objects . get ( short_name = " 1626 " )
if areanum == " 1627 " :
area = Area . objects . get ( short_name = " 1627 " )
return area
def create_new_cave ( svxpath ) :
""" This is called only when a new survex file is edited online which has a path on the
: loser : repo which is not recognised as a known cave .
"""
# e.g. svxpath = "caves-1623/666/antig"
print ( f " Create new cave at { svxpath } " )
#
survex_file = svxpath + " .svx "
parts = svxpath . split ( " / " )
a = parts [ 0 ] [ - 4 : ]
caveid = parts [ 1 ]
print ( f " parts { parts } , { a } , { caveid } " )
# double check
if a [ 0 : 3 ] == " 162 " :
areanum = a [ 0 : 4 ]
2023-07-08 16:56:49 +01:00
url = f " { areanum } / { caveid } .html " # Note we are appending the .html as we are believe in backwards compatability.
#url = f"{areanum}/{a[5:]}.html" # This is original code, but a above is only defined as being 4 characters long, so it did not make sense and produced non unique urls
else :
print ( f " WARNING: parsers/caves/create_new_cave called with svxpath ' { svxpath } ' . Surely it should start ' caves-162* ' ? " )
2023-03-24 00:54:26 +00:00
areanum = " 1623 "
2023-07-08 16:56:49 +01:00
url = f " 1623/ { caveid } .html "
#url = f"1623/{k}.html" # This is original code, but a above is only defined as being 4 characters long, so it did not make sense and produced non unique urls
2023-03-24 00:54:26 +00:00
k = f " { areanum } - { caveid } "
area = get_area ( areanum )
caves = Cave . objects . filter ( unofficial_number = caveid )
if caves :
message = f " ! Already exists, caveid: { k } in area { areanum } { caves } "
DataIssue . objects . create ( parser = " caves " , message = message )
print ( message )
return caves [ 0 ]
try :
2023-07-05 18:21:15 +01:00
cave = do_pending_cave ( k , caveid , url , area )
2023-03-24 00:54:26 +00:00
except :
message = f " ! Error. Cannot create pending cave and entrance, pending-id: { k } in area { areanum } "
DataIssue . objects . create ( parser = " caves " , message = message )
print ( message )
raise
# we know what the survex file is, we don't need to use the guess
cave . survex_file = survex_file
cave . save ( )
return cave
2023-01-19 21:18:42 +00:00
2023-07-05 18:21:15 +01:00
def do_pending_cave ( k , caveid , url , area ) :
2023-01-19 21:18:42 +00:00
"""
default for a PENDING cave , should be overwritten in the db later if a real cave of the same name exists
in expoweb / cave_data / 1623 - " k " . html
2022-09-19 19:54:51 +01:00
Note that at this point in importing the data we have not yet seen the survex files , so we can ' t
look inside the relevant survex file to find the year and so we con ' t provide helpful links.
2023-01-19 21:18:42 +00:00
"""
2022-10-06 19:02:15 +01:00
def get_survex_file ( k ) :
2023-01-19 21:18:42 +00:00
""" Guesses at and finds a survex file for this pending cave.
2022-11-17 01:24:39 +00:00
Convoluted . Needs rewriting
2023-01-19 21:18:42 +00:00
"""
2022-10-06 19:02:15 +01:00
if k [ 0 : 3 ] == " 162 " :
id = Path ( k [ 5 : ] )
else :
id = Path ( k )
2023-01-19 21:18:42 +00:00
2022-10-06 19:02:15 +01:00
survex_file = f " caves- { area . short_name } / { id } / { id } .svx "
if Path ( settings . SURVEX_DATA , survex_file ) . is_file ( ) :
return survex_file
else :
survex_file = f " caves- { area . short_name } / { id } .svx "
if Path ( settings . SURVEX_DATA , survex_file ) . is_file ( ) :
return survex_file
2023-01-19 21:18:42 +00:00
2022-10-06 19:02:15 +01:00
survex_file = " "
d = Path ( settings . SURVEX_DATA , f " caves- { area . short_name } / { id } " )
if d . is_dir ( ) :
prime_suspect = " "
dir = d . iterdir ( )
for f in dir :
if f . suffix == " .svx " :
survex_file = f . relative_to ( settings . SURVEX_DATA )
2023-01-19 21:18:42 +00:00
chk = min ( 5 , len ( f . name ) - 1 )
if str ( f . name ) [ : chk ] . lower ( ) == str ( id . name ) [ : chk ] . lower ( ) : # bodge which mostly works
2022-10-06 19:02:15 +01:00
prime_suspect = survex_file
if prime_suspect :
survex_file = prime_suspect
# message = f" ! {k:14} Found a survex file which might be the right one: {survex_file}"
# DataIssue.objects.create(parser='caves', message=message, url=url)
# print(message)
return survex_file
2022-07-23 17:26:47 +01:00
slug = k
2023-01-19 21:18:42 +00:00
2022-07-28 16:36:57 +01:00
g = GetCaveLookup ( )
2022-10-06 19:02:15 +01:00
with transaction . atomic ( ) :
if slug in g :
message = f " ! { k : 18 } cave listed in pendingcaves.txt already exists. "
2023-01-19 21:18:42 +00:00
DataIssue . objects . create ( parser = " caves " , message = message , url = url )
2022-10-06 19:02:15 +01:00
print ( message )
return
2022-07-28 16:36:57 +01:00
2023-01-19 21:34:09 +00:00
default_note = " _Survex file found in loser repo but no description in expoweb <br><br><br> \n "
default_note + = " INSTRUCTIONS: First open ' This survex file ' (link above the CaveView panel) to find the date and info. Then "
default_note + = ' <br><br> \n \n - (0) look in the <a href= " /noinfo/cave-number-index " >cave number index</a> for notes on this cave, '
default_note + = " <br><br> \n \n - (1) search in the survex file for the *ref to find a "
default_note + = " relevant wallet, e.g.<a href= ' /survey_scans/2009 % 252311/ ' >2009#11</a> and read the notes image files <br> \n - "
2023-01-19 21:18:42 +00:00
default_note + = (
2023-01-19 21:34:09 +00:00
" <br><br> \n \n - (2) search in the Expo for that year e.g. <a href= ' /expedition/2009 ' >2009</a> to find a "
2023-01-19 21:18:42 +00:00
)
2023-01-19 21:34:09 +00:00
default_note + = " relevant logbook entry, remember that the date may have been recorded incorrectly, "
2023-01-19 21:18:42 +00:00
default_note + = (
2023-01-19 21:34:09 +00:00
" so check for trips i.e. logbook entries involving the same people as were listed in the survex file, "
2023-01-19 21:18:42 +00:00
)
default_note + = (
2023-01-19 21:34:09 +00:00
" and you should also check the scanned copy of the logbook (linked from each logbook entry page) "
2023-01-19 21:18:42 +00:00
)
2023-01-19 21:34:09 +00:00
default_note + = " just in case a vital trip was not transcribed, then <br> \n - "
2023-01-19 21:18:42 +00:00
default_note + = (
2023-01-19 21:34:09 +00:00
" click on ' Edit this cave ' and copy the information you find in the survex file and the logbook "
2023-01-19 21:18:42 +00:00
)
2023-01-19 21:34:09 +00:00
default_note + = " and delete all the text in the ' Notes ' section - which is the text you are reading now. "
default_note + = " <br><br> \n \n - Only two fields on this form are essential. "
default_note + = " Documentation of all the fields on ' Edit this cave ' form is in <a href= ' /handbook/survey/caveentryfields.html ' >handbook/survey/caveentryfields</a> "
default_note + = " <br><br> \n \n - "
default_note + = " You will also need to create a new entrance from the ' Edit this cave ' page. Ignore the existing dummy one, it will evaporate on the next full import. "
default_note + = " <br><br> \n \n - "
default_note + = " When you Submit it will create a new file in expoweb/cave_data/ "
2023-01-19 21:18:42 +00:00
default_note + = (
2023-01-19 21:34:09 +00:00
" <br><br> \n \n - Now you can edit the entrance info: click on Edit below for the dummy entrance. "
2023-01-19 21:18:42 +00:00
)
2023-01-19 21:34:09 +00:00
default_note + = " and then Submit to save it (if you forget to do this, a dummy entrance will be created for your new cave description). "
default_note + = " <br><br> \n \n - Finally, you need to find a nerd to edit the file ' <var>expoweb/cave_data/pending.txt</var> ' "
2023-01-19 21:18:42 +00:00
default_note + = (
f " to remove the line <br><var> { slug } </var><br> as it is no longer ' pending ' but ' done. Well Done. "
)
2021-04-26 02:10:45 +01:00
2022-10-06 19:02:15 +01:00
survex_file = get_survex_file ( k )
2023-01-19 21:18:42 +00:00
2022-10-06 19:02:15 +01:00
cave = Cave (
2023-07-05 18:21:15 +01:00
unofficial_number = caveid ,
2023-07-29 15:21:27 +01:00
underground_description = " Pending cave write-up - No cave description created yet. " ,
2023-01-19 21:18:42 +00:00
survex_file = survex_file ,
url = url ,
notes = default_note ,
)
2022-10-06 19:02:15 +01:00
if cave :
2023-01-19 21:18:42 +00:00
cave . save ( ) # must save to have id before foreign keys work. This is also a ManyToMany key.
2022-10-06 19:02:15 +01:00
cave . area . add ( area )
cave . save ( )
message = f " ! { k : 18 } { cave . underground_description } url: { url } "
2023-01-19 21:18:42 +00:00
DataIssue . objects . create ( parser = " caves " , message = message , url = url )
2022-10-06 19:02:15 +01:00
print ( message )
2023-01-19 21:18:42 +00:00
try : # Now create a cave slug ID
2023-01-19 21:34:09 +00:00
CaveSlug . objects . update_or_create ( cave = cave , slug = slug , primary = False )
2022-10-06 19:02:15 +01:00
except :
2023-03-24 00:54:26 +00:00
message = f " ! { k : 11s } PENDING CaveSLUG { slug } create failure "
2023-01-19 21:18:42 +00:00
DataIssue . objects . create ( parser = " caves " , message = message )
2022-10-06 19:02:15 +01:00
print ( message )
else :
2023-01-19 21:18:42 +00:00
message = f " ! { k : 11s } PENDING cave create failure "
DataIssue . objects . create ( parser = " caves " , message = message )
2021-04-26 02:10:45 +01:00
print ( message )
2022-10-06 19:02:15 +01:00
try :
2023-07-25 23:23:49 +01:00
# troggle is more robust against missing entrances now, not needed.
# set_dummy_entrance(k, slug, cave, msg="PENDING")
pass
2022-10-06 19:02:15 +01:00
except :
2023-03-28 19:08:05 +01:00
message = f " ! { k : 11s } PENDING entrance + cave UNION create failure ' { cave } ' [ { slug } ] { k } "
# message = f" ! {k:11s} PENDING entrance + cave UNION create failure '{cave}' [{ent}]"
2023-01-19 21:18:42 +00:00
DataIssue . objects . create ( parser = " caves " , message = message )
2022-10-06 19:02:15 +01:00
print ( message )
2023-03-24 00:54:26 +00:00
return cave
2021-04-26 02:10:45 +01:00
2023-04-22 01:24:32 +01:00
def getXML ( text , itemname , minItems = 1 , maxItems = None , context = " " ) :
""" Reads a single XML tag
Should throw exception rather than producing error message here ,
then handle exception in calling routine where it has the context .
2023-04-22 22:05:12 +01:00
This always succeeds , but it produices error message on the terminal and in the
DatIssues log .
2023-04-22 01:24:32 +01:00
"""
items = re . findall ( " < %(itemname)s >(.*?)</ %(itemname)s > " % { " itemname " : itemname } , text , re . S )
if len ( items ) < minItems :
message = (
" ! %(count)i x %(itemname)s found, at least %(min)i expected. Load ABORT. "
% { " count " : len ( items ) , " itemname " : itemname , " min " : minItems }
+ " in file "
+ context
)
DataIssue . objects . create ( parser = " caves " , message = message , url = " " + context )
print ( message )
if maxItems is not None and len ( items ) > maxItems :
message = (
" ! %(count)i x %(itemname)s found, no more than %(max)i expected in this XML unit. Load ABORT. "
% { " count " : len ( items ) , " itemname " : itemname , " max " : maxItems }
+ " in file "
+ context
)
2023-04-22 22:05:12 +01:00
DataIssue . objects . create ( parser = " caves " , message = message , url = " " + context )
2023-04-22 01:24:32 +01:00
print ( message )
if minItems == 0 :
if not items :
items = [ " " ]
return items
2023-04-22 03:26:53 +01:00
def boolify ( boolstrs ) :
return {
" True " : True ,
" False " : False ,
" true " : True ,
" false " : False } [ boolstrs [ 0 ] ]
2023-07-25 22:14:46 +01:00
def validate_station ( station ) :
""" It is possible to break troggle entirely by getting this wrong.
These station identifiers are matched against other statsions using . endswith ( )
in parsers / locations . py
so a simple number here will match hundreds of SUrvexStation objects
It should be , e . g . " 1623.p240 "
"""
if station == " " :
return True
dot = station . find ( " . " )
if dot == - 1 :
# no full stop found. Bad station identifier.
raise
else :
return True
2023-04-22 03:26:53 +01:00
2023-04-22 22:05:12 +01:00
def read_entrance ( filename , ent = None ) :
""" Reads an entrance description from the .html file.
If not called as part of initial import , then the global lists will not be correct
but this is OK , a search will find them in the db .
Args :
filename : The name of the . html file .
ent : The entrance object , if it already exists .
Returns :
The entrance object , or a new entrance object if ` ent ` is None .
"""
2023-03-31 12:19:22 +01:00
def getXMLmax1 ( field ) :
return getXML ( entrancecontents , field , maxItems = 1 , context = context )
2020-07-06 20:27:31 +01:00
global entrances_xslug
global caves_xslug
global areas_xslug
# Note: these are HTML files in the EXPOWEB repo, not from the loser repo.
2022-03-05 12:20:26 +00:00
with open ( os . path . join ( ENTRANCEDESCRIPTIONS , filename ) ) as f :
2012-06-10 16:56:12 +01:00
contents = f . read ( )
2021-04-14 22:50:47 +01:00
context = filename
2023-04-22 03:26:53 +01:00
2023-01-19 21:18:42 +00:00
entrancecontentslist = getXML ( contents , " entrance " , maxItems = 1 , context = context )
2021-04-14 18:24:08 +01:00
if len ( entrancecontentslist ) != 1 :
2023-04-22 22:05:12 +01:00
message = f ' ! BAD ENTRANCE DATA in " { filename } " . More than one entrance. Edit file manually, click. '
DataIssue . objects . create ( parser = " entrances " , message = message , url = f " /entrance_data/ { filename } _edit " )
2021-04-14 18:24:08 +01:00
print ( message )
2023-04-22 22:05:12 +01:00
return None
2023-04-22 03:26:53 +01:00
entrancecontents = entrancecontentslist [ 0 ]
slugs = getXML ( entrancecontents , " slug " , context = context )
2023-04-22 22:05:12 +01:00
slug = slugs [ 0 ]
2023-04-22 03:26:53 +01:00
if len ( slugs ) > 1 :
# Only ever one of these per entrance in the expo dataset
2023-04-22 22:05:12 +01:00
message = f " ! - More than one slug for an entrance: { entrance } , slugs: { slugs } . Ignoring all except first. "
2023-04-22 03:26:53 +01:00
DataIssue . objects . create ( parser = " entrances " , message = message , url = f " /cave/ { slug } /edit/ " )
print ( message )
2023-03-28 19:26:37 +01:00
2023-07-26 20:40:30 +01:00
lastvisit = getXML ( entrancecontents , " lastvisit " , maxItems = 1 , minItems = 0 , context = context )
2023-04-22 03:26:53 +01:00
alt = getXMLmax1 ( " alt " )
approach = getXMLmax1 ( " approach " )
bearings = getXMLmax1 ( " bearings " )
easting = getXMLmax1 ( " easting " )
entrance_description = getXMLmax1 ( " entrance_description " )
exact_station = getXMLmax1 ( " exact_station " )
explorers = getXMLmax1 ( " explorers " )
findability = getXMLmax1 ( " findability " )
findability_description = getXMLmax1 ( " findability_description " )
location_description = getXMLmax1 ( " location_description " )
map_description = getXMLmax1 ( " map_description " )
marking = getXMLmax1 ( " marking " )
marking_comment = getXMLmax1 ( " marking_comment " )
name = getXMLmax1 ( " name " )
non_public = getXMLmax1 ( " non_public " )
northing = getXMLmax1 ( " northing " )
other_description = getXMLmax1 ( " other_description " )
other_station = getXMLmax1 ( " other_station " )
photo = getXMLmax1 ( " photo " )
tag_station = getXMLmax1 ( " tag_station " )
underground_description = getXMLmax1 ( " underground_description " )
url = getXMLmax1 ( " url " )
2023-04-22 22:05:12 +01:00
if not ent :
ent , state = Entrance . objects . update_or_create ( slug = slug )
ent . name = name [ 0 ]
ent . non_public = boolify ( non_public )
ent . alt = alt [ 0 ]
ent . approach = approach [ 0 ]
ent . bearings = bearings [ 0 ]
ent . easting = easting [ 0 ]
ent . entrance_description = entrance_description [ 0 ]
ent . exact_station = exact_station [ 0 ]
ent . explorers = explorers [ 0 ]
ent . filename = filename
ent . findability = findability [ 0 ]
ent . findability_description = findability_description [ 0 ]
ent . lastvisit = lastvisit [ 0 ]
ent . location_description = location_description [ 0 ]
ent . map_description = map_description [ 0 ]
ent . marking = marking [ 0 ]
ent . marking_comment = marking_comment [ 0 ]
ent . northing = northing [ 0 ]
ent . other_description = other_description [ 0 ]
ent . other_station = other_station [ 0 ]
ent . photo = photo [ 0 ]
# ent.slug=slugs[0]
ent . tag_station = tag_station [ 0 ]
ent . underground_description = underground_description [ 0 ]
ent . url = url [ 0 ]
2023-07-25 22:14:46 +01:00
for st in [ ent . exact_station , ent . other_station , ent . tag_station ] :
try :
validate_station ( st )
except :
message = f " ! BAD ENTRANCE TAG ' { st } ' in ' { filename } ' . Must format like ' 1623.p204 ' . Edit file manually, click. "
#http://localhost:8000/1623/2023-EBH-01/1623-2023-EBH-01:1623-2023-EBH-01_entrance_edit
DataIssue . objects . create ( parser = " entrances " , message = message , url = f " /1623/ { slug } / { slug } : { slug } _entrance_edit " )
print ( message )
# ent_issues = DataIssue.objects.filter(parser="entrances")
# print(f".. We now have {len(ent_issues)} entrance DataIssues")
return None
2023-04-22 22:05:12 +01:00
ent . save ( )
return ent
2012-06-10 16:56:12 +01:00
2023-04-22 22:05:12 +01:00
def read_cave ( filename , cave = None ) :
2023-04-05 20:42:19 +01:00
""" Reads an entrance description from the .html file
2023-04-22 01:24:32 +01:00
Convoluted . Sorry . Needs rewriting
2022-11-17 01:24:39 +00:00
Assumes any area it hasn ' t seen before is a subarea of 1623
2023-04-22 01:24:32 +01:00
If not called as part of initial import , then the global lists will not be correct
but this is OK , a search will find them in the db .
2023-04-22 23:15:50 +01:00
Attempted to use standard python3 .11 xml library but fails on HTML entities ( 2023 - 04 - 23 )
import xml . etree . ElementTree as ET
tree = ET . parse ( fn )
xml_root = tree . getroot ( )
for t in [ " html " , " head " , " body " , " cave " , " non_public " , " caveslug " , " official_name " , " entrance " ] :
elements = xml_root . findall ( t )
2023-01-19 21:18:42 +00:00
"""
2023-04-22 22:05:12 +01:00
def getXMLmax1 ( field ) :
return getXML ( cavecontents , field , maxItems = 1 , context = context )
2023-04-22 01:24:32 +01:00
def do_entrances ( ) :
2023-04-22 03:26:53 +01:00
""" For both bulk import and individual re-reading of cave_data file,
fix the entrances
2023-04-22 22:05:12 +01:00
What is Class CaveAndEntrance for ?
2023-04-22 03:26:53 +01:00
"""
for e in entrances :
eslug = getXML ( e , " entranceslug " , maxItems = 1 , context = context ) [ 0 ]
letter = getXML ( e , " letter " , maxItems = 1 , context = context ) [ 0 ]
2023-04-22 01:24:32 +01:00
if len ( entrances ) == 1 and not eslug : # may be empty: <entranceslug></entranceslug>
set_dummy_entrance ( slug [ 5 : ] , slug , c , msg = " DUMMY: no entrance slug read from file " )
else :
try :
if eslug in entrances_xslug :
entrance = entrances_xslug [ eslug ]
else :
entrance = Entrance . objects . get ( slug = eslug )
entrances_xslug [ eslug ] = entrance
CaveAndEntrance . objects . update_or_create (
2023-04-22 22:05:12 +01:00
cave = cave , entrance_letter = letter , entrance = entrance
2023-04-22 01:24:32 +01:00
)
except :
2023-07-24 15:33:39 +01:00
message = f ' ! Entrance setting failure, slug: " { slug } " #entrances: { len ( entrances ) } { e } letter: " { letter } " cave: " { cave } " filename: " cave_data/ { filename } " '
2023-04-22 22:05:12 +01:00
DataIssue . objects . create ( parser = " entrances " , message = message , url = f " { cave . url } _edit/ " )
2023-04-22 01:24:32 +01:00
print ( message )
2023-04-22 03:26:53 +01:00
def reload_entrances ( ) :
""" For individual re-reading of a cave_data file when editing,
also re - read the entrance_data files
"""
for eslug in entrances_xslug :
entrance = entrances_xslug [ eslug ]
2023-04-22 22:05:12 +01:00
read_entrance ( entrance . filename , ent = entrance )
2023-04-22 03:26:53 +01:00
entrance . save ( )
2023-04-22 22:05:12 +01:00
def do_caveslugstuff ( ) :
""" This may be a fossil. We only have one slug per cave in troggle.
Pending destruction of this whole concept and Class CaveSlug
What is Class CaveSlug for ?
"""
primary = True # this sets the first thing we find to be primary=True and all the others =False
for slug in slugs :
if slug in caves_xslug :
cs = caves_xslug [ slug ]
else :
try :
cs = CaveSlug . objects . update_or_create ( cave = cave , slug = slug , primary = primary )
caves_xslug [ slug ] = cs
except Exception as ex :
#raise
# This fails to do an update! It just crashes.. to be fixed
message = f " ! CaveSlug update/create failure : { slug } , skipping file cave_data/ { context } with exception \n Exception: { ex . __class__ } "
DataIssue . objects . create ( parser = " caves " , message = message , url = f " { cave . url } _edit/ " )
print ( message )
primary = False
2020-07-06 20:27:31 +01:00
global entrances_xslug
global caves_xslug
global areas_xslug
2023-01-19 21:18:42 +00:00
2023-04-22 22:05:12 +01:00
# Note: we want to overwrite a PENDING cave if we are now importing the 1623-xxx.html file for it
2020-07-06 20:27:31 +01:00
# Note: these are HTML files in the EXPOWEB repo, not from the loser repo.
2023-04-22 01:24:32 +01:00
fn = settings . CAVEDESCRIPTIONS / filename
2023-04-22 23:15:50 +01:00
context = filename
2023-04-22 01:24:32 +01:00
# print(f" - Reading Cave from cave descriptions file {fn}")
if not fn . exists ( ) :
message = f " ! Cave_data file reading problem filename: ' cave_data/ { filename } ' "
2023-04-22 22:05:12 +01:00
DataIssue . objects . create ( parser = " caves " , message = message , url = f " /cave_data/ { filename } _edit " )
2023-04-22 01:24:32 +01:00
print ( message )
2023-04-22 22:05:12 +01:00
return None
2023-04-22 01:24:32 +01:00
with open ( fn ) as f :
2012-06-10 16:56:12 +01:00
contents = f . read ( )
2023-01-19 21:18:42 +00:00
cavecontentslist = getXML ( contents , " cave " , maxItems = 1 , context = context )
2023-04-22 22:05:12 +01:00
2021-04-14 18:24:08 +01:00
if len ( cavecontentslist ) != 1 :
2023-04-22 22:05:12 +01:00
message = f ' ! BAD CAVE DATA in " { filename } " . More than one cave. Edit file manually, click. '
DataIssue . objects . create ( parser = " caves " , message = message , url = f " /cave_data/ { filename } _edit " )
2021-04-14 18:24:08 +01:00
print ( message )
2023-04-22 22:05:12 +01:00
return None
2023-04-22 01:24:32 +01:00
cavecontents = cavecontentslist [ 0 ]
slugs = getXML ( cavecontents , " caveslug " , maxItems = 1 , context = context )
2023-04-22 22:05:12 +01:00
if len ( slugs ) > 1 :
message = f " ! - More than one slug for a cave: { cave } , slugs: { slugs } . Ignoring all except first. "
DataIssue . objects . create ( parser = " caves " , message = message , url = f " { cave . url } _edit/ " )
2023-04-22 01:24:32 +01:00
print ( message )
2023-04-22 22:05:12 +01:00
slug = slugs [ 0 ]
2023-04-22 01:24:32 +01:00
2023-04-22 22:05:12 +01:00
non_public = getXMLmax1 ( " non_public " )
official_name = getXMLmax1 ( " official_name " )
kataster_code = getXMLmax1 ( " kataster_code " )
kataster_number = getXMLmax1 ( " kataster_number " )
unofficial_number = getXMLmax1 ( " unofficial_number " )
explorers = getXMLmax1 ( " explorers " )
underground_description = getXMLmax1 ( " underground_description " )
equipment = getXMLmax1 ( " equipment " )
references = getXMLmax1 ( " references " )
survey = getXMLmax1 ( " survey " )
kataster_status = getXMLmax1 ( " kataster_status " )
underground_centre_line = getXMLmax1 ( " underground_centre_line " )
notes = getXMLmax1 ( " notes " )
length = getXMLmax1 ( " length " )
depth = getXMLmax1 ( " depth " )
extent = getXMLmax1 ( " extent " )
survex_file = getXMLmax1 ( " survex_file " )
description_file = getXMLmax1 ( " description_file " )
url = getXMLmax1 ( " url " )
manual_edit = True
if not cave :
manual_edit = False
2023-04-22 01:24:32 +01:00
try :
2023-04-22 22:05:12 +01:00
cave , state = Cave . objects . update_or_create ( filename = filename ) # replace with slug when CaveSlug tidied up
2023-04-22 01:24:32 +01:00
except :
2023-04-22 22:05:12 +01:00
print ( " ! FAILED to get only one CAVE in db when updating using: " + filename )
kaves = Cave . objects . all ( ) . filter ( filename = filename ) # replace with slug when CaveSlug tidied up
2023-04-22 01:24:32 +01:00
for k in kaves :
message = " ! - DUPLICATES in db. kataster: " + str ( k . kataster_number ) + " , slug: " + str ( k . slug ( ) )
DataIssue . objects . create ( parser = " caves " , message = message )
print ( message )
for k in kaves :
if k . slug ( ) is not None :
print ( " ! - OVERWRITING this one: slug: " + str ( k . slug ( ) ) )
k . notes = " DUPLICATE kataster number found on import. Please fix \n " + k . notes
2023-04-22 22:05:12 +01:00
cave = k
# From here on the code applies to both edited and newly-imported caves (mostly!)
do_caveslugstuff ( ) # needs cave!=None
cave . non_public = boolify ( non_public )
cave . official_name = official_name [ 0 ]
cave . kataster_code = kataster_code [ 0 ]
cave . kataster_number = kataster_number [ 0 ]
cave . unofficial_number = unofficial_number [ 0 ]
cave . explorers = explorers [ 0 ]
cave . underground_description = underground_description [ 0 ]
cave . equipment = equipment [ 0 ]
cave . references = references [ 0 ]
cave . survey = survey [ 0 ]
cave . kataster_status = kataster_status [ 0 ]
cave . underground_centre_line = underground_centre_line [ 0 ]
cave . notes = notes [ 0 ]
cave . length = length [ 0 ]
cave . depth = depth [ 0 ]
cave . extent = extent [ 0 ]
cave . survex_file = survex_file [ 0 ]
cave . description_file = description_file [ 0 ]
cave . url = url [ 0 ]
2023-04-22 01:24:32 +01:00
2023-04-22 22:05:12 +01:00
areas = getXML ( cavecontents , " area " , context = context )
2023-04-22 23:15:50 +01:00
cave . area . clear ( ) # Deletes all links to areas in db
2023-04-22 22:05:12 +01:00
for area_slug in areas :
if area_slug in areas_xslug :
newArea = areas_xslug [ area_slug ]
else :
areas_new = Area . objects . filter ( short_name = area_slug )
if areas_new :
newArea = areas_new [ 0 ] # just the first one we find, but we are going to clean up Areas anyway
2021-04-26 17:23:23 +01:00
else :
2023-04-22 22:05:12 +01:00
# Area not seen before. SHould not happen with manual edit
if manual_edit :
message = f " ! Cave edit failure due to unrecognised Area: { area_slug [ 0 ] } , skipping this field edit. "
2023-04-22 01:24:32 +01:00
DataIssue . objects . create ( parser = " caves " , message = message )
2023-04-22 22:05:12 +01:00
print ( message )
# super value is highly dodgy
newArea = Area ( short_name = area_slug , super = Area . objects . get ( short_name = " 1623 " ) )
newArea . save ( )
areas_xslug [ area_slug ] = newArea
cave . area . add ( newArea )
2022-03-05 12:20:26 +00:00
2023-04-22 22:05:12 +01:00
entrances = getXML ( cavecontents , " entrance " , context = context )
do_entrances ( )
# print(f"- {entrances_xslug=}")
if not entrances or len ( entrances ) < 1 :
# missing entrance link in cave_data/1623-* .html file
set_dummy_entrance ( slug [ 5 : ] , slug , cave , msg = " DUMMY: no entrances " )
else :
do_entrances ( )
if manual_edit :
reload_entrances ( )
2023-04-22 01:24:32 +01:00
if survex_file [ 0 ] :
if not ( Path ( SURVEX_DATA ) / survex_file [ 0 ] ) . is_file ( ) :
message = f ' ! { slug : 12 } survex filename does not exist :LOSER: " { survex_file [ 0 ] } " in " { filename } " '
DataIssue . objects . create ( parser = " caves " , message = message , url = f " / { slug [ 0 : 4 ] } / { slug } _cave_edit/ " )
print ( message )
2023-01-19 21:18:42 +00:00
2023-04-22 01:24:32 +01:00
if description_file [ 0 ] : # if not an empty string
2023-07-14 10:20:57 +01:00
message = f ' - { slug : 12 } Note (not an error): complex description filename " { description_file [ 0 ] } " inside " cave_data/ { filename } " '
2023-04-22 01:24:32 +01:00
DataIssue . objects . create ( parser = " caves ok " , message = message , url = f " / { slug } _cave_edit/ " )
2019-04-14 22:45:31 +01:00
print ( message )
2023-01-19 21:18:42 +00:00
2023-04-22 01:24:32 +01:00
if not ( Path ( EXPOWEB ) / description_file [ 0 ] ) . is_file ( ) :
message = f ' ! { slug : 12 } description filename " { EXPOWEB } / { description_file [ 0 ] } " does not refer to a real file '
DataIssue . objects . create ( parser = " caves " , message = message , url = f " / { slug } _cave_edit/ " )
print ( message )
2023-04-22 22:05:12 +01:00
cave . save ( )
return cave
2023-04-22 01:24:32 +01:00
2022-07-23 17:26:47 +01:00
def readcaves ( ) :
2023-04-22 01:24:32 +01:00
""" Called from databaseReset mass importer.
Reads the xml - format HTML ' cave ' files in the EXPOWEB repo , the survex files from the loser repo .
"""
2023-03-28 19:08:05 +01:00
# Pending is for those caves which do not have cave_data/1623-xxx.html XML files even though
# they exist and have surveys.
2022-07-23 17:26:47 +01:00
pending = set ( )
fpending = Path ( CAVEDESCRIPTIONS , " pendingcaves.txt " )
if fpending . is_file ( ) :
with open ( fpending , " r " ) as fo :
cids = fo . readlines ( )
for cid in cids :
2023-01-19 21:18:42 +00:00
pending . add ( cid . strip ( ) . rstrip ( " \n " ) . upper ( ) )
2022-07-23 17:26:47 +01:00
with transaction . atomic ( ) :
print ( " - Deleting Caves and Entrances " )
# attempting to avoid MariaDB crash when doing this
try :
Area . objects . all ( ) . delete ( )
except :
pass
try :
Cave . objects . all ( ) . delete ( )
except :
pass
try :
Entrance . objects . all ( ) . delete ( )
except :
pass
# Clear the cave data issues and the caves as we are reloading
2023-01-19 21:18:42 +00:00
DataIssue . objects . filter ( parser = " areas " ) . delete ( )
DataIssue . objects . filter ( parser = " caves " ) . delete ( )
DataIssue . objects . filter ( parser = " caves ok " ) . delete ( )
2023-07-25 23:23:49 +01:00
#DataIssue.objects.filter(parser="entrances").delete()
2023-01-19 21:18:42 +00:00
2022-07-23 17:26:47 +01:00
with transaction . atomic ( ) :
2023-03-24 00:54:26 +00:00
area = get_area ( " 1623 " )
2022-07-23 17:26:47 +01:00
print ( " - Reading Entrances from entrance descriptions xml files " )
2023-01-19 21:18:42 +00:00
for filename in next ( os . walk ( ENTRANCEDESCRIPTIONS ) ) [ 2 ] : # Should be a better way of getting a list of files
2023-04-22 22:05:12 +01:00
read_entrance ( filename )
2023-07-25 22:14:46 +01:00
# WHy is this needed ? Without it, we lose these DataIssues!
ent_issues = DataIssue . objects . filter ( parser = " entrances " )
print ( f " __ We now have { len ( ent_issues ) } entrance DataIssues " )
2022-07-23 17:26:47 +01:00
print ( " - Reading Caves from cave descriptions xml files " )
2023-01-19 21:18:42 +00:00
for filename in next ( os . walk ( CAVEDESCRIPTIONS ) ) [ 2 ] : # Should be a better way of getting a list of files
if filename . endswith ( " .html " ) :
2023-04-22 22:05:12 +01:00
read_cave ( filename )
2022-07-25 00:57:00 +01:00
2023-01-19 21:18:42 +00:00
print ( " - Setting up all the variously useful alias names " )
2023-01-19 21:34:09 +00:00
GetCaveLookup ( )
2023-01-19 21:18:42 +00:00
print ( " - Setting pending caves " )
2022-07-28 16:36:57 +01:00
# Do this last, so we can detect if they are created and no longer 'pending'
2022-10-06 19:02:15 +01:00
with transaction . atomic ( ) :
for k in pending :
2023-01-19 21:18:42 +00:00
2022-10-06 19:02:15 +01:00
if k [ 0 : 3 ] == " 162 " :
areanum = k [ 0 : 4 ]
2023-07-05 18:21:15 +01:00
number = k [ 5 : ]
url = f " { areanum } / { k [ 5 : ] } .html " # Note we are appending the .htm to allow for offline websites
2022-10-06 19:02:15 +01:00
else :
areanum = " 1623 "
2023-07-05 18:21:15 +01:00
number = k
2023-01-19 21:18:42 +00:00
url = f " 1623/ { k } "
2022-09-25 19:43:00 +01:00
2023-03-24 00:54:26 +00:00
area = get_area ( areanum )
2023-01-19 21:18:42 +00:00
try :
2023-07-05 18:21:15 +01:00
do_pending_cave ( k , number , url , area )
2022-10-06 19:02:15 +01:00
except :
2023-07-25 23:23:49 +01:00
message = f " ! Error. Cannot create pending cave, pending-id: { k } in area { areanum } "
2023-01-19 21:18:42 +00:00
DataIssue . objects . create ( parser = " caves " , message = message )
2022-10-06 19:02:15 +01:00
print ( message )
raise