2
0
mirror of https://expo.survex.com/repositories/troggle/.git synced 2025-12-14 19:47:12 +00:00

edit cave reads from HTML file not just db

This commit is contained in:
2023-04-22 01:24:32 +01:00
parent 116cfc7c6e
commit 2ed66fe3d0
3 changed files with 342 additions and 197 deletions

View File

@@ -10,7 +10,12 @@ from troggle.core.models.logbooks import CaveSlug
from troggle.core.models.troggle import DataIssue
from troggle.settings import CAVEDESCRIPTIONS, ENTRANCEDESCRIPTIONS, EXPOWEB, SURVEX_DATA
"""Reads all the cave description data by parsing the xml files (stored as e.g. :EXPOWEB:/cave_data/1623-161.html )
"""Reads all the cave description data and entrance description data
by parsing the xml files stored as e.g.
:EXPOWEB:/cave_data/1623-161.html
or
:EXPOWEB:/entrance_data/1623-161g.html
and creating the various Cave, Entrance and necessary Area objects.
This is the first import that happens after the database is reinitialised.
@@ -272,8 +277,41 @@ def do_pending_cave(k, url, area):
print(message)
return cave
def readentrance(filename):
def getXML(text, itemname, minItems=1, maxItems=None, context=""):
"""Reads a single XML tag
Should throw exception rather than producing error message here,
then handle exception in calling routine where it has the context.
"""
items = re.findall("<%(itemname)s>(.*?)</%(itemname)s>" % {"itemname": itemname}, text, re.S)
if len(items) < minItems:
message = (
" ! %(count)i x %(itemname)s found, at least %(min)i expected. Load ABORT. "
% {"count": len(items), "itemname": itemname, "min": minItems}
+ " in file "
+ context
)
DataIssue.objects.create(parser="caves", message=message, url="" + context)
print(message)
if maxItems is not None and len(items) > maxItems:
message = (
" ! %(count)i x %(itemname)s found, no more than %(max)i expected in this XML unit. Load ABORT. "
% {"count": len(items), "itemname": itemname, "max": maxItems}
+ " in file "
+ context
)
DataIssue.objects.create(parser="caves", message=message)
print(message)
if minItems == 0:
if not items:
items = [""]
return items
def readentrance(filename, ent=None):
"""Reads an entrance description from the .html file
If not called as part of initial import, then the global lists will not be correct
but this is OK, a search will find them in the db.
"""
def getXMLmax1(field):
return getXML(entrancecontents, field, maxItems=1, context=context)
@@ -362,17 +400,50 @@ def readentrance(filename):
)
def readcave(filename):
def readcave(filename, cave=None):
"""Reads an entrance description from the .html file
Convoluted. Sorry.This is as I inherited it and I haven't fiddled with it. Needs rewriting
Convoluted. Sorry. Needs rewriting
Assumes any area it hasn't seen before is a subarea of 1623
If not called as part of initial import, then the global lists will not be correct
but this is OK, a search will find them in the db.
"""
def do_entrances():
for entrance in entrances:
eslug = getXML(entrance, "entranceslug", maxItems=1, context=context)[0]
letter = getXML(entrance, "letter", maxItems=1, context=context)[0]
if len(entrances) == 1 and not eslug: # may be empty: <entranceslug></entranceslug>
set_dummy_entrance(slug[5:], slug, c, msg="DUMMY: no entrance slug read from file")
else:
try:
if eslug in entrances_xslug:
entrance = entrances_xslug[eslug]
else:
# entrance = Entrance.objects.get(entranceslug__slug=eslug)
entrance = Entrance.objects.get(slug=eslug)
entrances_xslug[eslug] = entrance
CaveAndEntrance.objects.update_or_create(
cave=c, entrance_letter=letter, entrance=entrance
)
except:
message = f' ! Entrance setting failure, slug:"{slug}" #entrances:{len(entrances)} {entrance} letter:"{letter}" cave:"{c}" filename:"cave_data/{filename}"'
DataIssue.objects.create(parser="entrances", message=message, url=f"{c.url}_edit/")
print(message)
global entrances_xslug
global caves_xslug
global areas_xslug
# Note: these are HTML files in the EXPOWEB repo, not from the loser repo.
with open(os.path.join(CAVEDESCRIPTIONS, filename)) as f:
fn = settings.CAVEDESCRIPTIONS / filename
# print(f" - Reading Cave from cave descriptions file {fn}")
if not fn.exists():
message = f" ! Cave_data file reading problem filename:'cave_data/{filename}'"
DataIssue.objects.create(parser="caves", message=message, url=None)
print(message)
return
with open(fn) as f:
contents = f.read()
context = filename
cavecontentslist = getXML(contents, "cave", maxItems=1, context=context)
@@ -380,204 +451,271 @@ def readcave(filename):
message = f'! BAD CAVE at "{filename}"'
DataIssue.objects.create(parser="caves", message=message)
print(message)
else:
cavecontents = cavecontentslist[0]
non_public = getXML(cavecontents, "non_public", maxItems=1, context=context)
slugs = getXML(cavecontents, "caveslug", maxItems=1, context=context)
official_name = getXML(cavecontents, "official_name", maxItems=1, context=context)
areas = getXML(cavecontents, "area", context=context)
kataster_code = getXML(cavecontents, "kataster_code", maxItems=1, context=context)
kataster_number = getXML(cavecontents, "kataster_number", maxItems=1, context=context)
unofficial_number = getXML(cavecontents, "unofficial_number", maxItems=1, context=context)
explorers = getXML(cavecontents, "explorers", maxItems=1, context=context)
underground_description = getXML(cavecontents, "underground_description", maxItems=1, context=context)
equipment = getXML(cavecontents, "equipment", maxItems=1, context=context)
references = getXML(cavecontents, "references", maxItems=1, context=context)
survey = getXML(cavecontents, "survey", maxItems=1, context=context)
kataster_status = getXML(cavecontents, "kataster_status", maxItems=1, context=context)
underground_centre_line = getXML(cavecontents, "underground_centre_line", maxItems=1, context=context)
notes = getXML(cavecontents, "notes", maxItems=1, context=context)
length = getXML(cavecontents, "length", maxItems=1, context=context)
depth = getXML(cavecontents, "depth", maxItems=1, context=context)
extent = getXML(cavecontents, "extent", maxItems=1, context=context)
survex_file = getXML(cavecontents, "survex_file", maxItems=1, context=context)
description_file = getXML(cavecontents, "description_file", maxItems=1, context=context)
url = getXML(cavecontents, "url", maxItems=1, context=context)
entrances = getXML(cavecontents, "entrance", context=context)
return
cavecontents = cavecontentslist[0]
non_public = getXML(cavecontents, "non_public", maxItems=1, context=context)
slugs = getXML(cavecontents, "caveslug", maxItems=1, context=context)
official_name = getXML(cavecontents, "official_name", maxItems=1, context=context)
areas = getXML(cavecontents, "area", context=context)
kataster_code = getXML(cavecontents, "kataster_code", maxItems=1, context=context)
kataster_number = getXML(cavecontents, "kataster_number", maxItems=1, context=context)
unofficial_number = getXML(cavecontents, "unofficial_number", maxItems=1, context=context)
explorers = getXML(cavecontents, "explorers", maxItems=1, context=context)
underground_description = getXML(cavecontents, "underground_description", maxItems=1, context=context)
equipment = getXML(cavecontents, "equipment", maxItems=1, context=context)
references = getXML(cavecontents, "references", maxItems=1, context=context)
survey = getXML(cavecontents, "survey", maxItems=1, context=context)
kataster_status = getXML(cavecontents, "kataster_status", maxItems=1, context=context)
underground_centre_line = getXML(cavecontents, "underground_centre_line", maxItems=1, context=context)
notes = getXML(cavecontents, "notes", maxItems=1, context=context)
length = getXML(cavecontents, "length", maxItems=1, context=context)
depth = getXML(cavecontents, "depth", maxItems=1, context=context)
extent = getXML(cavecontents, "extent", maxItems=1, context=context)
survex_file = getXML(cavecontents, "survex_file", maxItems=1, context=context)
description_file = getXML(cavecontents, "description_file", maxItems=1, context=context)
url = getXML(cavecontents, "url", maxItems=1, context=context)
entrances = getXML(cavecontents, "entrance", context=context)
if (
len(non_public) == 1
and len(slugs) >= 1
and len(official_name) == 1
and len(areas) >= 1
and len(kataster_code) == 1
and len(kataster_number) == 1
and len(unofficial_number) == 1
and len(explorers) == 1
and len(underground_description) == 1
and len(equipment) == 1
and len(references) == 1
and len(survey) == 1
and len(kataster_status) == 1
and len(underground_centre_line) == 1
and len(notes) == 1
and len(length) == 1
and len(depth) == 1
and len(extent) == 1
and len(survex_file) == 1
and len(description_file) == 1
and len(url) == 1
):
try:
c, state = Cave.objects.update_or_create(
non_public={
"True": True,
"False": False,
"true": True,
"false": False,
}[non_public[0]],
official_name=official_name[0],
kataster_code=kataster_code[0],
kataster_number=kataster_number[0],
unofficial_number=unofficial_number[0],
explorers=explorers[0],
underground_description=underground_description[0],
equipment=equipment[0],
references=references[0],
survey=survey[0],
kataster_status=kataster_status[0],
underground_centre_line=underground_centre_line[0],
notes=notes[0],
length=length[0],
depth=depth[0],
extent=extent[0],
survex_file=survex_file[0],
description_file=description_file[0],
url=url[0],
filename=filename,
)
except:
print(" ! FAILED to get only one CAVE when updating using: " + filename)
kaves = Cave.objects.all().filter(kataster_number=kataster_number[0])
for k in kaves:
message = " ! - DUPLICATES in db. kataster:" + str(k.kataster_number) + ", slug:" + str(k.slug())
DataIssue.objects.create(parser="caves", message=message)
print(message)
for k in kaves:
if k.slug() is not None:
print(" ! - OVERWRITING this one: slug:" + str(k.slug()))
k.notes = "DUPLICATE kataster number found on import. Please fix\n" + k.notes
c = k
for area_slug in areas:
if area_slug in areas_xslug:
newArea = areas_xslug[area_slug]
else:
area = Area.objects.filter(short_name=area_slug)
if area:
newArea = area[0]
else:
newArea = Area(short_name=area_slug, super=Area.objects.get(short_name="1623"))
newArea.save()
areas_xslug[area_slug] = newArea
c.area.add(newArea)
primary = True # this sets the first thing we find to be primary=True and all the others =False
for slug in slugs:
if slug in caves_xslug:
cs = caves_xslug[slug]
else:
try: # we want to overwrite a PENDING cave if we are now importing the 1623-xxx.html file for it
cs = CaveSlug.objects.update_or_create(cave=c, slug=slug, primary=primary)
caves_xslug[slug] = cs
except Exception as ex:
# This fails to do an update! It just crashes.. to be fixed
message = f" ! Cave update/create failure : {slug}, skipping file cave_data/{context} with exception\nException: {ex.__class__}"
DataIssue.objects.create(parser="caves", message=message)
print(message)
primary = False
if not entrances or len(entrances) < 1:
# missing entrance link in cave_data/1623-* .html file
set_dummy_entrance(slug[5:], slug, c, msg="DUMMY: no entrances")
else:
for entrance in entrances:
eslug = getXML(entrance, "entranceslug", maxItems=1, context=context)[0]
letter = getXML(entrance, "letter", maxItems=1, context=context)[0]
if len(entrances) == 1 and not eslug: # may be empty: <entranceslug></entranceslug>
set_dummy_entrance(slug[5:], slug, c, msg="DUMMY: no entrance slug read from file")
else:
try:
if eslug in entrances_xslug:
entrance = entrances_xslug[eslug]
else:
# entrance = Entrance.objects.get(entranceslug__slug=eslug)
entrance = Entrance.objects.get(slug=eslug)
entrances_xslug[eslug] = entrance
CaveAndEntrance.objects.update_or_create(
cave=c, entrance_letter=letter, entrance=entrance
)
except:
message = f' ! Entrance setting failure, slug:"{slug}" #entrances:{len(entrances)} {entrance} letter:"{letter}" cave:"{c}" filename:"cave_data/{filename}"'
DataIssue.objects.create(parser="entrances", message=message, url=f"{c.url}_edit/")
print(message)
if survex_file[0]:
if not (Path(SURVEX_DATA) / survex_file[0]).is_file():
message = f' ! {slug:12} survex filename does not exist :LOSER:"{survex_file[0]}" in "{filename}"'
DataIssue.objects.create(parser="caves", message=message, url=f"/{slug[0:4]}/{slug}_cave_edit/")
print(message)
if description_file[0]: # if not an empty string
message = f' - {slug:12} Note (not an error): complex description filename "{description_file[0]}" inside "{CAVEDESCRIPTIONS}/{filename}"'
DataIssue.objects.create(parser="caves ok", message=message, url=f"/{slug}_cave_edit/")
print(message)
if not (Path(EXPOWEB) / description_file[0]).is_file():
message = f' ! {slug:12} description filename "{EXPOWEB}/{description_file[0]}" does not refer to a real file'
DataIssue.objects.create(parser="caves", message=message, url=f"/{slug}_cave_edit/")
print(message)
# c.description_file="" # done only once, to clear out cruft.
# c.save()
else: # more than one item in long list
message = f' ! ABORT loading this cave. in "{filename}"'
DataIssue.objects.create(parser="caves", message=message, url=f"/{slug}_cave_edit/")
if not (
len(non_public) == 1
and len(slugs) >= 1 # is this really correct ?
and len(official_name) == 1
and len(areas) >= 1 # we want to stop using the sub-ares in 2023
and len(kataster_code) == 1
and len(kataster_number) == 1
and len(unofficial_number) == 1
and len(explorers) == 1
and len(underground_description) == 1
and len(equipment) == 1
and len(references) == 1
and len(survey) == 1
and len(kataster_status) == 1
and len(underground_centre_line) == 1
and len(notes) == 1
and len(length) == 1
and len(depth) == 1
and len(extent) == 1
and len(survex_file) == 1
and len(description_file) == 1
and len(url) == 1
):
# more than one item in long list
message = f' ! ABORT loading this cave. in "{filename}"'
DataIssue.objects.create(parser="caves", message=message, url=f"/{slugs}_cave_edit/")
print(message)
return
if cave:
# this a re-load prior to editing and we already know the cave id
cave.non_public={
"True": True,
"False": False,
"true": True,
"false": False}[non_public[0]]
cave.official_name=official_name[0]
cave.kataster_code=kataster_code[0]
cave.kataster_number=kataster_number[0]
cave.unofficial_number=unofficial_number[0]
cave.explorers=explorers[0]
cave.underground_description=underground_description[0]
cave.equipment=equipment[0]
cave.references=references[0]
cave.survey=survey[0]
cave.kataster_status=kataster_status[0]
cave.underground_centre_line=underground_centre_line[0]
cave.notes=notes[0]
cave.length=length[0]
cave.depth=depth[0]
cave.extent=extent[0]
cave.survex_file=survex_file[0]
cave.description_file=description_file[0]
cave.url=url[0]
if len(slugs) > 1:
message = f" ! Cave edit failure due to more than one slug: {slugs}, skipping this field edit. "
DataIssue.objects.create(parser="caves", message=message)
print(message)
cave.areas = None
cave.save()
for area_slug in areas:
a = Area.objects.filter(short_name=area_slug)
if a:
cave.area.add(a[0])
else:
message = f" ! Cave edit failure due to unrecognised Area: {a}, skipping this field edit. "
DataIssue.objects.create(parser="caves", message=message)
print(message)
c = cave
do_entrances()
cave.save()
else:
try:
c, state = Cave.objects.update_or_create(
non_public={
"True": True,
"False": False,
"true": True,
"false": False,
}[non_public[0]],
official_name=official_name[0],
kataster_code=kataster_code[0],
kataster_number=kataster_number[0],
unofficial_number=unofficial_number[0],
explorers=explorers[0],
underground_description=underground_description[0],
equipment=equipment[0],
references=references[0],
survey=survey[0],
kataster_status=kataster_status[0],
underground_centre_line=underground_centre_line[0],
notes=notes[0],
length=length[0],
depth=depth[0],
extent=extent[0],
survex_file=survex_file[0],
description_file=description_file[0],
url=url[0],
filename=filename,
)
except:
print(" ! FAILED to get only one CAVE when updating using: " + filename)
kaves = Cave.objects.all().filter(kataster_number=kataster_number[0])
for k in kaves:
message = " ! - DUPLICATES in db. kataster:" + str(k.kataster_number) + ", slug:" + str(k.slug())
DataIssue.objects.create(parser="caves", message=message)
print(message)
for k in kaves:
if k.slug() is not None:
print(" ! - OVERWRITING this one: slug:" + str(k.slug()))
k.notes = "DUPLICATE kataster number found on import. Please fix\n" + k.notes
c = k
for area_slug in areas:
if area_slug in areas_xslug:
newArea = areas_xslug[area_slug]
else:
area = Area.objects.filter(short_name=area_slug)
if area:
newArea = area[0]
else:
newArea = Area(short_name=area_slug, super=Area.objects.get(short_name="1623"))
newArea.save()
areas_xslug[area_slug] = newArea
c.area.add(newArea)
primary = True # this sets the first thing we find to be primary=True and all the others =False
for slug in slugs:
if slug in caves_xslug:
cs = caves_xslug[slug]
else:
try: # we want to overwrite a PENDING cave if we are now importing the 1623-xxx.html file for it
cs = CaveSlug.objects.update_or_create(cave=c, slug=slug, primary=primary)
caves_xslug[slug] = cs
except Exception as ex:
#raise
# This fails to do an update! It just crashes.. to be fixed
message = f" ! Cave update/create failure : {slug}, skipping file cave_data/{context} with exception\nException: {ex.__class__}"
DataIssue.objects.create(parser="caves", message=message)
print(message)
def getXML(text, itemname, minItems=1, maxItems=None, context=""):
"""Reads a single XML tag
Should throw exception rather than producing error message here,
then handle exception in calling routine where it has the context.
"""
items = re.findall("<%(itemname)s>(.*?)</%(itemname)s>" % {"itemname": itemname}, text, re.S)
if len(items) < minItems:
message = (
" ! %(count)i x %(itemname)s found, at least %(min)i expected. Load ABORT. "
% {"count": len(items), "itemname": itemname, "min": minItems}
+ " in file "
+ context
)
DataIssue.objects.create(parser="caves", message=message, url="" + context)
primary = False
if not entrances or len(entrances) < 1:
# missing entrance link in cave_data/1623-* .html file
set_dummy_entrance(slug[5:], slug, c, msg="DUMMY: no entrances")
else:
do_entrances()
# From here on the code applies to both edited and newly-imported caves
if survex_file[0]:
if not (Path(SURVEX_DATA) / survex_file[0]).is_file():
message = f' ! {slug:12} survex filename does not exist :LOSER:"{survex_file[0]}" in "{filename}"'
DataIssue.objects.create(parser="caves", message=message, url=f"/{slug[0:4]}/{slug}_cave_edit/")
print(message)
if description_file[0]: # if not an empty string
message = f' - {slug:12} Note (not an error): complex description filename "{description_file[0]}" inside "{CAVEDESCRIPTIONS}/{filename}"'
DataIssue.objects.create(parser="caves ok", message=message, url=f"/{slug}_cave_edit/")
print(message)
if maxItems is not None and len(items) > maxItems:
message = (
" ! %(count)i x %(itemname)s found, no more than %(max)i expected in this XML unit. Load ABORT. "
% {"count": len(items), "itemname": itemname, "max": maxItems}
+ " in file "
+ context
)
DataIssue.objects.create(parser="caves", message=message)
print(message)
if minItems == 0:
if not items:
items = [""]
return items
if not (Path(EXPOWEB) / description_file[0]).is_file():
message = f' ! {slug:12} description filename "{EXPOWEB}/{description_file[0]}" does not refer to a real file'
DataIssue.objects.create(parser="caves", message=message, url=f"/{slug}_cave_edit/")
print(message)
# c.description_file="" # done only once, to clear out cruft.
c.save()
# ChatGPT replacement attempt 2023-04-21. Obviously very incomplete, but some useful ideas
# import os
# import xml.etree.ElementTree as ET
# class BadCaveException(Exception):
# pass
# class FailedCaveUpdateException(Exception):
# pass
# def readcave_chatgpt(filename, entrances_xslug, caves_xslug, areas_xslug):
# """Reads an entrance description from the .html file and updates the corresponding Cave object"""
# tree = ET.parse(os.path.join(CAVEDESCRIPTIONS, filename))
# root = tree.getroot()
# cavecontents = root.find("cave")
# if cavecontents is None:
# raise BadCaveException(f'! BAD CAVE at "{filename}"')
# non_public = cavecontents.findtext("non_public")
# slugs = cavecontents.findtext("caveslug")
# official_name = cavecontents.findtext("official_name")
# kataster_code = cavecontents.findtext("kataster_code")
# kataster_number = cavecontents.findtext("kataster_number")
# unofficial_number = cavecontents.findtext("unofficial_number")
# explorers = cavecontents.findtext("explorers")
# underground_description = cavecontents.findtext("underground_description")
# equipment = cavecontents.findtext("equipment")
# references = cavecontents.findtext("references")
# survey = cavecontents.findtext("survey")
# kataster_status = cavecontents.findtext("kataster_status")
# underground_centre_line = cavecontents.findtext("underground_centre_line")
# notes = cavecontents.findtext("notes")
# length = cavecontents.findtext("length")
# depth = cavecontents.findtext("depth")
# extent = cavecontents.findtext("extent")
# survex_file = cavecontents.findtext("survex_file")
# description_file = cavecontents.findtext("description_file")
# url = cavecontents.findtext("url")
# areas = cavecontents.findall("area")
# entrances = cavecontents.findall("entrance")
# if (
# non_public is not None
# # etc.
# # wrong, some of these should be ==1 and some >=1
# ):
# try:
# cave = caves_xslug.get(kataster_number)
# if cave is None:
# cave = Cave.objects.create(
# non_public={
# "True": True,
# "False": False,
# "true": True,
# "false": False,
# }[non_public],
# official_name=official_name,
# # kataster [truncated]
def readcaves():
"""Reads the xml-format HTML files in the EXPOWEB repo, not from the loser repo."""
"""Called from databaseReset mass importer.
Reads the xml-format HTML 'cave' files in the EXPOWEB repo, the survex files from the loser repo.
"""
# Pending is for those caves which do not have cave_data/1623-xxx.html XML files even though
# they exist and have surveys.
pending = set()