refactor cave import

This commit is contained in:
Philip Sargent 2023-04-22 22:05:12 +01:00
parent 275adc8efa
commit 30ef427b90
2 changed files with 188 additions and 247 deletions

View File

@ -15,7 +15,7 @@ from troggle.core.models.logbooks import CaveSlug, QM
from troggle.core.utils import write_and_commit
from troggle.core.views import expo
from troggle.settings import CAVEDESCRIPTIONS, ENTRANCEDESCRIPTIONS
from troggle.parsers.caves import readcave, readentrance
from troggle.parsers.caves import read_cave, read_entrance
from .auth import login_required_if_public
@ -392,7 +392,7 @@ def edit_cave(request, path="", slug=None):
else:
# re-read cave data from file.
filename = str(cave.slug() +".html")
readcave(filename, cave=cave)
read_cave(filename, cave=cave)
form = CaveForm(instance=cave)
ceFormSet = CaveAndEntranceFormSet(queryset=cave.caveandentrance_set.all())
@ -446,7 +446,7 @@ def edit_entrance(request, path="", caveslug=None, entslug=None):
if form.is_valid() and entletter.is_valid():
entrance = form.save(commit=False)
entrance_letter = entletter.save(commit=False)
print(f"- POST {caveslug=} {entslug=} {path=}")
# print(f"- POST {caveslug=} {entslug=} {path=}")
if entslug is None:
if entletter.cleaned_data["entrance_letter"]:
slugname = cave.slug() + entletter.cleaned_data["entrance_letter"]
@ -467,7 +467,7 @@ def edit_entrance(request, path="", caveslug=None, entslug=None):
else:
# re-read entrance data from file.
filename = str(entrance.slug +".html")
readentrance(filename, ent=entrance)
read_entrance(filename, ent=entrance)
form = EntranceForm(instance=entrance)
if entslug is None:

View File

@ -27,6 +27,8 @@ todo = """
- Cannot use Edit This Page for pendingcaves.txt_edit as Edit This Page is expecting an html file.
So we will need a separate file-editing capability just for this configuration file ?!
- we want to overwrite a PENDING cave if we are now importing the 1623-xxx.html file for it
- Semi-automagically import all the 1627- pending caves and create HTML files for them to be
edited individually. (These are caves we only know about because we have German survex files.)
@ -281,6 +283,9 @@ def getXML(text, itemname, minItems=1, maxItems=None, context=""):
"""Reads a single XML tag
Should throw exception rather than producing error message here,
then handle exception in calling routine where it has the context.
This always succeeds, but it produices error message on the terminal and in the
DatIssues log.
"""
items = re.findall("<%(itemname)s>(.*?)</%(itemname)s>" % {"itemname": itemname}, text, re.S)
if len(items) < minItems:
@ -300,7 +305,7 @@ def getXML(text, itemname, minItems=1, maxItems=None, context=""):
+ " in file "
+ context
)
DataIssue.objects.create(parser="caves", message=message)
DataIssue.objects.create(parser="caves", message=message, url="" + context)
print(message)
if minItems == 0:
if not items:
@ -315,11 +320,18 @@ def boolify(boolstrs):
"true": True,
"false": False}[boolstrs[0]]
def readentrance(filename, ent=None):
"""Reads an entrance description from the .html file
def read_entrance(filename, ent=None):
"""Reads an entrance description from the .html file.
If not called as part of initial import, then the global lists will not be correct
but this is OK, a search will find them in the db.
Args:
filename: The name of the .html file.
ent: The entrance object, if it already exists.
Returns:
The entrance object, or a new entrance object if `ent` is None.
"""
def getXMLmax1(field):
return getXML(entrancecontents, field, maxItems=1, context=context)
@ -333,23 +345,22 @@ def readentrance(filename, ent=None):
contents = f.read()
context = filename
# print("Reading file ENTRANCE {} / {}".format(ENTRANCEDESCRIPTIONS, filename))
entrancecontentslist = getXML(contents, "entrance", maxItems=1, context=context)
if len(entrancecontentslist) != 1:
message = f'! BAD ENTRANCE at "{filename}". Loading aborted. '
DataIssue.objects.create(parser="entrances", message=message)
message = f'! BAD ENTRANCE DATA in "{filename}". More than one entrance. Edit file manually, click.'
DataIssue.objects.create(parser="entrances", message=message, url=f"/entrance_data/{filename}_edit")
print(message)
return
return None
entrancecontents = entrancecontentslist[0]
slugs = getXML(entrancecontents, "slug", context=context)
slug = slugs[0]
if len(slugs) >1:
# Only ever one of these per entrance in the expo dataset
message = f" ! - More than one slug for an entrance: {entrance}, slugs: {slugs}. Aborting."
message = f" ! - More than one slug for an entrance: {entrance}, slugs: {slugs}. Ignoring all except first."
DataIssue.objects.create(parser="entrances", message=message, url=f"/cave/{slug}/edit/")
print(message)
return
lastvisit = getXML(entrancecontents, "last visit date", maxItems=1, minItems=0, context=context)
@ -376,7 +387,9 @@ def readentrance(filename, ent=None):
underground_description = getXMLmax1("underground_description")
url = getXMLmax1("url")
if ent:
if not ent:
ent, state = Entrance.objects.update_or_create(slug=slug)
ent.name=name[0]
ent.non_public=boolify(non_public)
ent.alt=alt[0]
@ -398,42 +411,15 @@ def readentrance(filename, ent=None):
ent.other_description=other_description[0]
ent.other_station=other_station[0]
ent.photo=photo[0]
ent.slug=slugs[0]
# ent.slug=slugs[0]
ent.tag_station=tag_station[0]
ent.underground_description=underground_description[0]
ent.url=url[0]
ent.save()
else:
e, state = Entrance.objects.update_or_create(
name=name[0],
non_public=boolify(non_public),
alt=alt[0],
approach=approach[0],
bearings=bearings[0],
easting=easting[0],
entrance_description=entrance_description[0],
exact_station=exact_station[0],
explorers=explorers[0],
filename=filename,
findability=findability[0],
findability_description=findability_description[0],
lastvisit=lastvisit[0],
location_description=location_description[0],
map_description=map_description[0],
marking=marking[0],
marking_comment=marking_comment[0],
northing=northing[0],
other_description=other_description[0],
other_station=other_station[0],
photo=photo[0],
slug=slugs[0],
tag_station=tag_station[0],
underground_description=underground_description[0],
url=url[0],
)
e.save()
def readcave(filename, cave=None):
ent.save()
return ent
def read_cave(filename, cave=None):
"""Reads an entrance description from the .html file
Convoluted. Sorry. Needs rewriting
Assumes any area it hasn't seen before is a subarea of 1623
@ -441,9 +427,13 @@ def readcave(filename, cave=None):
If not called as part of initial import, then the global lists will not be correct
but this is OK, a search will find them in the db.
"""
def getXMLmax1(field):
return getXML(cavecontents, field, maxItems=1, context=context)
def do_entrances():
"""For both bulk import and individual re-reading of cave_data file,
fix the entrances
What is Class CaveAndEntrance for?
"""
for e in entrances:
eslug = getXML(e, "entranceslug", maxItems=1, context=context)[0]
@ -458,11 +448,11 @@ def readcave(filename, cave=None):
entrance = Entrance.objects.get(slug=eslug)
entrances_xslug[eslug] = entrance
CaveAndEntrance.objects.update_or_create(
cave=c, entrance_letter=letter, entrance=entrance
cave=cave, entrance_letter=letter, entrance=entrance
)
except:
message = f' ! Entrance setting failure, slug:"{slug}" #entrances:{len(entrances)} {entrance} letter:"{letter}" cave:"{c}" filename:"cave_data/{filename}"'
DataIssue.objects.create(parser="entrances", message=message, url=f"{c.url}_edit/")
message = f' ! Entrance setting failure, slug:"{slug}" #entrances:{len(entrances)} {entrance} letter:"{letter}" cave:"{cave}" filename:"cave_data/{filename}"'
DataIssue.objects.create(parser="entrances", message=message, url=f"{cave.url}_edit/")
print(message)
def reload_entrances():
"""For individual re-reading of a cave_data file when editing,
@ -470,87 +460,105 @@ def readcave(filename, cave=None):
"""
for eslug in entrances_xslug:
entrance = entrances_xslug[eslug]
readentrance(entrance.filename, ent=entrance)
read_entrance(entrance.filename, ent=entrance)
entrance.save()
def do_caveslugstuff():
"""This may be a fossil. We only have one slug per cave in troggle.
Pending destruction of this whole concept and Class CaveSlug
What is Class CaveSlug for?
"""
primary = True # this sets the first thing we find to be primary=True and all the others =False
for slug in slugs:
if slug in caves_xslug:
cs = caves_xslug[slug]
else:
try:
cs = CaveSlug.objects.update_or_create(cave=cave, slug=slug, primary=primary)
caves_xslug[slug] = cs
except Exception as ex:
#raise
# This fails to do an update! It just crashes.. to be fixed
message = f" ! CaveSlug update/create failure : {slug}, skipping file cave_data/{context} with exception\nException: {ex.__class__}"
DataIssue.objects.create(parser="caves", message=message, url=f"{cave.url}_edit/")
print(message)
primary = False
global entrances_xslug
global caves_xslug
global areas_xslug
# Note: we want to overwrite a PENDING cave if we are now importing the 1623-xxx.html file for it
# Note: these are HTML files in the EXPOWEB repo, not from the loser repo.
fn = settings.CAVEDESCRIPTIONS / filename
# print(f" - Reading Cave from cave descriptions file {fn}")
if not fn.exists():
message = f" ! Cave_data file reading problem filename:'cave_data/{filename}'"
DataIssue.objects.create(parser="caves", message=message, url=None)
DataIssue.objects.create(parser="caves", message=message, url=f"/cave_data/{filename}_edit")
print(message)
return
return None
with open(fn) as f:
contents = f.read()
context = filename
cavecontentslist = getXML(contents, "cave", maxItems=1, context=context)
if len(cavecontentslist) != 1:
message = f'! BAD CAVE at "{filename}"'
DataIssue.objects.create(parser="caves", message=message)
message = f'! BAD CAVE DATA in "{filename}". More than one cave. Edit file manually, click.'
DataIssue.objects.create(parser="caves", message=message, url=f"/cave_data/{filename}_edit")
print(message)
return
return None
cavecontents = cavecontentslist[0]
non_public = getXML(cavecontents, "non_public", maxItems=1, context=context)
slugs = getXML(cavecontents, "caveslug", maxItems=1, context=context)
official_name = getXML(cavecontents, "official_name", maxItems=1, context=context)
areas = getXML(cavecontents, "area", context=context)
kataster_code = getXML(cavecontents, "kataster_code", maxItems=1, context=context)
kataster_number = getXML(cavecontents, "kataster_number", maxItems=1, context=context)
unofficial_number = getXML(cavecontents, "unofficial_number", maxItems=1, context=context)
explorers = getXML(cavecontents, "explorers", maxItems=1, context=context)
underground_description = getXML(cavecontents, "underground_description", maxItems=1, context=context)
equipment = getXML(cavecontents, "equipment", maxItems=1, context=context)
references = getXML(cavecontents, "references", maxItems=1, context=context)
survey = getXML(cavecontents, "survey", maxItems=1, context=context)
kataster_status = getXML(cavecontents, "kataster_status", maxItems=1, context=context)
underground_centre_line = getXML(cavecontents, "underground_centre_line", maxItems=1, context=context)
notes = getXML(cavecontents, "notes", maxItems=1, context=context)
length = getXML(cavecontents, "length", maxItems=1, context=context)
depth = getXML(cavecontents, "depth", maxItems=1, context=context)
extent = getXML(cavecontents, "extent", maxItems=1, context=context)
survex_file = getXML(cavecontents, "survex_file", maxItems=1, context=context)
description_file = getXML(cavecontents, "description_file", maxItems=1, context=context)
url = getXML(cavecontents, "url", maxItems=1, context=context)
entrances = getXML(cavecontents, "entrance", context=context)
if not (
len(non_public) == 1
and len(slugs) >= 1 # is this really correct ?
and len(official_name) == 1
and len(areas) >= 1 # we want to stop using the sub-ares in 2023
and len(kataster_code) == 1
and len(kataster_number) == 1
and len(unofficial_number) == 1
and len(explorers) == 1
and len(underground_description) == 1
and len(equipment) == 1
and len(references) == 1
and len(survey) == 1
and len(kataster_status) == 1
and len(underground_centre_line) == 1
and len(notes) == 1
and len(length) == 1
and len(depth) == 1
and len(extent) == 1
and len(survex_file) == 1
and len(description_file) == 1
and len(url) == 1
):
# more than one item in long list
message = f' ! ABORT loading this cave. in "{filename}"'
DataIssue.objects.create(parser="caves", message=message, url=f"/{slugs}_cave_edit/")
if len(slugs) > 1:
message = f" ! - More than one slug for a cave: {cave}, slugs: {slugs}. Ignoring all except first."
DataIssue.objects.create(parser="caves", message=message, url=f"{cave.url}_edit/")
print(message)
return
slug = slugs[0]
non_public = getXMLmax1("non_public")
official_name = getXMLmax1("official_name")
kataster_code = getXMLmax1("kataster_code")
kataster_number = getXMLmax1("kataster_number")
unofficial_number = getXMLmax1("unofficial_number")
explorers = getXMLmax1("explorers")
underground_description = getXMLmax1("underground_description")
equipment = getXMLmax1("equipment")
references = getXMLmax1("references")
survey = getXMLmax1("survey")
kataster_status = getXMLmax1("kataster_status")
underground_centre_line = getXMLmax1("underground_centre_line")
notes = getXMLmax1("notes")
length = getXMLmax1("length")
depth = getXMLmax1("depth")
extent = getXMLmax1("extent")
survex_file = getXMLmax1("survex_file")
description_file = getXMLmax1("description_file")
url = getXMLmax1("url")
manual_edit = True
if not cave:
manual_edit = False
try:
cave, state = Cave.objects.update_or_create(filename=filename) # replace with slug when CaveSlug tidied up
except:
print(" ! FAILED to get only one CAVE in db when updating using: " + filename)
kaves = Cave.objects.all().filter(filename=filename) # replace with slug when CaveSlug tidied up
for k in kaves:
message = " ! - DUPLICATES in db. kataster:" + str(k.kataster_number) + ", slug:" + str(k.slug())
DataIssue.objects.create(parser="caves", message=message)
print(message)
for k in kaves:
if k.slug() is not None:
print(" ! - OVERWRITING this one: slug:" + str(k.slug()))
k.notes = "DUPLICATE kataster number found on import. Please fix\n" + k.notes
cave = k
# From here on the code applies to both edited and newly-imported caves (mostly!)
do_caveslugstuff() # needs cave!=None
if cave:
# this a re-load prior to editing and we already know the cave id
cave.non_public=boolify(non_public)
cave.official_name=official_name[0]
cave.kataster_code=kataster_code[0]
@ -571,101 +579,38 @@ def readcave(filename, cave=None):
cave.description_file=description_file[0]
cave.url=url[0]
if len(slugs) > 1:
message = f" ! Cave edit failure due to more than one slug: {slugs}, skipping this field edit. "
DataIssue.objects.create(parser="caves", message=message)
print(message)
cave.areas = None
cave.save()
for area_slug in areas:
a = Area.objects.filter(short_name=area_slug)
if a:
cave.area.add(a[0])
else:
message = f" ! Cave edit failure due to unrecognised Area: {a}, skipping this field edit. "
DataIssue.objects.create(parser="caves", message=message)
print(message)
c = cave
do_entrances()
print(f"- {entrances_xslug=}")
reload_entrances()
cave.save()
else:
try:
c, state = Cave.objects.update_or_create(
non_public=boolify(non_public),
official_name=official_name[0],
kataster_code=kataster_code[0],
kataster_number=kataster_number[0],
unofficial_number=unofficial_number[0],
explorers=explorers[0],
underground_description=underground_description[0],
equipment=equipment[0],
references=references[0],
survey=survey[0],
kataster_status=kataster_status[0],
underground_centre_line=underground_centre_line[0],
notes=notes[0],
length=length[0],
depth=depth[0],
extent=extent[0],
survex_file=survex_file[0],
description_file=description_file[0],
url=url[0],
filename=filename,
)
except:
print(" ! FAILED to get only one CAVE when updating using: " + filename)
kaves = Cave.objects.all().filter(kataster_number=kataster_number[0])
for k in kaves:
message = " ! - DUPLICATES in db. kataster:" + str(k.kataster_number) + ", slug:" + str(k.slug())
DataIssue.objects.create(parser="caves", message=message)
print(message)
for k in kaves:
if k.slug() is not None:
print(" ! - OVERWRITING this one: slug:" + str(k.slug()))
k.notes = "DUPLICATE kataster number found on import. Please fix\n" + k.notes
c = k
areas = getXML(cavecontents, "area", context=context)
# cave.area_set.clear() # Need to find correct syntax. Does not delete previously loaded areas.. WARNING
for area_slug in areas:
if area_slug in areas_xslug:
newArea = areas_xslug[area_slug]
else:
area = Area.objects.filter(short_name=area_slug)
if area:
newArea = area[0]
areas_new = Area.objects.filter(short_name=area_slug)
if areas_new:
newArea = areas_new[0] # just the first one we find, but we are going to clean up Areas anyway
else:
# Area not seen before. SHould not happen with manual edit
if manual_edit:
message = f" ! Cave edit failure due to unrecognised Area: {area_slug[0]}, skipping this field edit. "
DataIssue.objects.create(parser="caves", message=message)
print(message)
# super value is highly dodgy
newArea = Area(short_name=area_slug, super=Area.objects.get(short_name="1623"))
newArea.save()
areas_xslug[area_slug] = newArea
c.area.add(newArea)
primary = True # this sets the first thing we find to be primary=True and all the others =False
for slug in slugs:
if slug in caves_xslug:
cs = caves_xslug[slug]
else:
try: # we want to overwrite a PENDING cave if we are now importing the 1623-xxx.html file for it
cs = CaveSlug.objects.update_or_create(cave=c, slug=slug, primary=primary)
caves_xslug[slug] = cs
except Exception as ex:
#raise
# This fails to do an update! It just crashes.. to be fixed
message = f" ! Cave update/create failure : {slug}, skipping file cave_data/{context} with exception\nException: {ex.__class__}"
DataIssue.objects.create(parser="caves", message=message)
print(message)
primary = False
cave.area.add(newArea)
entrances = getXML(cavecontents, "entrance", context=context)
do_entrances()
# print(f"- {entrances_xslug=}")
if not entrances or len(entrances) < 1:
# missing entrance link in cave_data/1623-* .html file
set_dummy_entrance(slug[5:], slug, c, msg="DUMMY: no entrances")
set_dummy_entrance(slug[5:], slug, cave, msg="DUMMY: no entrances")
else:
do_entrances()
if manual_edit:
reload_entrances()
# From here on the code applies to both edited and newly-imported caves
if survex_file[0]:
if not (Path(SURVEX_DATA) / survex_file[0]).is_file():
message = f' ! {slug:12} survex filename does not exist :LOSER:"{survex_file[0]}" in "{filename}"'
@ -681,8 +626,9 @@ def readcave(filename, cave=None):
message = f' ! {slug:12} description filename "{EXPOWEB}/{description_file[0]}" does not refer to a real file'
DataIssue.objects.create(parser="caves", message=message, url=f"/{slug}_cave_edit/")
print(message)
# c.description_file="" # done only once, to clear out cruft.
c.save()
cave.save()
return cave
# ChatGPT replacement attempt 2023-04-21. Obviously very incomplete, but some useful ideas
@ -695,7 +641,7 @@ def readcave(filename, cave=None):
# class FailedCaveUpdateException(Exception):
# pass
# def readcave_chatgpt(filename, entrances_xslug, caves_xslug, areas_xslug):
# def read_cave_chatgpt(filename, entrances_xslug, caves_xslug, areas_xslug):
# """Reads an entrance description from the .html file and updates the corresponding Cave object"""
# tree = ET.parse(os.path.join(CAVEDESCRIPTIONS, filename))
# root = tree.getroot()
@ -789,17 +735,12 @@ def readcaves():
print(" - settings.CAVEDESCRIPTIONS: ", CAVEDESCRIPTIONS)
print(" - Reading Entrances from entrance descriptions xml files")
for filename in next(os.walk(ENTRANCEDESCRIPTIONS))[2]: # Should be a better way of getting a list of files
# if filename.endswith('.html'):
# if Path(filename).stem[5:] in pending:
# print(f'Skipping pending entrance dummy file <{filename}>')
# else:
# readentrance(filename)
readentrance(filename)
read_entrance(filename)
print(" - Reading Caves from cave descriptions xml files")
for filename in next(os.walk(CAVEDESCRIPTIONS))[2]: # Should be a better way of getting a list of files
if filename.endswith(".html"):
readcave(filename)
read_cave(filename)
print(" - Setting up all the variously useful alias names")
GetCaveLookup()