xml parser attmpt retract

This commit is contained in:
Philip Sargent 2023-04-22 23:15:50 +01:00
parent 30ef427b90
commit c5a9bdc724
3 changed files with 15 additions and 69 deletions

View File

@ -1,5 +1,7 @@
import os import os
import re import re
from pathlib import Path from pathlib import Path
from django.conf import settings from django.conf import settings
@ -426,6 +428,14 @@ def read_cave(filename, cave=None):
If not called as part of initial import, then the global lists will not be correct If not called as part of initial import, then the global lists will not be correct
but this is OK, a search will find them in the db. but this is OK, a search will find them in the db.
Attempted to use standard python3.11 xml library but fails on HTML entities (2023-04-23)
import xml.etree.ElementTree as ET
tree = ET.parse(fn)
xml_root = tree.getroot()
for t in ["html", "head", "body", "cave","non_public", "caveslug", "official_name","entrance"]:
elements = xml_root.findall(t)
""" """
def getXMLmax1(field): def getXMLmax1(field):
return getXML(cavecontents, field, maxItems=1, context=context) return getXML(cavecontents, field, maxItems=1, context=context)
@ -492,6 +502,8 @@ def read_cave(filename, cave=None):
# Note: these are HTML files in the EXPOWEB repo, not from the loser repo. # Note: these are HTML files in the EXPOWEB repo, not from the loser repo.
fn = settings.CAVEDESCRIPTIONS / filename fn = settings.CAVEDESCRIPTIONS / filename
context = filename
# print(f" - Reading Cave from cave descriptions file {fn}") # print(f" - Reading Cave from cave descriptions file {fn}")
if not fn.exists(): if not fn.exists():
message = f" ! Cave_data file reading problem filename:'cave_data/{filename}'" message = f" ! Cave_data file reading problem filename:'cave_data/{filename}'"
@ -501,7 +513,6 @@ def read_cave(filename, cave=None):
with open(fn) as f: with open(fn) as f:
contents = f.read() contents = f.read()
context = filename
cavecontentslist = getXML(contents, "cave", maxItems=1, context=context) cavecontentslist = getXML(contents, "cave", maxItems=1, context=context)
if len(cavecontentslist) != 1: if len(cavecontentslist) != 1:
@ -580,7 +591,7 @@ def read_cave(filename, cave=None):
cave.url=url[0] cave.url=url[0]
areas = getXML(cavecontents, "area", context=context) areas = getXML(cavecontents, "area", context=context)
# cave.area_set.clear() # Need to find correct syntax. Does not delete previously loaded areas.. WARNING cave.area.clear() # Deletes all links to areas in db
for area_slug in areas: for area_slug in areas:
if area_slug in areas_xslug: if area_slug in areas_xslug:
newArea = areas_xslug[area_slug] newArea = areas_xslug[area_slug]
@ -630,70 +641,6 @@ def read_cave(filename, cave=None):
cave.save() cave.save()
return cave return cave
# ChatGPT replacement attempt 2023-04-21. Obviously very incomplete, but some useful ideas
# import os
# import xml.etree.ElementTree as ET
# class BadCaveException(Exception):
# pass
# class FailedCaveUpdateException(Exception):
# pass
# def read_cave_chatgpt(filename, entrances_xslug, caves_xslug, areas_xslug):
# """Reads an entrance description from the .html file and updates the corresponding Cave object"""
# tree = ET.parse(os.path.join(CAVEDESCRIPTIONS, filename))
# root = tree.getroot()
# cavecontents = root.find("cave")
# if cavecontents is None:
# raise BadCaveException(f'! BAD CAVE at "{filename}"')
# non_public = cavecontents.findtext("non_public")
# slugs = cavecontents.findtext("caveslug")
# official_name = cavecontents.findtext("official_name")
# kataster_code = cavecontents.findtext("kataster_code")
# kataster_number = cavecontents.findtext("kataster_number")
# unofficial_number = cavecontents.findtext("unofficial_number")
# explorers = cavecontents.findtext("explorers")
# underground_description = cavecontents.findtext("underground_description")
# equipment = cavecontents.findtext("equipment")
# references = cavecontents.findtext("references")
# survey = cavecontents.findtext("survey")
# kataster_status = cavecontents.findtext("kataster_status")
# underground_centre_line = cavecontents.findtext("underground_centre_line")
# notes = cavecontents.findtext("notes")
# length = cavecontents.findtext("length")
# depth = cavecontents.findtext("depth")
# extent = cavecontents.findtext("extent")
# survex_file = cavecontents.findtext("survex_file")
# description_file = cavecontents.findtext("description_file")
# url = cavecontents.findtext("url")
# areas = cavecontents.findall("area")
# entrances = cavecontents.findall("entrance")
# if (
# non_public is not None
# # etc.
# # wrong, some of these should be ==1 and some >=1
# ):
# try:
# cave = caves_xslug.get(kataster_number)
# if cave is None:
# cave = Cave.objects.create(
# non_public={
# "True": True,
# "False": False,
# "true": True,
# "false": False,
# }[non_public],
# official_name=official_name,
# # kataster [truncated]
def readcaves(): def readcaves():
"""Called from databaseReset mass importer. """Called from databaseReset mass importer.
Reads the xml-format HTML 'cave' files in the EXPOWEB repo, the survex files from the loser repo. Reads the xml-format HTML 'cave' files in the EXPOWEB repo, the survex files from the loser repo.
@ -732,7 +679,6 @@ def readcaves():
with transaction.atomic(): with transaction.atomic():
area = get_area("1623") area = get_area("1623")
print(" - settings.CAVEDESCRIPTIONS: ", CAVEDESCRIPTIONS)
print(" - Reading Entrances from entrance descriptions xml files") print(" - Reading Entrances from entrance descriptions xml files")
for filename in next(os.walk(ENTRANCEDESCRIPTIONS))[2]: # Should be a better way of getting a list of files for filename in next(os.walk(ENTRANCEDESCRIPTIONS))[2]: # Should be a better way of getting a list of files
read_entrance(filename) read_entrance(filename)

View File

@ -12,7 +12,7 @@ though, you do not need to do a data import as it happens automatically -->
<body> <body>
<b>This file is generated by troggle</b> on {{date}} UTC using the form documented at <b>This file is generated by troggle</b> on {{date}} UTC using the form documented at
the form documented at the form documented at
<a ="/handbook/survey/caveentry.html">handbook/survey/caveentry.html</a> <a href="/handbook/survey/caveentry.html">handbook/survey/caveentry.html</a>
<br> <br>
<cave> <cave>

View File

@ -15,7 +15,7 @@ though, you do not need to do a data import as it happens automatically -->
</head> </head>
<body> <body>
<b>This file is generated by troggle</b> on {{date}} UTC using the form documented at <b>This file is generated by troggle</b> on {{date}} UTC using the form documented at
<a ="/handbook/survey/ententry.html">handbook/survey/ententry.html</a> <a href="/handbook/survey/ententry.html">handbook/survey/ententry.html</a>
<br> <br>
<entrance> <entrance>
<non_public>{{ entrance.non_public }}</non_public> <non_public>{{ entrance.non_public }}</non_public>