From 2ed66fe3d0c24ba993425947ee7cd0986ecba83c Mon Sep 17 00:00:00 2001 From: Philip Sargent Date: Sat, 22 Apr 2023 01:24:32 +0100 Subject: [PATCH] edit cave reads from HTML file not just db --- core/utils.py | 2 +- core/views/caves.py | 11 +- parsers/caves.py | 526 ++++++++++++++++++++++++++++---------------- 3 files changed, 342 insertions(+), 197 deletions(-) diff --git a/core/utils.py b/core/utils.py index 2c0a3a2..3461543 100644 --- a/core/utils.py +++ b/core/utils.py @@ -117,7 +117,7 @@ def write_and_commit(files, message): kwargs = {} try: with open(filepath, mode, **kwargs) as f: - print(f"WRITING{cwd}---{filename} ") + print(f"WRITING {cwd}---{filename} ") # as the wsgi process www-data, we have group write-access but are not owner, so cannot chmod. # os.chmod(filepath, 0o664) # set file permissions to rw-rw-r-- f.write(content) diff --git a/core/views/caves.py b/core/views/caves.py index 7a3818c..07ea315 100644 --- a/core/views/caves.py +++ b/core/views/caves.py @@ -14,6 +14,9 @@ from troggle.core.models.caves import Cave, CaveAndEntrance, Entrance, GetCaveLo from troggle.core.models.logbooks import CaveSlug, QM from troggle.core.utils import write_and_commit from troggle.core.views import expo +from troggle.settings import CAVEDESCRIPTIONS, ENTRANCEDESCRIPTIONS +from troggle.parsers.caves import readcave, readentrance + from .auth import login_required_if_public @@ -333,7 +336,7 @@ def edit_cave(request, path="", slug=None): The format for the file being saved is in templates/dataformat/cave.xml Warning. This uses Django deep magic. - It does save the data into into the database directly, not by parsing the file. + It saves the data into into the database and into the html file, which it then commits to git. """ message = "" if slug is not None: @@ -373,7 +376,7 @@ def edit_cave(request, path="", slug=None): ceinst.save() try: cave_file = cave.file_output() - print(cave_file) + # print(cave_file) write_and_commit([cave_file], f"Online edit of {cave}") # leave other exceptions unhandled so that they bubble up to user interface except PermissionError: @@ -388,6 +391,10 @@ def edit_cave(request, path="", slug=None): message = f"! POST data is INVALID {cave}" print(message) else: + # re-read cave data from file. + filename = str(cave.slug() +".html") + readcave(filename, cave=cave) + form = CaveForm(instance=cave) ceFormSet = CaveAndEntranceFormSet(queryset=cave.caveandentrance_set.all()) diff --git a/parsers/caves.py b/parsers/caves.py index f7ca541..be1bc5a 100644 --- a/parsers/caves.py +++ b/parsers/caves.py @@ -10,7 +10,12 @@ from troggle.core.models.logbooks import CaveSlug from troggle.core.models.troggle import DataIssue from troggle.settings import CAVEDESCRIPTIONS, ENTRANCEDESCRIPTIONS, EXPOWEB, SURVEX_DATA -"""Reads all the cave description data by parsing the xml files (stored as e.g. :EXPOWEB:/cave_data/1623-161.html ) +"""Reads all the cave description data and entrance description data +by parsing the xml files stored as e.g. +:EXPOWEB:/cave_data/1623-161.html +or +:EXPOWEB:/entrance_data/1623-161g.html + and creating the various Cave, Entrance and necessary Area objects. This is the first import that happens after the database is reinitialised. @@ -272,8 +277,41 @@ def do_pending_cave(k, url, area): print(message) return cave -def readentrance(filename): +def getXML(text, itemname, minItems=1, maxItems=None, context=""): + """Reads a single XML tag + Should throw exception rather than producing error message here, + then handle exception in calling routine where it has the context. + """ + items = re.findall("<%(itemname)s>(.*?)" % {"itemname": itemname}, text, re.S) + if len(items) < minItems: + message = ( + " ! %(count)i x %(itemname)s found, at least %(min)i expected. Load ABORT. " + % {"count": len(items), "itemname": itemname, "min": minItems} + + " in file " + + context + ) + DataIssue.objects.create(parser="caves", message=message, url="" + context) + print(message) + + if maxItems is not None and len(items) > maxItems: + message = ( + " ! %(count)i x %(itemname)s found, no more than %(max)i expected in this XML unit. Load ABORT. " + % {"count": len(items), "itemname": itemname, "max": maxItems} + + " in file " + + context + ) + DataIssue.objects.create(parser="caves", message=message) + print(message) + if minItems == 0: + if not items: + items = [""] + return items + +def readentrance(filename, ent=None): """Reads an entrance description from the .html file + + If not called as part of initial import, then the global lists will not be correct + but this is OK, a search will find them in the db. """ def getXMLmax1(field): return getXML(entrancecontents, field, maxItems=1, context=context) @@ -362,17 +400,50 @@ def readentrance(filename): ) -def readcave(filename): +def readcave(filename, cave=None): """Reads an entrance description from the .html file - Convoluted. Sorry.This is as I inherited it and I haven't fiddled with it. Needs rewriting + Convoluted. Sorry. Needs rewriting Assumes any area it hasn't seen before is a subarea of 1623 + + If not called as part of initial import, then the global lists will not be correct + but this is OK, a search will find them in the db. """ + def do_entrances(): + for entrance in entrances: + eslug = getXML(entrance, "entranceslug", maxItems=1, context=context)[0] + letter = getXML(entrance, "letter", maxItems=1, context=context)[0] + if len(entrances) == 1 and not eslug: # may be empty: + set_dummy_entrance(slug[5:], slug, c, msg="DUMMY: no entrance slug read from file") + else: + try: + if eslug in entrances_xslug: + entrance = entrances_xslug[eslug] + else: + # entrance = Entrance.objects.get(entranceslug__slug=eslug) + entrance = Entrance.objects.get(slug=eslug) + entrances_xslug[eslug] = entrance + CaveAndEntrance.objects.update_or_create( + cave=c, entrance_letter=letter, entrance=entrance + ) + except: + message = f' ! Entrance setting failure, slug:"{slug}" #entrances:{len(entrances)} {entrance} letter:"{letter}" cave:"{c}" filename:"cave_data/{filename}"' + DataIssue.objects.create(parser="entrances", message=message, url=f"{c.url}_edit/") + print(message) + global entrances_xslug global caves_xslug global areas_xslug # Note: these are HTML files in the EXPOWEB repo, not from the loser repo. - with open(os.path.join(CAVEDESCRIPTIONS, filename)) as f: + fn = settings.CAVEDESCRIPTIONS / filename + # print(f" - Reading Cave from cave descriptions file {fn}") + if not fn.exists(): + message = f" ! Cave_data file reading problem filename:'cave_data/{filename}'" + DataIssue.objects.create(parser="caves", message=message, url=None) + print(message) + return + + with open(fn) as f: contents = f.read() context = filename cavecontentslist = getXML(contents, "cave", maxItems=1, context=context) @@ -380,204 +451,271 @@ def readcave(filename): message = f'! BAD CAVE at "{filename}"' DataIssue.objects.create(parser="caves", message=message) print(message) - else: - cavecontents = cavecontentslist[0] - non_public = getXML(cavecontents, "non_public", maxItems=1, context=context) - slugs = getXML(cavecontents, "caveslug", maxItems=1, context=context) - official_name = getXML(cavecontents, "official_name", maxItems=1, context=context) - areas = getXML(cavecontents, "area", context=context) - kataster_code = getXML(cavecontents, "kataster_code", maxItems=1, context=context) - kataster_number = getXML(cavecontents, "kataster_number", maxItems=1, context=context) - unofficial_number = getXML(cavecontents, "unofficial_number", maxItems=1, context=context) - explorers = getXML(cavecontents, "explorers", maxItems=1, context=context) - underground_description = getXML(cavecontents, "underground_description", maxItems=1, context=context) - equipment = getXML(cavecontents, "equipment", maxItems=1, context=context) - references = getXML(cavecontents, "references", maxItems=1, context=context) - survey = getXML(cavecontents, "survey", maxItems=1, context=context) - kataster_status = getXML(cavecontents, "kataster_status", maxItems=1, context=context) - underground_centre_line = getXML(cavecontents, "underground_centre_line", maxItems=1, context=context) - notes = getXML(cavecontents, "notes", maxItems=1, context=context) - length = getXML(cavecontents, "length", maxItems=1, context=context) - depth = getXML(cavecontents, "depth", maxItems=1, context=context) - extent = getXML(cavecontents, "extent", maxItems=1, context=context) - survex_file = getXML(cavecontents, "survex_file", maxItems=1, context=context) - description_file = getXML(cavecontents, "description_file", maxItems=1, context=context) - url = getXML(cavecontents, "url", maxItems=1, context=context) - entrances = getXML(cavecontents, "entrance", context=context) + return + + cavecontents = cavecontentslist[0] + non_public = getXML(cavecontents, "non_public", maxItems=1, context=context) + slugs = getXML(cavecontents, "caveslug", maxItems=1, context=context) + official_name = getXML(cavecontents, "official_name", maxItems=1, context=context) + areas = getXML(cavecontents, "area", context=context) + kataster_code = getXML(cavecontents, "kataster_code", maxItems=1, context=context) + kataster_number = getXML(cavecontents, "kataster_number", maxItems=1, context=context) + unofficial_number = getXML(cavecontents, "unofficial_number", maxItems=1, context=context) + explorers = getXML(cavecontents, "explorers", maxItems=1, context=context) + underground_description = getXML(cavecontents, "underground_description", maxItems=1, context=context) + equipment = getXML(cavecontents, "equipment", maxItems=1, context=context) + references = getXML(cavecontents, "references", maxItems=1, context=context) + survey = getXML(cavecontents, "survey", maxItems=1, context=context) + kataster_status = getXML(cavecontents, "kataster_status", maxItems=1, context=context) + underground_centre_line = getXML(cavecontents, "underground_centre_line", maxItems=1, context=context) + notes = getXML(cavecontents, "notes", maxItems=1, context=context) + length = getXML(cavecontents, "length", maxItems=1, context=context) + depth = getXML(cavecontents, "depth", maxItems=1, context=context) + extent = getXML(cavecontents, "extent", maxItems=1, context=context) + survex_file = getXML(cavecontents, "survex_file", maxItems=1, context=context) + description_file = getXML(cavecontents, "description_file", maxItems=1, context=context) + url = getXML(cavecontents, "url", maxItems=1, context=context) + entrances = getXML(cavecontents, "entrance", context=context) - if ( - len(non_public) == 1 - and len(slugs) >= 1 - and len(official_name) == 1 - and len(areas) >= 1 - and len(kataster_code) == 1 - and len(kataster_number) == 1 - and len(unofficial_number) == 1 - and len(explorers) == 1 - and len(underground_description) == 1 - and len(equipment) == 1 - and len(references) == 1 - and len(survey) == 1 - and len(kataster_status) == 1 - and len(underground_centre_line) == 1 - and len(notes) == 1 - and len(length) == 1 - and len(depth) == 1 - and len(extent) == 1 - and len(survex_file) == 1 - and len(description_file) == 1 - and len(url) == 1 - ): - try: - c, state = Cave.objects.update_or_create( - non_public={ - "True": True, - "False": False, - "true": True, - "false": False, - }[non_public[0]], - official_name=official_name[0], - kataster_code=kataster_code[0], - kataster_number=kataster_number[0], - unofficial_number=unofficial_number[0], - explorers=explorers[0], - underground_description=underground_description[0], - equipment=equipment[0], - references=references[0], - survey=survey[0], - kataster_status=kataster_status[0], - underground_centre_line=underground_centre_line[0], - notes=notes[0], - length=length[0], - depth=depth[0], - extent=extent[0], - survex_file=survex_file[0], - description_file=description_file[0], - url=url[0], - filename=filename, - ) - except: - print(" ! FAILED to get only one CAVE when updating using: " + filename) - kaves = Cave.objects.all().filter(kataster_number=kataster_number[0]) - for k in kaves: - message = " ! - DUPLICATES in db. kataster:" + str(k.kataster_number) + ", slug:" + str(k.slug()) - DataIssue.objects.create(parser="caves", message=message) - print(message) - for k in kaves: - if k.slug() is not None: - print(" ! - OVERWRITING this one: slug:" + str(k.slug())) - k.notes = "DUPLICATE kataster number found on import. Please fix\n" + k.notes - c = k - - for area_slug in areas: - if area_slug in areas_xslug: - newArea = areas_xslug[area_slug] - else: - area = Area.objects.filter(short_name=area_slug) - if area: - newArea = area[0] - else: - newArea = Area(short_name=area_slug, super=Area.objects.get(short_name="1623")) - newArea.save() - areas_xslug[area_slug] = newArea - c.area.add(newArea) - primary = True # this sets the first thing we find to be primary=True and all the others =False - for slug in slugs: - if slug in caves_xslug: - cs = caves_xslug[slug] - else: - try: # we want to overwrite a PENDING cave if we are now importing the 1623-xxx.html file for it - cs = CaveSlug.objects.update_or_create(cave=c, slug=slug, primary=primary) - caves_xslug[slug] = cs - except Exception as ex: - # This fails to do an update! It just crashes.. to be fixed - message = f" ! Cave update/create failure : {slug}, skipping file cave_data/{context} with exception\nException: {ex.__class__}" - DataIssue.objects.create(parser="caves", message=message) - print(message) - - primary = False - - if not entrances or len(entrances) < 1: - # missing entrance link in cave_data/1623-* .html file - set_dummy_entrance(slug[5:], slug, c, msg="DUMMY: no entrances") - else: - for entrance in entrances: - eslug = getXML(entrance, "entranceslug", maxItems=1, context=context)[0] - letter = getXML(entrance, "letter", maxItems=1, context=context)[0] - if len(entrances) == 1 and not eslug: # may be empty: - set_dummy_entrance(slug[5:], slug, c, msg="DUMMY: no entrance slug read from file") - else: - try: - if eslug in entrances_xslug: - entrance = entrances_xslug[eslug] - else: - # entrance = Entrance.objects.get(entranceslug__slug=eslug) - entrance = Entrance.objects.get(slug=eslug) - entrances_xslug[eslug] = entrance - CaveAndEntrance.objects.update_or_create( - cave=c, entrance_letter=letter, entrance=entrance - ) - except: - message = f' ! Entrance setting failure, slug:"{slug}" #entrances:{len(entrances)} {entrance} letter:"{letter}" cave:"{c}" filename:"cave_data/{filename}"' - DataIssue.objects.create(parser="entrances", message=message, url=f"{c.url}_edit/") - print(message) - - if survex_file[0]: - if not (Path(SURVEX_DATA) / survex_file[0]).is_file(): - message = f' ! {slug:12} survex filename does not exist :LOSER:"{survex_file[0]}" in "{filename}"' - DataIssue.objects.create(parser="caves", message=message, url=f"/{slug[0:4]}/{slug}_cave_edit/") - print(message) - - if description_file[0]: # if not an empty string - message = f' - {slug:12} Note (not an error): complex description filename "{description_file[0]}" inside "{CAVEDESCRIPTIONS}/{filename}"' - DataIssue.objects.create(parser="caves ok", message=message, url=f"/{slug}_cave_edit/") - print(message) - - if not (Path(EXPOWEB) / description_file[0]).is_file(): - message = f' ! {slug:12} description filename "{EXPOWEB}/{description_file[0]}" does not refer to a real file' - DataIssue.objects.create(parser="caves", message=message, url=f"/{slug}_cave_edit/") - print(message) - # c.description_file="" # done only once, to clear out cruft. - # c.save() - else: # more than one item in long list - message = f' ! ABORT loading this cave. in "{filename}"' - DataIssue.objects.create(parser="caves", message=message, url=f"/{slug}_cave_edit/") + if not ( + len(non_public) == 1 + and len(slugs) >= 1 # is this really correct ? + and len(official_name) == 1 + and len(areas) >= 1 # we want to stop using the sub-ares in 2023 + and len(kataster_code) == 1 + and len(kataster_number) == 1 + and len(unofficial_number) == 1 + and len(explorers) == 1 + and len(underground_description) == 1 + and len(equipment) == 1 + and len(references) == 1 + and len(survey) == 1 + and len(kataster_status) == 1 + and len(underground_centre_line) == 1 + and len(notes) == 1 + and len(length) == 1 + and len(depth) == 1 + and len(extent) == 1 + and len(survex_file) == 1 + and len(description_file) == 1 + and len(url) == 1 + ): + # more than one item in long list + message = f' ! ABORT loading this cave. in "{filename}"' + DataIssue.objects.create(parser="caves", message=message, url=f"/{slugs}_cave_edit/") + print(message) + return + + if cave: + # this a re-load prior to editing and we already know the cave id + cave.non_public={ + "True": True, + "False": False, + "true": True, + "false": False}[non_public[0]] + cave.official_name=official_name[0] + cave.kataster_code=kataster_code[0] + cave.kataster_number=kataster_number[0] + cave.unofficial_number=unofficial_number[0] + cave.explorers=explorers[0] + cave.underground_description=underground_description[0] + cave.equipment=equipment[0] + cave.references=references[0] + cave.survey=survey[0] + cave.kataster_status=kataster_status[0] + cave.underground_centre_line=underground_centre_line[0] + cave.notes=notes[0] + cave.length=length[0] + cave.depth=depth[0] + cave.extent=extent[0] + cave.survex_file=survex_file[0] + cave.description_file=description_file[0] + cave.url=url[0] + + if len(slugs) > 1: + message = f" ! Cave edit failure due to more than one slug: {slugs}, skipping this field edit. " + DataIssue.objects.create(parser="caves", message=message) print(message) + cave.areas = None + cave.save() + for area_slug in areas: + a = Area.objects.filter(short_name=area_slug) + if a: + cave.area.add(a[0]) + else: + message = f" ! Cave edit failure due to unrecognised Area: {a}, skipping this field edit. " + DataIssue.objects.create(parser="caves", message=message) + print(message) + + c = cave + do_entrances() + cave.save() + else: + try: + c, state = Cave.objects.update_or_create( + non_public={ + "True": True, + "False": False, + "true": True, + "false": False, + }[non_public[0]], + official_name=official_name[0], + kataster_code=kataster_code[0], + kataster_number=kataster_number[0], + unofficial_number=unofficial_number[0], + explorers=explorers[0], + underground_description=underground_description[0], + equipment=equipment[0], + references=references[0], + survey=survey[0], + kataster_status=kataster_status[0], + underground_centre_line=underground_centre_line[0], + notes=notes[0], + length=length[0], + depth=depth[0], + extent=extent[0], + survex_file=survex_file[0], + description_file=description_file[0], + url=url[0], + filename=filename, + ) + except: + print(" ! FAILED to get only one CAVE when updating using: " + filename) + kaves = Cave.objects.all().filter(kataster_number=kataster_number[0]) + for k in kaves: + message = " ! - DUPLICATES in db. kataster:" + str(k.kataster_number) + ", slug:" + str(k.slug()) + DataIssue.objects.create(parser="caves", message=message) + print(message) + for k in kaves: + if k.slug() is not None: + print(" ! - OVERWRITING this one: slug:" + str(k.slug())) + k.notes = "DUPLICATE kataster number found on import. Please fix\n" + k.notes + c = k + + for area_slug in areas: + if area_slug in areas_xslug: + newArea = areas_xslug[area_slug] + else: + area = Area.objects.filter(short_name=area_slug) + if area: + newArea = area[0] + else: + newArea = Area(short_name=area_slug, super=Area.objects.get(short_name="1623")) + newArea.save() + areas_xslug[area_slug] = newArea + c.area.add(newArea) + + primary = True # this sets the first thing we find to be primary=True and all the others =False + for slug in slugs: + if slug in caves_xslug: + cs = caves_xslug[slug] + else: + try: # we want to overwrite a PENDING cave if we are now importing the 1623-xxx.html file for it + cs = CaveSlug.objects.update_or_create(cave=c, slug=slug, primary=primary) + caves_xslug[slug] = cs + except Exception as ex: + #raise + # This fails to do an update! It just crashes.. to be fixed + message = f" ! Cave update/create failure : {slug}, skipping file cave_data/{context} with exception\nException: {ex.__class__}" + DataIssue.objects.create(parser="caves", message=message) + print(message) -def getXML(text, itemname, minItems=1, maxItems=None, context=""): - """Reads a single XML tag - Should throw exception rather than producing error message here, - then handle exception in calling routine where it has the context. - """ - items = re.findall("<%(itemname)s>(.*?)" % {"itemname": itemname}, text, re.S) - if len(items) < minItems: - message = ( - " ! %(count)i x %(itemname)s found, at least %(min)i expected. Load ABORT. " - % {"count": len(items), "itemname": itemname, "min": minItems} - + " in file " - + context - ) - DataIssue.objects.create(parser="caves", message=message, url="" + context) + primary = False + + if not entrances or len(entrances) < 1: + # missing entrance link in cave_data/1623-* .html file + set_dummy_entrance(slug[5:], slug, c, msg="DUMMY: no entrances") + else: + do_entrances() + + # From here on the code applies to both edited and newly-imported caves + if survex_file[0]: + if not (Path(SURVEX_DATA) / survex_file[0]).is_file(): + message = f' ! {slug:12} survex filename does not exist :LOSER:"{survex_file[0]}" in "{filename}"' + DataIssue.objects.create(parser="caves", message=message, url=f"/{slug[0:4]}/{slug}_cave_edit/") + print(message) + + if description_file[0]: # if not an empty string + message = f' - {slug:12} Note (not an error): complex description filename "{description_file[0]}" inside "{CAVEDESCRIPTIONS}/{filename}"' + DataIssue.objects.create(parser="caves ok", message=message, url=f"/{slug}_cave_edit/") print(message) - if maxItems is not None and len(items) > maxItems: - message = ( - " ! %(count)i x %(itemname)s found, no more than %(max)i expected in this XML unit. Load ABORT. " - % {"count": len(items), "itemname": itemname, "max": maxItems} - + " in file " - + context - ) - DataIssue.objects.create(parser="caves", message=message) - print(message) - if minItems == 0: - if not items: - items = [""] - return items + if not (Path(EXPOWEB) / description_file[0]).is_file(): + message = f' ! {slug:12} description filename "{EXPOWEB}/{description_file[0]}" does not refer to a real file' + DataIssue.objects.create(parser="caves", message=message, url=f"/{slug}_cave_edit/") + print(message) + # c.description_file="" # done only once, to clear out cruft. + c.save() + + +# ChatGPT replacement attempt 2023-04-21. Obviously very incomplete, but some useful ideas +# import os +# import xml.etree.ElementTree as ET + +# class BadCaveException(Exception): + # pass + +# class FailedCaveUpdateException(Exception): + # pass + +# def readcave_chatgpt(filename, entrances_xslug, caves_xslug, areas_xslug): + # """Reads an entrance description from the .html file and updates the corresponding Cave object""" + # tree = ET.parse(os.path.join(CAVEDESCRIPTIONS, filename)) + # root = tree.getroot() + + # cavecontents = root.find("cave") + # if cavecontents is None: + # raise BadCaveException(f'! BAD CAVE at "{filename}"') + + # non_public = cavecontents.findtext("non_public") + # slugs = cavecontents.findtext("caveslug") + # official_name = cavecontents.findtext("official_name") + # kataster_code = cavecontents.findtext("kataster_code") + # kataster_number = cavecontents.findtext("kataster_number") + # unofficial_number = cavecontents.findtext("unofficial_number") + # explorers = cavecontents.findtext("explorers") + # underground_description = cavecontents.findtext("underground_description") + # equipment = cavecontents.findtext("equipment") + # references = cavecontents.findtext("references") + # survey = cavecontents.findtext("survey") + # kataster_status = cavecontents.findtext("kataster_status") + # underground_centre_line = cavecontents.findtext("underground_centre_line") + # notes = cavecontents.findtext("notes") + # length = cavecontents.findtext("length") + # depth = cavecontents.findtext("depth") + # extent = cavecontents.findtext("extent") + # survex_file = cavecontents.findtext("survex_file") + # description_file = cavecontents.findtext("description_file") + # url = cavecontents.findtext("url") + + # areas = cavecontents.findall("area") + # entrances = cavecontents.findall("entrance") + + # if ( + # non_public is not None + # # etc. + # # wrong, some of these should be ==1 and some >=1 + # ): + # try: + # cave = caves_xslug.get(kataster_number) + # if cave is None: + # cave = Cave.objects.create( + # non_public={ + # "True": True, + # "False": False, + # "true": True, + # "false": False, + # }[non_public], + # official_name=official_name, + # # kataster [truncated] + def readcaves(): - """Reads the xml-format HTML files in the EXPOWEB repo, not from the loser repo.""" + """Called from databaseReset mass importer. + Reads the xml-format HTML 'cave' files in the EXPOWEB repo, the survex files from the loser repo. + """ # Pending is for those caves which do not have cave_data/1623-xxx.html XML files even though # they exist and have surveys. pending = set()