From 30ef427b904f2f1fcf3635d5944b14615b48dcee Mon Sep 17 00:00:00 2001 From: Philip Sargent Date: Sat, 22 Apr 2023 22:05:12 +0100 Subject: [PATCH] refactor cave import --- core/views/caves.py | 8 +- parsers/caves.py | 427 +++++++++++++++++++------------------------- 2 files changed, 188 insertions(+), 247 deletions(-) diff --git a/core/views/caves.py b/core/views/caves.py index e053ffd..e51fb47 100644 --- a/core/views/caves.py +++ b/core/views/caves.py @@ -15,7 +15,7 @@ from troggle.core.models.logbooks import CaveSlug, QM from troggle.core.utils import write_and_commit from troggle.core.views import expo from troggle.settings import CAVEDESCRIPTIONS, ENTRANCEDESCRIPTIONS -from troggle.parsers.caves import readcave, readentrance +from troggle.parsers.caves import read_cave, read_entrance from .auth import login_required_if_public @@ -392,7 +392,7 @@ def edit_cave(request, path="", slug=None): else: # re-read cave data from file. filename = str(cave.slug() +".html") - readcave(filename, cave=cave) + read_cave(filename, cave=cave) form = CaveForm(instance=cave) ceFormSet = CaveAndEntranceFormSet(queryset=cave.caveandentrance_set.all()) @@ -446,7 +446,7 @@ def edit_entrance(request, path="", caveslug=None, entslug=None): if form.is_valid() and entletter.is_valid(): entrance = form.save(commit=False) entrance_letter = entletter.save(commit=False) - print(f"- POST {caveslug=} {entslug=} {path=}") + # print(f"- POST {caveslug=} {entslug=} {path=}") if entslug is None: if entletter.cleaned_data["entrance_letter"]: slugname = cave.slug() + entletter.cleaned_data["entrance_letter"] @@ -467,7 +467,7 @@ def edit_entrance(request, path="", caveslug=None, entslug=None): else: # re-read entrance data from file. filename = str(entrance.slug +".html") - readentrance(filename, ent=entrance) + read_entrance(filename, ent=entrance) form = EntranceForm(instance=entrance) if entslug is None: diff --git a/parsers/caves.py b/parsers/caves.py index 7eba28a..5389a8e 100644 --- a/parsers/caves.py +++ b/parsers/caves.py @@ -27,6 +27,8 @@ todo = """ - Cannot use Edit This Page for pendingcaves.txt_edit as Edit This Page is expecting an html file. So we will need a separate file-editing capability just for this configuration file ?! +- we want to overwrite a PENDING cave if we are now importing the 1623-xxx.html file for it + - Semi-automagically import all the 1627- pending caves and create HTML files for them to be edited individually. (These are caves we only know about because we have German survex files.) @@ -281,6 +283,9 @@ def getXML(text, itemname, minItems=1, maxItems=None, context=""): """Reads a single XML tag Should throw exception rather than producing error message here, then handle exception in calling routine where it has the context. + + This always succeeds, but it produices error message on the terminal and in the + DatIssues log. """ items = re.findall("<%(itemname)s>(.*?)" % {"itemname": itemname}, text, re.S) if len(items) < minItems: @@ -300,7 +305,7 @@ def getXML(text, itemname, minItems=1, maxItems=None, context=""): + " in file " + context ) - DataIssue.objects.create(parser="caves", message=message) + DataIssue.objects.create(parser="caves", message=message, url="" + context) print(message) if minItems == 0: if not items: @@ -315,12 +320,19 @@ def boolify(boolstrs): "true": True, "false": False}[boolstrs[0]] -def readentrance(filename, ent=None): - """Reads an entrance description from the .html file - - If not called as part of initial import, then the global lists will not be correct - but this is OK, a search will find them in the db. - """ +def read_entrance(filename, ent=None): + """Reads an entrance description from the .html file. + + If not called as part of initial import, then the global lists will not be correct + but this is OK, a search will find them in the db. + + Args: + filename: The name of the .html file. + ent: The entrance object, if it already exists. + + Returns: + The entrance object, or a new entrance object if `ent` is None. + """ def getXMLmax1(field): return getXML(entrancecontents, field, maxItems=1, context=context) @@ -333,23 +345,22 @@ def readentrance(filename, ent=None): contents = f.read() context = filename - # print("Reading file ENTRANCE {} / {}".format(ENTRANCEDESCRIPTIONS, filename)) entrancecontentslist = getXML(contents, "entrance", maxItems=1, context=context) if len(entrancecontentslist) != 1: - message = f'! BAD ENTRANCE at "{filename}". Loading aborted. ' - DataIssue.objects.create(parser="entrances", message=message) + message = f'! BAD ENTRANCE DATA in "{filename}". More than one entrance. Edit file manually, click.' + DataIssue.objects.create(parser="entrances", message=message, url=f"/entrance_data/{filename}_edit") print(message) - return + return None entrancecontents = entrancecontentslist[0] slugs = getXML(entrancecontents, "slug", context=context) + slug = slugs[0] if len(slugs) >1: # Only ever one of these per entrance in the expo dataset - message = f" ! - More than one slug for an entrance: {entrance}, slugs: {slugs}. Aborting." + message = f" ! - More than one slug for an entrance: {entrance}, slugs: {slugs}. Ignoring all except first." DataIssue.objects.create(parser="entrances", message=message, url=f"/cave/{slug}/edit/") print(message) - return lastvisit = getXML(entrancecontents, "last visit date", maxItems=1, minItems=0, context=context) @@ -376,64 +387,39 @@ def readentrance(filename, ent=None): underground_description = getXMLmax1("underground_description") url = getXMLmax1("url") - if ent: - ent.name=name[0] - ent.non_public=boolify(non_public) - ent.alt=alt[0] - ent.approach=approach[0] - ent.bearings=bearings[0] - ent.easting=easting[0] - ent.entrance_description=entrance_description[0] - ent.exact_station=exact_station[0] - ent.explorers=explorers[0] - ent.filename=filename - ent.findability=findability[0] - ent.findability_description=findability_description[0] - ent.lastvisit=lastvisit[0] - ent.location_description=location_description[0] - ent.map_description=map_description[0] - ent.marking=marking[0] - ent.marking_comment=marking_comment[0] - ent.northing=northing[0] - ent.other_description=other_description[0] - ent.other_station=other_station[0] - ent.photo=photo[0] - ent.slug=slugs[0] - ent.tag_station=tag_station[0] - ent.underground_description=underground_description[0] - ent.url=url[0] - ent.save() - else: - e, state = Entrance.objects.update_or_create( - name=name[0], - non_public=boolify(non_public), - alt=alt[0], - approach=approach[0], - bearings=bearings[0], - easting=easting[0], - entrance_description=entrance_description[0], - exact_station=exact_station[0], - explorers=explorers[0], - filename=filename, - findability=findability[0], - findability_description=findability_description[0], - lastvisit=lastvisit[0], - location_description=location_description[0], - map_description=map_description[0], - marking=marking[0], - marking_comment=marking_comment[0], - northing=northing[0], - other_description=other_description[0], - other_station=other_station[0], - photo=photo[0], - slug=slugs[0], - tag_station=tag_station[0], - underground_description=underground_description[0], - url=url[0], - ) - e.save() + if not ent: + ent, state = Entrance.objects.update_or_create(slug=slug) + + ent.name=name[0] + ent.non_public=boolify(non_public) + ent.alt=alt[0] + ent.approach=approach[0] + ent.bearings=bearings[0] + ent.easting=easting[0] + ent.entrance_description=entrance_description[0] + ent.exact_station=exact_station[0] + ent.explorers=explorers[0] + ent.filename=filename + ent.findability=findability[0] + ent.findability_description=findability_description[0] + ent.lastvisit=lastvisit[0] + ent.location_description=location_description[0] + ent.map_description=map_description[0] + ent.marking=marking[0] + ent.marking_comment=marking_comment[0] + ent.northing=northing[0] + ent.other_description=other_description[0] + ent.other_station=other_station[0] + ent.photo=photo[0] + # ent.slug=slugs[0] + ent.tag_station=tag_station[0] + ent.underground_description=underground_description[0] + ent.url=url[0] + + ent.save() + return ent -def readcave(filename, cave=None): +def read_cave(filename, cave=None): """Reads an entrance description from the .html file Convoluted. Sorry. Needs rewriting Assumes any area it hasn't seen before is a subarea of 1623 @@ -441,9 +427,13 @@ def readcave(filename, cave=None): If not called as part of initial import, then the global lists will not be correct but this is OK, a search will find them in the db. """ + def getXMLmax1(field): + return getXML(cavecontents, field, maxItems=1, context=context) + def do_entrances(): """For both bulk import and individual re-reading of cave_data file, fix the entrances + What is Class CaveAndEntrance for? """ for e in entrances: eslug = getXML(e, "entranceslug", maxItems=1, context=context)[0] @@ -458,11 +448,11 @@ def readcave(filename, cave=None): entrance = Entrance.objects.get(slug=eslug) entrances_xslug[eslug] = entrance CaveAndEntrance.objects.update_or_create( - cave=c, entrance_letter=letter, entrance=entrance + cave=cave, entrance_letter=letter, entrance=entrance ) except: - message = f' ! Entrance setting failure, slug:"{slug}" #entrances:{len(entrances)} {entrance} letter:"{letter}" cave:"{c}" filename:"cave_data/{filename}"' - DataIssue.objects.create(parser="entrances", message=message, url=f"{c.url}_edit/") + message = f' ! Entrance setting failure, slug:"{slug}" #entrances:{len(entrances)} {entrance} letter:"{letter}" cave:"{cave}" filename:"cave_data/{filename}"' + DataIssue.objects.create(parser="entrances", message=message, url=f"{cave.url}_edit/") print(message) def reload_entrances(): """For individual re-reading of a cave_data file when editing, @@ -470,155 +460,92 @@ def readcave(filename, cave=None): """ for eslug in entrances_xslug: entrance = entrances_xslug[eslug] - readentrance(entrance.filename, ent=entrance) + read_entrance(entrance.filename, ent=entrance) entrance.save() - + + def do_caveslugstuff(): + """This may be a fossil. We only have one slug per cave in troggle. + Pending destruction of this whole concept and Class CaveSlug + What is Class CaveSlug for? + """ + primary = True # this sets the first thing we find to be primary=True and all the others =False + for slug in slugs: + if slug in caves_xslug: + cs = caves_xslug[slug] + else: + try: + cs = CaveSlug.objects.update_or_create(cave=cave, slug=slug, primary=primary) + caves_xslug[slug] = cs + except Exception as ex: + #raise + # This fails to do an update! It just crashes.. to be fixed + message = f" ! CaveSlug update/create failure : {slug}, skipping file cave_data/{context} with exception\nException: {ex.__class__}" + DataIssue.objects.create(parser="caves", message=message, url=f"{cave.url}_edit/") + print(message) + primary = False + global entrances_xslug global caves_xslug global areas_xslug + # Note: we want to overwrite a PENDING cave if we are now importing the 1623-xxx.html file for it + # Note: these are HTML files in the EXPOWEB repo, not from the loser repo. fn = settings.CAVEDESCRIPTIONS / filename # print(f" - Reading Cave from cave descriptions file {fn}") if not fn.exists(): message = f" ! Cave_data file reading problem filename:'cave_data/{filename}'" - DataIssue.objects.create(parser="caves", message=message, url=None) + DataIssue.objects.create(parser="caves", message=message, url=f"/cave_data/{filename}_edit") print(message) - return + return None with open(fn) as f: contents = f.read() context = filename cavecontentslist = getXML(contents, "cave", maxItems=1, context=context) + if len(cavecontentslist) != 1: - message = f'! BAD CAVE at "{filename}"' - DataIssue.objects.create(parser="caves", message=message) + message = f'! BAD CAVE DATA in "{filename}". More than one cave. Edit file manually, click.' + DataIssue.objects.create(parser="caves", message=message, url=f"/cave_data/{filename}_edit") print(message) - return + return None cavecontents = cavecontentslist[0] - non_public = getXML(cavecontents, "non_public", maxItems=1, context=context) slugs = getXML(cavecontents, "caveslug", maxItems=1, context=context) - official_name = getXML(cavecontents, "official_name", maxItems=1, context=context) - areas = getXML(cavecontents, "area", context=context) - kataster_code = getXML(cavecontents, "kataster_code", maxItems=1, context=context) - kataster_number = getXML(cavecontents, "kataster_number", maxItems=1, context=context) - unofficial_number = getXML(cavecontents, "unofficial_number", maxItems=1, context=context) - explorers = getXML(cavecontents, "explorers", maxItems=1, context=context) - underground_description = getXML(cavecontents, "underground_description", maxItems=1, context=context) - equipment = getXML(cavecontents, "equipment", maxItems=1, context=context) - references = getXML(cavecontents, "references", maxItems=1, context=context) - survey = getXML(cavecontents, "survey", maxItems=1, context=context) - kataster_status = getXML(cavecontents, "kataster_status", maxItems=1, context=context) - underground_centre_line = getXML(cavecontents, "underground_centre_line", maxItems=1, context=context) - notes = getXML(cavecontents, "notes", maxItems=1, context=context) - length = getXML(cavecontents, "length", maxItems=1, context=context) - depth = getXML(cavecontents, "depth", maxItems=1, context=context) - extent = getXML(cavecontents, "extent", maxItems=1, context=context) - survex_file = getXML(cavecontents, "survex_file", maxItems=1, context=context) - description_file = getXML(cavecontents, "description_file", maxItems=1, context=context) - url = getXML(cavecontents, "url", maxItems=1, context=context) - entrances = getXML(cavecontents, "entrance", context=context) - - if not ( - len(non_public) == 1 - and len(slugs) >= 1 # is this really correct ? - and len(official_name) == 1 - and len(areas) >= 1 # we want to stop using the sub-ares in 2023 - and len(kataster_code) == 1 - and len(kataster_number) == 1 - and len(unofficial_number) == 1 - and len(explorers) == 1 - and len(underground_description) == 1 - and len(equipment) == 1 - and len(references) == 1 - and len(survey) == 1 - and len(kataster_status) == 1 - and len(underground_centre_line) == 1 - and len(notes) == 1 - and len(length) == 1 - and len(depth) == 1 - and len(extent) == 1 - and len(survex_file) == 1 - and len(description_file) == 1 - and len(url) == 1 - ): - # more than one item in long list - message = f' ! ABORT loading this cave. in "{filename}"' - DataIssue.objects.create(parser="caves", message=message, url=f"/{slugs}_cave_edit/") + if len(slugs) > 1: + message = f" ! - More than one slug for a cave: {cave}, slugs: {slugs}. Ignoring all except first." + DataIssue.objects.create(parser="caves", message=message, url=f"{cave.url}_edit/") print(message) - return - - if cave: - # this a re-load prior to editing and we already know the cave id - cave.non_public=boolify(non_public) - cave.official_name=official_name[0] - cave.kataster_code=kataster_code[0] - cave.kataster_number=kataster_number[0] - cave.unofficial_number=unofficial_number[0] - cave.explorers=explorers[0] - cave.underground_description=underground_description[0] - cave.equipment=equipment[0] - cave.references=references[0] - cave.survey=survey[0] - cave.kataster_status=kataster_status[0] - cave.underground_centre_line=underground_centre_line[0] - cave.notes=notes[0] - cave.length=length[0] - cave.depth=depth[0] - cave.extent=extent[0] - cave.survex_file=survex_file[0] - cave.description_file=description_file[0] - cave.url=url[0] - - if len(slugs) > 1: - message = f" ! Cave edit failure due to more than one slug: {slugs}, skipping this field edit. " - DataIssue.objects.create(parser="caves", message=message) - print(message) + slug = slugs[0] - cave.areas = None - cave.save() - for area_slug in areas: - a = Area.objects.filter(short_name=area_slug) - if a: - cave.area.add(a[0]) - else: - message = f" ! Cave edit failure due to unrecognised Area: {a}, skipping this field edit. " - DataIssue.objects.create(parser="caves", message=message) - print(message) - - c = cave - do_entrances() - print(f"- {entrances_xslug=}") - reload_entrances() - cave.save() - else: + non_public = getXMLmax1("non_public") + official_name = getXMLmax1("official_name") + kataster_code = getXMLmax1("kataster_code") + kataster_number = getXMLmax1("kataster_number") + unofficial_number = getXMLmax1("unofficial_number") + explorers = getXMLmax1("explorers") + underground_description = getXMLmax1("underground_description") + equipment = getXMLmax1("equipment") + references = getXMLmax1("references") + survey = getXMLmax1("survey") + kataster_status = getXMLmax1("kataster_status") + underground_centre_line = getXMLmax1("underground_centre_line") + notes = getXMLmax1("notes") + length = getXMLmax1("length") + depth = getXMLmax1("depth") + extent = getXMLmax1("extent") + survex_file = getXMLmax1("survex_file") + description_file = getXMLmax1("description_file") + url = getXMLmax1("url") + + manual_edit = True + if not cave: + manual_edit = False try: - c, state = Cave.objects.update_or_create( - non_public=boolify(non_public), - official_name=official_name[0], - kataster_code=kataster_code[0], - kataster_number=kataster_number[0], - unofficial_number=unofficial_number[0], - explorers=explorers[0], - underground_description=underground_description[0], - equipment=equipment[0], - references=references[0], - survey=survey[0], - kataster_status=kataster_status[0], - underground_centre_line=underground_centre_line[0], - notes=notes[0], - length=length[0], - depth=depth[0], - extent=extent[0], - survex_file=survex_file[0], - description_file=description_file[0], - url=url[0], - filename=filename, - ) + cave, state = Cave.objects.update_or_create(filename=filename) # replace with slug when CaveSlug tidied up except: - print(" ! FAILED to get only one CAVE when updating using: " + filename) - kaves = Cave.objects.all().filter(kataster_number=kataster_number[0]) + print(" ! FAILED to get only one CAVE in db when updating using: " + filename) + kaves = Cave.objects.all().filter(filename=filename) # replace with slug when CaveSlug tidied up for k in kaves: message = " ! - DUPLICATES in db. kataster:" + str(k.kataster_number) + ", slug:" + str(k.slug()) DataIssue.objects.create(parser="caves", message=message) @@ -627,45 +554,63 @@ def readcave(filename, cave=None): if k.slug() is not None: print(" ! - OVERWRITING this one: slug:" + str(k.slug())) k.notes = "DUPLICATE kataster number found on import. Please fix\n" + k.notes - c = k - - for area_slug in areas: - if area_slug in areas_xslug: - newArea = areas_xslug[area_slug] - else: - area = Area.objects.filter(short_name=area_slug) - if area: - newArea = area[0] - else: - newArea = Area(short_name=area_slug, super=Area.objects.get(short_name="1623")) - newArea.save() - areas_xslug[area_slug] = newArea - c.area.add(newArea) + cave = k + + # From here on the code applies to both edited and newly-imported caves (mostly!) + do_caveslugstuff() # needs cave!=None + + cave.non_public=boolify(non_public) + cave.official_name=official_name[0] + cave.kataster_code=kataster_code[0] + cave.kataster_number=kataster_number[0] + cave.unofficial_number=unofficial_number[0] + cave.explorers=explorers[0] + cave.underground_description=underground_description[0] + cave.equipment=equipment[0] + cave.references=references[0] + cave.survey=survey[0] + cave.kataster_status=kataster_status[0] + cave.underground_centre_line=underground_centre_line[0] + cave.notes=notes[0] + cave.length=length[0] + cave.depth=depth[0] + cave.extent=extent[0] + cave.survex_file=survex_file[0] + cave.description_file=description_file[0] + cave.url=url[0] - primary = True # this sets the first thing we find to be primary=True and all the others =False - for slug in slugs: - if slug in caves_xslug: - cs = caves_xslug[slug] - else: - try: # we want to overwrite a PENDING cave if we are now importing the 1623-xxx.html file for it - cs = CaveSlug.objects.update_or_create(cave=c, slug=slug, primary=primary) - caves_xslug[slug] = cs - except Exception as ex: - #raise - # This fails to do an update! It just crashes.. to be fixed - message = f" ! Cave update/create failure : {slug}, skipping file cave_data/{context} with exception\nException: {ex.__class__}" - DataIssue.objects.create(parser="caves", message=message) - print(message) - - primary = False - - if not entrances or len(entrances) < 1: - # missing entrance link in cave_data/1623-* .html file - set_dummy_entrance(slug[5:], slug, c, msg="DUMMY: no entrances") + areas = getXML(cavecontents, "area", context=context) + # cave.area_set.clear() # Need to find correct syntax. Does not delete previously loaded areas.. WARNING + for area_slug in areas: + if area_slug in areas_xslug: + newArea = areas_xslug[area_slug] else: - do_entrances() + areas_new = Area.objects.filter(short_name=area_slug) + if areas_new: + newArea = areas_new[0] # just the first one we find, but we are going to clean up Areas anyway + else: + # Area not seen before. SHould not happen with manual edit + if manual_edit: + message = f" ! Cave edit failure due to unrecognised Area: {area_slug[0]}, skipping this field edit. " + DataIssue.objects.create(parser="caves", message=message) + print(message) + # super value is highly dodgy + newArea = Area(short_name=area_slug, super=Area.objects.get(short_name="1623")) + newArea.save() + areas_xslug[area_slug] = newArea + cave.area.add(newArea) - # From here on the code applies to both edited and newly-imported caves + entrances = getXML(cavecontents, "entrance", context=context) + do_entrances() + # print(f"- {entrances_xslug=}") + if not entrances or len(entrances) < 1: + # missing entrance link in cave_data/1623-* .html file + set_dummy_entrance(slug[5:], slug, cave, msg="DUMMY: no entrances") + else: + do_entrances() + if manual_edit: + reload_entrances() + if survex_file[0]: if not (Path(SURVEX_DATA) / survex_file[0]).is_file(): message = f' ! {slug:12} survex filename does not exist :LOSER:"{survex_file[0]}" in "{filename}"' @@ -681,8 +626,9 @@ def readcave(filename, cave=None): message = f' ! {slug:12} description filename "{EXPOWEB}/{description_file[0]}" does not refer to a real file' DataIssue.objects.create(parser="caves", message=message, url=f"/{slug}_cave_edit/") print(message) - # c.description_file="" # done only once, to clear out cruft. - c.save() + + cave.save() + return cave # ChatGPT replacement attempt 2023-04-21. Obviously very incomplete, but some useful ideas @@ -695,7 +641,7 @@ def readcave(filename, cave=None): # class FailedCaveUpdateException(Exception): # pass -# def readcave_chatgpt(filename, entrances_xslug, caves_xslug, areas_xslug): +# def read_cave_chatgpt(filename, entrances_xslug, caves_xslug, areas_xslug): # """Reads an entrance description from the .html file and updates the corresponding Cave object""" # tree = ET.parse(os.path.join(CAVEDESCRIPTIONS, filename)) # root = tree.getroot() @@ -789,17 +735,12 @@ def readcaves(): print(" - settings.CAVEDESCRIPTIONS: ", CAVEDESCRIPTIONS) print(" - Reading Entrances from entrance descriptions xml files") for filename in next(os.walk(ENTRANCEDESCRIPTIONS))[2]: # Should be a better way of getting a list of files - # if filename.endswith('.html'): - # if Path(filename).stem[5:] in pending: - # print(f'Skipping pending entrance dummy file <{filename}>') - # else: - # readentrance(filename) - readentrance(filename) + read_entrance(filename) print(" - Reading Caves from cave descriptions xml files") for filename in next(os.walk(CAVEDESCRIPTIONS))[2]: # Should be a better way of getting a list of files if filename.endswith(".html"): - readcave(filename) + read_cave(filename) print(" - Setting up all the variously useful alias names") GetCaveLookup()