refactor cave import

2023-04-22 22:05:12 +01:00
parent 275adc8efa
commit 30ef427b90
2 changed files with 188 additions and 247 deletions
--- a/core/views/caves.py
+++ b/core/views/caves.py
@@ -15,7 +15,7 @@ from troggle.core.models.logbooks import CaveSlug, QM
 from troggle.core.utils import write_and_commit
 from troggle.core.views import expo
 from troggle.settings import CAVEDESCRIPTIONS, ENTRANCEDESCRIPTIONS
-from troggle.parsers.caves import readcave, readentrance
+from troggle.parsers.caves import read_cave, read_entrance
 from .auth import login_required_if_public
@@ -392,7 +392,7 @@ def edit_cave(request, path="", slug=None):
    else:
        # re-read cave data from file.
        filename = str(cave.slug() +".html")
-        readcave(filename, cave=cave)
+        read_cave(filename, cave=cave)
        form = CaveForm(instance=cave)
        ceFormSet = CaveAndEntranceFormSet(queryset=cave.caveandentrance_set.all())
@@ -446,7 +446,7 @@ def edit_entrance(request, path="", caveslug=None, entslug=None):
        if form.is_valid() and entletter.is_valid():
            entrance = form.save(commit=False)
            entrance_letter = entletter.save(commit=False)
-            print(f"- POST {caveslug=} {entslug=} {path=}")
+            # print(f"- POST {caveslug=} {entslug=} {path=}")
            if entslug is None:
                if entletter.cleaned_data["entrance_letter"]:
                    slugname = cave.slug() + entletter.cleaned_data["entrance_letter"]
@@ -467,7 +467,7 @@ def edit_entrance(request, path="", caveslug=None, entslug=None):
    else:
        # re-read entrance data from file.
        filename = str(entrance.slug +".html")
-        readentrance(filename, ent=entrance)
+        read_entrance(filename, ent=entrance)
        form = EntranceForm(instance=entrance)
        if entslug is None:
--- a/parsers/caves.py
+++ b/parsers/caves.py
@@ -27,6 +27,8 @@ todo = """
 - Cannot use Edit This Page for pendingcaves.txt_edit as Edit This Page is expecting an html file.
   So we will need a separate file-editing capability just for this configuration file ?!
 - we want to overwrite a PENDING cave if we are now importing the 1623-xxx.html file for it
 - Semi-automagically import all the 1627- pending caves and create HTML files for them to be
  edited individually. (These are caves we only know about because we have German survex files.)
@@ -281,6 +283,9 @@ def getXML(text, itemname, minItems=1, maxItems=None, context=""):
    """Reads a single XML tag
    Should throw exception rather than producing error message here,
    then handle exception in calling routine where it has the context.
    This always succeeds, but it produices error message on the terminal and in the
    DatIssues log.
    """
    items = re.findall("<%(itemname)s>(.*?)</%(itemname)s>" % {"itemname": itemname}, text, re.S)
    if len(items) < minItems:
@@ -300,7 +305,7 @@ def getXML(text, itemname, minItems=1, maxItems=None, context=""):
            + " in file "
            + context
        )
-        DataIssue.objects.create(parser="caves", message=message)
+        DataIssue.objects.create(parser="caves", message=message, url="" + context)
        print(message)
    if minItems == 0:
        if not items:
@@ -315,11 +320,18 @@ def boolify(boolstrs):
            "true": True,
            "false": False}[boolstrs[0]]
-def readentrance(filename, ent=None):
+def read_entrance(filename, ent=None):
-    """Reads an entrance description from the .html file
+    """Reads an entrance description from the .html file.
      If not called as part of initial import, then the global lists will not be correct
      but this is OK, a search will find them in the db.
      Args:
        filename: The name of the .html file.
        ent: The entrance object, if it already exists.
      Returns:
        The entrance object, or a new entrance object if `ent` is None.
      """
    def getXMLmax1(field):
        return getXML(entrancecontents, field, maxItems=1, context=context)
@@ -333,23 +345,22 @@ def readentrance(filename, ent=None):
        contents = f.read()
    context = filename
    # print("Reading file ENTRANCE {} / {}".format(ENTRANCEDESCRIPTIONS, filename))
    entrancecontentslist = getXML(contents, "entrance", maxItems=1, context=context)
    if len(entrancecontentslist) != 1:
-        message = f'! BAD ENTRANCE at "{filename}". Loading aborted. '
+        message = f'! BAD ENTRANCE DATA in "{filename}". More than one entrance. Edit file manually, click.'
-        DataIssue.objects.create(parser="entrances", message=message)
+        DataIssue.objects.create(parser="entrances", message=message, url=f"/entrance_data/{filename}_edit")
        print(message)
-        return
+        return None
    entrancecontents = entrancecontentslist[0]
    slugs = getXML(entrancecontents, "slug", context=context)
    slug = slugs[0]
    if len(slugs) >1:
        # Only ever one of these per entrance in the expo dataset
-        message = f" ! - More than one slug for an entrance: {entrance}, slugs: {slugs}. Aborting."
+        message = f" ! - More than one slug for an entrance: {entrance}, slugs: {slugs}. Ignoring all except first."
        DataIssue.objects.create(parser="entrances", message=message, url=f"/cave/{slug}/edit/")
        print(message)
        return
    lastvisit = getXML(entrancecontents, "last visit date", maxItems=1, minItems=0, context=context)
@@ -376,7 +387,9 @@ def readentrance(filename, ent=None):
    underground_description = getXMLmax1("underground_description")
    url = getXMLmax1("url")
-    if ent:
+    if not ent:
        ent, state = Entrance.objects.update_or_create(slug=slug)
    ent.name=name[0]
    ent.non_public=boolify(non_public)
    ent.alt=alt[0]
@@ -398,42 +411,15 @@ def readentrance(filename, ent=None):
    ent.other_description=other_description[0]
    ent.other_station=other_station[0]
    ent.photo=photo[0]
-        ent.slug=slugs[0]
+    # ent.slug=slugs[0]
    ent.tag_station=tag_station[0]
    ent.underground_description=underground_description[0]
    ent.url=url[0]
        ent.save()
    else:
        e, state = Entrance.objects.update_or_create(
            name=name[0],
            non_public=boolify(non_public),
            alt=alt[0],
            approach=approach[0],
            bearings=bearings[0],
            easting=easting[0],
            entrance_description=entrance_description[0],
            exact_station=exact_station[0],
            explorers=explorers[0],
            filename=filename,
            findability=findability[0],
            findability_description=findability_description[0],
            lastvisit=lastvisit[0],
            location_description=location_description[0],
            map_description=map_description[0],
            marking=marking[0],
            marking_comment=marking_comment[0],
            northing=northing[0],
            other_description=other_description[0],
            other_station=other_station[0],
            photo=photo[0],
            slug=slugs[0],
            tag_station=tag_station[0],
            underground_description=underground_description[0],
            url=url[0],
        )
        e.save()
-def readcave(filename, cave=None):
+    ent.save()
    return ent
 def read_cave(filename, cave=None):
    """Reads an entrance description from the .html file
    Convoluted. Sorry. Needs rewriting
    Assumes any area it hasn't seen before is a subarea of 1623
@@ -441,9 +427,13 @@ def readcave(filename, cave=None):
    If not called as part of initial import, then the global lists will not be correct
    but this is OK, a search will find them in the db.
    """
    def getXMLmax1(field):
        return getXML(cavecontents, field, maxItems=1, context=context)
    def do_entrances():
        """For both bulk import and individual re-reading of cave_data file,
        fix the entrances
        What is Class CaveAndEntrance for?
        """
        for e in entrances:
            eslug = getXML(e, "entranceslug", maxItems=1, context=context)[0]
@@ -458,11 +448,11 @@ def readcave(filename, cave=None):
                        entrance = Entrance.objects.get(slug=eslug)
                        entrances_xslug[eslug] = entrance
                    CaveAndEntrance.objects.update_or_create(
-                        cave=c, entrance_letter=letter, entrance=entrance
+                        cave=cave, entrance_letter=letter, entrance=entrance
                    )
                except:
-                    message = f' ! Entrance setting failure, slug:"{slug}" #entrances:{len(entrances)} {entrance} letter:"{letter}" cave:"{c}" filename:"cave_data/{filename}"'
+                    message = f' ! Entrance setting failure, slug:"{slug}" #entrances:{len(entrances)} {entrance} letter:"{letter}" cave:"{cave}" filename:"cave_data/{filename}"'
-                    DataIssue.objects.create(parser="entrances", message=message, url=f"{c.url}_edit/")
+                    DataIssue.objects.create(parser="entrances", message=message, url=f"{cave.url}_edit/")
                    print(message)        
    def reload_entrances():
        """For individual re-reading of a cave_data file when editing,
@@ -470,87 +460,105 @@ def readcave(filename, cave=None):
        """
        for eslug in entrances_xslug:
            entrance = entrances_xslug[eslug]
-            readentrance(entrance.filename, ent=entrance)
+            read_entrance(entrance.filename, ent=entrance)
            entrance.save()
    def do_caveslugstuff():
        """This may be a fossil. We only have one slug per cave in troggle.
        Pending destruction of this whole concept and Class CaveSlug
        What is Class CaveSlug for?
        """
        primary = True # this sets the first thing we find to be primary=True and all the others =False
        for slug in slugs:
            if slug in caves_xslug:
                cs = caves_xslug[slug]
            else:
               try:  
                    cs = CaveSlug.objects.update_or_create(cave=cave, slug=slug, primary=primary)
                    caves_xslug[slug] = cs
               except Exception as ex:
                    #raise
                    # This fails to do an update! It just crashes.. to be fixed
                    message = f" ! CaveSlug update/create failure : {slug}, skipping file cave_data/{context} with exception\nException: {ex.__class__}"
                    DataIssue.objects.create(parser="caves", message=message, url=f"{cave.url}_edit/")
                    print(message)
            primary = False
    global entrances_xslug
    global caves_xslug
    global areas_xslug
    # Note: we want to overwrite a PENDING cave if we are now importing the 1623-xxx.html file for it
    # Note: these are HTML files in the EXPOWEB repo, not from the loser repo.
    fn = settings.CAVEDESCRIPTIONS / filename
    # print(f" - Reading Cave from cave descriptions file {fn}")
    if not fn.exists():
        message = f" ! Cave_data file reading problem filename:'cave_data/{filename}'"
-        DataIssue.objects.create(parser="caves", message=message, url=None)
+        DataIssue.objects.create(parser="caves", message=message, url=f"/cave_data/{filename}_edit")
        print(message)
-        return
+        return None
    with open(fn) as f:
        contents = f.read()
    context = filename
    cavecontentslist = getXML(contents, "cave", maxItems=1, context=context)
    if len(cavecontentslist) != 1:
-        message = f'! BAD CAVE at "{filename}"'
+        message = f'! BAD CAVE DATA in "{filename}". More than one cave. Edit file manually, click.'
-        DataIssue.objects.create(parser="caves", message=message)
+        DataIssue.objects.create(parser="caves", message=message, url=f"/cave_data/{filename}_edit")
        print(message)
-        return
+        return None
    cavecontents = cavecontentslist[0]
    non_public = getXML(cavecontents, "non_public", maxItems=1, context=context)
    slugs = getXML(cavecontents, "caveslug", maxItems=1, context=context)
-    official_name = getXML(cavecontents, "official_name", maxItems=1, context=context)
+    if len(slugs) > 1:
-    areas = getXML(cavecontents, "area", context=context)
+        message = f" ! - More than one slug for a cave: {cave}, slugs: {slugs}. Ignoring all except first."
-    kataster_code = getXML(cavecontents, "kataster_code", maxItems=1, context=context)
+        DataIssue.objects.create(parser="caves", message=message, url=f"{cave.url}_edit/")
    kataster_number = getXML(cavecontents, "kataster_number", maxItems=1, context=context)
    unofficial_number = getXML(cavecontents, "unofficial_number", maxItems=1, context=context)
    explorers = getXML(cavecontents, "explorers", maxItems=1, context=context)
    underground_description = getXML(cavecontents, "underground_description", maxItems=1, context=context)
    equipment = getXML(cavecontents, "equipment", maxItems=1, context=context)
    references = getXML(cavecontents, "references", maxItems=1, context=context)
    survey = getXML(cavecontents, "survey", maxItems=1, context=context)
    kataster_status = getXML(cavecontents, "kataster_status", maxItems=1, context=context)
    underground_centre_line = getXML(cavecontents, "underground_centre_line", maxItems=1, context=context)
    notes = getXML(cavecontents, "notes", maxItems=1, context=context)
    length = getXML(cavecontents, "length", maxItems=1, context=context)
    depth = getXML(cavecontents, "depth", maxItems=1, context=context)
    extent = getXML(cavecontents, "extent", maxItems=1, context=context)
    survex_file = getXML(cavecontents, "survex_file", maxItems=1, context=context)
    description_file = getXML(cavecontents, "description_file", maxItems=1, context=context)
    url = getXML(cavecontents, "url", maxItems=1, context=context)
    entrances = getXML(cavecontents, "entrance", context=context)
    if not (
        len(non_public) == 1
        and len(slugs) >= 1 # is this really correct ?
        and len(official_name) == 1
        and len(areas) >= 1 # we want to stop using the sub-ares in 2023
        and len(kataster_code) == 1
        and len(kataster_number) == 1
        and len(unofficial_number) == 1
        and len(explorers) == 1
        and len(underground_description) == 1
        and len(equipment) == 1
        and len(references) == 1
        and len(survey) == 1
        and len(kataster_status) == 1
        and len(underground_centre_line) == 1
        and len(notes) == 1
        and len(length) == 1
        and len(depth) == 1
        and len(extent) == 1
        and len(survex_file) == 1
        and len(description_file) == 1
        and len(url) == 1
    ):
        # more than one item in long list
        message = f' ! ABORT loading this cave. in "{filename}"'
        DataIssue.objects.create(parser="caves", message=message, url=f"/{slugs}_cave_edit/")
        print(message)
-        return
+    slug = slugs[0]
    non_public = getXMLmax1("non_public")
    official_name = getXMLmax1("official_name")
    kataster_code = getXMLmax1("kataster_code")
    kataster_number = getXMLmax1("kataster_number")
    unofficial_number = getXMLmax1("unofficial_number")
    explorers = getXMLmax1("explorers")
    underground_description = getXMLmax1("underground_description")
    equipment = getXMLmax1("equipment")
    references = getXMLmax1("references")
    survey = getXMLmax1("survey")
    kataster_status = getXMLmax1("kataster_status")
    underground_centre_line = getXMLmax1("underground_centre_line")
    notes = getXMLmax1("notes")
    length = getXMLmax1("length")
    depth = getXMLmax1("depth")
    extent = getXMLmax1("extent")
    survex_file = getXMLmax1("survex_file")
    description_file = getXMLmax1("description_file")
    url = getXMLmax1("url")
    manual_edit = True
    if not cave:
        manual_edit = False
        try:
            cave, state = Cave.objects.update_or_create(filename=filename) # replace with slug when CaveSlug tidied up
        except:
            print(" ! FAILED to get only one CAVE in db when updating using: " + filename)
            kaves = Cave.objects.all().filter(filename=filename) # replace with slug when CaveSlug tidied up
            for k in kaves:
                message = " ! - DUPLICATES in db. kataster:" + str(k.kataster_number) + ", slug:" + str(k.slug())
                DataIssue.objects.create(parser="caves", message=message)
                print(message)
            for k in kaves:
                if k.slug() is not None:
                    print(" ! - OVERWRITING this one: slug:" + str(k.slug()))
                    k.notes = "DUPLICATE kataster number found on import. Please fix\n" + k.notes
                    cave = k
    # From here on the code applies to both edited and newly-imported caves (mostly!)
    do_caveslugstuff() # needs cave!=None
    if cave:
        # this a re-load prior to editing and we already know the cave id
    cave.non_public=boolify(non_public)
    cave.official_name=official_name[0]
    cave.kataster_code=kataster_code[0]
@@ -571,101 +579,38 @@ def readcave(filename, cave=None):
    cave.description_file=description_file[0]
    cave.url=url[0]
-        if len(slugs) > 1:
+    areas = getXML(cavecontents, "area", context=context)
-            message = f" ! Cave edit failure due to more than one slug: {slugs}, skipping this field edit. "
+    # cave.area_set.clear() # Need to find correct syntax. Does not delete previously loaded areas.. WARNING
            DataIssue.objects.create(parser="caves", message=message)
            print(message)
        cave.areas = None
        cave.save()
        for area_slug in areas:
            a = Area.objects.filter(short_name=area_slug)
            if a:
                cave.area.add(a[0]) 
            else:
                message = f" ! Cave edit failure due to unrecognised Area: {a}, skipping this field edit. "
                DataIssue.objects.create(parser="caves", message=message)
                print(message)
        c = cave
        do_entrances()
        print(f"- {entrances_xslug=}")
        reload_entrances()
        cave.save()
    else:
        try:
            c, state = Cave.objects.update_or_create(
                non_public=boolify(non_public),
                official_name=official_name[0],
                kataster_code=kataster_code[0],
                kataster_number=kataster_number[0],
                unofficial_number=unofficial_number[0],
                explorers=explorers[0],
                underground_description=underground_description[0],
                equipment=equipment[0],
                references=references[0],
                survey=survey[0],
                kataster_status=kataster_status[0],
                underground_centre_line=underground_centre_line[0],
                notes=notes[0],
                length=length[0],
                depth=depth[0],
                extent=extent[0],
                survex_file=survex_file[0],
                description_file=description_file[0],
                url=url[0],
                filename=filename,
            )
        except:
            print(" ! FAILED to get only one CAVE when updating using: " + filename)
            kaves = Cave.objects.all().filter(kataster_number=kataster_number[0])
            for k in kaves:
                message = " ! - DUPLICATES in db. kataster:" + str(k.kataster_number) + ", slug:" + str(k.slug())
                DataIssue.objects.create(parser="caves", message=message)
                print(message)
            for k in kaves:
                if k.slug() is not None:
                    print(" ! - OVERWRITING this one: slug:" + str(k.slug()))
                    k.notes = "DUPLICATE kataster number found on import. Please fix\n" + k.notes
                    c = k
    for area_slug in areas:
        if area_slug in areas_xslug:
            newArea = areas_xslug[area_slug]
        else:
-                area = Area.objects.filter(short_name=area_slug)
+            areas_new = Area.objects.filter(short_name=area_slug)
-                if area:
+            if areas_new:
-                    newArea = area[0]
+                newArea = areas_new[0] # just the first one we find, but we are going to clean up Areas anyway
            else:
                # Area not seen before. SHould not happen with manual edit
                if manual_edit:
                    message = f" ! Cave edit failure due to unrecognised Area: {area_slug[0]}, skipping this field edit. "
                    DataIssue.objects.create(parser="caves", message=message)
                    print(message)                    
                # super value is highly dodgy
                newArea = Area(short_name=area_slug, super=Area.objects.get(short_name="1623"))
                newArea.save()
            areas_xslug[area_slug] = newArea
-            c.area.add(newArea)
+        cave.area.add(newArea)
        primary = True # this sets the first thing we find to be primary=True and all the others =False
        for slug in slugs:
            if slug in caves_xslug:
                cs = caves_xslug[slug]
            else:
               try:  # we want to overwrite a PENDING cave if we are now importing the 1623-xxx.html file for it
                    cs = CaveSlug.objects.update_or_create(cave=c, slug=slug, primary=primary)
                    caves_xslug[slug] = cs
               except Exception as ex:
                    #raise
                    # This fails to do an update! It just crashes.. to be fixed
                    message = f" ! Cave update/create failure : {slug}, skipping file cave_data/{context} with exception\nException: {ex.__class__}"
                    DataIssue.objects.create(parser="caves", message=message)
                    print(message)
            primary = False
    entrances = getXML(cavecontents, "entrance", context=context)
    do_entrances()
    # print(f"- {entrances_xslug=}")
    if not entrances or len(entrances) < 1:
        # missing entrance link in cave_data/1623-* .html file
-            set_dummy_entrance(slug[5:], slug, c, msg="DUMMY: no entrances")
+        set_dummy_entrance(slug[5:], slug, cave, msg="DUMMY: no entrances")
    else:
        do_entrances()
    if manual_edit:
        reload_entrances()
    # From here on the code applies to both edited and newly-imported caves
    if survex_file[0]:
        if not (Path(SURVEX_DATA) / survex_file[0]).is_file():
            message = f' ! {slug:12} survex filename does not exist :LOSER:"{survex_file[0]}" in "{filename}"'
@@ -681,8 +626,9 @@ def readcave(filename, cave=None):
            message = f' ! {slug:12} description filename  "{EXPOWEB}/{description_file[0]}" does not refer to a real file'
            DataIssue.objects.create(parser="caves", message=message, url=f"/{slug}_cave_edit/")
            print(message)
-            # c.description_file="" # done only once, to clear out cruft.
+
-    c.save()
+    cave.save()
    return cave
 # ChatGPT replacement attempt 2023-04-21. Obviously very incomplete, but some useful ideas
@@ -695,7 +641,7 @@ def readcave(filename, cave=None):
 # class FailedCaveUpdateException(Exception):
    # pass
-# def readcave_chatgpt(filename, entrances_xslug, caves_xslug, areas_xslug):
+# def read_cave_chatgpt(filename, entrances_xslug, caves_xslug, areas_xslug):
    # """Reads an entrance description from the .html file and updates the corresponding Cave object"""
    # tree = ET.parse(os.path.join(CAVEDESCRIPTIONS, filename))
    # root = tree.getroot()
@@ -789,17 +735,12 @@ def readcaves():
        print(" - settings.CAVEDESCRIPTIONS: ", CAVEDESCRIPTIONS)
        print(" - Reading Entrances from entrance descriptions xml files")
        for filename in next(os.walk(ENTRANCEDESCRIPTIONS))[2]:  # Should be a better way of getting a list of files
-            # if filename.endswith('.html'):
+            read_entrance(filename)
            # if Path(filename).stem[5:] in pending:
            # print(f'Skipping pending entrance dummy file <{filename}>')
            # else:
            # readentrance(filename)
            readentrance(filename)
        print(" - Reading Caves from cave descriptions xml files")
        for filename in next(os.walk(CAVEDESCRIPTIONS))[2]:  # Should be a better way of getting a list of files
            if filename.endswith(".html"):
-                readcave(filename)
+                read_cave(filename)
    print(" - Setting up all the variously useful alias names")
    GetCaveLookup()