fix schema and try cache caves import

2025-12-14 17:07:13 +00:00 · 2020-07-06 20:27:31 +01:00
parent 8530b0643d
commit d2833d26cc
3 changed files with 44 additions and 55 deletions
--- a/parsers/caves.py
+++ b/parsers/caves.py
@@ -6,6 +6,10 @@ from django.conf import settings
 from troggle.core.models import DataIssue, get_process_memory
 import troggle.core.models_caves as models_caves

+entrances_xslug = {}
+caves_xslug = {}
+areas_xslug = {}
+
 def readcaves():
    print(" - Deleting Caves and Entrances")
    models_caves.Cave.objects.all().delete()
@@ -37,7 +41,7 @@ def readcaves():
                cave.save() # must save to have id before foreign keys work
                cave.area = area_1623
                cave.save()
-                message = " ! {} {}".format(cave.unofficial_number, cave.underground_description)
+                message = " ! {:11s} {}".format(cave.unofficial_number, cave.underground_description)
                DataIssue.objects.create(parser='caves', message=message)
                print(message)
            else:
@@ -47,17 +51,23 @@ def readcaves():
            DataIssue.objects.create(parser='caves', message=message)
            print(message)
            raise
+
    print(" - Reading Entrances from entrance descriptions xml files")
    for filename in next(os.walk(settings.ENTRANCEDESCRIPTIONS))[2]: #Should be a better way of getting a list of files
        if filename.endswith('.html'):
            readentrance(filename)
+
    print(" - Reading Caves from cave descriptions xml files")
    for filename in next(os.walk(settings.CAVEDESCRIPTIONS))[2]: #Should be a better way of getting a list of files
        if filename.endswith('.html'):
            readcave(filename)

 def readentrance(filename):
-  # Note: these are HTML files in the EXPOWEB repo, not from the loser repo.
+    global entrances_xslug
+    global caves_xslug
+    global areas_xslug
+
+    # Note: these are HTML files in the EXPOWEB repo, not from the loser repo.
    with open(os.path.join(settings.ENTRANCEDESCRIPTIONS, filename)) as f:
        contents = f.read()
    context = "in file %s" % filename
@@ -138,13 +148,15 @@ def readentrance(filename):
                primary = False

 def readcave(filename):
-  # Note: these are HTML files in the EXPOWEB repo, not from the loser repo.
+    global entrances_xslug
+    global caves_xslug
+    global areas_xslug
+    
+    # Note: these are HTML files in the EXPOWEB repo, not from the loser repo.
    with open(os.path.join(settings.CAVEDESCRIPTIONS, filename)) as f:
        contents = f.read()
    context = " in file %s" % filename
-    #print("Reading file CAVE  {}".format(filename))
    cavecontentslist = getXML(contents, "cave", maxItems = 1, context = context)
-    #print cavecontentslist
    if len(cavecontentslist) == 1:
        cavecontents = cavecontentslist[0]
        non_public = getXML(cavecontents, "non_public", maxItems = 1, context = context)
@@ -192,9 +204,6 @@ def readcave(filename):
                         url = url[0],
                         filename = filename)
            except:
-                # this slow db query happens on every cave, but on import we have all this in memory
-                # and don't need to do a db query. Fix this to speed it up!
-                # need to cope with duplicates
                print(" ! FAILED to get only one CAVE when updating using: "+filename)
                kaves = models_caves.Cave.objects.all().filter(kataster_number=kataster_number[0])
                for k in kaves:
@@ -208,27 +217,31 @@ def readcave(filename):
                        c = k
                
            for area_slug in areas:
-                # this slow db query happens on every cave, but on import we have all this in memory
-                # and don't need to do a db query. Fix this to speed it up!
-                area = models_caves.Area.objects.filter(short_name = area_slug)
-                if area:
-                    newArea = area[0]
+                if area_slug in areas_xslug:
+                    newArea = areas_xslug[area_slug]
                else:
-                    newArea = models_caves.Area(short_name = area_slug, parent = models_caves.Area.objects.get(short_name = "1623"))
-                    newArea.save()
+                    area = models_caves.Area.objects.filter(short_name = area_slug)
+                    if area:
+                        newArea = area[0]
+                    else:
+                        newArea = models_caves.Area(short_name = area_slug, parent = models_caves.Area.objects.get(short_name = "1623"))
+                        newArea.save()
+                    areas_xslug[area_slug] = newArea
                c.area.add(newArea)
            primary = True
            for slug in slugs:
-                try:
-                    # this slow db query happens on every cave, but on import we have all this in memory
-                    # and don't need to do a db query. Fix this to speed it up!
-                    cs = models_caves.CaveSlug.objects.update_or_create(cave = c,
-                              slug = slug,
-                              primary = primary)
-                except:
-                    message = " ! Cave update/create failure: %s, skipping file %s" % (slug, context)
-                    DataIssue.objects.create(parser='caves', message=message)
-                    print(message)
+                if slug in caves_xslug:
+                    cs = caves_xslug[slug]
+                else:
+                    try:
+                        cs = models_caves.CaveSlug.objects.update_or_create(cave = c,
+                                  slug = slug,
+                                  primary = primary)
+                        caves_xslug[slug] = cs
+                    except:
+                        message = " ! Cave update/create failure: %s, skipping file %s" % (slug, context)
+                        DataIssue.objects.create(parser='caves', message=message)
+                        print(message)
                    
                primary = False

@@ -236,9 +249,11 @@ def readcave(filename):
                slug = getXML(entrance, "entranceslug", maxItems = 1, context = context)[0]
                letter = getXML(entrance, "letter", maxItems = 1, context = context)[0]
                try:
-                    # this slow db query happens on every entrance, but on import we have all this in memory
-                    # and don't need to do a db query. Fix this to speed it up!
-                    entrance = models_caves.Entrance.objects.get(entranceslug__slug = slug)
+                    if slug in entrances_xslug:
+                        entrance = entrances_xslug[slug]
+                    else:
+                        entrance = models_caves.Entrance.objects.get(entranceslug__slug = slug)
+                        entrances_xslug[slug] = entrance
                    ce = models_caves.CaveAndEntrance.objects.update_or_create(cave = c, entrance_letter = letter, entrance = entrance)
                except:
                    message = " ! Entrance setting failure, slug: %s letter: %s" % (slug, letter)