used cache instead of .get query, vastly faster

2025-12-17 17:27:08 +00:00 · 2025-07-21 18:29:06 +02:00
parent 2039501672
commit b5540fd543
3 changed files with 74 additions and 37 deletions
--- a/parsers/people.py
+++ b/parsers/people.py
@@ -1,6 +1,7 @@
 import csv
 import os
 import re
+import time
 from html import unescape
 from pathlib import Path

@@ -105,17 +106,33 @@ def load_people_expos():
    Given that we need to do stuff for the coming expo, well before we update the folk list,
    the Expedition object for the coming expo is created elsewhere - in addition to 
    those created here, if it does not exist.
+    
+    Refactored to separate out the creation of objects in the database to use bulk_create to
+    speed things up. Made little difference sadly.
    """
+    # import cProfile
+    # import pstats
+    # from pstats import SortKey
+
+    # pr = cProfile.Profile()
+    # pr.enable()
+    start = time.time()
    DataIssue.objects.filter(parser="people").delete()
    Person.objects.all().delete()
    PersonExpedition.objects.all().delete()
    
-    persontab = open(os.path.join(settings.EXPOWEB, "folk", "folk.csv"))  # should really be EXPOFOLK I guess
-    personreader = csv.reader(persontab)  # this is an iterator
+    splitnick_pattern = re.compile(r"^([\w&;\s\-]+)(?:\(([^)]*)\))?")
+    displayname_pattern = re.compile(r"^([^(]*)(\(([^)]*)\))?") # removes nickname in brackets
+    rawlastname_pattern = re.compile(r"^([\w&;\s]+)(?:\(([^)]*)\))?")
+
+    with open (settings.EXPOWEB / "folk" / "folk.csv", "r") as folkfile:
+        folklines = folkfile.readlines() # list of the lines, read all into memory
+    personreader = csv.reader(folklines) # this is an iterator
    headers = next(personreader)
    header = dict(list(zip(headers, list(range(len(headers))))))

    years = headers[5:]
+    expos = {}
    nexpos = Expedition.objects.count()
    if nexpos <= 0:
        print(" - Creating expeditions")
@@ -124,10 +141,21 @@ def load_people_expos():
            coUniqueAttribs = {"year": year}
            otherAttribs = {"name": f"CUCC expo {year}"}
            e = Expedition.objects.create(**otherAttribs, **coUniqueAttribs)
-
+            expos[year] = e
+    else: # re-running a folk import without a complete reset
+        print(" - Cacheing expeditions")
+        for year in years:
+            year = year.strip()
+            e = Expedition.objects.get(year=year)
+            expos[year] = e
+    print("", flush=True)
    print(" - Loading persons and personexpeditions")
+    print("  - Reading folk file")

    pe_list = []
+    prep_list = []
+    
+    p_list =[]
    for personline in personreader:
        #   This is all horrible: refactor it.
        # CSV: Name,Lastname,Guest,VfHO member,Mugshot,..
@@ -135,8 +163,7 @@ def load_people_expos():
        name = personline[header["Name"]]
        plainname = re.sub(r"<.*?>", "", name) # now in slugify
        
-        match = re.match(r"^([^(]*)(\(([^)]*)\))?", name) # removes nickname in brackets
-        displayname = match.group(1).strip()
+        displayname = displayname_pattern.match(name).group(1).strip()
        input_name = displayname
        slug = troggle_slugify(displayname)

@@ -148,13 +175,14 @@ def load_people_expos():

        if rawlastname == "":
            print(f"MISSING SURNAME FIELD for {name} - check against similar names in the list to see what you have done.")
-        if matchlastname := re.match(r"^([\w&;\s]+)(?:\(([^)]*)\))?", rawlastname):
+        if matchlastname := rawlastname_pattern.match(rawlastname):
+        #re.match(r"^([\w&;\s]+)(?:\(([^)]*)\))?", 
            lastname = matchlastname.group(1).strip()
        else:
            print(f"MATCH FAIL {personline=}\n     {slug=}\n     {name=}\n     {rawlastname=}")
            exit(1)
        
-        splitnick = re.match(r"^([\w&;\s\-]+)(?:\(([^)]*)\))?", plainname)
+        splitnick = splitnick_pattern.match(plainname)
        fullname = splitnick.group(1) # removes Nickname in brackets, but also cuts hyphenated names
        nick = splitnick.group(2) or "" 
        if nick:
@@ -175,36 +203,41 @@ def load_people_expos():
        else:
            vfho = True

-        # would be better to just create the python object, and only commit to db once all done inc blurb
-        # and better to save all the Persons in a bulk update, then do all the PersonExpeditions
        blurb_fields = parse_blurb(personline=personline, header=header) 
        
        attribs = {"slug": slug, "first_name": firstname, "last_name": (lastname or ""), 
            "is_vfho": vfho, "fullname": fullname, "nickname": nick, "input_name": input_name, "input_surname": input_surname,
            "is_guest": (personline[header["Guest"]] == "1")}
-        person = Person.objects.create(**attribs, **blurb_fields)
-
-        #person.save()
-
-        #print(" - Loading personexpeditions")
+        p_list.append(Person(**attribs, **blurb_fields))

        # make person expedition 
        for year, attended in list(zip(headers, personline))[5:]:
-            expedition = Expedition.objects.get(year=year)
+            expedition = expos[str(year)]
            
            if attended in ("1", "-1"):
-                pe_list.append(PersonExpedition(person=person, expedition=expedition, noncaver=(attended == "-1")))
-                # pe = PersonExpedition.objects.create(
-                    # person=person,
-                    # expedition=expedition,
-                    # noncaver=(attended == "-1")
-                # )
+                prep_list.append((slug, expedition, (attended == "-1")))
+                # pe_list.append(PersonExpedition(person=person, expedition=expedition, noncaver=(attended == "-1")))
+    duration = time.time() - start
+    print(f"   - duration: {duration:5.1f} s")
+    print("  - Loading persons into db")
+    persons = Person.objects.bulk_create(p_list)
+    
+    print("  - Loading personexpeditions into db", flush=True)
+    for pe in prep_list:
+        slug, expedition, noncaver = pe
+        p = Person.objects.get(slug=slug)
+        pe_list.append(PersonExpedition(person=p, expedition=expedition, noncaver=noncaver))
    PersonExpedition.objects.bulk_create(pe_list)

-    print("", flush=True)
    ensure_users_are_persons()
    most_recent = Expedition.objects.all().first()
    check_new_signups(most_recent)
+    # pr.disable()
+    # with open("folk_reader.prof", "w") as f:
+        # ps = pstats.Stats(pr, stream=f)
+        # ps.sort_stats(SortKey.CUMULATIVE)
+        # ps.print_stats()
+
    
 def check_new_signups(expedition):
    signups_clear = read_signups()
@@ -226,7 +259,7 @@ def ensure_users_are_persons():
            p = ps[0]
            p.user = u
            p.save()
-            print(f"     {p.user} {u=}")
+            # print(f"     {p.user} {u=}")
            
 def who_is_this(year, possibleid):
    expo = Expedition.objects.filter(year=year)
--- a/parsers/survex.py
+++ b/parsers/survex.py
@@ -2338,22 +2338,22 @@ def FindAndLoadSurvex():
    fcollate.write(f";*include {survexfileroot.path}\n")
    flinear.write(f"{svx_scan.depthinclude:2} {indent} *include {survexfileroot.path}\n")

-    import cProfile
-    import pstats
-    from pstats import SortKey
+    # import cProfile
+    # import pstats
+    # from pstats import SortKey

-    pr = cProfile.Profile()
-    pr.enable()
+    # pr = cProfile.Profile()
+    # pr.enable()
    svx_scan.svxpass = svx_scan.TREE
    # ----------------------------------------------------------------
    svx_scan.PushdownStackScan(survexblockroot, survexfileroot.path, finrootname, flinear, fcollate)
    # ----------------------------------------------------------------
    svx_scan.svxpass = ""
-    pr.disable()
-    with open("PushdownStackScan.prof", "w") as f:
-        ps = pstats.Stats(pr, stream=f)
-        ps.sort_stats(SortKey.CUMULATIVE)
-        ps.print_stats()
+    # pr.disable()
+    # with open("PushdownStackScan.prof", "w") as f:
+        # ps = pstats.Stats(pr, stream=f)
+        # ps.sort_stats(SortKey.CUMULATIVE)
+        # ps.print_stats()

    flinear.write(f"{svx_scan.depthinclude:2} {indent} *edulcni {survexfileroot.path}\n")
    fcollate.write(f";*edulcni {survexfileroot.path}\n")
@@ -2709,7 +2709,8 @@ def parse_one_file(fpath): # --------------------------------------in progress--
            print(f"  - Aborting file parsing & import into database.")
            return False
        print(f"  - Pre-existing survexfile {svxs}.")
-        existingsvx = SurvexFile.objects.get(path=fpath)
+        existingsvx = svxs[0]
+        #existingsvx = SurvexFile.objects.get(path=fpath)
        existingcave = existingsvx.cave
        print(f"  - survexfile id={existingsvx.id}  {existingsvx}  {existingcave}")

--- a/templates/controlPanel.html
+++ b/templates/controlPanel.html
@@ -15,7 +15,10 @@
 <li><a href="/survexfilewild/{{year}}">Wild survex files</a> - survex files containing blocks with no related wallet
 <li><a href="/survexdir">Survex Directories</a> - Every Cave has an associated directory and a Primary survex file
 <li><a href="/surveximport">Survex import record</a> - indented *include and begin/end tree<br /><li><a href="/survexdebug">Survex debug report</a> - warnings and  details<br />
-<li><a href="/therionissues">Therion Import issues</a> - warnings from the recent data import<br /><br />
+<li><a href="/therionissues">Therion Import issues</a> - warnings from the recent data import<br />
+<li><a href="/people_ids">List of folk</a> - ordered by first name, registered people in red<br />
+
+<li><a href="/folk_export">Export new_folk.csv</a> - export from data in the database<br /><br />

 <li><a href="/kataster/1623-2002-08">Kataster renumber</a> - Rename a cave to a new kataster number <span style="color:red">{{error}}</span>