bulk update seriously speeds up logbook database storing

2026-03-07 07:46:35 +00:00 · 2025-09-21 21:40:30 +03:00
parent 9df466de2f
commit a7ec46cb70
1 changed files with 94 additions and 81 deletions
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -28,11 +28,6 @@ todo = """
 - check cross-references to specific logbook entries in other logbooks and other HTML frahments
 e.g. cave descriptions

- Most of the time is during the database writing (6s out of 8s).
-profile the code to find bad repetitive things, of which there are many. But probably we just have too many Django database operations.
-Currently we store each entry individually. It should be done using Django bulk entry.
-Look at Person & PersonExpedition all in python in parsers/people.py and then commit as two bulk transactions. test if links between them work when done like that.
-
 - attach or link a DataIssue to an individual expo (logbook) so that it can be found and deleted in the DataIssue bug output

 - rewrite to use generators rather than storing everything intermediate in lists - to 
@@ -299,46 +294,64 @@ def tidy_tid(tid, title, date):
    tid = str(randint(1000, 9999)) + "_" + slugify(title)[:10].replace("-", "_")    
    return tid
    
-def store_entry_into_database(entry):
-    """saves a single logbook entry and related personlogentry items
-    
-    We could do a bulk update to save all the entries, but then we would need to do a query on
-    each one to get the primary key to assign to the PersonLogEntries. So overall probably not much
-    faster ? 
+def bulk_store_entries(entries):
    """
-    other_people = ", ".join(entry.guests) # join list members separated by comma
-    # if guests:
-        # print(f" {date} - {guests}")
-    
-    otherAttribs = {
-        "place": entry.place,
-        "other_people": other_people, # *Ol's Mum, foreigners..
-        "text": entry.text,
-        "expedition": entry.expedition,
-        "time_underground": entry.tu,
-        "cave": entry.tripcave,
-    }
-    coUniqueAttribs = {"slug": entry.tid, "date": entry.tripdate, "title": entry.triptitle}
-    if LogbookEntry.objects.filter(slug=entry.tid).exists():
-        # oops. Our code should already have ensured this is unique.
-        message = " ! - DUPLICATE SLUG for logbook entry " + entry.tripdate + " - " + entry.tid
-        DataIssue.objects.create(parser="logbooks", message=message)
-        slug = slug + "_" + unique_slug(text,2)
+    Bulk saves logbook entries and related personlogentry items.
+    This reduces the number of database operations significantly.
+    This replaces >2,000 calls to store_entry_into_database()
+    """
+    # 1. Prepare LogbookEntry objects
+    logbook_objs = []
+    slug_to_entrydata = {}
+    for entry in entries:
+        other_people = ", ".join(entry.guests)
+        # Ensure slug is unique, otherwise add suffix
+        slug = entry.tid
+        orig_slug = slug
+        i = 2
+        while slug in slug_to_entrydata:
+            # found duplicate
+            slug = f"{orig_slug}_{i}"
+            message = " ! - DUPLICATE SLUG for logbook entry " + entry.tripdate + " - " + entry.tid
+            DataIssue.objects.create(parser="logbooks", message=message)
+            #slug = slug + "_" + unique_slug(entry.text,2)
+            i += 1
+        slug_to_entrydata[slug] = entry
+
+        logbook_objs.append(LogbookEntry(
+            place=entry.place,
+            other_people=other_people, # Ol's mum, foreigners
+            text=entry.text,
+            expedition=entry.expedition,
+            time_underground=entry.tu,
+            cave=entry.tripcave,
+            slug=slug,
+            date=entry.tripdate,
+            title=entry.triptitle,
+        ))
+
+    # 2. Bulk create LogbookEntry objects
+    LogbookEntry.objects.bulk_create(logbook_objs)
+
+    # 3. Fetch created LogbookEntry objects by slug for FK assignment
+    created_entries = {lbe.slug: lbe for lbe in LogbookEntry.objects.filter(slug__in=slug_to_entrydata.keys())}
+
+    # 4. Prepare PersonLogEntry objects
+    personlog_objs = []
+    for slug, entry in slug_to_entrydata.items():
+        lbo = created_entries[slug]
+        for tripperson, nickname_used, time_underground in entry.trippersons:
+            personlog_objs.append(PersonLogEntry(
+                personexpedition=tripperson,
+                nickname_used=nickname_used,
+                logbook_entry=lbo,
+                time_underground=time_underground,
+                is_logbook_entry_author=(tripperson == entry.author),
+            ))
+
+    # 5. Bulk create PersonLogEntry objects
+    PersonLogEntry.objects.bulk_create(personlog_objs)

-    lbo = LogbookEntry.objects.create(**otherAttribs, **coUniqueAttribs)
-    
-    pt_list = []
-    for tripperson, nickname_used, time_underground in entry.trippersons:
-        coUniqueAttribs = {
-            "personexpedition": tripperson, 
-            "nickname_used": nickname_used, 
-            "logbook_entry": lbo
-            } # lbo is primary key
-        otherAttribs = {
-            "time_underground": time_underground, 
-            "is_logbook_entry_author": (tripperson == entry.author)}
-        pt_list.append(PersonLogEntry(**otherAttribs, **coUniqueAttribs))
-    PersonLogEntry.objects.bulk_create(pt_list)
        
 def parser_date(tripdate, year):
    """Interprets dates in the expo logbooks and returns a correct datetime.date object
@@ -707,6 +720,33 @@ def parse_logbook_for_expedition(expedition, blog=False):

    return logentries

+def _collect_logbook_entries_for_expos(expos, nologbook, ENTRIES, BLOG_PARSER_SETTINGS):
+    """Helper to collect all logbook and blog entries for a list of expeditions."""
+    allentries = []
+    loglist = []
+    bloglist = []
+
+    for expo in expos:
+        year = expo.year
+        if year not in nologbook:
+            if year in ENTRIES:
+                loglist.append(expo)
+            else:
+                print(" - No Logbook entries count yet for: " + year)
+                loglist.append(expo)
+        if year in BLOG_PARSER_SETTINGS:
+            bloglist.append(expo)
+
+    for ex in loglist:
+        logentries = parse_logbook_for_expedition(ex)
+        allentries += logentries
+
+    for b in bloglist:
+        print(f" - BLOG: {b}")
+        logentries = parse_logbook_for_expedition(b, blog=True)
+        allentries += logentries
+
+    return allentries

 def LoadLogbook(year):
    """One off logbook for testing purposes, and also reloadable on '/expedition/2023?reload'
@@ -726,13 +766,15 @@ def LoadLogbook(year):
            print(
                f" - Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}"
            )
-        for entry in logentries:
+        # Bulk store all entries at once
+        bulk_store_entries(logentries)
+        #for entry in logentries:
            #date, place, tripcave, triptitle, text, trippersons, author, guests, expedition, tu, tid = entrytuple
-            if expo == entry.expedition: # unneeded check, we zeroed it before filling it
+            #if expo == entry.expedition: # unneeded check, we zeroed it before filling it
                # print(f" -- {triptitle}")
-                store_entry_into_database(entry)
-            else:
-                print(f" ! unexpected log entry labelled as '{entry.expedition}' {entry.tid}" ) 
+                #store_entry_into_database(entry)
+            #else:
+                #print(f" ! unexpected log entry labelled as '{entry.expedition}' {entry.tid}" ) 
        expo.save() # to save logbook name property
    
 def LoadLogbooks():
@@ -772,36 +814,8 @@ def LoadLogbooks():
    sqlfail = [""]  # breaks mysql with db constraint fail - all now fixed.]
    nologbook = noexpo + lostlogbook + sqlfail

-    nlbe = {}
-    loglist = []
-    bloglist = []
-
-    for expo in expos: 
-        year = expo.year
-        if year in sqlfail:
-            print(" - Logbook for: " + year + " NO parsing attempted - known sql failures")
-            message = f" ! - Not even attempting to parse logbook for {year} until code fixed"
-            DataIssue.objects.create(parser="logbooks", message=message)
-            print(message)
-
-        if year not in nologbook:
-            if year in ENTRIES:
-                loglist.append(expo)
-            else:
-                print(" - No Logbook entries count yet for: " + year)  # catch case when preparing for next expo
-                loglist.append(expo)
-                
-        if year in BLOG_PARSER_SETTINGS:
-            bloglist.append(expo)
-
-    for ex in loglist:
-        logentries = parse_logbook_for_expedition(ex)  # this  loads the logbook for one expo
-        allentries += logentries
-
-    for b in bloglist:
-        print(f" - BLOG: {b}")
-        logentries = parse_logbook_for_expedition(b, blog=True)  # loads the blog logbook for one expo
-        allentries += logentries
+    allentries = _collect_logbook_entries_for_expos(expos, nologbook, ENTRIES, BLOG_PARSER_SETTINGS)
+   

    print(f"\n - {len(allentries):,} log entries parsed in all expeditions")
    mem = get_process_memory()
@@ -814,9 +828,8 @@ def LoadLogbooks():
    # - Expedition (the 'logbook.html' value)
    # - LogBookEntry (text, who when etc.)
    # - PersonLogEntry (who was on that specific trip mentione din the logbook entry)
-    for entry in allentries:
-        # date, place, tripcave, triptitle, text, trippersons, author, guests, expedition, tu, tid = entrytuple
-        store_entry_into_database(entry)
+    bulk_store_entries(allentries)
+    
 
    for expo in expos: 
        expo.save() # to save logbook name property