2
0
mirror of https://expo.survex.com/repositories/troggle/.git synced 2025-12-19 07:17:20 +00:00

bulk update seriously speeds up logbook database storing

This commit is contained in:
2025-09-21 21:40:30 +03:00
parent 9df466de2f
commit a7ec46cb70

View File

@@ -28,11 +28,6 @@ todo = """
- check cross-references to specific logbook entries in other logbooks and other HTML frahments
e.g. cave descriptions
- Most of the time is during the database writing (6s out of 8s).
profile the code to find bad repetitive things, of which there are many. But probably we just have too many Django database operations.
Currently we store each entry individually. It should be done using Django bulk entry.
Look at Person & PersonExpedition all in python in parsers/people.py and then commit as two bulk transactions. test if links between them work when done like that.
- attach or link a DataIssue to an individual expo (logbook) so that it can be found and deleted in the DataIssue bug output
- rewrite to use generators rather than storing everything intermediate in lists - to
@@ -299,46 +294,64 @@ def tidy_tid(tid, title, date):
tid = str(randint(1000, 9999)) + "_" + slugify(title)[:10].replace("-", "_")
return tid
def store_entry_into_database(entry):
"""saves a single logbook entry and related personlogentry items
We could do a bulk update to save all the entries, but then we would need to do a query on
each one to get the primary key to assign to the PersonLogEntries. So overall probably not much
faster ?
def bulk_store_entries(entries):
"""
other_people = ", ".join(entry.guests) # join list members separated by comma
# if guests:
# print(f" {date} - {guests}")
otherAttribs = {
"place": entry.place,
"other_people": other_people, # *Ol's Mum, foreigners..
"text": entry.text,
"expedition": entry.expedition,
"time_underground": entry.tu,
"cave": entry.tripcave,
}
coUniqueAttribs = {"slug": entry.tid, "date": entry.tripdate, "title": entry.triptitle}
if LogbookEntry.objects.filter(slug=entry.tid).exists():
# oops. Our code should already have ensured this is unique.
message = " ! - DUPLICATE SLUG for logbook entry " + entry.tripdate + " - " + entry.tid
DataIssue.objects.create(parser="logbooks", message=message)
slug = slug + "_" + unique_slug(text,2)
Bulk saves logbook entries and related personlogentry items.
This reduces the number of database operations significantly.
This replaces >2,000 calls to store_entry_into_database()
"""
# 1. Prepare LogbookEntry objects
logbook_objs = []
slug_to_entrydata = {}
for entry in entries:
other_people = ", ".join(entry.guests)
# Ensure slug is unique, otherwise add suffix
slug = entry.tid
orig_slug = slug
i = 2
while slug in slug_to_entrydata:
# found duplicate
slug = f"{orig_slug}_{i}"
message = " ! - DUPLICATE SLUG for logbook entry " + entry.tripdate + " - " + entry.tid
DataIssue.objects.create(parser="logbooks", message=message)
#slug = slug + "_" + unique_slug(entry.text,2)
i += 1
slug_to_entrydata[slug] = entry
logbook_objs.append(LogbookEntry(
place=entry.place,
other_people=other_people, # Ol's mum, foreigners
text=entry.text,
expedition=entry.expedition,
time_underground=entry.tu,
cave=entry.tripcave,
slug=slug,
date=entry.tripdate,
title=entry.triptitle,
))
# 2. Bulk create LogbookEntry objects
LogbookEntry.objects.bulk_create(logbook_objs)
# 3. Fetch created LogbookEntry objects by slug for FK assignment
created_entries = {lbe.slug: lbe for lbe in LogbookEntry.objects.filter(slug__in=slug_to_entrydata.keys())}
# 4. Prepare PersonLogEntry objects
personlog_objs = []
for slug, entry in slug_to_entrydata.items():
lbo = created_entries[slug]
for tripperson, nickname_used, time_underground in entry.trippersons:
personlog_objs.append(PersonLogEntry(
personexpedition=tripperson,
nickname_used=nickname_used,
logbook_entry=lbo,
time_underground=time_underground,
is_logbook_entry_author=(tripperson == entry.author),
))
# 5. Bulk create PersonLogEntry objects
PersonLogEntry.objects.bulk_create(personlog_objs)
lbo = LogbookEntry.objects.create(**otherAttribs, **coUniqueAttribs)
pt_list = []
for tripperson, nickname_used, time_underground in entry.trippersons:
coUniqueAttribs = {
"personexpedition": tripperson,
"nickname_used": nickname_used,
"logbook_entry": lbo
} # lbo is primary key
otherAttribs = {
"time_underground": time_underground,
"is_logbook_entry_author": (tripperson == entry.author)}
pt_list.append(PersonLogEntry(**otherAttribs, **coUniqueAttribs))
PersonLogEntry.objects.bulk_create(pt_list)
def parser_date(tripdate, year):
"""Interprets dates in the expo logbooks and returns a correct datetime.date object
@@ -707,6 +720,33 @@ def parse_logbook_for_expedition(expedition, blog=False):
return logentries
def _collect_logbook_entries_for_expos(expos, nologbook, ENTRIES, BLOG_PARSER_SETTINGS):
"""Helper to collect all logbook and blog entries for a list of expeditions."""
allentries = []
loglist = []
bloglist = []
for expo in expos:
year = expo.year
if year not in nologbook:
if year in ENTRIES:
loglist.append(expo)
else:
print(" - No Logbook entries count yet for: " + year)
loglist.append(expo)
if year in BLOG_PARSER_SETTINGS:
bloglist.append(expo)
for ex in loglist:
logentries = parse_logbook_for_expedition(ex)
allentries += logentries
for b in bloglist:
print(f" - BLOG: {b}")
logentries = parse_logbook_for_expedition(b, blog=True)
allentries += logentries
return allentries
def LoadLogbook(year):
"""One off logbook for testing purposes, and also reloadable on '/expedition/2023?reload'
@@ -726,13 +766,15 @@ def LoadLogbook(year):
print(
f" - Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}"
)
for entry in logentries:
# Bulk store all entries at once
bulk_store_entries(logentries)
#for entry in logentries:
#date, place, tripcave, triptitle, text, trippersons, author, guests, expedition, tu, tid = entrytuple
if expo == entry.expedition: # unneeded check, we zeroed it before filling it
#if expo == entry.expedition: # unneeded check, we zeroed it before filling it
# print(f" -- {triptitle}")
store_entry_into_database(entry)
else:
print(f" ! unexpected log entry labelled as '{entry.expedition}' {entry.tid}" )
#store_entry_into_database(entry)
#else:
#print(f" ! unexpected log entry labelled as '{entry.expedition}' {entry.tid}" )
expo.save() # to save logbook name property
def LoadLogbooks():
@@ -772,36 +814,8 @@ def LoadLogbooks():
sqlfail = [""] # breaks mysql with db constraint fail - all now fixed.]
nologbook = noexpo + lostlogbook + sqlfail
nlbe = {}
loglist = []
bloglist = []
for expo in expos:
year = expo.year
if year in sqlfail:
print(" - Logbook for: " + year + " NO parsing attempted - known sql failures")
message = f" ! - Not even attempting to parse logbook for {year} until code fixed"
DataIssue.objects.create(parser="logbooks", message=message)
print(message)
if year not in nologbook:
if year in ENTRIES:
loglist.append(expo)
else:
print(" - No Logbook entries count yet for: " + year) # catch case when preparing for next expo
loglist.append(expo)
if year in BLOG_PARSER_SETTINGS:
bloglist.append(expo)
for ex in loglist:
logentries = parse_logbook_for_expedition(ex) # this loads the logbook for one expo
allentries += logentries
for b in bloglist:
print(f" - BLOG: {b}")
logentries = parse_logbook_for_expedition(b, blog=True) # loads the blog logbook for one expo
allentries += logentries
allentries = _collect_logbook_entries_for_expos(expos, nologbook, ENTRIES, BLOG_PARSER_SETTINGS)
print(f"\n - {len(allentries):,} log entries parsed in all expeditions")
mem = get_process_memory()
@@ -814,9 +828,8 @@ def LoadLogbooks():
# - Expedition (the 'logbook.html' value)
# - LogBookEntry (text, who when etc.)
# - PersonLogEntry (who was on that specific trip mentione din the logbook entry)
for entry in allentries:
# date, place, tripcave, triptitle, text, trippersons, author, guests, expedition, tu, tid = entrytuple
store_entry_into_database(entry)
bulk_store_entries(allentries)
for expo in expos:
expo.save() # to save logbook name property