mirror of
https://expo.survex.com/repositories/troggle/.git
synced 2025-12-19 07:17:20 +00:00
bulk update seriously speeds up logbook database storing
This commit is contained in:
@@ -28,11 +28,6 @@ todo = """
|
||||
- check cross-references to specific logbook entries in other logbooks and other HTML frahments
|
||||
e.g. cave descriptions
|
||||
|
||||
- Most of the time is during the database writing (6s out of 8s).
|
||||
profile the code to find bad repetitive things, of which there are many. But probably we just have too many Django database operations.
|
||||
Currently we store each entry individually. It should be done using Django bulk entry.
|
||||
Look at Person & PersonExpedition all in python in parsers/people.py and then commit as two bulk transactions. test if links between them work when done like that.
|
||||
|
||||
- attach or link a DataIssue to an individual expo (logbook) so that it can be found and deleted in the DataIssue bug output
|
||||
|
||||
- rewrite to use generators rather than storing everything intermediate in lists - to
|
||||
@@ -299,46 +294,64 @@ def tidy_tid(tid, title, date):
|
||||
tid = str(randint(1000, 9999)) + "_" + slugify(title)[:10].replace("-", "_")
|
||||
return tid
|
||||
|
||||
def store_entry_into_database(entry):
|
||||
"""saves a single logbook entry and related personlogentry items
|
||||
|
||||
We could do a bulk update to save all the entries, but then we would need to do a query on
|
||||
each one to get the primary key to assign to the PersonLogEntries. So overall probably not much
|
||||
faster ?
|
||||
def bulk_store_entries(entries):
|
||||
"""
|
||||
other_people = ", ".join(entry.guests) # join list members separated by comma
|
||||
# if guests:
|
||||
# print(f" {date} - {guests}")
|
||||
|
||||
otherAttribs = {
|
||||
"place": entry.place,
|
||||
"other_people": other_people, # *Ol's Mum, foreigners..
|
||||
"text": entry.text,
|
||||
"expedition": entry.expedition,
|
||||
"time_underground": entry.tu,
|
||||
"cave": entry.tripcave,
|
||||
}
|
||||
coUniqueAttribs = {"slug": entry.tid, "date": entry.tripdate, "title": entry.triptitle}
|
||||
if LogbookEntry.objects.filter(slug=entry.tid).exists():
|
||||
# oops. Our code should already have ensured this is unique.
|
||||
message = " ! - DUPLICATE SLUG for logbook entry " + entry.tripdate + " - " + entry.tid
|
||||
DataIssue.objects.create(parser="logbooks", message=message)
|
||||
slug = slug + "_" + unique_slug(text,2)
|
||||
Bulk saves logbook entries and related personlogentry items.
|
||||
This reduces the number of database operations significantly.
|
||||
This replaces >2,000 calls to store_entry_into_database()
|
||||
"""
|
||||
# 1. Prepare LogbookEntry objects
|
||||
logbook_objs = []
|
||||
slug_to_entrydata = {}
|
||||
for entry in entries:
|
||||
other_people = ", ".join(entry.guests)
|
||||
# Ensure slug is unique, otherwise add suffix
|
||||
slug = entry.tid
|
||||
orig_slug = slug
|
||||
i = 2
|
||||
while slug in slug_to_entrydata:
|
||||
# found duplicate
|
||||
slug = f"{orig_slug}_{i}"
|
||||
message = " ! - DUPLICATE SLUG for logbook entry " + entry.tripdate + " - " + entry.tid
|
||||
DataIssue.objects.create(parser="logbooks", message=message)
|
||||
#slug = slug + "_" + unique_slug(entry.text,2)
|
||||
i += 1
|
||||
slug_to_entrydata[slug] = entry
|
||||
|
||||
logbook_objs.append(LogbookEntry(
|
||||
place=entry.place,
|
||||
other_people=other_people, # Ol's mum, foreigners
|
||||
text=entry.text,
|
||||
expedition=entry.expedition,
|
||||
time_underground=entry.tu,
|
||||
cave=entry.tripcave,
|
||||
slug=slug,
|
||||
date=entry.tripdate,
|
||||
title=entry.triptitle,
|
||||
))
|
||||
|
||||
# 2. Bulk create LogbookEntry objects
|
||||
LogbookEntry.objects.bulk_create(logbook_objs)
|
||||
|
||||
# 3. Fetch created LogbookEntry objects by slug for FK assignment
|
||||
created_entries = {lbe.slug: lbe for lbe in LogbookEntry.objects.filter(slug__in=slug_to_entrydata.keys())}
|
||||
|
||||
# 4. Prepare PersonLogEntry objects
|
||||
personlog_objs = []
|
||||
for slug, entry in slug_to_entrydata.items():
|
||||
lbo = created_entries[slug]
|
||||
for tripperson, nickname_used, time_underground in entry.trippersons:
|
||||
personlog_objs.append(PersonLogEntry(
|
||||
personexpedition=tripperson,
|
||||
nickname_used=nickname_used,
|
||||
logbook_entry=lbo,
|
||||
time_underground=time_underground,
|
||||
is_logbook_entry_author=(tripperson == entry.author),
|
||||
))
|
||||
|
||||
# 5. Bulk create PersonLogEntry objects
|
||||
PersonLogEntry.objects.bulk_create(personlog_objs)
|
||||
|
||||
lbo = LogbookEntry.objects.create(**otherAttribs, **coUniqueAttribs)
|
||||
|
||||
pt_list = []
|
||||
for tripperson, nickname_used, time_underground in entry.trippersons:
|
||||
coUniqueAttribs = {
|
||||
"personexpedition": tripperson,
|
||||
"nickname_used": nickname_used,
|
||||
"logbook_entry": lbo
|
||||
} # lbo is primary key
|
||||
otherAttribs = {
|
||||
"time_underground": time_underground,
|
||||
"is_logbook_entry_author": (tripperson == entry.author)}
|
||||
pt_list.append(PersonLogEntry(**otherAttribs, **coUniqueAttribs))
|
||||
PersonLogEntry.objects.bulk_create(pt_list)
|
||||
|
||||
def parser_date(tripdate, year):
|
||||
"""Interprets dates in the expo logbooks and returns a correct datetime.date object
|
||||
@@ -707,6 +720,33 @@ def parse_logbook_for_expedition(expedition, blog=False):
|
||||
|
||||
return logentries
|
||||
|
||||
def _collect_logbook_entries_for_expos(expos, nologbook, ENTRIES, BLOG_PARSER_SETTINGS):
|
||||
"""Helper to collect all logbook and blog entries for a list of expeditions."""
|
||||
allentries = []
|
||||
loglist = []
|
||||
bloglist = []
|
||||
|
||||
for expo in expos:
|
||||
year = expo.year
|
||||
if year not in nologbook:
|
||||
if year in ENTRIES:
|
||||
loglist.append(expo)
|
||||
else:
|
||||
print(" - No Logbook entries count yet for: " + year)
|
||||
loglist.append(expo)
|
||||
if year in BLOG_PARSER_SETTINGS:
|
||||
bloglist.append(expo)
|
||||
|
||||
for ex in loglist:
|
||||
logentries = parse_logbook_for_expedition(ex)
|
||||
allentries += logentries
|
||||
|
||||
for b in bloglist:
|
||||
print(f" - BLOG: {b}")
|
||||
logentries = parse_logbook_for_expedition(b, blog=True)
|
||||
allentries += logentries
|
||||
|
||||
return allentries
|
||||
|
||||
def LoadLogbook(year):
|
||||
"""One off logbook for testing purposes, and also reloadable on '/expedition/2023?reload'
|
||||
@@ -726,13 +766,15 @@ def LoadLogbook(year):
|
||||
print(
|
||||
f" - Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}"
|
||||
)
|
||||
for entry in logentries:
|
||||
# Bulk store all entries at once
|
||||
bulk_store_entries(logentries)
|
||||
#for entry in logentries:
|
||||
#date, place, tripcave, triptitle, text, trippersons, author, guests, expedition, tu, tid = entrytuple
|
||||
if expo == entry.expedition: # unneeded check, we zeroed it before filling it
|
||||
#if expo == entry.expedition: # unneeded check, we zeroed it before filling it
|
||||
# print(f" -- {triptitle}")
|
||||
store_entry_into_database(entry)
|
||||
else:
|
||||
print(f" ! unexpected log entry labelled as '{entry.expedition}' {entry.tid}" )
|
||||
#store_entry_into_database(entry)
|
||||
#else:
|
||||
#print(f" ! unexpected log entry labelled as '{entry.expedition}' {entry.tid}" )
|
||||
expo.save() # to save logbook name property
|
||||
|
||||
def LoadLogbooks():
|
||||
@@ -772,36 +814,8 @@ def LoadLogbooks():
|
||||
sqlfail = [""] # breaks mysql with db constraint fail - all now fixed.]
|
||||
nologbook = noexpo + lostlogbook + sqlfail
|
||||
|
||||
nlbe = {}
|
||||
loglist = []
|
||||
bloglist = []
|
||||
|
||||
for expo in expos:
|
||||
year = expo.year
|
||||
if year in sqlfail:
|
||||
print(" - Logbook for: " + year + " NO parsing attempted - known sql failures")
|
||||
message = f" ! - Not even attempting to parse logbook for {year} until code fixed"
|
||||
DataIssue.objects.create(parser="logbooks", message=message)
|
||||
print(message)
|
||||
|
||||
if year not in nologbook:
|
||||
if year in ENTRIES:
|
||||
loglist.append(expo)
|
||||
else:
|
||||
print(" - No Logbook entries count yet for: " + year) # catch case when preparing for next expo
|
||||
loglist.append(expo)
|
||||
|
||||
if year in BLOG_PARSER_SETTINGS:
|
||||
bloglist.append(expo)
|
||||
|
||||
for ex in loglist:
|
||||
logentries = parse_logbook_for_expedition(ex) # this loads the logbook for one expo
|
||||
allentries += logentries
|
||||
|
||||
for b in bloglist:
|
||||
print(f" - BLOG: {b}")
|
||||
logentries = parse_logbook_for_expedition(b, blog=True) # loads the blog logbook for one expo
|
||||
allentries += logentries
|
||||
allentries = _collect_logbook_entries_for_expos(expos, nologbook, ENTRIES, BLOG_PARSER_SETTINGS)
|
||||
|
||||
|
||||
print(f"\n - {len(allentries):,} log entries parsed in all expeditions")
|
||||
mem = get_process_memory()
|
||||
@@ -814,9 +828,8 @@ def LoadLogbooks():
|
||||
# - Expedition (the 'logbook.html' value)
|
||||
# - LogBookEntry (text, who when etc.)
|
||||
# - PersonLogEntry (who was on that specific trip mentione din the logbook entry)
|
||||
for entry in allentries:
|
||||
# date, place, tripcave, triptitle, text, trippersons, author, guests, expedition, tu, tid = entrytuple
|
||||
store_entry_into_database(entry)
|
||||
bulk_store_entries(allentries)
|
||||
|
||||
|
||||
for expo in expos:
|
||||
expo.save() # to save logbook name property
|
||||
|
||||
Reference in New Issue
Block a user