diff --git a/parsers/logbooks.py b/parsers/logbooks.py index 5311031..aa1b290 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -28,11 +28,6 @@ todo = """ - check cross-references to specific logbook entries in other logbooks and other HTML frahments e.g. cave descriptions -- Most of the time is during the database writing (6s out of 8s). -profile the code to find bad repetitive things, of which there are many. But probably we just have too many Django database operations. -Currently we store each entry individually. It should be done using Django bulk entry. -Look at Person & PersonExpedition all in python in parsers/people.py and then commit as two bulk transactions. test if links between them work when done like that. - - attach or link a DataIssue to an individual expo (logbook) so that it can be found and deleted in the DataIssue bug output - rewrite to use generators rather than storing everything intermediate in lists - to @@ -299,46 +294,64 @@ def tidy_tid(tid, title, date): tid = str(randint(1000, 9999)) + "_" + slugify(title)[:10].replace("-", "_") return tid -def store_entry_into_database(entry): - """saves a single logbook entry and related personlogentry items - - We could do a bulk update to save all the entries, but then we would need to do a query on - each one to get the primary key to assign to the PersonLogEntries. So overall probably not much - faster ? +def bulk_store_entries(entries): """ - other_people = ", ".join(entry.guests) # join list members separated by comma - # if guests: - # print(f" {date} - {guests}") - - otherAttribs = { - "place": entry.place, - "other_people": other_people, # *Ol's Mum, foreigners.. - "text": entry.text, - "expedition": entry.expedition, - "time_underground": entry.tu, - "cave": entry.tripcave, - } - coUniqueAttribs = {"slug": entry.tid, "date": entry.tripdate, "title": entry.triptitle} - if LogbookEntry.objects.filter(slug=entry.tid).exists(): - # oops. Our code should already have ensured this is unique. - message = " ! - DUPLICATE SLUG for logbook entry " + entry.tripdate + " - " + entry.tid - DataIssue.objects.create(parser="logbooks", message=message) - slug = slug + "_" + unique_slug(text,2) + Bulk saves logbook entries and related personlogentry items. + This reduces the number of database operations significantly. + This replaces >2,000 calls to store_entry_into_database() + """ + # 1. Prepare LogbookEntry objects + logbook_objs = [] + slug_to_entrydata = {} + for entry in entries: + other_people = ", ".join(entry.guests) + # Ensure slug is unique, otherwise add suffix + slug = entry.tid + orig_slug = slug + i = 2 + while slug in slug_to_entrydata: + # found duplicate + slug = f"{orig_slug}_{i}" + message = " ! - DUPLICATE SLUG for logbook entry " + entry.tripdate + " - " + entry.tid + DataIssue.objects.create(parser="logbooks", message=message) + #slug = slug + "_" + unique_slug(entry.text,2) + i += 1 + slug_to_entrydata[slug] = entry + + logbook_objs.append(LogbookEntry( + place=entry.place, + other_people=other_people, # Ol's mum, foreigners + text=entry.text, + expedition=entry.expedition, + time_underground=entry.tu, + cave=entry.tripcave, + slug=slug, + date=entry.tripdate, + title=entry.triptitle, + )) + + # 2. Bulk create LogbookEntry objects + LogbookEntry.objects.bulk_create(logbook_objs) + + # 3. Fetch created LogbookEntry objects by slug for FK assignment + created_entries = {lbe.slug: lbe for lbe in LogbookEntry.objects.filter(slug__in=slug_to_entrydata.keys())} + + # 4. Prepare PersonLogEntry objects + personlog_objs = [] + for slug, entry in slug_to_entrydata.items(): + lbo = created_entries[slug] + for tripperson, nickname_used, time_underground in entry.trippersons: + personlog_objs.append(PersonLogEntry( + personexpedition=tripperson, + nickname_used=nickname_used, + logbook_entry=lbo, + time_underground=time_underground, + is_logbook_entry_author=(tripperson == entry.author), + )) + + # 5. Bulk create PersonLogEntry objects + PersonLogEntry.objects.bulk_create(personlog_objs) - lbo = LogbookEntry.objects.create(**otherAttribs, **coUniqueAttribs) - - pt_list = [] - for tripperson, nickname_used, time_underground in entry.trippersons: - coUniqueAttribs = { - "personexpedition": tripperson, - "nickname_used": nickname_used, - "logbook_entry": lbo - } # lbo is primary key - otherAttribs = { - "time_underground": time_underground, - "is_logbook_entry_author": (tripperson == entry.author)} - pt_list.append(PersonLogEntry(**otherAttribs, **coUniqueAttribs)) - PersonLogEntry.objects.bulk_create(pt_list) def parser_date(tripdate, year): """Interprets dates in the expo logbooks and returns a correct datetime.date object @@ -707,6 +720,33 @@ def parse_logbook_for_expedition(expedition, blog=False): return logentries +def _collect_logbook_entries_for_expos(expos, nologbook, ENTRIES, BLOG_PARSER_SETTINGS): + """Helper to collect all logbook and blog entries for a list of expeditions.""" + allentries = [] + loglist = [] + bloglist = [] + + for expo in expos: + year = expo.year + if year not in nologbook: + if year in ENTRIES: + loglist.append(expo) + else: + print(" - No Logbook entries count yet for: " + year) + loglist.append(expo) + if year in BLOG_PARSER_SETTINGS: + bloglist.append(expo) + + for ex in loglist: + logentries = parse_logbook_for_expedition(ex) + allentries += logentries + + for b in bloglist: + print(f" - BLOG: {b}") + logentries = parse_logbook_for_expedition(b, blog=True) + allentries += logentries + + return allentries def LoadLogbook(year): """One off logbook for testing purposes, and also reloadable on '/expedition/2023?reload' @@ -726,13 +766,15 @@ def LoadLogbook(year): print( f" - Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}" ) - for entry in logentries: + # Bulk store all entries at once + bulk_store_entries(logentries) + #for entry in logentries: #date, place, tripcave, triptitle, text, trippersons, author, guests, expedition, tu, tid = entrytuple - if expo == entry.expedition: # unneeded check, we zeroed it before filling it + #if expo == entry.expedition: # unneeded check, we zeroed it before filling it # print(f" -- {triptitle}") - store_entry_into_database(entry) - else: - print(f" ! unexpected log entry labelled as '{entry.expedition}' {entry.tid}" ) + #store_entry_into_database(entry) + #else: + #print(f" ! unexpected log entry labelled as '{entry.expedition}' {entry.tid}" ) expo.save() # to save logbook name property def LoadLogbooks(): @@ -772,36 +814,8 @@ def LoadLogbooks(): sqlfail = [""] # breaks mysql with db constraint fail - all now fixed.] nologbook = noexpo + lostlogbook + sqlfail - nlbe = {} - loglist = [] - bloglist = [] - - for expo in expos: - year = expo.year - if year in sqlfail: - print(" - Logbook for: " + year + " NO parsing attempted - known sql failures") - message = f" ! - Not even attempting to parse logbook for {year} until code fixed" - DataIssue.objects.create(parser="logbooks", message=message) - print(message) - - if year not in nologbook: - if year in ENTRIES: - loglist.append(expo) - else: - print(" - No Logbook entries count yet for: " + year) # catch case when preparing for next expo - loglist.append(expo) - - if year in BLOG_PARSER_SETTINGS: - bloglist.append(expo) - - for ex in loglist: - logentries = parse_logbook_for_expedition(ex) # this loads the logbook for one expo - allentries += logentries - - for b in bloglist: - print(f" - BLOG: {b}") - logentries = parse_logbook_for_expedition(b, blog=True) # loads the blog logbook for one expo - allentries += logentries + allentries = _collect_logbook_entries_for_expos(expos, nologbook, ENTRIES, BLOG_PARSER_SETTINGS) + print(f"\n - {len(allentries):,} log entries parsed in all expeditions") mem = get_process_memory() @@ -814,9 +828,8 @@ def LoadLogbooks(): # - Expedition (the 'logbook.html' value) # - LogBookEntry (text, who when etc.) # - PersonLogEntry (who was on that specific trip mentione din the logbook entry) - for entry in allentries: - # date, place, tripcave, triptitle, text, trippersons, author, guests, expedition, tu, tid = entrytuple - store_entry_into_database(entry) + bulk_store_entries(allentries) + for expo in expos: expo.save() # to save logbook name property