From db0504057b988ea0ccc982a53a48334084dc48bc Mon Sep 17 00:00:00 2001 From: Philip Sargent Date: Sat, 28 Jan 2023 11:45:30 +0000 Subject: [PATCH] no search before db object creation: faster --- parsers/imports.py | 2 +- parsers/logbooks.py | 79 +++++++++++++++++++++++++-------------------- 2 files changed, 45 insertions(+), 36 deletions(-) diff --git a/parsers/imports.py b/parsers/imports.py index 050d5ea..c1de034 100644 --- a/parsers/imports.py +++ b/parsers/imports.py @@ -40,7 +40,7 @@ def import_logbooks(): troggle.parsers.logbooks.LoadLogbooks() -def import_logbook(year=2016): +def import_logbook(year=2022): print(f"-- Importing Logbook {year}") with transaction.atomic(): troggle.parsers.logbooks.LoadLogbook(year) diff --git a/parsers/logbooks.py b/parsers/logbooks.py index fd1e7eb..f5250a2 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -204,26 +204,19 @@ def tidy_trip_persons(trippeople, expedition, logtime_underground, tid): print(message) return trippersons, author - -def store_entry_into_database(date, place, title, text, trippersons, author, expedition, logtime_underground, tid=None): - """saves a single logbook entry and related persontrips - Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday ! - - troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite but we are saving the same thing too many times.. - - Until 18 Dec.2022, this was overwriting logbook entries for the same date with the same title, because - lookupAttribs={'date':date, 'title':title} - """ - - # This needs attention. The slug field is derived from 'title' - # both GetCaveLookup() and GetTripCave() need to work together better. None of this data is *used* though? - # tripCave = GetTripCave(place): - + +def tidy_trip_cave(place): + # GetCaveLookup() need to work better. None of this data is *used* though? + # 'tripcave' is converted to a string doing this, which renders as the cave slug. + lplace = place.lower() cave = None if lplace not in noncaveplaces: cave = GetCaveLookup().get(lplace) + return cave + +def tidy_trip_image_urls(text, date): y = str(date)[:4] text = text.replace(' src="', f' src="/years/{y}/') @@ -234,13 +227,26 @@ def store_entry_into_database(date, place, title, text, trippersons, author, exp text = text.replace("\t", "") text = text.replace("\n\n\n", "\n\n") + return text + +def store_entry_into_database(date, place, tripcave, title, text, trippersons, author, expedition, logtime_underground, tid=None): + """saves a single logbook entry and related persontrips + Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday ! + troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite but we are saving the same thing too many times.. + + Until 18 Dec.2022, this was overwriting logbook entries for the same date with the same title, because + lookupAttribs={'date':date, 'title':title} + """ + text = tidy_trip_image_urls(text, date) + # Check for an existing copy of the current entry, and save expedition.get_expedition_day(date) + lookupAttribs = {"date": date, "title": title} - # 'cave' is converted to a string doing this, which renders as the cave slug. # but it is a db query which we should try to avoid - rewrite this + # This needs attention. The slug field is derived from 'title' # NEW slug for a logbook entry here! Unique id + slugified title fragment if tid is not None: @@ -253,12 +259,13 @@ def store_entry_into_database(date, place, title, text, trippersons, author, exp "text": text, "expedition": expedition, "time_underground": logtime_underground, - "cave_slug": str(cave), + "cave_slug": str(tripcave), "slug": slug, } - + # Rewriting as we know prior objects have already been deleted. # This creates the lbo instance of LogbookEntry - lbo, created = save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs) + lbo = LogbookEntry.objects.create(**nonLookupAttribs, **lookupAttribs) + # lbo, created = save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs) # for PersonTrip time_underground is float (decimal hours) for tripperson, time_underground in trippersons: @@ -266,7 +273,8 @@ def store_entry_into_database(date, place, title, text, trippersons, author, exp lookupAttribs = {"personexpedition": tripperson, "logbook_entry": lbo} nonLookupAttribs = {"time_underground": time_underground, "is_logbook_entry_author": (tripperson == author)} # this creates the PersonTrip instance. - save_carefully(PersonTrip, lookupAttribs, nonLookupAttribs) + pt = PersonTrip.objects.create(**nonLookupAttribs, **lookupAttribs) + # save_carefully(PersonTrip, lookupAttribs, nonLookupAttribs) def parser_date(tripdate, year): """Interprets dates in the expo logbooks and returns a correct datetime.date object""" @@ -386,9 +394,9 @@ def parser_html(year, expedition, txt, seq=""): ldate = parser_date(tripdate.strip(), year) triptitles = triptitle.split(" - ") if len(triptitles) >= 2: - tripcave = triptitles[0] + place = triptitles[0] else: - tripcave = "UNKNOWN" + place = "UNKNOWN" ltriptext = re.sub(r"

", "", triptext) # ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext) ltriptext = re.sub(r"

", "

", ltriptext).strip() @@ -404,9 +412,10 @@ def parser_html(year, expedition, txt, seq=""): dupl[check] = 1 tu = tidy_time_underground(tu) - trippersons, author = tidy_trip_persons(trippeople, expedition, tu, tid) - entrytuple = (ldate, tripcave, triptitle, ltriptext, trippersons, author, expedition, tu, tripid1) + tripcave = tidy_trip_cave(place) + + entrytuple = (ldate, place, tripcave, triptitle, ltriptext, trippersons, author, expedition, tu, tripid1) logentries.append(entrytuple) @@ -494,7 +503,7 @@ def parser_blog(year, expedition, txt, sq=""): # print(f" - tid: {tid} '{trippeople}' '{tripdate}'") # triptitle must have the location then a hyphen at the beginning as it is ignored by export function. We can't know what this is, so we set it as 'Expo' and 'Unknown'. - location = "Unknown" + place = "Unknown" # triptitle must be unique for a given date. We can enforce this here. triptitle = f"Expo - UK Caving Blog{sq} post {logbook_entry_count}" tripcontent = re.sub(r"(width=\"\d+\")", "", tripcontent) @@ -505,7 +514,9 @@ def parser_blog(year, expedition, txt, sq=""): tripcontent = f"\n\n\nBlog Author: {trippeople}" + tripcontent trippersons, author = tidy_trip_persons(trippeople, expedition, logtime_underground, tid) - entrytuple = (tripdate, location, triptitle, tripcontent, trippersons, author, expedition, tu, tid) + tripcave = tidy_trip_cave(place) + + entrytuple = (tripdate, place, tripcave, triptitle, tripcontent, trippersons, author, expedition, tu, tid) logentries.append(entrytuple) def clean_all_logbooks(): @@ -581,10 +592,6 @@ def parse_logbook_for_expedition(expedition, blog=False): print(f" - {year} parsing with {parsefunc} - {lb}") parser(year, expedition, txt, sq) # this launches the right parser for this year # -------------------- - # move database storage into separate step - # for entrytuple in logentries: - # date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple - # store_entry_into_database(date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1) if len(logentries) == expect: # print(f"OK {year} {len(logentries):5d} is {expect}\n") @@ -614,10 +621,12 @@ def LoadLogbook(year): f" - Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}" ) for entrytuple in logentries: - date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple - if expo == expedition: + date, place, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple + if expo == expedition: # unneeded check, we zeroed it bbefore filling it #print(f" - {triptitle}") - store_entry_into_database(date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1) + store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1) + else: + print(f" ! unexpected log entry labelled as '{expedition}' {tripid1}" ) expedition.save() # to save logbook name property def LoadLogbooks(): @@ -703,8 +712,8 @@ def LoadLogbooks(): # - LogBookEntry (text, who when etc.) # - PersonTrip (who was on that specific trip mentione din the logbook entry) for entrytuple in allentries: - date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple - store_entry_into_database(date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1) + date, place, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple + store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1) for expo in expos: expedition.save() # to save logbook name property