no search before db object creation: faster

This commit is contained in:
Philip Sargent 2023-01-28 11:45:30 +00:00
parent e4c804b305
commit db0504057b
2 changed files with 45 additions and 36 deletions

View File

@ -40,7 +40,7 @@ def import_logbooks():
troggle.parsers.logbooks.LoadLogbooks() troggle.parsers.logbooks.LoadLogbooks()
def import_logbook(year=2016): def import_logbook(year=2022):
print(f"-- Importing Logbook {year}") print(f"-- Importing Logbook {year}")
with transaction.atomic(): with transaction.atomic():
troggle.parsers.logbooks.LoadLogbook(year) troggle.parsers.logbooks.LoadLogbook(year)

View File

@ -205,25 +205,18 @@ def tidy_trip_persons(trippeople, expedition, logtime_underground, tid):
return trippersons, author return trippersons, author
def store_entry_into_database(date, place, title, text, trippersons, author, expedition, logtime_underground, tid=None): def tidy_trip_cave(place):
"""saves a single logbook entry and related persontrips # GetCaveLookup() need to work better. None of this data is *used* though?
Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday ! # 'tripcave' is converted to a string doing this, which renders as the cave slug.
troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite but we are saving the same thing too many times..
Until 18 Dec.2022, this was overwriting logbook entries for the same date with the same title, because
lookupAttribs={'date':date, 'title':title}
"""
# This needs attention. The slug field is derived from 'title'
# both GetCaveLookup() and GetTripCave() need to work together better. None of this data is *used* though?
# tripCave = GetTripCave(place):
lplace = place.lower() lplace = place.lower()
cave = None cave = None
if lplace not in noncaveplaces: if lplace not in noncaveplaces:
cave = GetCaveLookup().get(lplace) cave = GetCaveLookup().get(lplace)
return cave
def tidy_trip_image_urls(text, date):
y = str(date)[:4] y = str(date)[:4]
text = text.replace(' src="', f' src="/years/{y}/') text = text.replace(' src="', f' src="/years/{y}/')
@ -234,13 +227,26 @@ def store_entry_into_database(date, place, title, text, trippersons, author, exp
text = text.replace("\t", "") text = text.replace("\t", "")
text = text.replace("\n\n\n", "\n\n") text = text.replace("\n\n\n", "\n\n")
return text
def store_entry_into_database(date, place, tripcave, title, text, trippersons, author, expedition, logtime_underground, tid=None):
"""saves a single logbook entry and related persontrips
Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday !
troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite but we are saving the same thing too many times..
Until 18 Dec.2022, this was overwriting logbook entries for the same date with the same title, because
lookupAttribs={'date':date, 'title':title}
"""
text = tidy_trip_image_urls(text, date)
# Check for an existing copy of the current entry, and save # Check for an existing copy of the current entry, and save
expedition.get_expedition_day(date) expedition.get_expedition_day(date)
lookupAttribs = {"date": date, "title": title} lookupAttribs = {"date": date, "title": title}
# 'cave' is converted to a string doing this, which renders as the cave slug.
# but it is a db query which we should try to avoid - rewrite this # but it is a db query which we should try to avoid - rewrite this
# This needs attention. The slug field is derived from 'title'
# NEW slug for a logbook entry here! Unique id + slugified title fragment # NEW slug for a logbook entry here! Unique id + slugified title fragment
if tid is not None: if tid is not None:
@ -253,12 +259,13 @@ def store_entry_into_database(date, place, title, text, trippersons, author, exp
"text": text, "text": text,
"expedition": expedition, "expedition": expedition,
"time_underground": logtime_underground, "time_underground": logtime_underground,
"cave_slug": str(cave), "cave_slug": str(tripcave),
"slug": slug, "slug": slug,
} }
# Rewriting as we know prior objects have already been deleted.
# This creates the lbo instance of LogbookEntry # This creates the lbo instance of LogbookEntry
lbo, created = save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs) lbo = LogbookEntry.objects.create(**nonLookupAttribs, **lookupAttribs)
# lbo, created = save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs)
# for PersonTrip time_underground is float (decimal hours) # for PersonTrip time_underground is float (decimal hours)
for tripperson, time_underground in trippersons: for tripperson, time_underground in trippersons:
@ -266,7 +273,8 @@ def store_entry_into_database(date, place, title, text, trippersons, author, exp
lookupAttribs = {"personexpedition": tripperson, "logbook_entry": lbo} lookupAttribs = {"personexpedition": tripperson, "logbook_entry": lbo}
nonLookupAttribs = {"time_underground": time_underground, "is_logbook_entry_author": (tripperson == author)} nonLookupAttribs = {"time_underground": time_underground, "is_logbook_entry_author": (tripperson == author)}
# this creates the PersonTrip instance. # this creates the PersonTrip instance.
save_carefully(PersonTrip, lookupAttribs, nonLookupAttribs) pt = PersonTrip.objects.create(**nonLookupAttribs, **lookupAttribs)
# save_carefully(PersonTrip, lookupAttribs, nonLookupAttribs)
def parser_date(tripdate, year): def parser_date(tripdate, year):
"""Interprets dates in the expo logbooks and returns a correct datetime.date object""" """Interprets dates in the expo logbooks and returns a correct datetime.date object"""
@ -386,9 +394,9 @@ def parser_html(year, expedition, txt, seq=""):
ldate = parser_date(tripdate.strip(), year) ldate = parser_date(tripdate.strip(), year)
triptitles = triptitle.split(" - ") triptitles = triptitle.split(" - ")
if len(triptitles) >= 2: if len(triptitles) >= 2:
tripcave = triptitles[0] place = triptitles[0]
else: else:
tripcave = "UNKNOWN" place = "UNKNOWN"
ltriptext = re.sub(r"</p>", "", triptext) ltriptext = re.sub(r"</p>", "", triptext)
# ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext) # ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip() ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
@ -404,9 +412,10 @@ def parser_html(year, expedition, txt, seq=""):
dupl[check] = 1 dupl[check] = 1
tu = tidy_time_underground(tu) tu = tidy_time_underground(tu)
trippersons, author = tidy_trip_persons(trippeople, expedition, tu, tid) trippersons, author = tidy_trip_persons(trippeople, expedition, tu, tid)
entrytuple = (ldate, tripcave, triptitle, ltriptext, trippersons, author, expedition, tu, tripid1) tripcave = tidy_trip_cave(place)
entrytuple = (ldate, place, tripcave, triptitle, ltriptext, trippersons, author, expedition, tu, tripid1)
logentries.append(entrytuple) logentries.append(entrytuple)
@ -494,7 +503,7 @@ def parser_blog(year, expedition, txt, sq=""):
# print(f" - tid: {tid} '{trippeople}' '{tripdate}'") # print(f" - tid: {tid} '{trippeople}' '{tripdate}'")
# triptitle must have the location then a hyphen at the beginning as it is ignored by export function. We can't know what this is, so we set it as 'Expo' and 'Unknown'. # triptitle must have the location then a hyphen at the beginning as it is ignored by export function. We can't know what this is, so we set it as 'Expo' and 'Unknown'.
location = "Unknown" place = "Unknown"
# triptitle must be unique for a given date. We can enforce this here. # triptitle must be unique for a given date. We can enforce this here.
triptitle = f"Expo - UK Caving Blog{sq} post {logbook_entry_count}" triptitle = f"Expo - UK Caving Blog{sq} post {logbook_entry_count}"
tripcontent = re.sub(r"(width=\"\d+\")", "", tripcontent) tripcontent = re.sub(r"(width=\"\d+\")", "", tripcontent)
@ -505,7 +514,9 @@ def parser_blog(year, expedition, txt, sq=""):
tripcontent = f"\n\n<!-- Content parsed from UK Caving Blog -->\nBlog Author: {trippeople}" + tripcontent tripcontent = f"\n\n<!-- Content parsed from UK Caving Blog -->\nBlog Author: {trippeople}" + tripcontent
trippersons, author = tidy_trip_persons(trippeople, expedition, logtime_underground, tid) trippersons, author = tidy_trip_persons(trippeople, expedition, logtime_underground, tid)
entrytuple = (tripdate, location, triptitle, tripcontent, trippersons, author, expedition, tu, tid) tripcave = tidy_trip_cave(place)
entrytuple = (tripdate, place, tripcave, triptitle, tripcontent, trippersons, author, expedition, tu, tid)
logentries.append(entrytuple) logentries.append(entrytuple)
def clean_all_logbooks(): def clean_all_logbooks():
@ -581,10 +592,6 @@ def parse_logbook_for_expedition(expedition, blog=False):
print(f" - {year} parsing with {parsefunc} - {lb}") print(f" - {year} parsing with {parsefunc} - {lb}")
parser(year, expedition, txt, sq) # this launches the right parser for this year parser(year, expedition, txt, sq) # this launches the right parser for this year
# -------------------- # --------------------
# move database storage into separate step
# for entrytuple in logentries:
# date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple
# store_entry_into_database(date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1)
if len(logentries) == expect: if len(logentries) == expect:
# print(f"OK {year} {len(logentries):5d} is {expect}\n") # print(f"OK {year} {len(logentries):5d} is {expect}\n")
@ -614,10 +621,12 @@ def LoadLogbook(year):
f" - Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}" f" - Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}"
) )
for entrytuple in logentries: for entrytuple in logentries:
date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple date, place, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple
if expo == expedition: if expo == expedition: # unneeded check, we zeroed it bbefore filling it
#print(f" - {triptitle}") #print(f" - {triptitle}")
store_entry_into_database(date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1) store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1)
else:
print(f" ! unexpected log entry labelled as '{expedition}' {tripid1}" )
expedition.save() # to save logbook name property expedition.save() # to save logbook name property
def LoadLogbooks(): def LoadLogbooks():
@ -703,8 +712,8 @@ def LoadLogbooks():
# - LogBookEntry (text, who when etc.) # - LogBookEntry (text, who when etc.)
# - PersonTrip (who was on that specific trip mentione din the logbook entry) # - PersonTrip (who was on that specific trip mentione din the logbook entry)
for entrytuple in allentries: for entrytuple in allentries:
date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple date, place, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple
store_entry_into_database(date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1) store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1)
for expo in expos: for expo in expos:
expedition.save() # to save logbook name property expedition.save() # to save logbook name property