no search before db object creation: faster

This commit is contained in:
Philip Sargent 2023-01-28 11:45:30 +00:00
parent e4c804b305
commit db0504057b
2 changed files with 45 additions and 36 deletions

View File

@ -40,7 +40,7 @@ def import_logbooks():
troggle.parsers.logbooks.LoadLogbooks()
def import_logbook(year=2016):
def import_logbook(year=2022):
print(f"-- Importing Logbook {year}")
with transaction.atomic():
troggle.parsers.logbooks.LoadLogbook(year)

View File

@ -205,25 +205,18 @@ def tidy_trip_persons(trippeople, expedition, logtime_underground, tid):
return trippersons, author
def store_entry_into_database(date, place, title, text, trippersons, author, expedition, logtime_underground, tid=None):
"""saves a single logbook entry and related persontrips
Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday !
troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite but we are saving the same thing too many times..
Until 18 Dec.2022, this was overwriting logbook entries for the same date with the same title, because
lookupAttribs={'date':date, 'title':title}
"""
# This needs attention. The slug field is derived from 'title'
# both GetCaveLookup() and GetTripCave() need to work together better. None of this data is *used* though?
# tripCave = GetTripCave(place):
def tidy_trip_cave(place):
# GetCaveLookup() need to work better. None of this data is *used* though?
# 'tripcave' is converted to a string doing this, which renders as the cave slug.
lplace = place.lower()
cave = None
if lplace not in noncaveplaces:
cave = GetCaveLookup().get(lplace)
return cave
def tidy_trip_image_urls(text, date):
y = str(date)[:4]
text = text.replace(' src="', f' src="/years/{y}/')
@ -234,13 +227,26 @@ def store_entry_into_database(date, place, title, text, trippersons, author, exp
text = text.replace("\t", "")
text = text.replace("\n\n\n", "\n\n")
return text
def store_entry_into_database(date, place, tripcave, title, text, trippersons, author, expedition, logtime_underground, tid=None):
"""saves a single logbook entry and related persontrips
Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday !
troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite but we are saving the same thing too many times..
Until 18 Dec.2022, this was overwriting logbook entries for the same date with the same title, because
lookupAttribs={'date':date, 'title':title}
"""
text = tidy_trip_image_urls(text, date)
# Check for an existing copy of the current entry, and save
expedition.get_expedition_day(date)
lookupAttribs = {"date": date, "title": title}
# 'cave' is converted to a string doing this, which renders as the cave slug.
# but it is a db query which we should try to avoid - rewrite this
# This needs attention. The slug field is derived from 'title'
# NEW slug for a logbook entry here! Unique id + slugified title fragment
if tid is not None:
@ -253,12 +259,13 @@ def store_entry_into_database(date, place, title, text, trippersons, author, exp
"text": text,
"expedition": expedition,
"time_underground": logtime_underground,
"cave_slug": str(cave),
"cave_slug": str(tripcave),
"slug": slug,
}
# Rewriting as we know prior objects have already been deleted.
# This creates the lbo instance of LogbookEntry
lbo, created = save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs)
lbo = LogbookEntry.objects.create(**nonLookupAttribs, **lookupAttribs)
# lbo, created = save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs)
# for PersonTrip time_underground is float (decimal hours)
for tripperson, time_underground in trippersons:
@ -266,7 +273,8 @@ def store_entry_into_database(date, place, title, text, trippersons, author, exp
lookupAttribs = {"personexpedition": tripperson, "logbook_entry": lbo}
nonLookupAttribs = {"time_underground": time_underground, "is_logbook_entry_author": (tripperson == author)}
# this creates the PersonTrip instance.
save_carefully(PersonTrip, lookupAttribs, nonLookupAttribs)
pt = PersonTrip.objects.create(**nonLookupAttribs, **lookupAttribs)
# save_carefully(PersonTrip, lookupAttribs, nonLookupAttribs)
def parser_date(tripdate, year):
"""Interprets dates in the expo logbooks and returns a correct datetime.date object"""
@ -386,9 +394,9 @@ def parser_html(year, expedition, txt, seq=""):
ldate = parser_date(tripdate.strip(), year)
triptitles = triptitle.split(" - ")
if len(triptitles) >= 2:
tripcave = triptitles[0]
place = triptitles[0]
else:
tripcave = "UNKNOWN"
place = "UNKNOWN"
ltriptext = re.sub(r"</p>", "", triptext)
# ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
@ -404,9 +412,10 @@ def parser_html(year, expedition, txt, seq=""):
dupl[check] = 1
tu = tidy_time_underground(tu)
trippersons, author = tidy_trip_persons(trippeople, expedition, tu, tid)
entrytuple = (ldate, tripcave, triptitle, ltriptext, trippersons, author, expedition, tu, tripid1)
tripcave = tidy_trip_cave(place)
entrytuple = (ldate, place, tripcave, triptitle, ltriptext, trippersons, author, expedition, tu, tripid1)
logentries.append(entrytuple)
@ -494,7 +503,7 @@ def parser_blog(year, expedition, txt, sq=""):
# print(f" - tid: {tid} '{trippeople}' '{tripdate}'")
# triptitle must have the location then a hyphen at the beginning as it is ignored by export function. We can't know what this is, so we set it as 'Expo' and 'Unknown'.
location = "Unknown"
place = "Unknown"
# triptitle must be unique for a given date. We can enforce this here.
triptitle = f"Expo - UK Caving Blog{sq} post {logbook_entry_count}"
tripcontent = re.sub(r"(width=\"\d+\")", "", tripcontent)
@ -505,7 +514,9 @@ def parser_blog(year, expedition, txt, sq=""):
tripcontent = f"\n\n<!-- Content parsed from UK Caving Blog -->\nBlog Author: {trippeople}" + tripcontent
trippersons, author = tidy_trip_persons(trippeople, expedition, logtime_underground, tid)
entrytuple = (tripdate, location, triptitle, tripcontent, trippersons, author, expedition, tu, tid)
tripcave = tidy_trip_cave(place)
entrytuple = (tripdate, place, tripcave, triptitle, tripcontent, trippersons, author, expedition, tu, tid)
logentries.append(entrytuple)
def clean_all_logbooks():
@ -581,10 +592,6 @@ def parse_logbook_for_expedition(expedition, blog=False):
print(f" - {year} parsing with {parsefunc} - {lb}")
parser(year, expedition, txt, sq) # this launches the right parser for this year
# --------------------
# move database storage into separate step
# for entrytuple in logentries:
# date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple
# store_entry_into_database(date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1)
if len(logentries) == expect:
# print(f"OK {year} {len(logentries):5d} is {expect}\n")
@ -614,10 +621,12 @@ def LoadLogbook(year):
f" - Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}"
)
for entrytuple in logentries:
date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple
if expo == expedition:
date, place, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple
if expo == expedition: # unneeded check, we zeroed it bbefore filling it
#print(f" - {triptitle}")
store_entry_into_database(date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1)
store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1)
else:
print(f" ! unexpected log entry labelled as '{expedition}' {tripid1}" )
expedition.save() # to save logbook name property
def LoadLogbooks():
@ -703,8 +712,8 @@ def LoadLogbooks():
# - LogBookEntry (text, who when etc.)
# - PersonTrip (who was on that specific trip mentione din the logbook entry)
for entrytuple in allentries:
date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple
store_entry_into_database(date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1)
date, place, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple
store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1)
for expo in expos:
expedition.save() # to save logbook name property