no search before db object creation: faster

2023-01-28 11:45:30 +00:00 · 2023-01-28 11:45:30 +00:00 · db0504057b
commit db0504057b
parent e4c804b305
2 changed files with 45 additions and 36 deletions
--- a/parsers/imports.py
+++ b/parsers/imports.py
@ -40,7 +40,7 @@ def import_logbooks():
        troggle.parsers.logbooks.LoadLogbooks()
-def import_logbook(year=2016):
+def import_logbook(year=2022):
    print(f"-- Importing Logbook {year}")
    with transaction.atomic():
        troggle.parsers.logbooks.LoadLogbook(year)
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@ -205,25 +205,18 @@ def tidy_trip_persons(trippeople, expedition, logtime_underground, tid):
    return trippersons, author
-def store_entry_into_database(date, place, title, text, trippersons, author, expedition, logtime_underground, tid=None):
+def tidy_trip_cave(place):
-    """saves a single logbook entry and related persontrips
+    #  GetCaveLookup() need to work  better. None of this data is *used* though?
-    Does NOT save the expeditionday_id  - all NULLs. why? Because we are deprecating expeditionday !
+    # 'tripcave' is converted to a string doing this, which renders as the cave slug.
    troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite but we are saving the  same thing too many times..
    Until 18 Dec.2022, this was overwriting logbook entries for the same date with the same title, because
    lookupAttribs={'date':date, 'title':title}
    """
    # This needs attention. The slug field is derived from 'title'
    # both GetCaveLookup() and GetTripCave() need to work together better. None of this data is *used* though?
    # tripCave = GetTripCave(place):
    lplace = place.lower()
    cave = None
    if lplace not in noncaveplaces:
        cave = GetCaveLookup().get(lplace)
    return cave
 def tidy_trip_image_urls(text, date):
    y = str(date)[:4]
    text = text.replace(' src="', f' src="/years/{y}/')
@ -234,13 +227,26 @@ def store_entry_into_database(date, place, title, text, trippersons, author, exp
    text = text.replace("\t", "")
    text = text.replace("\n\n\n", "\n\n")
    return text
 def store_entry_into_database(date, place, tripcave, title, text, trippersons, author, expedition, logtime_underground, tid=None):
    """saves a single logbook entry and related persontrips
    Does NOT save the expeditionday_id  - all NULLs. why? Because we are deprecating expeditionday !
    troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite but we are saving the  same thing too many times..
    Until 18 Dec.2022, this was overwriting logbook entries for the same date with the same title, because
    lookupAttribs={'date':date, 'title':title}
    """
    text = tidy_trip_image_urls(text, date)
    # Check for an existing copy of the current entry, and save
    expedition.get_expedition_day(date)
    lookupAttribs = {"date": date, "title": title}
    # 'cave' is converted to a string doing this, which renders as the cave slug.
    # but it is a db query which we should try to avoid - rewrite this
    # This needs attention. The slug field is derived from 'title'
    # NEW slug for a logbook entry here! Unique id + slugified title fragment
    if tid is not None:
@ -253,12 +259,13 @@ def store_entry_into_database(date, place, title, text, trippersons, author, exp
        "text": text,
        "expedition": expedition,
        "time_underground": logtime_underground,
-        "cave_slug": str(cave),
+        "cave_slug": str(tripcave),
        "slug": slug,
    }
-
+    # Rewriting as we know prior objects have already been deleted.
    # This creates the lbo instance of LogbookEntry
-    lbo, created = save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs)
+    lbo = LogbookEntry.objects.create(**nonLookupAttribs, **lookupAttribs)
    # lbo, created = save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs)
    # for PersonTrip time_underground is float (decimal hours)
    for tripperson, time_underground in trippersons:
@ -266,7 +273,8 @@ def store_entry_into_database(date, place, title, text, trippersons, author, exp
        lookupAttribs = {"personexpedition": tripperson, "logbook_entry": lbo}
        nonLookupAttribs = {"time_underground": time_underground, "is_logbook_entry_author": (tripperson == author)}
        # this creates the PersonTrip instance.
-        save_carefully(PersonTrip, lookupAttribs, nonLookupAttribs)
+        pt = PersonTrip.objects.create(**nonLookupAttribs, **lookupAttribs)
        # save_carefully(PersonTrip, lookupAttribs, nonLookupAttribs)
 def parser_date(tripdate, year):
    """Interprets dates in the expo logbooks and returns a correct datetime.date object"""
@ -386,9 +394,9 @@ def parser_html(year, expedition, txt, seq=""):
        ldate = parser_date(tripdate.strip(), year)
        triptitles = triptitle.split(" - ")
        if len(triptitles) >= 2:
-            tripcave = triptitles[0]
+            place = triptitles[0]
        else:
-            tripcave = "UNKNOWN"
+            place = "UNKNOWN"
        ltriptext = re.sub(r"</p>", "", triptext)
        # ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
        ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
@ -404,9 +412,10 @@ def parser_html(year, expedition, txt, seq=""):
            dupl[check] = 1
        tu = tidy_time_underground(tu)
        trippersons, author = tidy_trip_persons(trippeople, expedition, tu, tid)
-        entrytuple = (ldate, tripcave, triptitle, ltriptext, trippersons, author, expedition, tu, tripid1)
+        tripcave = tidy_trip_cave(place)
        entrytuple = (ldate, place, tripcave, triptitle, ltriptext, trippersons, author, expedition, tu, tripid1)
        logentries.append(entrytuple)
@ -494,7 +503,7 @@ def parser_blog(year, expedition, txt, sq=""):
        # print(f" -  tid: {tid} '{trippeople}' '{tripdate}'")
        # triptitle must have the location then a hyphen at the beginning as it is ignored by export function. We can't know what this is, so we set it as 'Expo' and 'Unknown'.
-        location = "Unknown"
+        place = "Unknown"
        # triptitle must be unique for a given date. We can enforce this here.
        triptitle = f"Expo - UK Caving Blog{sq} post {logbook_entry_count}"  
        tripcontent = re.sub(r"(width=\"\d+\")", "", tripcontent)
@ -505,7 +514,9 @@ def parser_blog(year, expedition, txt, sq=""):
        tripcontent = f"\n\n<!-- Content parsed from UK Caving Blog -->\nBlog Author: {trippeople}" + tripcontent
        trippersons, author = tidy_trip_persons(trippeople, expedition, logtime_underground, tid)
-        entrytuple = (tripdate, location, triptitle, tripcontent, trippersons, author, expedition, tu, tid)
+        tripcave = tidy_trip_cave(place)
        entrytuple = (tripdate, place, tripcave, triptitle, tripcontent, trippersons, author, expedition, tu, tid)
        logentries.append(entrytuple)
 def clean_all_logbooks():
@ -581,10 +592,6 @@ def parse_logbook_for_expedition(expedition, blog=False):
            print(f" - {year} parsing with {parsefunc} - {lb}")
            parser(year, expedition, txt, sq)  # this launches the right parser for this year
            # --------------------
        # move database storage into separate step
        # for entrytuple in logentries:
            # date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple
            # store_entry_into_database(date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1)
    if len(logentries) == expect:
        # print(f"OK  {year} {len(logentries):5d} is {expect}\n")
@ -614,10 +621,12 @@ def LoadLogbook(year):
            f" - Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}"
        )
    for entrytuple in logentries:
-        date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple
+        date, place, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple
-        if expo == expedition:
+        if expo == expedition: # unneeded check, we zeroed it bbefore filling it
            #print(f" - {triptitle}")
-            store_entry_into_database(date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1)
+            store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1)
        else:
            print(f" ! unexpected log entry labelled as '{expedition}' {tripid1}" ) 
    expedition.save() # to save logbook name property
 def LoadLogbooks():
@ -703,8 +712,8 @@ def LoadLogbooks():
    # - LogBookEntry (text, who when etc.)
    # - PersonTrip (who was on that specific trip mentione din the logbook entry)
    for entrytuple in allentries:
-        date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple
+        date, place, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple
-        store_entry_into_database(date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1)
+        store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1)
    for expo in expos: 
        expedition.save() # to save logbook name property