From db0504057b988ea0ccc982a53a48334084dc48bc Mon Sep 17 00:00:00 2001
From: Philip Sargent <philip.sargent@gmail.com>
Date: Sat, 28 Jan 2023 11:45:30 +0000
Subject: [PATCH] no search before db object creation: faster

---
 parsers/imports.py  |  2 +-
 parsers/logbooks.py | 79 +++++++++++++++++++++++++--------------------
 2 files changed, 45 insertions(+), 36 deletions(-)

diff --git a/parsers/imports.py b/parsers/imports.py
index 050d5ea..c1de034 100644
--- a/parsers/imports.py
+++ b/parsers/imports.py
@@ -40,7 +40,7 @@ def import_logbooks():
         troggle.parsers.logbooks.LoadLogbooks()
 
 
-def import_logbook(year=2016):
+def import_logbook(year=2022):
     print(f"-- Importing Logbook {year}")
     with transaction.atomic():
         troggle.parsers.logbooks.LoadLogbook(year)
diff --git a/parsers/logbooks.py b/parsers/logbooks.py
index fd1e7eb..f5250a2 100644
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -204,26 +204,19 @@ def tidy_trip_persons(trippeople, expedition, logtime_underground, tid):
         print(message)
         
     return trippersons, author
-        
-def store_entry_into_database(date, place, title, text, trippersons, author, expedition, logtime_underground, tid=None):
-    """saves a single logbook entry and related persontrips
-    Does NOT save the expeditionday_id  - all NULLs. why? Because we are deprecating expeditionday !
-
-    troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite but we are saving the  same thing too many times..
-
-    Until 18 Dec.2022, this was overwriting logbook entries for the same date with the same title, because
-    lookupAttribs={'date':date, 'title':title}
-    """
-
-    # This needs attention. The slug field is derived from 'title'
-    # both GetCaveLookup() and GetTripCave() need to work together better. None of this data is *used* though?
-    # tripCave = GetTripCave(place):
-
+    
+def tidy_trip_cave(place):
+    #  GetCaveLookup() need to work  better. None of this data is *used* though?
+    # 'tripcave' is converted to a string doing this, which renders as the cave slug.
+ 
     lplace = place.lower()
     cave = None
     if lplace not in noncaveplaces:
         cave = GetCaveLookup().get(lplace)
 
+    return cave
+
+def tidy_trip_image_urls(text, date):
     y = str(date)[:4]
 
     text = text.replace(' src="', f' src="/years/{y}/')
@@ -234,13 +227,26 @@ def store_entry_into_database(date, place, title, text, trippersons, author, exp
 
     text = text.replace("\t", "")
     text = text.replace("\n\n\n", "\n\n")
+    return text
+    
+def store_entry_into_database(date, place, tripcave, title, text, trippersons, author, expedition, logtime_underground, tid=None):
+    """saves a single logbook entry and related persontrips
+    Does NOT save the expeditionday_id  - all NULLs. why? Because we are deprecating expeditionday !
 
+    troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite but we are saving the  same thing too many times..
+
+    Until 18 Dec.2022, this was overwriting logbook entries for the same date with the same title, because
+    lookupAttribs={'date':date, 'title':title}
+    """
+    text = tidy_trip_image_urls(text, date)
+    
     # Check for an existing copy of the current entry, and save
     expedition.get_expedition_day(date)
+
     lookupAttribs = {"date": date, "title": title}
-    # 'cave' is converted to a string doing this, which renders as the cave slug.
     # but it is a db query which we should try to avoid - rewrite this
 
+    # This needs attention. The slug field is derived from 'title'
     # NEW slug for a logbook entry here! Unique id + slugified title fragment
 
     if tid is not None:
@@ -253,12 +259,13 @@ def store_entry_into_database(date, place, title, text, trippersons, author, exp
         "text": text,
         "expedition": expedition,
         "time_underground": logtime_underground,
-        "cave_slug": str(cave),
+        "cave_slug": str(tripcave),
         "slug": slug,
     }
-
+    # Rewriting as we know prior objects have already been deleted.
     # This creates the lbo instance of LogbookEntry
-    lbo, created = save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs)
+    lbo = LogbookEntry.objects.create(**nonLookupAttribs, **lookupAttribs)
+    # lbo, created = save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs)
 
     # for PersonTrip time_underground is float (decimal hours)
     for tripperson, time_underground in trippersons:
@@ -266,7 +273,8 @@ def store_entry_into_database(date, place, title, text, trippersons, author, exp
         lookupAttribs = {"personexpedition": tripperson, "logbook_entry": lbo}
         nonLookupAttribs = {"time_underground": time_underground, "is_logbook_entry_author": (tripperson == author)}
         # this creates the PersonTrip instance.
-        save_carefully(PersonTrip, lookupAttribs, nonLookupAttribs)
+        pt = PersonTrip.objects.create(**nonLookupAttribs, **lookupAttribs)
+        # save_carefully(PersonTrip, lookupAttribs, nonLookupAttribs)
         
 def parser_date(tripdate, year):
     """Interprets dates in the expo logbooks and returns a correct datetime.date object"""
@@ -386,9 +394,9 @@ def parser_html(year, expedition, txt, seq=""):
         ldate = parser_date(tripdate.strip(), year)
         triptitles = triptitle.split(" - ")
         if len(triptitles) >= 2:
-            tripcave = triptitles[0]
+            place = triptitles[0]
         else:
-            tripcave = "UNKNOWN"
+            place = "UNKNOWN"
         ltriptext = re.sub(r"</p>", "", triptext)
         # ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
         ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
@@ -404,9 +412,10 @@ def parser_html(year, expedition, txt, seq=""):
             dupl[check] = 1
             
         tu = tidy_time_underground(tu)
-
         trippersons, author = tidy_trip_persons(trippeople, expedition, tu, tid)
-        entrytuple = (ldate, tripcave, triptitle, ltriptext, trippersons, author, expedition, tu, tripid1)
+        tripcave = tidy_trip_cave(place)
+    
+        entrytuple = (ldate, place, tripcave, triptitle, ltriptext, trippersons, author, expedition, tu, tripid1)
         logentries.append(entrytuple)
 
 
@@ -494,7 +503,7 @@ def parser_blog(year, expedition, txt, sq=""):
         # print(f" -  tid: {tid} '{trippeople}' '{tripdate}'")
 
         # triptitle must have the location then a hyphen at the beginning as it is ignored by export function. We can't know what this is, so we set it as 'Expo' and 'Unknown'.
-        location = "Unknown"
+        place = "Unknown"
         # triptitle must be unique for a given date. We can enforce this here.
         triptitle = f"Expo - UK Caving Blog{sq} post {logbook_entry_count}"  
         tripcontent = re.sub(r"(width=\"\d+\")", "", tripcontent)
@@ -505,7 +514,9 @@ def parser_blog(year, expedition, txt, sq=""):
         tripcontent = f"\n\n<!-- Content parsed from UK Caving Blog -->\nBlog Author: {trippeople}" + tripcontent
 
         trippersons, author = tidy_trip_persons(trippeople, expedition, logtime_underground, tid)
-        entrytuple = (tripdate, location, triptitle, tripcontent, trippersons, author, expedition, tu, tid)
+        tripcave = tidy_trip_cave(place)
+            
+        entrytuple = (tripdate, place, tripcave, triptitle, tripcontent, trippersons, author, expedition, tu, tid)
         logentries.append(entrytuple)
 
 def clean_all_logbooks():
@@ -581,10 +592,6 @@ def parse_logbook_for_expedition(expedition, blog=False):
             print(f" - {year} parsing with {parsefunc} - {lb}")
             parser(year, expedition, txt, sq)  # this launches the right parser for this year
             # --------------------
-        # move database storage into separate step
-        # for entrytuple in logentries:
-            # date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple
-            # store_entry_into_database(date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1)
 
     if len(logentries) == expect:
         # print(f"OK  {year} {len(logentries):5d} is {expect}\n")
@@ -614,10 +621,12 @@ def LoadLogbook(year):
             f" - Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}"
         )
     for entrytuple in logentries:
-        date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple
-        if expo == expedition:
+        date, place, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple
+        if expo == expedition: # unneeded check, we zeroed it bbefore filling it
             #print(f" - {triptitle}")
-            store_entry_into_database(date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1)
+            store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1)
+        else:
+            print(f" ! unexpected log entry labelled as '{expedition}' {tripid1}" ) 
     expedition.save() # to save logbook name property
     
 def LoadLogbooks():
@@ -703,8 +712,8 @@ def LoadLogbooks():
     # - LogBookEntry (text, who when etc.)
     # - PersonTrip (who was on that specific trip mentione din the logbook entry)
     for entrytuple in allentries:
-        date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple
-        store_entry_into_database(date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1)
+        date, place, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple
+        store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1)
 
     for expo in expos: 
         expedition.save() # to save logbook name property