refactored, global removed

2026-06-14 05:31:51 +01:00 · 2023-01-28 13:14:54 +00:00
parent db0504057b
commit 9e71be8169
2 changed files with 53 additions and 77 deletions
@@ -203,6 +203,8 @@ def save_carefully(objectType, lookupAttribs={}, nonLookupAttribs={}):
        defined in core.models.TroggleModel.
        
        We are not using new_since_parsing - it is a fossil from Aaron Curtis's design in 2006. So it is always false.
+        
+        NOTE: this takes twice as long as simply creating a new object with the given values.
    
    """
    try:
@@ -13,7 +13,7 @@ from django.template.defaultfilters import slugify
 from parsers.people import GetPersonExpeditionNameLookup, load_people_expos
 from troggle.core.models.caves import GetCaveLookup, LogbookEntry, PersonTrip
 from troggle.core.models.troggle import DataIssue, Expedition
-from troggle.core.utils import save_carefully, get_process_memory
+from troggle.core.utils import get_process_memory

 """
 Parses and imports logbooks in all their wonderful confusion
@@ -21,27 +21,16 @@ Parses and imports logbooks in all their wonderful confusion
    https://expo.survex.com/handbook/computing/logbooks-parsing.html
 """
 todo = """
- Most of the time is during the database writing (13s out of 14s).
+- Most of the time is during the database writing (6s out of 8s).

- Move a lot of non-db code from store_entry_into_database()
-into parse_logbook_for_expedition()
-
- call GetTripPersons at parsing time, not db writing time
 - this is a slow and uncertain function too:  cave = getCaveByReference(caveRef)

- if I am certain that we are creating from scratch, don't use save_carefully() to
-create the Django objects. And I am, because I delete the outdated stuff.
-
 - pre-compile all the heavily used regular expressions !

- refactor to get rid of the global 'logentries', very ugly indeed.
-
 - profile the code to find bad repetitive things, of which there are many.

 - attach or link a DataIssue to an individual expo (logbook) so that it can be found and deleted

- far too many uses of Django field dereferencing to get values, which is SLOW
-
 - replace explicit 1970 date with a constant EPOCH

 - rewrite to use generators rather than storing everything intermediate in lists - to reduce memory impact.
@@ -228,53 +217,41 @@ def tidy_trip_image_urls(text, date):
    text = text.replace("\t", "")
    text = text.replace("\n\n\n", "\n\n")
    return text
-    
-def store_entry_into_database(date, place, tripcave, title, text, trippersons, author, expedition, logtime_underground, tid=None):
-    """saves a single logbook entry and related persontrips
-    Does NOT save the expeditionday_id  - all NULLs. why? Because we are deprecating expeditionday !

-    troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite but we are saving the  same thing too many times..
-
-    Until 18 Dec.2022, this was overwriting logbook entries for the same date with the same title, because
-    lookupAttribs={'date':date, 'title':title}
-    """
-    text = tidy_trip_image_urls(text, date)
-    
-    # Check for an existing copy of the current entry, and save
-    expedition.get_expedition_day(date)
-
-    lookupAttribs = {"date": date, "title": title}
-    # but it is a db query which we should try to avoid - rewrite this
-
-    # This needs attention. The slug field is derived from 'title'
-    # NEW slug for a logbook entry here! Unique id + slugified title fragment
+def tidy_tid(tid, title):

    if tid is not None:
-        slug = tid
-        # slug = tid + "_" + slugify(title)[:10].replace('-','_')
-    else:
-        slug = str(randint(1000, 9999)) + "_" + slugify(title)[:10].replace("-", "_")
+        return tid
+
+    # print(f"!    {title=} ")
+    tid = str(randint(1000, 9999)) + "_" + slugify(title)[:10].replace("-", "_")    
+    return tid
+    
+def store_entry_into_database(date, place, tripcave, title, text, trippersons, author, expedition, logtime_underground, tid):
+    """saves a single logbook entry and related persontrips
+    Does NOT save the expeditionday_id  - all NULLs. why? Because we are deprecating expeditionday !
+    """
+    
+    # gets the current ExpeditionDay, and saves it as an object attached to 
+    # the expedition, but does not attach it to the logbook entry. Why ?
+    
+    # expedition.get_expedition_day(date)
+
    nonLookupAttribs = {
        "place": place,
        "text": text,
        "expedition": expedition,
        "time_underground": logtime_underground,
        "cave_slug": str(tripcave),
-        "slug": slug,
+        "slug": tid,
    }
-    # Rewriting as we know prior objects have already been deleted.
-    # This creates the lbo instance of LogbookEntry
+    lookupAttribs = {"date": date, "title": title}
    lbo = LogbookEntry.objects.create(**nonLookupAttribs, **lookupAttribs)
-    # lbo, created = save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs)
-
-    # for PersonTrip time_underground is float (decimal hours)
+    
    for tripperson, time_underground in trippersons:
-        # print(f" -  {tid} '{tripperson}' author:{tripperson == author}")
        lookupAttribs = {"personexpedition": tripperson, "logbook_entry": lbo}
        nonLookupAttribs = {"time_underground": time_underground, "is_logbook_entry_author": (tripperson == author)}
-        # this creates the PersonTrip instance.
        pt = PersonTrip.objects.create(**nonLookupAttribs, **lookupAttribs)
-        # save_carefully(PersonTrip, lookupAttribs, nonLookupAttribs)
        
 def parser_date(tripdate, year):
    """Interprets dates in the expo logbooks and returns a correct datetime.date object"""
@@ -321,7 +298,7 @@ def parser_html(year, expedition, txt, seq=""):
    the endmatter up to the frontmatter. This made sense when translating
    from parser_html_01 format logfiles, believe me.
    """
-    global logentries
+    logentries = []
    dupl = {}

    # extract front material and stash for later use when rebuilding from list of entries
@@ -397,9 +374,8 @@ def parser_html(year, expedition, txt, seq=""):
            place = triptitles[0]
        else:
            place = "UNKNOWN"
-        ltriptext = re.sub(r"</p>", "", triptext)
-        # ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
-        ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
+        tripcontent = re.sub(r"</p>", "", triptext)
+        tripcontent = re.sub(r"<p>", "<br /><br />", tripcontent).strip()

        triptitle = triptitle.strip()
        # triptitle must be unique for a given date. We fix this here.
@@ -414,9 +390,12 @@ def parser_html(year, expedition, txt, seq=""):
        tu = tidy_time_underground(tu)
        trippersons, author = tidy_trip_persons(trippeople, expedition, tu, tid)
        tripcave = tidy_trip_cave(place)
-    
-        entrytuple = (ldate, place, tripcave, triptitle, ltriptext, trippersons, author, expedition, tu, tripid1)
+        tripcontent = tidy_trip_image_urls(tripcontent, date)
+        tid = tidy_tid(tid, triptitle)
+   
+        entrytuple = (ldate, place, tripcave, triptitle, tripcontent, trippersons, author, expedition, tu, tid)
        logentries.append(entrytuple)
+    return logentries


 def parser_blog(year, expedition, txt, sq=""):
@@ -437,7 +416,7 @@ def parser_blog(year, expedition, txt, sq=""):
    So the content is nested inside the header. Attachments (images) come after the content.
    It's a bugger, but it's out of our control.
    """
-    global logentries
+    logentries = []

    tripheads = re.findall(
        r"<article class=\"message message--post js-post js-inlineModContainer\s*\"\s*([\s\S]*?)(?=</article)", txt
@@ -515,9 +494,12 @@ def parser_blog(year, expedition, txt, sq=""):

        trippersons, author = tidy_trip_persons(trippeople, expedition, logtime_underground, tid)
        tripcave = tidy_trip_cave(place)
-            
+        tripcontent = tidy_trip_image_urls(tripcontent, date)
+        tid = tidy_tid(tid, triptitle)
+
        entrytuple = (tripdate, place, tripcave, triptitle, tripcontent, trippersons, author, expedition, tu, tid)
        logentries.append(entrytuple)
+    return logentries

 def clean_all_logbooks():
    DataIssue.objects.filter(parser="logbooks").delete()
@@ -538,7 +520,6 @@ def clean_logbook_for_expedition(expedition):
 def parse_logbook_for_expedition(expedition, blog=False):
    """Parses all logbook entries for one expedition
    """
-    global logentries
    global ENTRIES
    logentries = [] 
    
@@ -590,7 +571,7 @@ def parse_logbook_for_expedition(expedition, blog=False):
            # --------------------
            parser = globals()[parsefunc]
            print(f" - {year} parsing with {parsefunc} - {lb}")
-            parser(year, expedition, txt, sq)  # this launches the right parser for this year
+            logentries = parser(year, expedition, txt, sq)  # this launches the right parser
            # --------------------

    if len(logentries) == expect:
@@ -599,35 +580,33 @@ def parse_logbook_for_expedition(expedition, blog=False):
    else:
        print(f"Mismatch in number of log entries: {year} {len(logentries):5d} is not {expect}\n")

-    return len(logentries)
+    return logentries


 def LoadLogbook(year):
    """One off logbook for testing purposes, and also reloadable on '/expedition/2022?reload'
    This is inside an atomic transaction"""
-    global logentries
-    nlbe = {}

    expo = Expedition.objects.get(year=year)
    year = expo.year  # some type funny
    clean_logbook_for_expedition(expo)
    logentries = []
    
-    nlbe[expo] = parse_logbook_for_expedition(expo)  # this actually loads the logbook for one expo
+    logentries = parse_logbook_for_expedition(expo)  # this actually loads the logbook for one expo
    if year in BLOG_PARSER_SETTINGS:
-         nlbe[expo] = parse_logbook_for_expedition(expo, blog=True)  # this  loads the blog logbook
+         logentries += parse_logbook_for_expedition(expo, blog=True)  # this  loads the blog logbook
    else:
        print(
            f" - Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}"
        )
    for entrytuple in logentries:
-        date, place, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple
-        if expo == expedition: # unneeded check, we zeroed it bbefore filling it
+        date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid = entrytuple
+        if expo == expedition: # unneeded check, we zeroed it before filling it
            #print(f" - {triptitle}")
-            store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1)
+            store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid)
        else:
-            print(f" ! unexpected log entry labelled as '{expedition}' {tripid1}" ) 
-    expedition.save() # to save logbook name property
+            print(f" ! unexpected log entry labelled as '{expedition}' {tid}" ) 
+    expo.save() # to save logbook name property
    
 def LoadLogbooks():
    """This is the master function for parsing all logbooks into the Troggle database.
@@ -688,20 +667,15 @@ def LoadLogbooks():
            bloglist.append(expo)

    for ex in loglist:
-        nlbe[ex] = parse_logbook_for_expedition(ex)  # this  loads the logbook for one expo
+        logentries = parse_logbook_for_expedition(ex)  # this  loads the logbook for one expo
        allentries += logentries

    for b in bloglist:
        print(f" - BLOG: {b}")
-        nlbe[b] += parse_logbook_for_expedition(b, blog=True)  # loads the blog logbook for one expo
+        logentries = parse_logbook_for_expedition(b, blog=True)  # loads the blog logbook for one expo
        allentries += logentries

-    yt = 0
-    for exp in nlbe:
-        yt += nlbe[exp]
-    print(f"total {yt:,} log entries parsed in all expeditions")
-
-    print(f"total {len(allentries):,} log entries in complete dict")
+    print(f"total {len(allentries):,} log entries parsed in all expeditions")
    mem = get_process_memory()
    print(f" - MEM: {mem:7.2f} MB in use, {mem-mem1:7.2f} MB more", file=sys.stderr)
    duration = time.time() - start
@@ -712,11 +686,11 @@ def LoadLogbooks():
    # - LogBookEntry (text, who when etc.)
    # - PersonTrip (who was on that specific trip mentione din the logbook entry)
    for entrytuple in allentries:
-        date, place, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple
-        store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1)
-
+        date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid = entrytuple
+        store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid)
+ 
    for expo in expos: 
-        expedition.save() # to save logbook name property
+        expo.save() # to save logbook name property
    mem = get_process_memory()
    print(f" - MEM: {mem:7.2f} MB in use, {mem-mem1:7.2f} MB more", file=sys.stderr)
    duration = time.time() - start