refactored, global removed

2026-02-08 14:37:53 +00:00 · 2023-01-28 13:14:54 +00:00
parent db0504057b
commit 9e71be8169
2 changed files with 53 additions and 77 deletions
--- a/core/utils.py
+++ b/core/utils.py
@@ -203,6 +203,8 @@ def save_carefully(objectType, lookupAttribs={}, nonLookupAttribs={}):
        defined in core.models.TroggleModel.
        We are not using new_since_parsing - it is a fossil from Aaron Curtis's design in 2006. So it is always false.
        NOTE: this takes twice as long as simply creating a new object with the given values.
    """
    try:
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -13,7 +13,7 @@ from django.template.defaultfilters import slugify
 from parsers.people import GetPersonExpeditionNameLookup, load_people_expos
 from troggle.core.models.caves import GetCaveLookup, LogbookEntry, PersonTrip
 from troggle.core.models.troggle import DataIssue, Expedition
-from troggle.core.utils import save_carefully, get_process_memory
+from troggle.core.utils import get_process_memory
 """
 Parses and imports logbooks in all their wonderful confusion
@@ -21,27 +21,16 @@ Parses and imports logbooks in all their wonderful confusion
    https://expo.survex.com/handbook/computing/logbooks-parsing.html
 """
 todo = """
- Most of the time is during the database writing (13s out of 14s).
+- Most of the time is during the database writing (6s out of 8s).
 - Move a lot of non-db code from store_entry_into_database()
 into parse_logbook_for_expedition()
 - call GetTripPersons at parsing time, not db writing time
 - this is a slow and uncertain function too:  cave = getCaveByReference(caveRef)
 - if I am certain that we are creating from scratch, don't use save_carefully() to
 create the Django objects. And I am, because I delete the outdated stuff.
 - pre-compile all the heavily used regular expressions !
 - refactor to get rid of the global 'logentries', very ugly indeed.
 - profile the code to find bad repetitive things, of which there are many.
 - attach or link a DataIssue to an individual expo (logbook) so that it can be found and deleted
 - far too many uses of Django field dereferencing to get values, which is SLOW
 - replace explicit 1970 date with a constant EPOCH
 - rewrite to use generators rather than storing everything intermediate in lists - to reduce memory impact.
@@ -228,53 +217,41 @@ def tidy_trip_image_urls(text, date):
    text = text.replace("\t", "")
    text = text.replace("\n\n\n", "\n\n")
    return text
 def store_entry_into_database(date, place, tripcave, title, text, trippersons, author, expedition, logtime_underground, tid=None):
    """saves a single logbook entry and related persontrips
    Does NOT save the expeditionday_id  - all NULLs. why? Because we are deprecating expeditionday !
-    troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite but we are saving the  same thing too many times..
+def tidy_tid(tid, title):
    Until 18 Dec.2022, this was overwriting logbook entries for the same date with the same title, because
    lookupAttribs={'date':date, 'title':title}
    """
    text = tidy_trip_image_urls(text, date)
    # Check for an existing copy of the current entry, and save
    expedition.get_expedition_day(date)
    lookupAttribs = {"date": date, "title": title}
    # but it is a db query which we should try to avoid - rewrite this
    # This needs attention. The slug field is derived from 'title'
    # NEW slug for a logbook entry here! Unique id + slugified title fragment
    if tid is not None:
-        slug = tid
+        return tid
-        # slug = tid + "_" + slugify(title)[:10].replace('-','_')
+
-    else:
+    # print(f"!    {title=} ")
-        slug = str(randint(1000, 9999)) + "_" + slugify(title)[:10].replace("-", "_")
+    tid = str(randint(1000, 9999)) + "_" + slugify(title)[:10].replace("-", "_")    
    return tid
 def store_entry_into_database(date, place, tripcave, title, text, trippersons, author, expedition, logtime_underground, tid):
    """saves a single logbook entry and related persontrips
    Does NOT save the expeditionday_id  - all NULLs. why? Because we are deprecating expeditionday !
    """
    # gets the current ExpeditionDay, and saves it as an object attached to 
    # the expedition, but does not attach it to the logbook entry. Why ?
    # expedition.get_expedition_day(date)
    nonLookupAttribs = {
        "place": place,
        "text": text,
        "expedition": expedition,
        "time_underground": logtime_underground,
        "cave_slug": str(tripcave),
-        "slug": slug,
+        "slug": tid,
    }
-    # Rewriting as we know prior objects have already been deleted.
+    lookupAttribs = {"date": date, "title": title}
    # This creates the lbo instance of LogbookEntry
    lbo = LogbookEntry.objects.create(**nonLookupAttribs, **lookupAttribs)
-    # lbo, created = save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs)
+    
    # for PersonTrip time_underground is float (decimal hours)
    for tripperson, time_underground in trippersons:
        # print(f" -  {tid} '{tripperson}' author:{tripperson == author}")
        lookupAttribs = {"personexpedition": tripperson, "logbook_entry": lbo}
        nonLookupAttribs = {"time_underground": time_underground, "is_logbook_entry_author": (tripperson == author)}
        # this creates the PersonTrip instance.
        pt = PersonTrip.objects.create(**nonLookupAttribs, **lookupAttribs)
        # save_carefully(PersonTrip, lookupAttribs, nonLookupAttribs)
 def parser_date(tripdate, year):
    """Interprets dates in the expo logbooks and returns a correct datetime.date object"""
@@ -321,7 +298,7 @@ def parser_html(year, expedition, txt, seq=""):
    the endmatter up to the frontmatter. This made sense when translating
    from parser_html_01 format logfiles, believe me.
    """
-    global logentries
+    logentries = []
    dupl = {}
    # extract front material and stash for later use when rebuilding from list of entries
@@ -397,9 +374,8 @@ def parser_html(year, expedition, txt, seq=""):
            place = triptitles[0]
        else:
            place = "UNKNOWN"
-        ltriptext = re.sub(r"</p>", "", triptext)
+        tripcontent = re.sub(r"</p>", "", triptext)
-        # ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
+        tripcontent = re.sub(r"<p>", "<br /><br />", tripcontent).strip()
        ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
        triptitle = triptitle.strip()
        # triptitle must be unique for a given date. We fix this here.
@@ -414,9 +390,12 @@ def parser_html(year, expedition, txt, seq=""):
        tu = tidy_time_underground(tu)
        trippersons, author = tidy_trip_persons(trippeople, expedition, tu, tid)
        tripcave = tidy_trip_cave(place)
-    
+        tripcontent = tidy_trip_image_urls(tripcontent, date)
-        entrytuple = (ldate, place, tripcave, triptitle, ltriptext, trippersons, author, expedition, tu, tripid1)
+        tid = tidy_tid(tid, triptitle)
        entrytuple = (ldate, place, tripcave, triptitle, tripcontent, trippersons, author, expedition, tu, tid)
        logentries.append(entrytuple)
    return logentries
 def parser_blog(year, expedition, txt, sq=""):
@@ -437,7 +416,7 @@ def parser_blog(year, expedition, txt, sq=""):
    So the content is nested inside the header. Attachments (images) come after the content.
    It's a bugger, but it's out of our control.
    """
-    global logentries
+    logentries = []
    tripheads = re.findall(
        r"<article class=\"message message--post js-post js-inlineModContainer\s*\"\s*([\s\S]*?)(?=</article)", txt
@@ -515,9 +494,12 @@ def parser_blog(year, expedition, txt, sq=""):
        trippersons, author = tidy_trip_persons(trippeople, expedition, logtime_underground, tid)
        tripcave = tidy_trip_cave(place)
-            
+        tripcontent = tidy_trip_image_urls(tripcontent, date)
        tid = tidy_tid(tid, triptitle)
        entrytuple = (tripdate, place, tripcave, triptitle, tripcontent, trippersons, author, expedition, tu, tid)
        logentries.append(entrytuple)
    return logentries
 def clean_all_logbooks():
    DataIssue.objects.filter(parser="logbooks").delete()
@@ -538,7 +520,6 @@ def clean_logbook_for_expedition(expedition):
 def parse_logbook_for_expedition(expedition, blog=False):
    """Parses all logbook entries for one expedition
    """
    global logentries
    global ENTRIES
    logentries = [] 
@@ -590,7 +571,7 @@ def parse_logbook_for_expedition(expedition, blog=False):
            # --------------------
            parser = globals()[parsefunc]
            print(f" - {year} parsing with {parsefunc} - {lb}")
-            parser(year, expedition, txt, sq)  # this launches the right parser for this year
+            logentries = parser(year, expedition, txt, sq)  # this launches the right parser
            # --------------------
    if len(logentries) == expect:
@@ -599,35 +580,33 @@ def parse_logbook_for_expedition(expedition, blog=False):
    else:
        print(f"Mismatch in number of log entries: {year} {len(logentries):5d} is not {expect}\n")
-    return len(logentries)
+    return logentries
 def LoadLogbook(year):
    """One off logbook for testing purposes, and also reloadable on '/expedition/2022?reload'
    This is inside an atomic transaction"""
    global logentries
    nlbe = {}
    expo = Expedition.objects.get(year=year)
    year = expo.year  # some type funny
    clean_logbook_for_expedition(expo)
    logentries = []
-    nlbe[expo] = parse_logbook_for_expedition(expo)  # this actually loads the logbook for one expo
+    logentries = parse_logbook_for_expedition(expo)  # this actually loads the logbook for one expo
    if year in BLOG_PARSER_SETTINGS:
-         nlbe[expo] = parse_logbook_for_expedition(expo, blog=True)  # this  loads the blog logbook
+         logentries += parse_logbook_for_expedition(expo, blog=True)  # this  loads the blog logbook
    else:
        print(
            f" - Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}"
        )
    for entrytuple in logentries:
-        date, place, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple
+        date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid = entrytuple
-        if expo == expedition: # unneeded check, we zeroed it bbefore filling it
+        if expo == expedition: # unneeded check, we zeroed it before filling it
            #print(f" - {triptitle}")
-            store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1)
+            store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid)
        else:
-            print(f" ! unexpected log entry labelled as '{expedition}' {tripid1}" ) 
+            print(f" ! unexpected log entry labelled as '{expedition}' {tid}" ) 
-    expedition.save() # to save logbook name property
+    expo.save() # to save logbook name property
 def LoadLogbooks():
    """This is the master function for parsing all logbooks into the Troggle database.
@@ -688,20 +667,15 @@ def LoadLogbooks():
            bloglist.append(expo)
    for ex in loglist:
-        nlbe[ex] = parse_logbook_for_expedition(ex)  # this  loads the logbook for one expo
+        logentries = parse_logbook_for_expedition(ex)  # this  loads the logbook for one expo
        allentries += logentries
    for b in bloglist:
        print(f" - BLOG: {b}")
-        nlbe[b] += parse_logbook_for_expedition(b, blog=True)  # loads the blog logbook for one expo
+        logentries = parse_logbook_for_expedition(b, blog=True)  # loads the blog logbook for one expo
        allentries += logentries
-    yt = 0
+    print(f"total {len(allentries):,} log entries parsed in all expeditions")
    for exp in nlbe:
        yt += nlbe[exp]
    print(f"total {yt:,} log entries parsed in all expeditions")
    print(f"total {len(allentries):,} log entries in complete dict")
    mem = get_process_memory()
    print(f" - MEM: {mem:7.2f} MB in use, {mem-mem1:7.2f} MB more", file=sys.stderr)
    duration = time.time() - start
@@ -712,11 +686,11 @@ def LoadLogbooks():
    # - LogBookEntry (text, who when etc.)
    # - PersonTrip (who was on that specific trip mentione din the logbook entry)
    for entrytuple in allentries:
-        date, place, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple
+        date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid = entrytuple
-        store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1)
+        store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid)
-
+ 
    for expo in expos: 
-        expedition.save() # to save logbook name property
+        expo.save() # to save logbook name property
    mem = get_process_memory()
    print(f" - MEM: {mem:7.2f} MB in use, {mem-mem1:7.2f} MB more", file=sys.stderr)
    duration = time.time() - start