Remove logdataissues from TROG

2024-11-25 08:41:51 +00:00 · 2023-01-27 17:24:31 +00:00 · 2023-01-27 17:24:31 +00:00 · 2fee216e80
commit 2fee216e80
parent 75834902f2
2 changed files with 38 additions and 68 deletions
--- a/core/utils.py
+++ b/core/utils.py
@ -33,15 +33,15 @@ save_carefully() - core function that saves troggle objects in the database

 various git add/commit functions that need refactoring together

+NOTE that TROG is not serialized! Two users can update it and conflict !! 
+This needs to be in a multi-user database with transactions.
+
 '''

 TROG = {
    'pagecache' : {
        'expedition' : {}
    },
-    'issues' : {
-        'logdataissues' : {}
-    },
    'caves' : {
        'gcavelookup' : {},
        'gcavecount' : {}
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@ -10,7 +10,7 @@ from django.template.defaultfilters import slugify
 from parsers.people import GetPersonExpeditionNameLookup
 from troggle.core.models.caves import GetCaveLookup, LogbookEntry, PersonTrip
 from troggle.core.models.troggle import DataIssue, Expedition
-from troggle.core.utils import TROG, save_carefully
+from troggle.core.utils import save_carefully

 """
 Parses and imports logbooks in all their wonderful confusion
@ -18,12 +18,16 @@ Parses and imports logbooks in all their wonderful confusion
    https://expo.survex.com/handbook/computing/logbooks-parsing.html
 """
 todo = """
- refactor everything with some urgency, esp. LoadLogbookForExpedition()
+- refactor everything with some urgency, esp. parse_logbook_for_expedition()

- remove the TROG things since we need the database for multiuser access? Or not?
+- break out the code that hits the database from that which parses the logbook
+so that the file-reading and parsing can be parallelized, while writing to the
+database remains serialized (sqlite is single-user).

 - profile the code to find bad repetitive things, of which there are many.

+- attach or link a DataIssue to an individual expo (logbook) so that it can be found and deleted
+
 - far too many uses of Django field dereferencing to get values, which is SLOW

 - rewrite to use generators rather than storing everything intermediate in lists - to reduce memory impact.
@ -96,12 +100,8 @@ entries = {

 logentries = []  # the entire logbook for one year is a single object: a list of entries
 noncaveplaces = ["Journey", "Loser Plateau", "UNKNOWN", "plateau", "base camp", "basecamp", "top camp", "topcamp"]
-logdataissues = TROG["issues"]["logdataissues"]
 trips = {}

-#
-# the logbook loading section
-#
 def set_trip_id(year, seq):
    tid = f"{year}_s{seq:02d}"
    return tid
@ -149,7 +149,6 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
                message = f" ! - {expedition.year} No name match for: '{tripperson}' in entry {tid=} for this expedition year."
                print(message)
                DataIssue.objects.create(parser="logbooks", message=message)
-                logdataissues[tid] = message
            res.append((personyear, logtime_underground))
            if mul:
                author = personyear
@ -163,7 +162,7 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):


 def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, tid=None):
-    """saves a logbook entry and related persontrips
+    """saves a single logbook entry and related persontrips
    Does NOT save the expeditionday_id  - all NULLs. why? Because we are deprecating expeditionday !

    troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite but we are saving the  same thing too many times..
@ -193,7 +192,6 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
    except:
        message = f" ! - {expedition.year} Skipping logentry: {title} - GetTripPersons FAIL"
        DataIssue.objects.create(parser="logbooks", message=message)
-        logdataissues["title"] = message
        print(message)
        raise
        return
@ -201,7 +199,6 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
    if not author:
        message = f" ! - {expedition.year} Warning: logentry: {title} - no expo member author for entry '{tid}'"
        DataIssue.objects.create(parser="logbooks", message=message)
-        logdataissues["title"] = message
        print(message)
        # return

@ -261,7 +258,7 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_

 def ParseDate(tripdate, year):
    """Interprets dates in the expo logbooks and returns a correct datetime.date object"""
-    dummydate = date(1970, 1, 1)
+    dummydate = date(1970, 1, 1) # replace with _EPOCH
    month = 1
    day = 1
    # message = f" ! - Trying to parse date in logbook: {tripdate} - {year}"
@ -273,7 +270,6 @@ def ParseDate(tripdate, year):
            if not (mdatestandard.group(1) == year):
                message = f" ! - Bad date (year) in logbook: {tripdate} - {year}"
                DataIssue.objects.create(parser="logbooks", message=message)
-                logdataissues["tripdate"] = message
                return dummydate
            else:
                year, month, day = int(mdatestandard.group(1)), int(mdatestandard.group(2)), int(mdatestandard.group(3))
@ -281,23 +277,20 @@ def ParseDate(tripdate, year):
            if not (not mdategoof.group(3) or mdategoof.group(3) == year[:2]):
                message = " ! - Bad date mdategoof.group(3) in logbook: " + tripdate + " - " + mdategoof.group(3)
                DataIssue.objects.create(parser="logbooks", message=message)
-                logdataissues["tripdate"] = message
                return dummydate
            else:
                yadd = int(year[:2]) * 100
                day, month, year = int(mdategoof.group(1)), int(mdategoof.group(2)), int(mdategoof.group(4)) + yadd
        else:
-            year = 1970
+            year = 1970 # replace with _EPOCH
            message = f" ! - Bad date in logbook: {tripdate} - {year}"
            DataIssue.objects.create(parser="logbooks", message=message)
-            logdataissues["tripdate"] = message

        return date(year, month, day)
    except:
        message = f" ! - Failed to parse date in logbook: {tripdate} - {year}"
        DataIssue.objects.create(parser="logbooks", message=message)
-        logdataissues["tripdate"] = message
-        return datetime.date(1970, 1, 1)
+        return datetime.date(1970, 1, 1) # replace with _EPOCH


 def parser_html(year, expedition, txt, seq=""):
@ -309,7 +302,6 @@ def parser_html(year, expedition, txt, seq=""):
    from parser_html_01 format logfiles, believe me.
    """
    global logentries
-    global logdataissues

    # extract front material and stash for later use when rebuilding from list of entries
    headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt)
@ -356,7 +348,6 @@ def parser_html(year, expedition, txt, seq=""):
            msg = f" !- {year} Can't parse:{logbook_entry_count} '{trippara[:50]}'..."
            print(msg)
            DataIssue.objects.create(parser="logbooks", message=msg)
-            logdataissues[tid] = msg

            s2 = re.match(
                r"""(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)?  # second date
@ -377,7 +368,6 @@ def parser_html(year, expedition, txt, seq=""):
                msg = f" !- Logbook. Can't parse entry on 2nd pass:{logbook_entry_count} '{trippara[:50]}'..."
                print(msg)
                DataIssue.objects.create(parser="logbooks", message=msg)
-                logdataissues[tid] = msg
                continue

        ldate = ParseDate(tripdate.strip(), year)
@ -413,7 +403,6 @@ def parser_blog(year, expedition, txt, sq=""):
    So the content is nested inside the header. Attachments (images) come after the content.
    """
    global logentries
-    global logdataissues

    tripheads = re.findall(
        r"<article class=\"message message--post js-post js-inlineModContainer\s*\"\s*([\s\S]*?)(?=</article)", txt
@ -455,7 +444,6 @@ def parser_blog(year, expedition, txt, sq=""):
        if not (match_author):
            message = f" ! - Skipping logentry {year}:{logbook_entry_count} on failure to parse data-author  {tid} {triphead[:400]}..."
            DataIssue.objects.create(parser="logbooks", message=message)
-            logdataissues[tid] = message
            print(message)
            break
        trippeople = match_author.group(1)
@ -465,7 +453,6 @@ def parser_blog(year, expedition, txt, sq=""):
        if not (match_datetime):
            message = f" ! - Skipping logentry {year}:{logbook_entry_count} on failure to parse datetime  {tid} {triphead[:400]}..."
            DataIssue.objects.create(parser="logbooks", message=message)
-            logdataissues[tid] = message
            print(message)
            break
        datestamp = match_datetime.group(1)
@ -475,7 +462,6 @@ def parser_blog(year, expedition, txt, sq=""):
        except:
            message = f" ! - FROMISOFORMAT fail logentry {year}:{logbook_entry_count} {tid} '{datestamp}'"
            DataIssue.objects.create(parser="logbooks", message=message)
-            logdataissues[tid] = message
            print(message)
            # fallback, ignore the timestamp bits:
            tripdate = datetime.fromisoformat(datestamp[0:10])
@ -494,14 +480,30 @@ def parser_blog(year, expedition, txt, sq=""):
        entrytuple = (tripdate, location, tripname, tripcontent, trippeople, expedition, tu, tid)
        logentries.append(entrytuple)

+def clean_logbook_for_expedition(expedition):
+    def cleanerrors(year):
+        dataissues = DataIssue.objects.filter(parser="logbooks")
+        for di in dataissues:
+            ph = year
+            if re.search(ph, di.message) is not None:  # SLOW just to delete issues for one year
+                # print(f'   - CLEANING dataissue {di.message}')
+                di.delete()

-def LoadLogbookForExpedition(expedition, clean=True):
+            
+
+    year = expedition.year
+    cleanerrors(year)
+    
+    lbes = LogbookEntry.objects.filter(expedition=expedition) # must be a quicker way
+    for lbe in lbes:
+        lbe.delete()
+
+def parse_logbook_for_expedition(expedition):
    """Parses all logbook entries for one expedition
    if clean==True then it deletes all entries for this year first.
    """
    global logentries
    # absolutely horrid. REFACTOR THIS (all my fault..)
-    global logdataissues
    global entries

    logbook_parseable = False
@ -513,28 +515,6 @@ def LoadLogbookForExpedition(expedition, clean=True):
    expect = entries[year]
    # print(" - Logbook for: " + year)

-    def cleanerrors(year):
-        global logdataissues
-        dataissues = DataIssue.objects.filter(parser="logbooks")
-        for di in dataissues:
-            ph = year
-            if re.search(ph, di.message) is not None:
-                # print(f'   - CLEANING dataissue {di.message}')
-                di.delete()
-
-        # print(f'   - CLEAN {year} {len(logdataissues)} {type(logdataissues)} data issues for this year')
-        dellist = []
-        for key, value in logdataissues.items():
-            # print(f'   - CLEANING logdataissues [{key}]: {value}')
-            if key.startswith(year):
-                # print(f'   - CLEANING logdataissues [{key:12}]: {value} ')
-                dellist.append(key)
-        for i in dellist:
-            del logdataissues[i]
-
-    if clean:
-        cleanerrors(year)
-
    if year in yearlinks:
        yearfile, yearparser = yearlinks[year]
        logbookpath = Path(yearfile)
@ -549,11 +529,6 @@ def LoadLogbookForExpedition(expedition, clean=True):

    expedition.save()

-    lbes = LogbookEntry.objects.filter(expedition=expedition)
-    if clean:
-        for lbe in lbes:
-            lbe.delete()
-
    for sq in ["", "2", "3", "4"]:  # cope with blog saved as many separate files
        lb = Path(expologbase, year, logbookpath.stem + sq + logbookpath.suffix)
        if not (lb.is_file()):
@ -603,15 +578,15 @@ def LoadLogbook(year):
    global LOGBOOK_PARSER_SETTINGS

    nlbe = {}
-    TROG["pagecache"]["expedition"][year] = None  # clear cache

    expo = Expedition.objects.get(year=year)
    year = expo.year  # some type funny
-    nlbe[expo] = LoadLogbookForExpedition(expo)  # this actually loads the logbook for one expo
+    clean_logbook_for_expedition(expo)
+    nlbe[expo] = parse_logbook_for_expedition(expo)  # this actually loads the logbook for one expo
    if year in BLOG_PARSER_SETTINGS:
        print("BLOG parsing")
        LOGBOOK_PARSER_SETTINGS[year] = BLOG_PARSER_SETTINGS[year]
-        nlbe[expo] = LoadLogbookForExpedition(expo, clean=False)  # this  loads the blog logbook for one expo
+        nlbe[expo] = parse_logbook_for_expedition(expo)  # this  loads the blog logbook for one expo
    else:
        print(
            f"Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}"
@ -623,16 +598,13 @@ def LoadLogbooks():
    This should be rewritten to use coroutines to load all logbooks from disc in parallel,
    but must be serialised to write to database as sqlite is single-user.
    """
-    global logdataissues
    global entries

-    logdataissues = {}
    DataIssue.objects.filter(parser="logbooks").delete()
    expos = Expedition.objects.all()
    if len(expos) <= 1:
        message = " ! - No expeditions found. Load 'people' first"
        DataIssue.objects.create(parser="logbooks", message=message)
-        logdataissues["sqlfail 0000"] = message
        print(message)
        return

@ -651,12 +623,10 @@ def LoadLogbooks():

    for expo in expos:  # pointless as we explicitly know the years in this code.
        year = expo.year
-        TROG["pagecache"]["expedition"][year] = None  # clear cache
        if year in sqlfail:
            print(" - Logbook for: " + year + " NO parsing attempted - known sql failures")
            message = f" ! - Not even attempting to parse logbook for {year} until code fixed"
            DataIssue.objects.create(parser="logbooks", message=message)
-            logdataissues[f"sqlfail {year}"] = message
            print(message)

        if year not in nologbook:
@ -669,7 +639,7 @@ def LoadLogbooks():
            bloglist.append(expo)

    for ex in loglist:
-        nlbe[ex] = LoadLogbookForExpedition(ex)  # this  loads the logbook for one expo
+        nlbe[ex] = parse_logbook_for_expedition(ex)  # this  loads the logbook for one expo

    for b in bloglist:
        if str(b) in LOGBOOK_PARSER_SETTINGS:
@ -678,12 +648,12 @@ def LoadLogbooks():
            orig = (DEFAULT_LOGBOOK_FILE, DEFAULT_LOGBOOK_PARSER)
        LOGBOOK_PARSER_SETTINGS[str(b)] = BLOG_PARSER_SETTINGS[str(b)]
        print(f" - BLOG: {b}")
-        nlbe[b] = LoadLogbookForExpedition(b, clean=False)  # this  loads the blog logbook for one expo
+        nlbe[b] = parse_logbook_for_expedition(b, clean=False)  # this  loads the blog logbook for one expo
        LOGBOOK_PARSER_SETTINGS[str(b)] = orig

    # tried to use map with concurrent threads - but sqlite database is not concurrent, so failed with database lock
    # yt = 0
-    # for r in map(LoadLogbookForExpedition, loglist):
+    # for r in map(parse_logbook_for_expedition, loglist):
    # yt = r

    yt = 0