2003 logbook export/re-import as now HTML format

2022-12-09 23:45:07 +00:00
parent 17b2b7b89c
commit cabcada0b8
6 changed files with 98 additions and 70 deletions
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -26,6 +26,8 @@ Parses and imports logbooks in all their wonderful confusion
 todo='''
 - refactor everything with some urgency, esp. LoadLogbookForExpedition()

+- remove the TROG and lbo things since we need the database for multiuser access? Or not?
+
 - profile the code to find bad repetitive things, of which there are many.

 - far too many uses of Django field dereferencing to get values, which is SLOW
@@ -55,15 +57,15 @@ DEFAULT_LOGBOOK_FILE = "logbook.html"
 # but several don't work, and are skipped by the parsing code, e.g. 1983
 LOGBOOK_PARSER_SETTINGS = {
                "2010": ("logbook.html", "parser_html"), 
-                "2009": ("2009logbook.txt", "parser_wiki"), 
-                "2008": ("2008logbook.txt", "parser_wiki"), 
+                "2009": ("2009logbook.txt", "wiki_parser"), 
+                "2008": ("2008logbook.txt", "wiki_parser"), 
                "2007": ("logbook.html", "parser_html"), 
                "2006": ("logbook.html", "parser_html"), 
-#               "2006": ("logbook/logbook_06.txt", "parser_wiki"), 
+#               "2006": ("logbook/logbook_06.txt", "wiki_parser"), 
                "2006": ("logbook.html", "parser_html"), 
                "2005": ("logbook.html", "parser_html"), 
                "2004": ("logbook.html", "parser_html"), 
-                "2003": ("logbook.html", "parser_html_03"), 
+                "2003": ("logbook.html", "parser_html"), 
                "2002": ("logbook.html", "parser_html"), 
                "2001": ("log.htm", "parser_html_01"), 
                "2000": ("log.htm", "parser_html_01"), 
@@ -88,7 +90,7 @@ LOGBOOK_PARSER_SETTINGS = {

 entries = { "2022": 64, "2019": 56, "2018": 74, "2017": 60, "2016": 81, "2015": 79, 
    "2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 52, 
-    "2008": 49, "2007": 111, "2006": 60, "2005": 55, "2004": 76, "2003": 40, "2002": 31, 
+    "2008": 49, "2007": 111, "2006": 60, "2005": 55, "2004": 76, "2003": 42, "2002": 31, 
    "2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 94, "1995": 41, 
    "1994": 32, "1993": 41, "1992": 61, "1991": 38, "1990": 87, "1989": 1,"1988": 1,"1987": 1,
    "1985": 24,"1984": 32,"1983": 52,"1982": 42,}
@@ -114,8 +116,9 @@ rx_round_bracket = re.compile(r"[\(\[].*?[\)\]]")
 def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
    res = [ ]
    author = None
-    #print(f'# {tid}')
-       
+    # print(f'# {tid}')
+    # print(f" -  {tid} '{trippeople}'  ")
+
    for tripperson in re.split(r",|\+|&amp;|&(?!\w+;)| and ", trippeople):
        tripperson = tripperson.strip()
        # mul = re.match(r"(?i)<u>(.*?)</u>$", tripperson)
@@ -147,6 +150,7 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
            return None, None
        author = res[-1][0]
        
+    #print(f" -  {tid}  [{author.person}] '{res[0][0].person}'...")
    return res, author

 def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, tid=None):
@@ -195,9 +199,10 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
    #NEW slug for a logbook entry here! Unique id + slugified title fragment
    
    if tid is not None:
-        slug = tid + "_" + slugify(title)[:10].replace('-','_')
+        slug = tid
+        # slug = tid + "_" + slugify(title)[:10].replace('-','_')
    else: 
-        slug = str(randint(1000,10000)) + "_" + slugify(title)[:10].replace('-','_')
+        slug = str(randint(1000,9999)) + "_" + slugify(title)[:10].replace('-','_')
    nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave_slug':str(cave), 'slug': slug}
    
    # This creates the lbo instance of LogbookEntry
@@ -205,6 +210,7 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_

    
    for tripperson, time_underground in trippersons:
+        # print(f" -  {tid} '{tripperson}' author:{tripperson == author}")
        lookupAttribs={'personexpedition':tripperson, 'logbook_entry':lbo}
        nonLookupAttribs={'time_underground':time_underground, 'is_logbook_entry_author':(tripperson == author)}
        # this creates the PersonTrip instance. 
@@ -251,7 +257,7 @@ def ParseDate(tripdate, year):
        return datetime.date(1970, 1, 1)

 # (2006 - not any more), 2008 - 2009
-def parser_wiki(year, expedition, txt):
+def wiki_parser(year, expedition, txt):
    global logentries
    global logdataissues

@@ -316,6 +322,11 @@ def parser_html(year, expedition, txt):
        if s:
            tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups()
        else: # allow title and people to be swapped in order
+            msg = f" !- {year} Can't parse:{logbook_entry_count} '{trippara[:40]}'..."
+            print(msg)
+            DataIssue.objects.create(parser='logbooks', message=msg)
+            logdataissues[tid]=msg
+
            s2 = re.match(r'''(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)?  # second date
                                \s*(?:<a\s+id="(.*?)"\s*/>\s*</a>)?
                                \s*<div\s+class="tripdate"\s*(?:id="(.*?)")?>(.*?)</div>(?:<p>)?
@@ -329,7 +340,7 @@ def parser_html(year, expedition, txt):
                tripid, tripid1, tripdate, triptitle, trippeople, triptext, tu = s2.groups()
            else:
                if not re.search(r"Rigging Guide", trippara):
-                    msg = f" !- Logbook. Can't parse {tripid1}: {trippara} entry:{logbook_entry_count} "
+                    msg = f" !- Logbook. Can't parse entry on 2nd pass:{logbook_entry_count} '{trippara[:40]}'..."
                    print(msg)
                    DataIssue.objects.create(parser='logbooks', message=msg)
                    logdataissues[tid]=msg
@@ -343,7 +354,7 @@ def parser_html(year, expedition, txt):
            tripcave = "UNKNOWN"
        ltriptext = re.sub(r"</p>", "", triptext)
        ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
-        ltriptext = re.sub(r"<p>", "</br></br>", ltriptext).strip()
+        ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()

        entrytuple = (ldate, tripcave, triptitle, ltriptext, 
                trippeople, expedition, tu, tripid1)
@@ -430,11 +441,11 @@ def parser_html_01(year, expedition, txt):
                ltriptext = ltriptext[:mtail.start(0)]
            ltriptext = re.sub(r"</p>", "", ltriptext)
            ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
-            ltriptext = re.sub(r"<p>|<br>", "\n\n", ltriptext).strip()
            ltriptext = re.sub(r"</?u>", "_", ltriptext)
            ltriptext = re.sub(r"</?i>", "''", ltriptext)
            ltriptext = re.sub(r"</?b>", "'''", ltriptext)
-            
+            ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
+           
            if ltriptext == "":
                message = " ! - Zero content for logbook entry!: " + tid 
                DataIssue.objects.create(parser='logbooks', message=message)
@@ -469,7 +480,7 @@ def parser_html_03(year, expedition, txt):
    logbook_entry_count = 0
    for trippara in tripparas:
        logbook_entry_count += 1
-        tid = set_trip_id(year,logbook_entry_count)
+        tid = set_trip_id(year,logbook_entry_count) # default trip id, before we read the date
        
        s = re.match(r"(?s)\s*<p>(.*?)</p>(.*)$", trippara)
        if not ( s ) :
@@ -485,23 +496,30 @@ def parser_html_03(year, expedition, txt):
        sheader = tripheader.split(" -- ")
        tu = ""
        if re.match("T/U|Time underwater", sheader[-1]):
-            tu = sheader.pop()
+            tu = sheader.pop() # not a number in 2003 usually
+            # print(f" -  {logbook_entry_count} '{tu}' ")
        if len(sheader) != 3:
-            print(" ! Header not three pieces", sheader)
+            print(" ! Header not three pieces for parser_html_03() ", sheader)
        tripdate, triptitle, trippeople = sheader
        ldate = ParseDate(tripdate.strip(), year)
-        triptitles = triptitle.split(" , ")
-        if len(triptitles) >= 2:
-            tripcave = triptitles[0]
+        # print(f" -  {logbook_entry_count} '{ldate}' from '{tripdate.strip()}' ")
+        # print(f" -  {logbook_entry_count} '{trippeople}'  ")
+        titlelist = triptitle.split(" , ")
+        if len(titlelist) >= 2:
+            location, *namelist = titlelist # list unpacking operator
+            tripname = ", ".join(namelist) # concatenate strings
+            # print(f" -  {logbook_entry_count} {location}  '{tripname}'")
        else:
-            tripcave = "UNKNOWN"
-        ltriptext = re.sub(r"</p>", "", triptext)
-        ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
-        ltriptext = re.sub(r"<p>", "\n\n", ltriptext).strip()
-        ltriptext = re.sub(r"[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext)
+            location = "UNKNOWN"
+            
+        ltriptext = triptext + "<br /><br />\n\n" + tu
+        ltriptext = re.sub(r"</p>", "", ltriptext)
+        #ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
+        ltriptext = re.sub(r"<p>", "<br /><br />\n\n", ltriptext).strip()
+        #ltriptext = re.sub(r"[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext)


-        entrytuple = (ldate, tripcave, triptitle, ltriptext, 
+        entrytuple = (ldate, location, tripname, ltriptext, 
                trippeople, expedition, tu, tid)
        logentries.append(entrytuple)

@@ -509,8 +527,8 @@ def parser_html_03(year, expedition, txt):
 def LoadLogbookForExpedition(expedition):
    """ Parses all logbook entries for one expedition 
    """
-    # absolutely horrid. REFACTOR THIS (all my fault..)
    global logentries
+    # absolutely horrid. REFACTOR THIS (all my fault..)
    global logdataissues
    global entries

@@ -556,6 +574,10 @@ def LoadLogbookForExpedition(expedition):
        parsefunc   = DEFAULT_LOGBOOK_PARSER

    expedition.save()
+    
+    lbes = LogbookEntry.objects.filter(expedition=expedition)
+    for lbe in lbes:
+        lbe.delete()

    try:
        file_in = open(logbookpath,'rb')
@@ -594,6 +616,14 @@ def LoadLogbookForExpedition(expedition):

    return len(logentries)

+def LoadLogbook(year):
+    nlbe={}
+    TROG['pagecache']['expedition'][year] = None # clear cache
+    
+    expo = Expedition.objects.get(year=year)
+    nlbe[expo] = LoadLogbookForExpedition(expo)  # this actually loads the logbook for one expo
+     
+    
 def LoadLogbooks():
    """ This is the master function for parsing all logbooks into the Troggle database. 
    This should be rewritten to use coroutines to load all logbooks from disc in parallel,