converting 1987 logbook

2022-12-20 19:59:36 +00:00
parent dc3a61addd
commit 05df2e084c
2 changed files with 134 additions and 19 deletions
--- a/parsers/imports.py
+++ b/parsers/imports.py
@@ -41,7 +41,7 @@ def import_logbooks():
    with transaction.atomic():
        troggle.parsers.logbooks.LoadLogbooks()

-def import_logbook(year=1982):
+def import_logbook(year=1987):
    print(f"-- Importing Logbook {year}")
    with transaction.atomic():
        troggle.parsers.logbooks.LoadLogbook(year)
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -76,7 +76,8 @@ LOGBOOK_PARSER_SETTINGS = {
                "1990": ("log.htm", "parser_html_01"), 
                "1989": ("log.htm", "parser_html_01"), #crashes MySQL
                "1988": ("log.htm", "parser_html_01"), #crashes MySQL
-                "1987": ("log.htm", "parser_html_01"), #crashes MySQL
+                #"1987": ("log.htm", "parser_02"), #crashes MySQL
+                "1987": ("logbook.html", "parser_html"), 
                "1985": ("logbook.html", "parser_html"), 
                "1984": ("logbook.html", "parser_html"), 
                "1983": ("logbook.html", "parser_html"), 
@@ -87,7 +88,7 @@ entries = { "2022": 86, "2019": 56, "2018": 100, "2017": 76, "2016": 83, "2015":
    "2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 53, 
    "2008": 49, "2007": 113, "2006": 60, "2005": 55, "2004": 76, "2003": 42, "2002": 31, 
    "2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 95, "1995": 42, 
-    "1994": 32, "1993": 41, "1992": 62, "1991": 39, "1990": 87, "1989": 1,"1988": 1,"1987": 1,
+    "1994": 32, "1993": 41, "1992": 62, "1991": 39, "1990": 87, "1989": 1,"1988": 1,"1987": 34,
    "1985": 24, "1984": 32, "1983": 52, "1982": 42,}
 # Logbooks log.htm exist for 87, 88, 89 but have no full-working parser, or need hand-editing.

@@ -366,27 +367,13 @@ def parser_html(year, expedition, txt, seq=""):
                trippeople, expedition, tu, tripid1)
        logentries.append(entrytuple)

-
 # main parser for 1991 - 2001.  simpler because the data has been hacked so much to fit it
 # trying it out for years 1982 - 1990 too. Some logbook editing required by hand.. place
 def parser_html_01(year, expedition, txt, seq=""):
-    '''This uses some of the more obscure capabilities of regular expressions,
-    see https://docs.python.org/3/library/re.html
-    '''
    global logentries
    global logdataissues
    errorcount = 0

-    # extract front material and stash for later use when rebuilding from list of entries
-    headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt)
-    headpara = headmatch.groups()[0].strip()
-    
-    # print(f" - headpara:\n'{headpara}'")
-    if(len(headpara)>0):
-        frontpath = Path(settings.EXPOWEB, "years", year, "frontmatter.html")
-        with open(frontpath,"w") as front:
-            front.write(headpara+"\n")
-    
    tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
    logbook_entry_count = 0
    for trippara in tripparas:
@@ -439,7 +426,6 @@ def parser_html_01(year, expedition, txt, seq=""):
                    print(message)
                    break
            #print(f" #3 - tid: {tid}")
-            triptitle = triptitle.strip()
            ldate = ParseDate(tripdate.strip(), year)
            #print(f" # -             tid: {tid} <{tripdate}> <{triptitle}> <{trippeople}>")
            #print(f" #4 - tid: {tid}")
@@ -491,6 +477,135 @@ def parser_html_01(year, expedition, txt, seq=""):
                print(message)
                return

+# variant parser for 1987
+def parser_02(year, expedition, txt, seq=""):
+    '''This uses some of the more obscure capabilities of regular expressions,
+    see https://docs.python.org/3/library/re.html
+    '''
+    global logentries
+    global logdataissues
+    errorcount = 0
+    
+    # extract front material and stash for later use when rebuilding from list of entries
+    headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt)
+    headpara = headmatch.groups()[0].strip()
+    
+    #print(f" - headpara:\n'{headpara}'")
+    if(len(headpara)>0):
+        frontpath = Path(settings.EXPOWEB, "years", year, "frontmatter.html")
+        with open(frontpath,"w") as front:
+            front.write(headpara+"\n")
+    
+    tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
+    logbook_entry_count = 0
+    for trippara in tripparas:
+        logbook_entry_count += 1
+        tid = set_trip_id(year,logbook_entry_count)
+        # print(f" #0 - tid: {tid}")
+        try:
+            # print(f" #1 - tid: {tid}")
+            #s = re.match(r"(?i)(?s)\s*(?:<p>)?(.*?)</?p>(.*)$", trippara)
+            s = re.match(r"(?i)(?s)\s*(<hr[\s/]*>)?.*?<a[^>]*>([\s\S]*?)</a>(.*)$", trippara)
+            if not s:
+                message = " ! - Skipping logentry {tid} failure to parse header: " + tid + trippara[:300] + "..."
+                DataIssue.objects.create(parser='logbooks', message=message)
+                logdataissues[tid]=message
+                print(message)
+                break
+            #print(s.group(2))
+            #print(s.group(3)[:80])
+            try:
+                tripheader, triptext = s.group(2), s.group(3)
+            except:
+                message = f" ! - Fail to set tripheader, triptext. trip:<{tid}> s:'{s}'" 
+                DataIssue.objects.create(parser='logbooks', message=message)
+                logdataissues[tid]=message
+                print(message)
+                
+            print(f" {tid} {tripheader}")
+            if not tripheader:
+                continue
+
+            # mtripid = re.search(r'<a id="(.*?)"', tripheader)
+            # if not mtripid:
+                # message = f" ! - A tag id not found. Never mind. Not needed. trip:<{tid}> header:'{tripheader}'" 
+                # DataIssue.objects.create(parser='logbooks', message=message)
+                # logdataissues[tid]=message
+                # print(message)
+                
+            # tripid = mtripid and mtripid.group(1) or ""
+            # print(f" # - mtripid: {mtripid}")
+            tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader)
+            # print(f" #2 - tid: {tid}")
+            try:
+                tripdate, triptitle, trippeople = tripheader.split("|")
+            except:
+                message = f" ! - Fail 3 to split out date|title|people. trip:<{tid}>\n '{tripheader.split('|')}'" 
+                DataIssue.objects.create(parser='logbooks', message=message)
+                logdataissues[tid]=message
+                print(message)
+                try:
+                    tripdate, triptitle = tripheader.split("|")
+                    trippeople = "GUESS ANON"
+                except:
+                    message = f" ! - Skipping logentry {year} Fail 2 to split out date|title (anon). trip:<{tid}> '{tripheader.split('|')}' CRASHES MySQL !" 
+                    DataIssue.objects.create(parser='logbooks', message=message)
+                    logdataissues[tid]=message
+                    print(message)
+                    break
+            # print(f" #3 - tid: {tid}")
+            triptitle = triptitle.strip()
+            ldate = ParseDate(tripdate.strip(), year)
+            #print(f" # -             tid: {tid} <{tripdate}> <{triptitle}> <{trippeople}>")
+            # print(f" #4 - tid: {tid}")
+
+            mtu = re.search(r'<p[^>]*>(T/?U.*)', triptext)
+            if mtu:
+                tu = mtu.group(1)
+                triptext = triptext[:mtu.start(0)] + triptext[mtu.end():]
+            else:
+                tu = ""
+
+            triptitles = triptitle.split(" - ")
+            tripcave = triptitles[0].strip()
+
+            ltriptext = triptext
+            
+            mtail = re.search(r'(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&amp;|</?p>|\((?:same day|\d+)\))*$', ltriptext)
+            if mtail:
+                ltriptext = ltriptext[:mtail.start(0)]
+            ltriptext = re.sub(r"</p>", "", ltriptext)
+            ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
+            ltriptext = re.sub(r"</?u>", "_", ltriptext)
+            ltriptext = re.sub(r"</?i>", "''", ltriptext)
+            ltriptext = re.sub(r"</?b>", "'''", ltriptext)
+            ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
+           
+            if ltriptext == "":
+                message = " ! - Zero content for logbook entry!: " + tid 
+                DataIssue.objects.create(parser='logbooks', message=message)
+                logdataissues[tid]=message
+                print(message)
+                
+
+            entrytuple = (ldate, tripcave, triptitle, ltriptext, 
+                    trippeople, expedition, tu, tid)
+            logentries.append(entrytuple)
+                
+        except:
+            message = f" ! - Skipping logentry {year} due to exception in: {tid}"
+            DataIssue.objects.create(parser='logbooks', message=message)
+            logdataissues[tid]=message
+            print(message)
+            errorcount += 1
+            raise
+            if errorcount >5 :
+                message = f" !!- TOO MANY ERRORS - aborting at '{tid}' logbook: {year}"
+                DataIssue.objects.create(parser='logbooks', message=message)
+                logdataissues[tid]=message
+                print(message)
+                return
+
 def parser_blog(year, expedition, txt, sq=""):
    '''Parses the format of web pages collected as 'Save As HTML" from the UK Caving blog website.
    Note that the entries have dates and authors, but no titles.