exptl parse UK Caving blog

2022-12-14 23:46:14 +00:00
parent 6dd8e5a75c
commit cb50528e2d
3 changed files with 81 additions and 62 deletions
--- a/databaseReset.py
+++ b/databaseReset.py
@@ -238,15 +238,16 @@ class JobQueue():
        for runfunction in self.queue:
            start = time.time()
            memstart = get_process_memory()
+            jobname, jobparser = runfunction
            #--------------------
-            runfunction[1]()    #  invokes function passed in the second item in the tuple
+            jobparser()    #  invokes function passed in the second item in the tuple
            #--------------------
            memend = get_process_memory()
            duration = time.time()-start
            #print(" - MEMORY start:{:.3f} MB end:{:.3f} MB change={:.3f} MB".format(memstart,memend, ))
-            print("\n*- Ended \"",  runfunction[0], f"\"  {duration:.1f} seconds + {memend - memstart:.3f} MB ({memend:.3f} MB)")
-            self.results[runfunction[0]].pop()  # the null item
-            self.results[runfunction[0]].append(duration)
+            print("\n*- Ended \"",  jobname, f"\"  {duration:.1f} seconds + {memend - memstart:.3f} MB ({memend:.3f} MB)")
+            self.results[jobname].pop()  # the null item
+            self.results[jobname].append(duration)
               

        jobend = time.time()
--- a/parsers/imports.py
+++ b/parsers/imports.py
@@ -41,10 +41,10 @@ def import_logbooks():
    with transaction.atomic():
        troggle.parsers.logbooks.LoadLogbooks()

-def import_logbook(year=2003):
+def import_logbook(year=2019):
    print(f"-- Importing Logbook {year}")
    with transaction.atomic():
-        troggle.parsers.logbooks.LoadLogbook(year)
+        troggle.parsers.logbooks.LoadLogbook(year, format="blog")

 def import_QMs():
    print("-- Importing old QMs for 161, 204, 234 from CSV files")
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -56,6 +56,7 @@ DEFAULT_LOGBOOK_FILE = "logbook.html"
 # All years since 2010 use the default value for Logbook parser
 # but several don't work, and are skipped by the parsing code, e.g. 1983
 LOGBOOK_PARSER_SETTINGS = {
+                "2019": ("logbook.html", "parser_html"), 
                "2010": ("logbook.html", "parser_html"), 
                "2009": ("2009logbook.txt", "wiki_parser"), 
                "2008": ("2008logbook.txt", "wiki_parser"), 
@@ -171,7 +172,7 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
        return
        
    if not author:
-        message = f" ! - {expedition.year} Warning: logentry: {title} - no author for entry '{tid}'"
+        message = f" ! - {expedition.year} Warning: logentry: {title} - no expo member author for entry '{tid}'"
        DataIssue.objects.create(parser='logbooks', message=message)
        logdataissues["title"]=message
        print(message)
@@ -471,60 +472,68 @@ def parser_html_01(year, expedition, txt):
                print(message)
                return

-# parser for 2003. Retired after conversion of the logbook.html
-# KEEP THIS COMMENTED-OUT example until after we have doen the same thing with the html_01 parser
-# def parser_html_03(year, expedition, txt):
-    # global logentries
-    # global logdataissues
+def parser_blog(year, expedition, txt):
+    '''Parses the format of web pages collected as 'Save As HTML" fromt eh UK Caving blog website.
+    Note that the entries have dates and authors, but no titles.
+    '''
+    global logentries
+    global logdataissues
+    errorcount = 0

-    # tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt)
-    # logbook_entry_count = 0
-    # for trippara in tripparas:
-        # logbook_entry_count += 1
-        # tid = set_trip_id(year,logbook_entry_count) # default trip id, before we read the date
+    tripheads = re.findall(r"<article class=\"message message--post js-post js-inlineModContainer\s*\"\s*([\s\S]*?)(?=</article)", txt)
+    if not ( tripheads ) :
+        message = f" ! - Skipping on failure to parse article header: {txt[:500]}"
+        print(message)
+
+    tripparas = re.findall(r"<article class=\"message-body js-selectToQuote\"\>\s*([\s\S]*?)(?=</article)", txt)
+    if not ( tripparas ) :
+        message = f" ! - Skipping on failure to parse article content: {txt[:500]}"
+        print(message)
        
-        # s = re.match(r"(?s)\s*<p>(.*?)</p>(.*)$", trippara)
-        # if not ( s ) :
-            # message = " ! - Skipping logentry {year} on failure to parse parser_html_03: {} {} {}...".format(tid,s,trippara[:300])
-            # DataIssue.objects.create(parser='logbooks', message=message)
-            # logdataissues[tid]=message
-            # print(message)
-            # break
+    if (len(tripheads) !=len(tripparas)):
+        print(f"{len(tripheads)} != {len(tripparas)}")
+
+    location = "Plateau"
+    tripname = "UK Caving Blog post"
+    tu = 0
+    logbook_entry_count = 0
+    for i in range(0, len(tripparas)):
+        trippara = tripparas[i]
+        triphead = tripheads[i]
+        logbook_entry_count += 1
+        tid = set_trip_id(year,logbook_entry_count) +"_blog"
+        # print(f" -  tid: {tid}")
        
-        # tripheader, triptext = s.group(1), s.group(2)
-        # tripheader = re.sub(r"&nbsp;", " ", tripheader)
-        # tripheader = re.sub(r"\s+", " ", tripheader).strip()
-        # sheader = tripheader.split(" -- ")
-        # tu = ""
-        # if re.match("T/U|Time underwater", sheader[-1]):
-            # tu = sheader.pop() # not a number in 2003 usually
-            # # print(f" -  {logbook_entry_count} '{tu}' ")
-        # if len(sheader) != 3:
-            # print(" ! Header not three pieces for parser_html_03() ", sheader)
-        # tripdate, triptitle, trippeople = sheader
-        # ldate = ParseDate(tripdate.strip(), year)
-        # # print(f" -  {logbook_entry_count} '{ldate}' from '{tripdate.strip()}' ")
-        # # print(f" -  {logbook_entry_count} '{trippeople}'  ")
-        # titlelist = triptitle.split(" , ")
-        # if len(titlelist) >= 2:
-            # location, *namelist = titlelist # list unpacking operator
-            # tripname = ", ".join(namelist) # concatenate strings
-            # # print(f" -  {logbook_entry_count} {location}  '{tripname}'")
-        # else:
-            # location = "UNKNOWN"
-            
-        # ltriptext = triptext + "<br /><br />\n\n" + tu
-        # ltriptext = re.sub(r"</p>", "", ltriptext)
-        # #ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
-        # ltriptext = re.sub(r"<p>", "<br /><br />\n\n", ltriptext).strip()
-        # #ltriptext = re.sub(r"[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext)
-
-
-        # entrytuple = (ldate, location, tripname, ltriptext, 
-                # trippeople, expedition, tu, tid)
-        # logentries.append(entrytuple)
+        # data-author="tcacrossley"
+        match_author = re.search(r".*data-author=\"([^\"]*)\" data-content=.*", triphead)
+        if not ( match_author ) :
+            message = f" ! - Skipping logentry {year}:{logbook_entry_count} on failure to parse data-author  {tid} {triphead[:400]}..."
+            DataIssue.objects.create(parser='logbooks', message=message)
+            logdataissues[tid]=message
+            print(message)
+            break
+        trippeople = match_author.group(1)
+        # print(f" -  tid: {tid} {trippeople}")
+       # datetime="2019-07-11T13:16:18+0100"
+        match_datetime = re.search(r".*datetime=\"([^\"]*)\" data-time=.*", triphead)
+        if not ( match_datetime ) :
+            message = f" ! - Skipping logentry {year}:{logbook_entry_count} on failure to parse datetime  {tid} {triphead[:400]}..."
+            DataIssue.objects.create(parser='logbooks', message=message)
+            logdataissues[tid]=message
+            print(message)
+            break
+        datestamp = match_datetime.group(1)
+        
+        tripdate = datetime.fromisoformat(datestamp)
+        print(f" -  tid: {tid} '{trippeople}' '{tripdate}'")
+        
+        tripname = f"UK Caving Blog post {logbook_entry_count}" # must be unique for a given date

+        entrytuple = (tripdate, location, tripname, trippara, 
+            trippeople, expedition, tu, tid)
+        logentries.append(entrytuple)

+    
 def LoadLogbookForExpedition(expedition):
    """ Parses all logbook entries for one expedition 
    """
@@ -566,9 +575,12 @@ def LoadLogbookForExpedition(expedition):
    cleanerrors(year)

    if year in yearlinks:
-        logbookpath = Path(expologbase) /  year / yearlinks[year][0]
-        expedition.logbookfile = yearlinks[year][0] 
-        parsefunc   = yearlinks[year][1]
+        yearfile, yearparser = yearlinks[year]
+        logbookpath = Path(expologbase) /  year / yearfile
+        expedition.logbookfile = yearfile 
+        parsefunc   = yearparser
+        print(f" - Logbook file {yearfile} using parser {yearparser}")
+
    else:
        logbookpath = Path(expologbase) /  year / DEFAULT_LOGBOOK_FILE
        expedition.logbookfile = DEFAULT_LOGBOOK_FILE
@@ -617,14 +629,20 @@ def LoadLogbookForExpedition(expedition):

    return len(logentries)

-def LoadLogbook(year):
+def LoadLogbook(year, format="cucc"):
+    global LOGBOOK_PARSER_SETTINGS
+     
    nlbe={}
    TROG['pagecache']['expedition'][year] = None # clear cache
    
    expo = Expedition.objects.get(year=year)
-    nlbe[expo] = LoadLogbookForExpedition(expo)  # this actually loads the logbook for one expo
-     
    
+    if (format=="blog"):
+        LOGBOOK_PARSER_SETTINGS[str(year)] = ("ukcavingblog.html", "parser_blog")
+    # print(f" - Logbook file {LOGBOOK_PARSER_SETTINGS[str(year)][0]} using parser {LOGBOOK_PARSER_SETTINGS[str(year)][1]}")
+    
+    nlbe[expo] = LoadLogbookForExpedition(expo)  # this actually loads the logbook for one expo
+
 def LoadLogbooks():
    """ This is the master function for parsing all logbooks into the Troggle database. 
    This should be rewritten to use coroutines to load all logbooks from disc in parallel,