diff --git a/databaseReset.py b/databaseReset.py index db7a781..2d0af58 100644 --- a/databaseReset.py +++ b/databaseReset.py @@ -238,15 +238,16 @@ class JobQueue(): for runfunction in self.queue: start = time.time() memstart = get_process_memory() + jobname, jobparser = runfunction #-------------------- - runfunction[1]() # invokes function passed in the second item in the tuple + jobparser() # invokes function passed in the second item in the tuple #-------------------- memend = get_process_memory() duration = time.time()-start #print(" - MEMORY start:{:.3f} MB end:{:.3f} MB change={:.3f} MB".format(memstart,memend, )) - print("\n*- Ended \"", runfunction[0], f"\" {duration:.1f} seconds + {memend - memstart:.3f} MB ({memend:.3f} MB)") - self.results[runfunction[0]].pop() # the null item - self.results[runfunction[0]].append(duration) + print("\n*- Ended \"", jobname, f"\" {duration:.1f} seconds + {memend - memstart:.3f} MB ({memend:.3f} MB)") + self.results[jobname].pop() # the null item + self.results[jobname].append(duration) jobend = time.time() diff --git a/parsers/imports.py b/parsers/imports.py index 9cc945c..3723ce3 100644 --- a/parsers/imports.py +++ b/parsers/imports.py @@ -41,10 +41,10 @@ def import_logbooks(): with transaction.atomic(): troggle.parsers.logbooks.LoadLogbooks() -def import_logbook(year=2003): +def import_logbook(year=2019): print(f"-- Importing Logbook {year}") with transaction.atomic(): - troggle.parsers.logbooks.LoadLogbook(year) + troggle.parsers.logbooks.LoadLogbook(year, format="blog") def import_QMs(): print("-- Importing old QMs for 161, 204, 234 from CSV files") diff --git a/parsers/logbooks.py b/parsers/logbooks.py index 0bbc23d..7e2870b 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -56,6 +56,7 @@ DEFAULT_LOGBOOK_FILE = "logbook.html" # All years since 2010 use the default value for Logbook parser # but several don't work, and are skipped by the parsing code, e.g. 1983 LOGBOOK_PARSER_SETTINGS = { + "2019": ("logbook.html", "parser_html"), "2010": ("logbook.html", "parser_html"), "2009": ("2009logbook.txt", "wiki_parser"), "2008": ("2008logbook.txt", "wiki_parser"), @@ -171,7 +172,7 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_ return if not author: - message = f" ! - {expedition.year} Warning: logentry: {title} - no author for entry '{tid}'" + message = f" ! - {expedition.year} Warning: logentry: {title} - no expo member author for entry '{tid}'" DataIssue.objects.create(parser='logbooks', message=message) logdataissues["title"]=message print(message) @@ -471,60 +472,68 @@ def parser_html_01(year, expedition, txt): print(message) return -# parser for 2003. Retired after conversion of the logbook.html -# KEEP THIS COMMENTED-OUT example until after we have doen the same thing with the html_01 parser -# def parser_html_03(year, expedition, txt): - # global logentries - # global logdataissues +def parser_blog(year, expedition, txt): + '''Parses the format of web pages collected as 'Save As HTML" fromt eh UK Caving blog website. + Note that the entries have dates and authors, but no titles. + ''' + global logentries + global logdataissues + errorcount = 0 - # tripparas = re.findall(r"([\s\S]*?)(?=\s*([\s\S]*?)(?=(.*?)

(.*)$", trippara) - # if not ( s ) : - # message = " ! - Skipping logentry {year} on failure to parse parser_html_03: {} {} {}...".format(tid,s,trippara[:300]) - # DataIssue.objects.create(parser='logbooks', message=message) - # logdataissues[tid]=message - # print(message) - # break + if (len(tripheads) !=len(tripparas)): + print(f"{len(tripheads)} != {len(tripparas)}") + + location = "Plateau" + tripname = "UK Caving Blog post" + tu = 0 + logbook_entry_count = 0 + for i in range(0, len(tripparas)): + trippara = tripparas[i] + triphead = tripheads[i] + logbook_entry_count += 1 + tid = set_trip_id(year,logbook_entry_count) +"_blog" + # print(f" - tid: {tid}") - # tripheader, triptext = s.group(1), s.group(2) - # tripheader = re.sub(r" ", " ", tripheader) - # tripheader = re.sub(r"\s+", " ", tripheader).strip() - # sheader = tripheader.split(" -- ") - # tu = "" - # if re.match("T/U|Time underwater", sheader[-1]): - # tu = sheader.pop() # not a number in 2003 usually - # # print(f" - {logbook_entry_count} '{tu}' ") - # if len(sheader) != 3: - # print(" ! Header not three pieces for parser_html_03() ", sheader) - # tripdate, triptitle, trippeople = sheader - # ldate = ParseDate(tripdate.strip(), year) - # # print(f" - {logbook_entry_count} '{ldate}' from '{tripdate.strip()}' ") - # # print(f" - {logbook_entry_count} '{trippeople}' ") - # titlelist = triptitle.split(" , ") - # if len(titlelist) >= 2: - # location, *namelist = titlelist # list unpacking operator - # tripname = ", ".join(namelist) # concatenate strings - # # print(f" - {logbook_entry_count} {location} '{tripname}'") - # else: - # location = "UNKNOWN" - - # ltriptext = triptext + "

\n\n" + tu - # ltriptext = re.sub(r"

", "", ltriptext) - # #ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext) - # ltriptext = re.sub(r"

", "

\n\n", ltriptext).strip() - # #ltriptext = re.sub(r"[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext) - - - # entrytuple = (ldate, location, tripname, ltriptext, - # trippeople, expedition, tu, tid) - # logentries.append(entrytuple) + # data-author="tcacrossley" + match_author = re.search(r".*data-author=\"([^\"]*)\" data-content=.*", triphead) + if not ( match_author ) : + message = f" ! - Skipping logentry {year}:{logbook_entry_count} on failure to parse data-author {tid} {triphead[:400]}..." + DataIssue.objects.create(parser='logbooks', message=message) + logdataissues[tid]=message + print(message) + break + trippeople = match_author.group(1) + # print(f" - tid: {tid} {trippeople}") + # datetime="2019-07-11T13:16:18+0100" + match_datetime = re.search(r".*datetime=\"([^\"]*)\" data-time=.*", triphead) + if not ( match_datetime ) : + message = f" ! - Skipping logentry {year}:{logbook_entry_count} on failure to parse datetime {tid} {triphead[:400]}..." + DataIssue.objects.create(parser='logbooks', message=message) + logdataissues[tid]=message + print(message) + break + datestamp = match_datetime.group(1) + + tripdate = datetime.fromisoformat(datestamp) + print(f" - tid: {tid} '{trippeople}' '{tripdate}'") + + tripname = f"UK Caving Blog post {logbook_entry_count}" # must be unique for a given date + entrytuple = (tripdate, location, tripname, trippara, + trippeople, expedition, tu, tid) + logentries.append(entrytuple) + def LoadLogbookForExpedition(expedition): """ Parses all logbook entries for one expedition """ @@ -566,9 +575,12 @@ def LoadLogbookForExpedition(expedition): cleanerrors(year) if year in yearlinks: - logbookpath = Path(expologbase) / year / yearlinks[year][0] - expedition.logbookfile = yearlinks[year][0] - parsefunc = yearlinks[year][1] + yearfile, yearparser = yearlinks[year] + logbookpath = Path(expologbase) / year / yearfile + expedition.logbookfile = yearfile + parsefunc = yearparser + print(f" - Logbook file {yearfile} using parser {yearparser}") + else: logbookpath = Path(expologbase) / year / DEFAULT_LOGBOOK_FILE expedition.logbookfile = DEFAULT_LOGBOOK_FILE @@ -617,14 +629,20 @@ def LoadLogbookForExpedition(expedition): return len(logentries) -def LoadLogbook(year): +def LoadLogbook(year, format="cucc"): + global LOGBOOK_PARSER_SETTINGS + nlbe={} TROG['pagecache']['expedition'][year] = None # clear cache expo = Expedition.objects.get(year=year) - nlbe[expo] = LoadLogbookForExpedition(expo) # this actually loads the logbook for one expo - + if (format=="blog"): + LOGBOOK_PARSER_SETTINGS[str(year)] = ("ukcavingblog.html", "parser_blog") + # print(f" - Logbook file {LOGBOOK_PARSER_SETTINGS[str(year)][0]} using parser {LOGBOOK_PARSER_SETTINGS[str(year)][1]}") + + nlbe[expo] = LoadLogbookForExpedition(expo) # this actually loads the logbook for one expo + def LoadLogbooks(): """ This is the master function for parsing all logbooks into the Troggle database. This should be rewritten to use coroutines to load all logbooks from disc in parallel,