forked from expo/troggle
refactored logbooks parser
This commit is contained in:
parent
6565b3f9c4
commit
e01bd39609
@ -549,6 +549,10 @@ class PersonTrip(TroggleModel):
|
||||
logbook_entry = models.ForeignKey(LogbookEntry,on_delete=models.CASCADE)
|
||||
is_logbook_entry_author = models.BooleanField(default=False)
|
||||
|
||||
class Meta:
|
||||
ordering = ('-personexpedition',)
|
||||
#order_with_respect_to = 'personexpedition'
|
||||
|
||||
def persontrip_next(self):
|
||||
futurePTs = PersonTrip.objects.filter(personexpedition = self.personexpedition, logbook_entry__date__gt = self.logbook_entry.date).order_by('logbook_entry__date').all()
|
||||
if len(futurePTs) > 0:
|
||||
|
@ -18,7 +18,7 @@ from troggle.core.models.caves import LogbookEntry, PersonTrip
|
||||
from troggle.core.models.survex import SurvexBlock, Wallet
|
||||
from troggle.core.models.troggle import Expedition, Person, PersonExpedition
|
||||
from troggle.core.utils import TROG
|
||||
from troggle.parsers.logbooks import parse_logbook_for_expedition
|
||||
from troggle.parsers.imports import import_logbook
|
||||
from troggle.parsers.people import GetPersonExpeditionNameLookup
|
||||
|
||||
from .auth import login_required_if_public
|
||||
@ -69,7 +69,7 @@ def expedition(request, expeditionname):
|
||||
#print(f'! - delete entry: "{entry}"')
|
||||
entry.delete()
|
||||
entries = this_expedition.logbookentry_set.all()
|
||||
LoadLogbookForExpedition(this_expedition)
|
||||
import_logbook(year=this_expedition.year)
|
||||
logged_in = True
|
||||
else:
|
||||
logged_in = False
|
||||
|
@ -40,7 +40,7 @@ def import_logbooks():
|
||||
troggle.parsers.logbooks.LoadLogbooks()
|
||||
|
||||
|
||||
def import_logbook(year=2022):
|
||||
def import_logbook(year=2016):
|
||||
print(f"-- Importing Logbook {year}")
|
||||
with transaction.atomic():
|
||||
troggle.parsers.logbooks.LoadLogbook(year)
|
||||
|
@ -1,5 +1,8 @@
|
||||
import os
|
||||
import re
|
||||
import sys
|
||||
import time
|
||||
|
||||
from datetime import date, datetime
|
||||
from pathlib import Path
|
||||
from random import randint
|
||||
@ -7,10 +10,10 @@ from random import randint
|
||||
from django.conf import settings
|
||||
from django.template.defaultfilters import slugify
|
||||
|
||||
from parsers.people import GetPersonExpeditionNameLookup
|
||||
from parsers.people import GetPersonExpeditionNameLookup, load_people_expos
|
||||
from troggle.core.models.caves import GetCaveLookup, LogbookEntry, PersonTrip
|
||||
from troggle.core.models.troggle import DataIssue, Expedition
|
||||
from troggle.core.utils import save_carefully
|
||||
from troggle.core.utils import save_carefully, get_process_memory
|
||||
|
||||
"""
|
||||
Parses and imports logbooks in all their wonderful confusion
|
||||
@ -20,6 +23,8 @@ Parses and imports logbooks in all their wonderful confusion
|
||||
todo = """
|
||||
- refactor everything with some urgency, esp. parse_logbook_for_expedition()
|
||||
|
||||
- pre-compile all the heavily used regular expressions !
|
||||
|
||||
- break out the code that hits the database from that which parses the logbook
|
||||
so that the file-reading and parsing can be parallelized, while writing to the
|
||||
database remains serialized (sqlite is single-user).
|
||||
@ -43,7 +48,7 @@ data for old logbooks? Not worth it..
|
||||
|
||||
"""
|
||||
MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200
|
||||
BLOG_PARSER_SETTINGS = {
|
||||
BLOG_PARSER_SETTINGS = { # no default, must be explicit
|
||||
# "2022": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
|
||||
# "2019": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
|
||||
# "2018": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
|
||||
@ -57,7 +62,7 @@ LOGBOOK_PARSER_SETTINGS = {
|
||||
"1982": ("logbook.html", "parser_html"),
|
||||
}
|
||||
|
||||
entries = {
|
||||
ENTRIES = {
|
||||
"2022": 90,
|
||||
"2019": 55,
|
||||
"2018": 95,
|
||||
@ -99,18 +104,15 @@ entries = {
|
||||
}
|
||||
|
||||
logentries = [] # the entire logbook for one year is a single object: a list of entries
|
||||
noncaveplaces = ["Journey", "Loser Plateau", "UNKNOWN", "plateau", "base camp", "basecamp", "top camp", "topcamp"]
|
||||
trips = {}
|
||||
noncaveplaces = ["travel", "Journey", "Loser Plateau", "UNKNOWN", "plateau", "base camp", "basecamp", "top camp", "topcamp"]
|
||||
|
||||
def set_trip_id(year, seq):
|
||||
tid = f"{year}_s{seq:02d}"
|
||||
return tid
|
||||
|
||||
|
||||
rx_tripperson = re.compile(r"(?i)<u>(.*?)</u>$")
|
||||
rx_round_bracket = re.compile(r"[\(\[].*?[\)\]]")
|
||||
|
||||
|
||||
def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
|
||||
res = []
|
||||
author = None
|
||||
@ -161,7 +163,7 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
|
||||
return res, author
|
||||
|
||||
|
||||
def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, tid=None):
|
||||
def store_entry_into_database(date, place, title, text, trippeople, expedition, logtime_underground, tid=None):
|
||||
"""saves a single logbook entry and related persontrips
|
||||
Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday !
|
||||
|
||||
@ -254,9 +256,11 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
|
||||
nonLookupAttribs = {"time_underground": time_underground, "is_logbook_entry_author": (tripperson == author)}
|
||||
# this creates the PersonTrip instance.
|
||||
save_carefully(PersonTrip, lookupAttribs, nonLookupAttribs)
|
||||
|
||||
|
||||
|
||||
|
||||
def ParseDate(tripdate, year):
|
||||
def parser_date(tripdate, year):
|
||||
"""Interprets dates in the expo logbooks and returns a correct datetime.date object"""
|
||||
dummydate = date(1970, 1, 1) # replace with _EPOCH
|
||||
month = 1
|
||||
@ -302,6 +306,7 @@ def parser_html(year, expedition, txt, seq=""):
|
||||
from parser_html_01 format logfiles, believe me.
|
||||
"""
|
||||
global logentries
|
||||
dupl = {}
|
||||
|
||||
# extract front material and stash for later use when rebuilding from list of entries
|
||||
headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt)
|
||||
@ -370,7 +375,7 @@ def parser_html(year, expedition, txt, seq=""):
|
||||
DataIssue.objects.create(parser="logbooks", message=msg)
|
||||
continue
|
||||
|
||||
ldate = ParseDate(tripdate.strip(), year)
|
||||
ldate = parser_date(tripdate.strip(), year)
|
||||
triptitles = triptitle.split(" - ")
|
||||
if len(triptitles) >= 2:
|
||||
tripcave = triptitles[0]
|
||||
@ -381,6 +386,15 @@ def parser_html(year, expedition, txt, seq=""):
|
||||
ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
|
||||
|
||||
triptitle = triptitle.strip()
|
||||
# triptitle must be unique for a given date. We fix this here.
|
||||
check = (ldate, triptitle)
|
||||
if check in dupl:
|
||||
dupl[check] += 1
|
||||
triptitle = f"{triptitle} #{dupl[check]}"
|
||||
print(f" - {triptitle} -- {date}")
|
||||
else:
|
||||
dupl[check] = 1
|
||||
|
||||
entrytuple = (ldate, tripcave, triptitle, ltriptext, trippeople, expedition, tu, tripid1)
|
||||
logentries.append(entrytuple)
|
||||
|
||||
@ -401,6 +415,7 @@ def parser_blog(year, expedition, txt, sq=""):
|
||||
</article>
|
||||
</article>
|
||||
So the content is nested inside the header. Attachments (images) come after the content.
|
||||
It's a bugger, but it's out of our control.
|
||||
"""
|
||||
global logentries
|
||||
|
||||
@ -467,9 +482,10 @@ def parser_blog(year, expedition, txt, sq=""):
|
||||
tripdate = datetime.fromisoformat(datestamp[0:10])
|
||||
# print(f" - tid: {tid} '{trippeople}' '{tripdate}'")
|
||||
|
||||
# tripname must have the location then a hyphen at the beginning as it is ignored by export function
|
||||
# triptitle must have the location then a hyphen at the beginning as it is ignored by export function. We can't know what this is, so we set it as 'Expo' and 'Unknown'.
|
||||
location = "Unknown"
|
||||
tripname = f"Expo - UK Caving Blog{sq} post {logbook_entry_count}" # must be unique for a given date
|
||||
# triptitle must be unique for a given date. We can enforce this here.
|
||||
triptitle = f"Expo - UK Caving Blog{sq} post {logbook_entry_count}"
|
||||
tripcontent = re.sub(r"(width=\"\d+\")", "", tripcontent)
|
||||
tripcontent = re.sub(r"height=\"\d+\"", "", tripcontent)
|
||||
tripcontent = re.sub(r"width: \d+px", "", tripcontent)
|
||||
@ -477,7 +493,7 @@ def parser_blog(year, expedition, txt, sq=""):
|
||||
tripcontent = re.sub(r"<hr\s*>", "", tripcontent)
|
||||
tripcontent = f"\n\n<!-- Content parsed from UK Caving Blog -->\nBlog Author: {trippeople}" + tripcontent
|
||||
|
||||
entrytuple = (tripdate, location, tripname, tripcontent, trippeople, expedition, tu, tid)
|
||||
entrytuple = (tripdate, location, triptitle, tripcontent, trippeople, expedition, tu, tid)
|
||||
logentries.append(entrytuple)
|
||||
|
||||
def clean_all_logbooks():
|
||||
@ -485,53 +501,51 @@ def clean_all_logbooks():
|
||||
LogbookEntry.objects.all().delete()
|
||||
|
||||
def clean_logbook_for_expedition(expedition):
|
||||
def cleanerrors(year):
|
||||
dataissues = DataIssue.objects.filter(parser="logbooks")
|
||||
for di in dataissues:
|
||||
ph = year
|
||||
if re.search(ph, di.message) is not None: # SLOW just to delete issues for one year
|
||||
# print(f' - CLEANING dataissue {di.message}')
|
||||
di.delete()
|
||||
"""Only used when loading a single logbook. Deletes database LogBookEntries and
|
||||
DataIssues for this expedition year.
|
||||
"""
|
||||
lbes = LogbookEntry.objects.filter(expedition=expedition).delete()
|
||||
dataissues = DataIssue.objects.filter(parser="logbooks")
|
||||
for di in dataissues:
|
||||
ph = expedition.year
|
||||
if re.search(ph, di.message) is not None: # SLOW just to delete issues for one year
|
||||
# print(f' - CLEANING dataissue {di.message}')
|
||||
di.delete()
|
||||
|
||||
|
||||
|
||||
year = expedition.year
|
||||
cleanerrors(year)
|
||||
|
||||
lbes = LogbookEntry.objects.filter(expedition=expedition) # must be a quicker way
|
||||
for lbe in lbes:
|
||||
lbe.delete()
|
||||
|
||||
def parse_logbook_for_expedition(expedition):
|
||||
def parse_logbook_for_expedition(expedition, blog=False):
|
||||
"""Parses all logbook entries for one expedition
|
||||
if clean==True then it deletes all entries for this year first.
|
||||
"""
|
||||
global logentries
|
||||
# absolutely horrid. REFACTOR THIS (all my fault..)
|
||||
global entries
|
||||
|
||||
global ENTRIES
|
||||
logentries = []
|
||||
|
||||
logbook_parseable = False
|
||||
yearlinks = LOGBOOK_PARSER_SETTINGS
|
||||
expologbase = os.path.join(settings.EXPOWEB, "years")
|
||||
logentries = []
|
||||
|
||||
expologbase = Path(settings.EXPOWEB, "years")
|
||||
|
||||
year = expedition.year
|
||||
expect = entries[year]
|
||||
expect = ENTRIES[year]
|
||||
# print(" - Logbook for: " + year)
|
||||
|
||||
if year in yearlinks:
|
||||
yearfile, yearparser = yearlinks[year]
|
||||
logbookpath = Path(yearfile)
|
||||
expedition.logbookfile = yearfile
|
||||
parsefunc = yearparser
|
||||
# print(f" - Logbook file {yearfile} using parser {yearparser}")
|
||||
|
||||
if year in LOGBOOK_PARSER_SETTINGS:
|
||||
yearfile, parsefunc = LOGBOOK_PARSER_SETTINGS[year]
|
||||
expedition.logbookfile = yearfile # don't change this if a blog
|
||||
else:
|
||||
logbookpath = Path(DEFAULT_LOGBOOK_FILE)
|
||||
expedition.logbookfile = DEFAULT_LOGBOOK_FILE
|
||||
yearfile = DEFAULT_LOGBOOK_FILE
|
||||
expedition.logbookfile = DEFAULT_LOGBOOK_FILE # don't change this if a blog
|
||||
parsefunc = DEFAULT_LOGBOOK_PARSER
|
||||
|
||||
if blog:
|
||||
print(f" - BLOG file {yearfile} using parser {parsefunc}")
|
||||
if year not in BLOG_PARSER_SETTINGS:
|
||||
message = f" ! - Expecting blog parser buut none specified for {year}"
|
||||
DataIssue.objects.create(parser="logbooks", message=message)
|
||||
print(message)
|
||||
else:
|
||||
yearfile, parsefunc = BLOG_PARSER_SETTINGS[year]
|
||||
|
||||
expedition.save()
|
||||
logbookpath = Path(yearfile)
|
||||
# print(f" - Logbook file {yearfile} using parser {parsefunc}")
|
||||
# expedition.save()
|
||||
|
||||
for sq in ["", "2", "3", "4"]: # cope with blog saved as many separate files
|
||||
lb = Path(expologbase, year, logbookpath.stem + sq + logbookpath.suffix)
|
||||
@ -556,17 +570,10 @@ def parse_logbook_for_expedition(expedition):
|
||||
print(f" - {year} parsing with {parsefunc} - {lb}")
|
||||
parser(year, expedition, txt, sq) # this launches the right parser for this year
|
||||
# --------------------
|
||||
dupl = {}
|
||||
for entrytuple in logentries:
|
||||
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
|
||||
check = (date, triptitle)
|
||||
if check in dupl:
|
||||
dupl[check] += 1
|
||||
triptitle = f"{triptitle} #{dupl[check]}"
|
||||
print(f" - {triptitle} -- {date}")
|
||||
else:
|
||||
dupl[check] = 1
|
||||
EnterLogIntoDbase(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1)
|
||||
# move database storage into separate step
|
||||
# for entrytuple in logentries:
|
||||
# date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
|
||||
# store_entry_into_database(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1)
|
||||
|
||||
if len(logentries) == expect:
|
||||
# print(f"OK {year} {len(logentries):5d} is {expect}\n")
|
||||
@ -578,39 +585,57 @@ def parse_logbook_for_expedition(expedition):
|
||||
|
||||
|
||||
def LoadLogbook(year):
|
||||
"""One off logbook for testing purposes"""
|
||||
global LOGBOOK_PARSER_SETTINGS
|
||||
"""One off logbook for testing purposes, and also reloadable on '/expedition/2022?reload'
|
||||
This is inside an atomic transaction"""
|
||||
global logentries
|
||||
nlbe = {}
|
||||
|
||||
expo = Expedition.objects.get(year=year)
|
||||
year = expo.year # some type funny
|
||||
clean_logbook_for_expedition(expo)
|
||||
logentries = []
|
||||
|
||||
nlbe[expo] = parse_logbook_for_expedition(expo) # this actually loads the logbook for one expo
|
||||
if year in BLOG_PARSER_SETTINGS:
|
||||
print("BLOG parsing")
|
||||
LOGBOOK_PARSER_SETTINGS[year] = BLOG_PARSER_SETTINGS[year]
|
||||
nlbe[expo] = parse_logbook_for_expedition(expo) # this loads the blog logbook for one expo
|
||||
nlbe[expo] = parse_logbook_for_expedition(expo, blog=True) # this loads the blog logbook
|
||||
else:
|
||||
print(
|
||||
f"Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}"
|
||||
f" - Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}"
|
||||
)
|
||||
|
||||
|
||||
for entrytuple in logentries:
|
||||
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
|
||||
if expo == expedition:
|
||||
#print(f" - {triptitle}")
|
||||
store_entry_into_database(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1)
|
||||
expedition.save() # to save logbook name property
|
||||
|
||||
def LoadLogbooks():
|
||||
"""This is the master function for parsing all logbooks into the Troggle database.
|
||||
This should be rewritten to use coroutines to load all logbooks from disc in parallel,
|
||||
but must be serialised to write to database as sqlite is single-user.
|
||||
|
||||
This is inside an atomic transaction. Maybe it shouldn't be..
|
||||
"""
|
||||
global entries
|
||||
global ENTRIES
|
||||
global logentries
|
||||
allentries = []
|
||||
mem1 = get_process_memory()
|
||||
print(f" - MEM:{mem1:7.2f} MB now ", file=sys.stderr)
|
||||
start = time.time()
|
||||
|
||||
clean_all_logbooks()
|
||||
expos = Expedition.objects.all()
|
||||
if len(expos) <= 1:
|
||||
message = " ! - No expeditions found. Load 'people' first"
|
||||
message = " ! - No expeditions found. Attempting to 'people' first"
|
||||
DataIssue.objects.create(parser="logbooks", message=message)
|
||||
print(message)
|
||||
return
|
||||
load_people_expos()
|
||||
expos = Expedition.objects.all()
|
||||
if len(expos) <= 1:
|
||||
message = " ! - No expeditions found, even after attempting to load 'people'. Abort."
|
||||
DataIssue.objects.create(parser="logbooks", message=message)
|
||||
print(message)
|
||||
return
|
||||
|
||||
noexpo = [
|
||||
"1986",
|
||||
@ -625,7 +650,7 @@ def LoadLogbooks():
|
||||
loglist = []
|
||||
bloglist = []
|
||||
|
||||
for expo in expos: # pointless as we explicitly know the years in this code.
|
||||
for expo in expos:
|
||||
year = expo.year
|
||||
if year in sqlfail:
|
||||
print(" - Logbook for: " + year + " NO parsing attempted - known sql failures")
|
||||
@ -634,7 +659,7 @@ def LoadLogbooks():
|
||||
print(message)
|
||||
|
||||
if year not in nologbook:
|
||||
if year in entries:
|
||||
if year in ENTRIES:
|
||||
loglist.append(expo)
|
||||
else:
|
||||
print(" - No Logbook yet for: " + year) # catch case when preparing for next expo
|
||||
@ -644,28 +669,38 @@ def LoadLogbooks():
|
||||
|
||||
for ex in loglist:
|
||||
nlbe[ex] = parse_logbook_for_expedition(ex) # this loads the logbook for one expo
|
||||
allentries += logentries
|
||||
|
||||
for b in bloglist:
|
||||
if str(b) in LOGBOOK_PARSER_SETTINGS:
|
||||
orig = LOGBOOK_PARSER_SETTINGS[str(b)]
|
||||
else:
|
||||
orig = (DEFAULT_LOGBOOK_FILE, DEFAULT_LOGBOOK_PARSER)
|
||||
LOGBOOK_PARSER_SETTINGS[str(b)] = BLOG_PARSER_SETTINGS[str(b)]
|
||||
print(f" - BLOG: {b}")
|
||||
nlbe[b] = parse_logbook_for_expedition(b) # no clean. loads the blog logbook for one expo
|
||||
LOGBOOK_PARSER_SETTINGS[str(b)] = orig
|
||||
|
||||
# tried to use map with concurrent threads - but sqlite database is not concurrent, so failed with database lock
|
||||
# yt = 0
|
||||
# for r in map(parse_logbook_for_expedition, loglist):
|
||||
# yt = r
|
||||
nlbe[b] += parse_logbook_for_expedition(b, blog=True) # loads the blog logbook for one expo
|
||||
allentries += logentries
|
||||
|
||||
yt = 0
|
||||
for e in nlbe:
|
||||
yt += nlbe[e]
|
||||
for exp in nlbe:
|
||||
yt += nlbe[exp]
|
||||
print(f"total {yt:,} log entries parsed in all expeditions")
|
||||
|
||||
print(f"total {len(allentries):,} log entries in complete dict")
|
||||
mem = get_process_memory()
|
||||
print(f" - MEM: {mem:7.2f} MB in use, {mem-mem1:7.2f} MB more", file=sys.stderr)
|
||||
duration = time.time() - start
|
||||
print(f" - TIME: {duration:7.2f} s", file=sys.stderr)
|
||||
|
||||
# Now we serially store the parsed data in the database, updating 3 types of object:
|
||||
# - Expedition (the 'logbook.html' value)
|
||||
# - LogBookEntry (text, who when etc.)
|
||||
# - PersonTrip (who was on that specific trip mentione din the logbook entry)
|
||||
for entrytuple in allentries:
|
||||
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
|
||||
store_entry_into_database(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1)
|
||||
|
||||
for expo in expos:
|
||||
expedition.save() # to save logbook name property
|
||||
mem = get_process_memory()
|
||||
print(f" - MEM: {mem:7.2f} MB in use, {mem-mem1:7.2f} MB more", file=sys.stderr)
|
||||
duration = time.time() - start
|
||||
print(f" - TIME: {duration:7.2f} s", file=sys.stderr)
|
||||
# dateRegex = re.compile(r'<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S)
|
||||
# expeditionYearRegex = re.compile(r'<span\s+class="expeditionyear">(.*?)</span>', re.S)
|
||||
# titleRegex = re.compile(r'<H1>(.*?)</H1>', re.S)
|
||||
|
Loading…
Reference in New Issue
Block a user