diff --git a/core/models/caves.py b/core/models/caves.py
index 70a13ae..0b3aa1d 100644
--- a/core/models/caves.py
+++ b/core/models/caves.py
@@ -549,6 +549,10 @@ class PersonTrip(TroggleModel):
logbook_entry = models.ForeignKey(LogbookEntry,on_delete=models.CASCADE)
is_logbook_entry_author = models.BooleanField(default=False)
+ class Meta:
+ ordering = ('-personexpedition',)
+ #order_with_respect_to = 'personexpedition'
+
def persontrip_next(self):
futurePTs = PersonTrip.objects.filter(personexpedition = self.personexpedition, logbook_entry__date__gt = self.logbook_entry.date).order_by('logbook_entry__date').all()
if len(futurePTs) > 0:
diff --git a/core/views/logbooks.py b/core/views/logbooks.py
index 415c7f8..9748aed 100644
--- a/core/views/logbooks.py
+++ b/core/views/logbooks.py
@@ -18,7 +18,7 @@ from troggle.core.models.caves import LogbookEntry, PersonTrip
from troggle.core.models.survex import SurvexBlock, Wallet
from troggle.core.models.troggle import Expedition, Person, PersonExpedition
from troggle.core.utils import TROG
-from troggle.parsers.logbooks import parse_logbook_for_expedition
+from troggle.parsers.imports import import_logbook
from troggle.parsers.people import GetPersonExpeditionNameLookup
from .auth import login_required_if_public
@@ -69,7 +69,7 @@ def expedition(request, expeditionname):
#print(f'! - delete entry: "{entry}"')
entry.delete()
entries = this_expedition.logbookentry_set.all()
- LoadLogbookForExpedition(this_expedition)
+ import_logbook(year=this_expedition.year)
logged_in = True
else:
logged_in = False
diff --git a/parsers/imports.py b/parsers/imports.py
index c1de034..050d5ea 100644
--- a/parsers/imports.py
+++ b/parsers/imports.py
@@ -40,7 +40,7 @@ def import_logbooks():
troggle.parsers.logbooks.LoadLogbooks()
-def import_logbook(year=2022):
+def import_logbook(year=2016):
print(f"-- Importing Logbook {year}")
with transaction.atomic():
troggle.parsers.logbooks.LoadLogbook(year)
diff --git a/parsers/logbooks.py b/parsers/logbooks.py
index 86016af..a65774d 100644
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -1,5 +1,8 @@
import os
import re
+import sys
+import time
+
from datetime import date, datetime
from pathlib import Path
from random import randint
@@ -7,10 +10,10 @@ from random import randint
from django.conf import settings
from django.template.defaultfilters import slugify
-from parsers.people import GetPersonExpeditionNameLookup
+from parsers.people import GetPersonExpeditionNameLookup, load_people_expos
from troggle.core.models.caves import GetCaveLookup, LogbookEntry, PersonTrip
from troggle.core.models.troggle import DataIssue, Expedition
-from troggle.core.utils import save_carefully
+from troggle.core.utils import save_carefully, get_process_memory
"""
Parses and imports logbooks in all their wonderful confusion
@@ -20,6 +23,8 @@ Parses and imports logbooks in all their wonderful confusion
todo = """
- refactor everything with some urgency, esp. parse_logbook_for_expedition()
+- pre-compile all the heavily used regular expressions !
+
- break out the code that hits the database from that which parses the logbook
so that the file-reading and parsing can be parallelized, while writing to the
database remains serialized (sqlite is single-user).
@@ -43,7 +48,7 @@ data for old logbooks? Not worth it..
"""
MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200
-BLOG_PARSER_SETTINGS = {
+BLOG_PARSER_SETTINGS = { # no default, must be explicit
# "2022": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
# "2019": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
# "2018": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
@@ -57,7 +62,7 @@ LOGBOOK_PARSER_SETTINGS = {
"1982": ("logbook.html", "parser_html"),
}
-entries = {
+ENTRIES = {
"2022": 90,
"2019": 55,
"2018": 95,
@@ -99,18 +104,15 @@ entries = {
}
logentries = [] # the entire logbook for one year is a single object: a list of entries
-noncaveplaces = ["Journey", "Loser Plateau", "UNKNOWN", "plateau", "base camp", "basecamp", "top camp", "topcamp"]
-trips = {}
+noncaveplaces = ["travel", "Journey", "Loser Plateau", "UNKNOWN", "plateau", "base camp", "basecamp", "top camp", "topcamp"]
def set_trip_id(year, seq):
tid = f"{year}_s{seq:02d}"
return tid
-
rx_tripperson = re.compile(r"(?i)(.*?)$")
rx_round_bracket = re.compile(r"[\(\[].*?[\)\]]")
-
def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
res = []
author = None
@@ -161,7 +163,7 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
return res, author
-def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, tid=None):
+def store_entry_into_database(date, place, title, text, trippeople, expedition, logtime_underground, tid=None):
"""saves a single logbook entry and related persontrips
Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday !
@@ -254,9 +256,11 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
nonLookupAttribs = {"time_underground": time_underground, "is_logbook_entry_author": (tripperson == author)}
# this creates the PersonTrip instance.
save_carefully(PersonTrip, lookupAttribs, nonLookupAttribs)
+
+
-def ParseDate(tripdate, year):
+def parser_date(tripdate, year):
"""Interprets dates in the expo logbooks and returns a correct datetime.date object"""
dummydate = date(1970, 1, 1) # replace with _EPOCH
month = 1
@@ -302,6 +306,7 @@ def parser_html(year, expedition, txt, seq=""):
from parser_html_01 format logfiles, believe me.
"""
global logentries
+ dupl = {}
# extract front material and stash for later use when rebuilding from list of entries
headmatch = re.match(r"(?i)(?s).*
]*>(.*?)= 2:
tripcave = triptitles[0]
@@ -381,6 +386,15 @@ def parser_html(year, expedition, txt, seq=""):
ltriptext = re.sub(r"", "
", ltriptext).strip()
triptitle = triptitle.strip()
+ # triptitle must be unique for a given date. We fix this here.
+ check = (ldate, triptitle)
+ if check in dupl:
+ dupl[check] += 1
+ triptitle = f"{triptitle} #{dupl[check]}"
+ print(f" - {triptitle} -- {date}")
+ else:
+ dupl[check] = 1
+
entrytuple = (ldate, tripcave, triptitle, ltriptext, trippeople, expedition, tu, tripid1)
logentries.append(entrytuple)
@@ -401,6 +415,7 @@ def parser_blog(year, expedition, txt, sq=""):
So the content is nested inside the header. Attachments (images) come after the content.
+ It's a bugger, but it's out of our control.
"""
global logentries
@@ -467,9 +482,10 @@ def parser_blog(year, expedition, txt, sq=""):
tripdate = datetime.fromisoformat(datestamp[0:10])
# print(f" - tid: {tid} '{trippeople}' '{tripdate}'")
- # tripname must have the location then a hyphen at the beginning as it is ignored by export function
+ # triptitle must have the location then a hyphen at the beginning as it is ignored by export function. We can't know what this is, so we set it as 'Expo' and 'Unknown'.
location = "Unknown"
- tripname = f"Expo - UK Caving Blog{sq} post {logbook_entry_count}" # must be unique for a given date
+ # triptitle must be unique for a given date. We can enforce this here.
+ triptitle = f"Expo - UK Caving Blog{sq} post {logbook_entry_count}"
tripcontent = re.sub(r"(width=\"\d+\")", "", tripcontent)
tripcontent = re.sub(r"height=\"\d+\"", "", tripcontent)
tripcontent = re.sub(r"width: \d+px", "", tripcontent)
@@ -477,7 +493,7 @@ def parser_blog(year, expedition, txt, sq=""):
tripcontent = re.sub(r"
", "", tripcontent)
tripcontent = f"\n\n\nBlog Author: {trippeople}" + tripcontent
- entrytuple = (tripdate, location, tripname, tripcontent, trippeople, expedition, tu, tid)
+ entrytuple = (tripdate, location, triptitle, tripcontent, trippeople, expedition, tu, tid)
logentries.append(entrytuple)
def clean_all_logbooks():
@@ -485,53 +501,51 @@ def clean_all_logbooks():
LogbookEntry.objects.all().delete()
def clean_logbook_for_expedition(expedition):
- def cleanerrors(year):
- dataissues = DataIssue.objects.filter(parser="logbooks")
- for di in dataissues:
- ph = year
- if re.search(ph, di.message) is not None: # SLOW just to delete issues for one year
- # print(f' - CLEANING dataissue {di.message}')
- di.delete()
+ """Only used when loading a single logbook. Deletes database LogBookEntries and
+ DataIssues for this expedition year.
+ """
+ lbes = LogbookEntry.objects.filter(expedition=expedition).delete()
+ dataissues = DataIssue.objects.filter(parser="logbooks")
+ for di in dataissues:
+ ph = expedition.year
+ if re.search(ph, di.message) is not None: # SLOW just to delete issues for one year
+ # print(f' - CLEANING dataissue {di.message}')
+ di.delete()
-
-
- year = expedition.year
- cleanerrors(year)
-
- lbes = LogbookEntry.objects.filter(expedition=expedition) # must be a quicker way
- for lbe in lbes:
- lbe.delete()
-
-def parse_logbook_for_expedition(expedition):
+def parse_logbook_for_expedition(expedition, blog=False):
"""Parses all logbook entries for one expedition
- if clean==True then it deletes all entries for this year first.
"""
global logentries
- # absolutely horrid. REFACTOR THIS (all my fault..)
- global entries
-
+ global ENTRIES
+ logentries = []
+
logbook_parseable = False
- yearlinks = LOGBOOK_PARSER_SETTINGS
- expologbase = os.path.join(settings.EXPOWEB, "years")
- logentries = []
-
+ expologbase = Path(settings.EXPOWEB, "years")
+
year = expedition.year
- expect = entries[year]
+ expect = ENTRIES[year]
# print(" - Logbook for: " + year)
- if year in yearlinks:
- yearfile, yearparser = yearlinks[year]
- logbookpath = Path(yearfile)
- expedition.logbookfile = yearfile
- parsefunc = yearparser
- # print(f" - Logbook file {yearfile} using parser {yearparser}")
-
+ if year in LOGBOOK_PARSER_SETTINGS:
+ yearfile, parsefunc = LOGBOOK_PARSER_SETTINGS[year]
+ expedition.logbookfile = yearfile # don't change this if a blog
else:
- logbookpath = Path(DEFAULT_LOGBOOK_FILE)
- expedition.logbookfile = DEFAULT_LOGBOOK_FILE
+ yearfile = DEFAULT_LOGBOOK_FILE
+ expedition.logbookfile = DEFAULT_LOGBOOK_FILE # don't change this if a blog
parsefunc = DEFAULT_LOGBOOK_PARSER
+
+ if blog:
+ print(f" - BLOG file {yearfile} using parser {parsefunc}")
+ if year not in BLOG_PARSER_SETTINGS:
+ message = f" ! - Expecting blog parser buut none specified for {year}"
+ DataIssue.objects.create(parser="logbooks", message=message)
+ print(message)
+ else:
+ yearfile, parsefunc = BLOG_PARSER_SETTINGS[year]
- expedition.save()
+ logbookpath = Path(yearfile)
+ # print(f" - Logbook file {yearfile} using parser {parsefunc}")
+ # expedition.save()
for sq in ["", "2", "3", "4"]: # cope with blog saved as many separate files
lb = Path(expologbase, year, logbookpath.stem + sq + logbookpath.suffix)
@@ -556,17 +570,10 @@ def parse_logbook_for_expedition(expedition):
print(f" - {year} parsing with {parsefunc} - {lb}")
parser(year, expedition, txt, sq) # this launches the right parser for this year
# --------------------
- dupl = {}
- for entrytuple in logentries:
- date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
- check = (date, triptitle)
- if check in dupl:
- dupl[check] += 1
- triptitle = f"{triptitle} #{dupl[check]}"
- print(f" - {triptitle} -- {date}")
- else:
- dupl[check] = 1
- EnterLogIntoDbase(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1)
+ # move database storage into separate step
+ # for entrytuple in logentries:
+ # date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
+ # store_entry_into_database(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1)
if len(logentries) == expect:
# print(f"OK {year} {len(logentries):5d} is {expect}\n")
@@ -578,39 +585,57 @@ def parse_logbook_for_expedition(expedition):
def LoadLogbook(year):
- """One off logbook for testing purposes"""
- global LOGBOOK_PARSER_SETTINGS
+ """One off logbook for testing purposes, and also reloadable on '/expedition/2022?reload'
+ This is inside an atomic transaction"""
+ global logentries
nlbe = {}
expo = Expedition.objects.get(year=year)
year = expo.year # some type funny
clean_logbook_for_expedition(expo)
+ logentries = []
nlbe[expo] = parse_logbook_for_expedition(expo) # this actually loads the logbook for one expo
if year in BLOG_PARSER_SETTINGS:
- print("BLOG parsing")
- LOGBOOK_PARSER_SETTINGS[year] = BLOG_PARSER_SETTINGS[year]
- nlbe[expo] = parse_logbook_for_expedition(expo) # this loads the blog logbook for one expo
+ nlbe[expo] = parse_logbook_for_expedition(expo, blog=True) # this loads the blog logbook
else:
print(
- f"Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}"
+ f" - Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}"
)
-
-
+ for entrytuple in logentries:
+ date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
+ if expo == expedition:
+ #print(f" - {triptitle}")
+ store_entry_into_database(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1)
+ expedition.save() # to save logbook name property
+
def LoadLogbooks():
"""This is the master function for parsing all logbooks into the Troggle database.
This should be rewritten to use coroutines to load all logbooks from disc in parallel,
but must be serialised to write to database as sqlite is single-user.
+
+ This is inside an atomic transaction. Maybe it shouldn't be..
"""
- global entries
+ global ENTRIES
+ global logentries
+ allentries = []
+ mem1 = get_process_memory()
+ print(f" - MEM:{mem1:7.2f} MB now ", file=sys.stderr)
+ start = time.time()
clean_all_logbooks()
expos = Expedition.objects.all()
if len(expos) <= 1:
- message = " ! - No expeditions found. Load 'people' first"
+ message = " ! - No expeditions found. Attempting to 'people' first"
DataIssue.objects.create(parser="logbooks", message=message)
print(message)
- return
+ load_people_expos()
+ expos = Expedition.objects.all()
+ if len(expos) <= 1:
+ message = " ! - No expeditions found, even after attempting to load 'people'. Abort."
+ DataIssue.objects.create(parser="logbooks", message=message)
+ print(message)
+ return
noexpo = [
"1986",
@@ -625,7 +650,7 @@ def LoadLogbooks():
loglist = []
bloglist = []
- for expo in expos: # pointless as we explicitly know the years in this code.
+ for expo in expos:
year = expo.year
if year in sqlfail:
print(" - Logbook for: " + year + " NO parsing attempted - known sql failures")
@@ -634,7 +659,7 @@ def LoadLogbooks():
print(message)
if year not in nologbook:
- if year in entries:
+ if year in ENTRIES:
loglist.append(expo)
else:
print(" - No Logbook yet for: " + year) # catch case when preparing for next expo
@@ -644,28 +669,38 @@ def LoadLogbooks():
for ex in loglist:
nlbe[ex] = parse_logbook_for_expedition(ex) # this loads the logbook for one expo
+ allentries += logentries
for b in bloglist:
- if str(b) in LOGBOOK_PARSER_SETTINGS:
- orig = LOGBOOK_PARSER_SETTINGS[str(b)]
- else:
- orig = (DEFAULT_LOGBOOK_FILE, DEFAULT_LOGBOOK_PARSER)
- LOGBOOK_PARSER_SETTINGS[str(b)] = BLOG_PARSER_SETTINGS[str(b)]
print(f" - BLOG: {b}")
- nlbe[b] = parse_logbook_for_expedition(b) # no clean. loads the blog logbook for one expo
- LOGBOOK_PARSER_SETTINGS[str(b)] = orig
-
- # tried to use map with concurrent threads - but sqlite database is not concurrent, so failed with database lock
- # yt = 0
- # for r in map(parse_logbook_for_expedition, loglist):
- # yt = r
+ nlbe[b] += parse_logbook_for_expedition(b, blog=True) # loads the blog logbook for one expo
+ allentries += logentries
yt = 0
- for e in nlbe:
- yt += nlbe[e]
+ for exp in nlbe:
+ yt += nlbe[exp]
print(f"total {yt:,} log entries parsed in all expeditions")
+ print(f"total {len(allentries):,} log entries in complete dict")
+ mem = get_process_memory()
+ print(f" - MEM: {mem:7.2f} MB in use, {mem-mem1:7.2f} MB more", file=sys.stderr)
+ duration = time.time() - start
+ print(f" - TIME: {duration:7.2f} s", file=sys.stderr)
+
+ # Now we serially store the parsed data in the database, updating 3 types of object:
+ # - Expedition (the 'logbook.html' value)
+ # - LogBookEntry (text, who when etc.)
+ # - PersonTrip (who was on that specific trip mentione din the logbook entry)
+ for entrytuple in allentries:
+ date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
+ store_entry_into_database(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1)
+ for expo in expos:
+ expedition.save() # to save logbook name property
+ mem = get_process_memory()
+ print(f" - MEM: {mem:7.2f} MB in use, {mem-mem1:7.2f} MB more", file=sys.stderr)
+ duration = time.time() - start
+ print(f" - TIME: {duration:7.2f} s", file=sys.stderr)
# dateRegex = re.compile(r'(\d\d\d\d)-(\d\d)-(\d\d)', re.S)
# expeditionYearRegex = re.compile(r'(.*?)', re.S)
# titleRegex = re.compile(r'(.*?)
', re.S)