mirror of
https://expo.survex.com/repositories/troggle/.git
synced 2024-11-25 00:31:55 +00:00
refactored logbooks parser
This commit is contained in:
parent
6565b3f9c4
commit
e01bd39609
@ -549,6 +549,10 @@ class PersonTrip(TroggleModel):
|
|||||||
logbook_entry = models.ForeignKey(LogbookEntry,on_delete=models.CASCADE)
|
logbook_entry = models.ForeignKey(LogbookEntry,on_delete=models.CASCADE)
|
||||||
is_logbook_entry_author = models.BooleanField(default=False)
|
is_logbook_entry_author = models.BooleanField(default=False)
|
||||||
|
|
||||||
|
class Meta:
|
||||||
|
ordering = ('-personexpedition',)
|
||||||
|
#order_with_respect_to = 'personexpedition'
|
||||||
|
|
||||||
def persontrip_next(self):
|
def persontrip_next(self):
|
||||||
futurePTs = PersonTrip.objects.filter(personexpedition = self.personexpedition, logbook_entry__date__gt = self.logbook_entry.date).order_by('logbook_entry__date').all()
|
futurePTs = PersonTrip.objects.filter(personexpedition = self.personexpedition, logbook_entry__date__gt = self.logbook_entry.date).order_by('logbook_entry__date').all()
|
||||||
if len(futurePTs) > 0:
|
if len(futurePTs) > 0:
|
||||||
|
@ -18,7 +18,7 @@ from troggle.core.models.caves import LogbookEntry, PersonTrip
|
|||||||
from troggle.core.models.survex import SurvexBlock, Wallet
|
from troggle.core.models.survex import SurvexBlock, Wallet
|
||||||
from troggle.core.models.troggle import Expedition, Person, PersonExpedition
|
from troggle.core.models.troggle import Expedition, Person, PersonExpedition
|
||||||
from troggle.core.utils import TROG
|
from troggle.core.utils import TROG
|
||||||
from troggle.parsers.logbooks import parse_logbook_for_expedition
|
from troggle.parsers.imports import import_logbook
|
||||||
from troggle.parsers.people import GetPersonExpeditionNameLookup
|
from troggle.parsers.people import GetPersonExpeditionNameLookup
|
||||||
|
|
||||||
from .auth import login_required_if_public
|
from .auth import login_required_if_public
|
||||||
@ -69,7 +69,7 @@ def expedition(request, expeditionname):
|
|||||||
#print(f'! - delete entry: "{entry}"')
|
#print(f'! - delete entry: "{entry}"')
|
||||||
entry.delete()
|
entry.delete()
|
||||||
entries = this_expedition.logbookentry_set.all()
|
entries = this_expedition.logbookentry_set.all()
|
||||||
LoadLogbookForExpedition(this_expedition)
|
import_logbook(year=this_expedition.year)
|
||||||
logged_in = True
|
logged_in = True
|
||||||
else:
|
else:
|
||||||
logged_in = False
|
logged_in = False
|
||||||
|
@ -40,7 +40,7 @@ def import_logbooks():
|
|||||||
troggle.parsers.logbooks.LoadLogbooks()
|
troggle.parsers.logbooks.LoadLogbooks()
|
||||||
|
|
||||||
|
|
||||||
def import_logbook(year=2022):
|
def import_logbook(year=2016):
|
||||||
print(f"-- Importing Logbook {year}")
|
print(f"-- Importing Logbook {year}")
|
||||||
with transaction.atomic():
|
with transaction.atomic():
|
||||||
troggle.parsers.logbooks.LoadLogbook(year)
|
troggle.parsers.logbooks.LoadLogbook(year)
|
||||||
|
@ -1,5 +1,8 @@
|
|||||||
import os
|
import os
|
||||||
import re
|
import re
|
||||||
|
import sys
|
||||||
|
import time
|
||||||
|
|
||||||
from datetime import date, datetime
|
from datetime import date, datetime
|
||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from random import randint
|
from random import randint
|
||||||
@ -7,10 +10,10 @@ from random import randint
|
|||||||
from django.conf import settings
|
from django.conf import settings
|
||||||
from django.template.defaultfilters import slugify
|
from django.template.defaultfilters import slugify
|
||||||
|
|
||||||
from parsers.people import GetPersonExpeditionNameLookup
|
from parsers.people import GetPersonExpeditionNameLookup, load_people_expos
|
||||||
from troggle.core.models.caves import GetCaveLookup, LogbookEntry, PersonTrip
|
from troggle.core.models.caves import GetCaveLookup, LogbookEntry, PersonTrip
|
||||||
from troggle.core.models.troggle import DataIssue, Expedition
|
from troggle.core.models.troggle import DataIssue, Expedition
|
||||||
from troggle.core.utils import save_carefully
|
from troggle.core.utils import save_carefully, get_process_memory
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Parses and imports logbooks in all their wonderful confusion
|
Parses and imports logbooks in all their wonderful confusion
|
||||||
@ -20,6 +23,8 @@ Parses and imports logbooks in all their wonderful confusion
|
|||||||
todo = """
|
todo = """
|
||||||
- refactor everything with some urgency, esp. parse_logbook_for_expedition()
|
- refactor everything with some urgency, esp. parse_logbook_for_expedition()
|
||||||
|
|
||||||
|
- pre-compile all the heavily used regular expressions !
|
||||||
|
|
||||||
- break out the code that hits the database from that which parses the logbook
|
- break out the code that hits the database from that which parses the logbook
|
||||||
so that the file-reading and parsing can be parallelized, while writing to the
|
so that the file-reading and parsing can be parallelized, while writing to the
|
||||||
database remains serialized (sqlite is single-user).
|
database remains serialized (sqlite is single-user).
|
||||||
@ -43,7 +48,7 @@ data for old logbooks? Not worth it..
|
|||||||
|
|
||||||
"""
|
"""
|
||||||
MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200
|
MAX_LOGBOOK_ENTRY_TITLE_LENGTH = 200
|
||||||
BLOG_PARSER_SETTINGS = {
|
BLOG_PARSER_SETTINGS = { # no default, must be explicit
|
||||||
# "2022": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
|
# "2022": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
|
||||||
# "2019": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
|
# "2019": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
|
||||||
# "2018": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
|
# "2018": ("ukcavingblog.html", "parser_blog"), # now folded in to logbooks.html
|
||||||
@ -57,7 +62,7 @@ LOGBOOK_PARSER_SETTINGS = {
|
|||||||
"1982": ("logbook.html", "parser_html"),
|
"1982": ("logbook.html", "parser_html"),
|
||||||
}
|
}
|
||||||
|
|
||||||
entries = {
|
ENTRIES = {
|
||||||
"2022": 90,
|
"2022": 90,
|
||||||
"2019": 55,
|
"2019": 55,
|
||||||
"2018": 95,
|
"2018": 95,
|
||||||
@ -99,18 +104,15 @@ entries = {
|
|||||||
}
|
}
|
||||||
|
|
||||||
logentries = [] # the entire logbook for one year is a single object: a list of entries
|
logentries = [] # the entire logbook for one year is a single object: a list of entries
|
||||||
noncaveplaces = ["Journey", "Loser Plateau", "UNKNOWN", "plateau", "base camp", "basecamp", "top camp", "topcamp"]
|
noncaveplaces = ["travel", "Journey", "Loser Plateau", "UNKNOWN", "plateau", "base camp", "basecamp", "top camp", "topcamp"]
|
||||||
trips = {}
|
|
||||||
|
|
||||||
def set_trip_id(year, seq):
|
def set_trip_id(year, seq):
|
||||||
tid = f"{year}_s{seq:02d}"
|
tid = f"{year}_s{seq:02d}"
|
||||||
return tid
|
return tid
|
||||||
|
|
||||||
|
|
||||||
rx_tripperson = re.compile(r"(?i)<u>(.*?)</u>$")
|
rx_tripperson = re.compile(r"(?i)<u>(.*?)</u>$")
|
||||||
rx_round_bracket = re.compile(r"[\(\[].*?[\)\]]")
|
rx_round_bracket = re.compile(r"[\(\[].*?[\)\]]")
|
||||||
|
|
||||||
|
|
||||||
def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
|
def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
|
||||||
res = []
|
res = []
|
||||||
author = None
|
author = None
|
||||||
@ -161,7 +163,7 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
|
|||||||
return res, author
|
return res, author
|
||||||
|
|
||||||
|
|
||||||
def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, tid=None):
|
def store_entry_into_database(date, place, title, text, trippeople, expedition, logtime_underground, tid=None):
|
||||||
"""saves a single logbook entry and related persontrips
|
"""saves a single logbook entry and related persontrips
|
||||||
Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday !
|
Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday !
|
||||||
|
|
||||||
@ -254,9 +256,11 @@ def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_
|
|||||||
nonLookupAttribs = {"time_underground": time_underground, "is_logbook_entry_author": (tripperson == author)}
|
nonLookupAttribs = {"time_underground": time_underground, "is_logbook_entry_author": (tripperson == author)}
|
||||||
# this creates the PersonTrip instance.
|
# this creates the PersonTrip instance.
|
||||||
save_carefully(PersonTrip, lookupAttribs, nonLookupAttribs)
|
save_carefully(PersonTrip, lookupAttribs, nonLookupAttribs)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def ParseDate(tripdate, year):
|
def parser_date(tripdate, year):
|
||||||
"""Interprets dates in the expo logbooks and returns a correct datetime.date object"""
|
"""Interprets dates in the expo logbooks and returns a correct datetime.date object"""
|
||||||
dummydate = date(1970, 1, 1) # replace with _EPOCH
|
dummydate = date(1970, 1, 1) # replace with _EPOCH
|
||||||
month = 1
|
month = 1
|
||||||
@ -302,6 +306,7 @@ def parser_html(year, expedition, txt, seq=""):
|
|||||||
from parser_html_01 format logfiles, believe me.
|
from parser_html_01 format logfiles, believe me.
|
||||||
"""
|
"""
|
||||||
global logentries
|
global logentries
|
||||||
|
dupl = {}
|
||||||
|
|
||||||
# extract front material and stash for later use when rebuilding from list of entries
|
# extract front material and stash for later use when rebuilding from list of entries
|
||||||
headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt)
|
headmatch = re.match(r"(?i)(?s).*<body[^>]*>(.*?)<hr.*", txt)
|
||||||
@ -370,7 +375,7 @@ def parser_html(year, expedition, txt, seq=""):
|
|||||||
DataIssue.objects.create(parser="logbooks", message=msg)
|
DataIssue.objects.create(parser="logbooks", message=msg)
|
||||||
continue
|
continue
|
||||||
|
|
||||||
ldate = ParseDate(tripdate.strip(), year)
|
ldate = parser_date(tripdate.strip(), year)
|
||||||
triptitles = triptitle.split(" - ")
|
triptitles = triptitle.split(" - ")
|
||||||
if len(triptitles) >= 2:
|
if len(triptitles) >= 2:
|
||||||
tripcave = triptitles[0]
|
tripcave = triptitles[0]
|
||||||
@ -381,6 +386,15 @@ def parser_html(year, expedition, txt, seq=""):
|
|||||||
ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
|
ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
|
||||||
|
|
||||||
triptitle = triptitle.strip()
|
triptitle = triptitle.strip()
|
||||||
|
# triptitle must be unique for a given date. We fix this here.
|
||||||
|
check = (ldate, triptitle)
|
||||||
|
if check in dupl:
|
||||||
|
dupl[check] += 1
|
||||||
|
triptitle = f"{triptitle} #{dupl[check]}"
|
||||||
|
print(f" - {triptitle} -- {date}")
|
||||||
|
else:
|
||||||
|
dupl[check] = 1
|
||||||
|
|
||||||
entrytuple = (ldate, tripcave, triptitle, ltriptext, trippeople, expedition, tu, tripid1)
|
entrytuple = (ldate, tripcave, triptitle, ltriptext, trippeople, expedition, tu, tripid1)
|
||||||
logentries.append(entrytuple)
|
logentries.append(entrytuple)
|
||||||
|
|
||||||
@ -401,6 +415,7 @@ def parser_blog(year, expedition, txt, sq=""):
|
|||||||
</article>
|
</article>
|
||||||
</article>
|
</article>
|
||||||
So the content is nested inside the header. Attachments (images) come after the content.
|
So the content is nested inside the header. Attachments (images) come after the content.
|
||||||
|
It's a bugger, but it's out of our control.
|
||||||
"""
|
"""
|
||||||
global logentries
|
global logentries
|
||||||
|
|
||||||
@ -467,9 +482,10 @@ def parser_blog(year, expedition, txt, sq=""):
|
|||||||
tripdate = datetime.fromisoformat(datestamp[0:10])
|
tripdate = datetime.fromisoformat(datestamp[0:10])
|
||||||
# print(f" - tid: {tid} '{trippeople}' '{tripdate}'")
|
# print(f" - tid: {tid} '{trippeople}' '{tripdate}'")
|
||||||
|
|
||||||
# tripname must have the location then a hyphen at the beginning as it is ignored by export function
|
# triptitle must have the location then a hyphen at the beginning as it is ignored by export function. We can't know what this is, so we set it as 'Expo' and 'Unknown'.
|
||||||
location = "Unknown"
|
location = "Unknown"
|
||||||
tripname = f"Expo - UK Caving Blog{sq} post {logbook_entry_count}" # must be unique for a given date
|
# triptitle must be unique for a given date. We can enforce this here.
|
||||||
|
triptitle = f"Expo - UK Caving Blog{sq} post {logbook_entry_count}"
|
||||||
tripcontent = re.sub(r"(width=\"\d+\")", "", tripcontent)
|
tripcontent = re.sub(r"(width=\"\d+\")", "", tripcontent)
|
||||||
tripcontent = re.sub(r"height=\"\d+\"", "", tripcontent)
|
tripcontent = re.sub(r"height=\"\d+\"", "", tripcontent)
|
||||||
tripcontent = re.sub(r"width: \d+px", "", tripcontent)
|
tripcontent = re.sub(r"width: \d+px", "", tripcontent)
|
||||||
@ -477,7 +493,7 @@ def parser_blog(year, expedition, txt, sq=""):
|
|||||||
tripcontent = re.sub(r"<hr\s*>", "", tripcontent)
|
tripcontent = re.sub(r"<hr\s*>", "", tripcontent)
|
||||||
tripcontent = f"\n\n<!-- Content parsed from UK Caving Blog -->\nBlog Author: {trippeople}" + tripcontent
|
tripcontent = f"\n\n<!-- Content parsed from UK Caving Blog -->\nBlog Author: {trippeople}" + tripcontent
|
||||||
|
|
||||||
entrytuple = (tripdate, location, tripname, tripcontent, trippeople, expedition, tu, tid)
|
entrytuple = (tripdate, location, triptitle, tripcontent, trippeople, expedition, tu, tid)
|
||||||
logentries.append(entrytuple)
|
logentries.append(entrytuple)
|
||||||
|
|
||||||
def clean_all_logbooks():
|
def clean_all_logbooks():
|
||||||
@ -485,53 +501,51 @@ def clean_all_logbooks():
|
|||||||
LogbookEntry.objects.all().delete()
|
LogbookEntry.objects.all().delete()
|
||||||
|
|
||||||
def clean_logbook_for_expedition(expedition):
|
def clean_logbook_for_expedition(expedition):
|
||||||
def cleanerrors(year):
|
"""Only used when loading a single logbook. Deletes database LogBookEntries and
|
||||||
dataissues = DataIssue.objects.filter(parser="logbooks")
|
DataIssues for this expedition year.
|
||||||
for di in dataissues:
|
"""
|
||||||
ph = year
|
lbes = LogbookEntry.objects.filter(expedition=expedition).delete()
|
||||||
if re.search(ph, di.message) is not None: # SLOW just to delete issues for one year
|
dataissues = DataIssue.objects.filter(parser="logbooks")
|
||||||
# print(f' - CLEANING dataissue {di.message}')
|
for di in dataissues:
|
||||||
di.delete()
|
ph = expedition.year
|
||||||
|
if re.search(ph, di.message) is not None: # SLOW just to delete issues for one year
|
||||||
|
# print(f' - CLEANING dataissue {di.message}')
|
||||||
|
di.delete()
|
||||||
|
|
||||||
|
def parse_logbook_for_expedition(expedition, blog=False):
|
||||||
|
|
||||||
year = expedition.year
|
|
||||||
cleanerrors(year)
|
|
||||||
|
|
||||||
lbes = LogbookEntry.objects.filter(expedition=expedition) # must be a quicker way
|
|
||||||
for lbe in lbes:
|
|
||||||
lbe.delete()
|
|
||||||
|
|
||||||
def parse_logbook_for_expedition(expedition):
|
|
||||||
"""Parses all logbook entries for one expedition
|
"""Parses all logbook entries for one expedition
|
||||||
if clean==True then it deletes all entries for this year first.
|
|
||||||
"""
|
"""
|
||||||
global logentries
|
global logentries
|
||||||
# absolutely horrid. REFACTOR THIS (all my fault..)
|
global ENTRIES
|
||||||
global entries
|
logentries = []
|
||||||
|
|
||||||
logbook_parseable = False
|
logbook_parseable = False
|
||||||
yearlinks = LOGBOOK_PARSER_SETTINGS
|
expologbase = Path(settings.EXPOWEB, "years")
|
||||||
expologbase = os.path.join(settings.EXPOWEB, "years")
|
|
||||||
logentries = []
|
|
||||||
|
|
||||||
year = expedition.year
|
year = expedition.year
|
||||||
expect = entries[year]
|
expect = ENTRIES[year]
|
||||||
# print(" - Logbook for: " + year)
|
# print(" - Logbook for: " + year)
|
||||||
|
|
||||||
if year in yearlinks:
|
if year in LOGBOOK_PARSER_SETTINGS:
|
||||||
yearfile, yearparser = yearlinks[year]
|
yearfile, parsefunc = LOGBOOK_PARSER_SETTINGS[year]
|
||||||
logbookpath = Path(yearfile)
|
expedition.logbookfile = yearfile # don't change this if a blog
|
||||||
expedition.logbookfile = yearfile
|
|
||||||
parsefunc = yearparser
|
|
||||||
# print(f" - Logbook file {yearfile} using parser {yearparser}")
|
|
||||||
|
|
||||||
else:
|
else:
|
||||||
logbookpath = Path(DEFAULT_LOGBOOK_FILE)
|
yearfile = DEFAULT_LOGBOOK_FILE
|
||||||
expedition.logbookfile = DEFAULT_LOGBOOK_FILE
|
expedition.logbookfile = DEFAULT_LOGBOOK_FILE # don't change this if a blog
|
||||||
parsefunc = DEFAULT_LOGBOOK_PARSER
|
parsefunc = DEFAULT_LOGBOOK_PARSER
|
||||||
|
|
||||||
|
if blog:
|
||||||
|
print(f" - BLOG file {yearfile} using parser {parsefunc}")
|
||||||
|
if year not in BLOG_PARSER_SETTINGS:
|
||||||
|
message = f" ! - Expecting blog parser buut none specified for {year}"
|
||||||
|
DataIssue.objects.create(parser="logbooks", message=message)
|
||||||
|
print(message)
|
||||||
|
else:
|
||||||
|
yearfile, parsefunc = BLOG_PARSER_SETTINGS[year]
|
||||||
|
|
||||||
expedition.save()
|
logbookpath = Path(yearfile)
|
||||||
|
# print(f" - Logbook file {yearfile} using parser {parsefunc}")
|
||||||
|
# expedition.save()
|
||||||
|
|
||||||
for sq in ["", "2", "3", "4"]: # cope with blog saved as many separate files
|
for sq in ["", "2", "3", "4"]: # cope with blog saved as many separate files
|
||||||
lb = Path(expologbase, year, logbookpath.stem + sq + logbookpath.suffix)
|
lb = Path(expologbase, year, logbookpath.stem + sq + logbookpath.suffix)
|
||||||
@ -556,17 +570,10 @@ def parse_logbook_for_expedition(expedition):
|
|||||||
print(f" - {year} parsing with {parsefunc} - {lb}")
|
print(f" - {year} parsing with {parsefunc} - {lb}")
|
||||||
parser(year, expedition, txt, sq) # this launches the right parser for this year
|
parser(year, expedition, txt, sq) # this launches the right parser for this year
|
||||||
# --------------------
|
# --------------------
|
||||||
dupl = {}
|
# move database storage into separate step
|
||||||
for entrytuple in logentries:
|
# for entrytuple in logentries:
|
||||||
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
|
# date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
|
||||||
check = (date, triptitle)
|
# store_entry_into_database(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1)
|
||||||
if check in dupl:
|
|
||||||
dupl[check] += 1
|
|
||||||
triptitle = f"{triptitle} #{dupl[check]}"
|
|
||||||
print(f" - {triptitle} -- {date}")
|
|
||||||
else:
|
|
||||||
dupl[check] = 1
|
|
||||||
EnterLogIntoDbase(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1)
|
|
||||||
|
|
||||||
if len(logentries) == expect:
|
if len(logentries) == expect:
|
||||||
# print(f"OK {year} {len(logentries):5d} is {expect}\n")
|
# print(f"OK {year} {len(logentries):5d} is {expect}\n")
|
||||||
@ -578,39 +585,57 @@ def parse_logbook_for_expedition(expedition):
|
|||||||
|
|
||||||
|
|
||||||
def LoadLogbook(year):
|
def LoadLogbook(year):
|
||||||
"""One off logbook for testing purposes"""
|
"""One off logbook for testing purposes, and also reloadable on '/expedition/2022?reload'
|
||||||
global LOGBOOK_PARSER_SETTINGS
|
This is inside an atomic transaction"""
|
||||||
|
global logentries
|
||||||
nlbe = {}
|
nlbe = {}
|
||||||
|
|
||||||
expo = Expedition.objects.get(year=year)
|
expo = Expedition.objects.get(year=year)
|
||||||
year = expo.year # some type funny
|
year = expo.year # some type funny
|
||||||
clean_logbook_for_expedition(expo)
|
clean_logbook_for_expedition(expo)
|
||||||
|
logentries = []
|
||||||
|
|
||||||
nlbe[expo] = parse_logbook_for_expedition(expo) # this actually loads the logbook for one expo
|
nlbe[expo] = parse_logbook_for_expedition(expo) # this actually loads the logbook for one expo
|
||||||
if year in BLOG_PARSER_SETTINGS:
|
if year in BLOG_PARSER_SETTINGS:
|
||||||
print("BLOG parsing")
|
nlbe[expo] = parse_logbook_for_expedition(expo, blog=True) # this loads the blog logbook
|
||||||
LOGBOOK_PARSER_SETTINGS[year] = BLOG_PARSER_SETTINGS[year]
|
|
||||||
nlbe[expo] = parse_logbook_for_expedition(expo) # this loads the blog logbook for one expo
|
|
||||||
else:
|
else:
|
||||||
print(
|
print(
|
||||||
f"Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}"
|
f" - Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}"
|
||||||
)
|
)
|
||||||
|
for entrytuple in logentries:
|
||||||
|
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
|
||||||
|
if expo == expedition:
|
||||||
|
#print(f" - {triptitle}")
|
||||||
|
store_entry_into_database(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1)
|
||||||
|
expedition.save() # to save logbook name property
|
||||||
|
|
||||||
def LoadLogbooks():
|
def LoadLogbooks():
|
||||||
"""This is the master function for parsing all logbooks into the Troggle database.
|
"""This is the master function for parsing all logbooks into the Troggle database.
|
||||||
This should be rewritten to use coroutines to load all logbooks from disc in parallel,
|
This should be rewritten to use coroutines to load all logbooks from disc in parallel,
|
||||||
but must be serialised to write to database as sqlite is single-user.
|
but must be serialised to write to database as sqlite is single-user.
|
||||||
|
|
||||||
|
This is inside an atomic transaction. Maybe it shouldn't be..
|
||||||
"""
|
"""
|
||||||
global entries
|
global ENTRIES
|
||||||
|
global logentries
|
||||||
|
allentries = []
|
||||||
|
mem1 = get_process_memory()
|
||||||
|
print(f" - MEM:{mem1:7.2f} MB now ", file=sys.stderr)
|
||||||
|
start = time.time()
|
||||||
|
|
||||||
clean_all_logbooks()
|
clean_all_logbooks()
|
||||||
expos = Expedition.objects.all()
|
expos = Expedition.objects.all()
|
||||||
if len(expos) <= 1:
|
if len(expos) <= 1:
|
||||||
message = " ! - No expeditions found. Load 'people' first"
|
message = " ! - No expeditions found. Attempting to 'people' first"
|
||||||
DataIssue.objects.create(parser="logbooks", message=message)
|
DataIssue.objects.create(parser="logbooks", message=message)
|
||||||
print(message)
|
print(message)
|
||||||
return
|
load_people_expos()
|
||||||
|
expos = Expedition.objects.all()
|
||||||
|
if len(expos) <= 1:
|
||||||
|
message = " ! - No expeditions found, even after attempting to load 'people'. Abort."
|
||||||
|
DataIssue.objects.create(parser="logbooks", message=message)
|
||||||
|
print(message)
|
||||||
|
return
|
||||||
|
|
||||||
noexpo = [
|
noexpo = [
|
||||||
"1986",
|
"1986",
|
||||||
@ -625,7 +650,7 @@ def LoadLogbooks():
|
|||||||
loglist = []
|
loglist = []
|
||||||
bloglist = []
|
bloglist = []
|
||||||
|
|
||||||
for expo in expos: # pointless as we explicitly know the years in this code.
|
for expo in expos:
|
||||||
year = expo.year
|
year = expo.year
|
||||||
if year in sqlfail:
|
if year in sqlfail:
|
||||||
print(" - Logbook for: " + year + " NO parsing attempted - known sql failures")
|
print(" - Logbook for: " + year + " NO parsing attempted - known sql failures")
|
||||||
@ -634,7 +659,7 @@ def LoadLogbooks():
|
|||||||
print(message)
|
print(message)
|
||||||
|
|
||||||
if year not in nologbook:
|
if year not in nologbook:
|
||||||
if year in entries:
|
if year in ENTRIES:
|
||||||
loglist.append(expo)
|
loglist.append(expo)
|
||||||
else:
|
else:
|
||||||
print(" - No Logbook yet for: " + year) # catch case when preparing for next expo
|
print(" - No Logbook yet for: " + year) # catch case when preparing for next expo
|
||||||
@ -644,28 +669,38 @@ def LoadLogbooks():
|
|||||||
|
|
||||||
for ex in loglist:
|
for ex in loglist:
|
||||||
nlbe[ex] = parse_logbook_for_expedition(ex) # this loads the logbook for one expo
|
nlbe[ex] = parse_logbook_for_expedition(ex) # this loads the logbook for one expo
|
||||||
|
allentries += logentries
|
||||||
|
|
||||||
for b in bloglist:
|
for b in bloglist:
|
||||||
if str(b) in LOGBOOK_PARSER_SETTINGS:
|
|
||||||
orig = LOGBOOK_PARSER_SETTINGS[str(b)]
|
|
||||||
else:
|
|
||||||
orig = (DEFAULT_LOGBOOK_FILE, DEFAULT_LOGBOOK_PARSER)
|
|
||||||
LOGBOOK_PARSER_SETTINGS[str(b)] = BLOG_PARSER_SETTINGS[str(b)]
|
|
||||||
print(f" - BLOG: {b}")
|
print(f" - BLOG: {b}")
|
||||||
nlbe[b] = parse_logbook_for_expedition(b) # no clean. loads the blog logbook for one expo
|
nlbe[b] += parse_logbook_for_expedition(b, blog=True) # loads the blog logbook for one expo
|
||||||
LOGBOOK_PARSER_SETTINGS[str(b)] = orig
|
allentries += logentries
|
||||||
|
|
||||||
# tried to use map with concurrent threads - but sqlite database is not concurrent, so failed with database lock
|
|
||||||
# yt = 0
|
|
||||||
# for r in map(parse_logbook_for_expedition, loglist):
|
|
||||||
# yt = r
|
|
||||||
|
|
||||||
yt = 0
|
yt = 0
|
||||||
for e in nlbe:
|
for exp in nlbe:
|
||||||
yt += nlbe[e]
|
yt += nlbe[exp]
|
||||||
print(f"total {yt:,} log entries parsed in all expeditions")
|
print(f"total {yt:,} log entries parsed in all expeditions")
|
||||||
|
|
||||||
|
print(f"total {len(allentries):,} log entries in complete dict")
|
||||||
|
mem = get_process_memory()
|
||||||
|
print(f" - MEM: {mem:7.2f} MB in use, {mem-mem1:7.2f} MB more", file=sys.stderr)
|
||||||
|
duration = time.time() - start
|
||||||
|
print(f" - TIME: {duration:7.2f} s", file=sys.stderr)
|
||||||
|
|
||||||
|
# Now we serially store the parsed data in the database, updating 3 types of object:
|
||||||
|
# - Expedition (the 'logbook.html' value)
|
||||||
|
# - LogBookEntry (text, who when etc.)
|
||||||
|
# - PersonTrip (who was on that specific trip mentione din the logbook entry)
|
||||||
|
for entrytuple in allentries:
|
||||||
|
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
|
||||||
|
store_entry_into_database(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1)
|
||||||
|
|
||||||
|
for expo in expos:
|
||||||
|
expedition.save() # to save logbook name property
|
||||||
|
mem = get_process_memory()
|
||||||
|
print(f" - MEM: {mem:7.2f} MB in use, {mem-mem1:7.2f} MB more", file=sys.stderr)
|
||||||
|
duration = time.time() - start
|
||||||
|
print(f" - TIME: {duration:7.2f} s", file=sys.stderr)
|
||||||
# dateRegex = re.compile(r'<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S)
|
# dateRegex = re.compile(r'<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S)
|
||||||
# expeditionYearRegex = re.compile(r'<span\s+class="expeditionyear">(.*?)</span>', re.S)
|
# expeditionYearRegex = re.compile(r'<span\s+class="expeditionyear">(.*?)</span>', re.S)
|
||||||
# titleRegex = re.compile(r'<H1>(.*?)</H1>', re.S)
|
# titleRegex = re.compile(r'<H1>(.*?)</H1>', re.S)
|
||||||
|
Loading…
Reference in New Issue
Block a user