mirror of
https://expo.survex.com/repositories/troggle/.git
synced 2024-11-25 08:41:51 +00:00
refactored, global removed
This commit is contained in:
parent
db0504057b
commit
9e71be8169
@ -203,6 +203,8 @@ def save_carefully(objectType, lookupAttribs={}, nonLookupAttribs={}):
|
|||||||
defined in core.models.TroggleModel.
|
defined in core.models.TroggleModel.
|
||||||
|
|
||||||
We are not using new_since_parsing - it is a fossil from Aaron Curtis's design in 2006. So it is always false.
|
We are not using new_since_parsing - it is a fossil from Aaron Curtis's design in 2006. So it is always false.
|
||||||
|
|
||||||
|
NOTE: this takes twice as long as simply creating a new object with the given values.
|
||||||
|
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
|
@ -13,7 +13,7 @@ from django.template.defaultfilters import slugify
|
|||||||
from parsers.people import GetPersonExpeditionNameLookup, load_people_expos
|
from parsers.people import GetPersonExpeditionNameLookup, load_people_expos
|
||||||
from troggle.core.models.caves import GetCaveLookup, LogbookEntry, PersonTrip
|
from troggle.core.models.caves import GetCaveLookup, LogbookEntry, PersonTrip
|
||||||
from troggle.core.models.troggle import DataIssue, Expedition
|
from troggle.core.models.troggle import DataIssue, Expedition
|
||||||
from troggle.core.utils import save_carefully, get_process_memory
|
from troggle.core.utils import get_process_memory
|
||||||
|
|
||||||
"""
|
"""
|
||||||
Parses and imports logbooks in all their wonderful confusion
|
Parses and imports logbooks in all their wonderful confusion
|
||||||
@ -21,27 +21,16 @@ Parses and imports logbooks in all their wonderful confusion
|
|||||||
https://expo.survex.com/handbook/computing/logbooks-parsing.html
|
https://expo.survex.com/handbook/computing/logbooks-parsing.html
|
||||||
"""
|
"""
|
||||||
todo = """
|
todo = """
|
||||||
- Most of the time is during the database writing (13s out of 14s).
|
- Most of the time is during the database writing (6s out of 8s).
|
||||||
|
|
||||||
- Move a lot of non-db code from store_entry_into_database()
|
|
||||||
into parse_logbook_for_expedition()
|
|
||||||
|
|
||||||
- call GetTripPersons at parsing time, not db writing time
|
|
||||||
- this is a slow and uncertain function too: cave = getCaveByReference(caveRef)
|
- this is a slow and uncertain function too: cave = getCaveByReference(caveRef)
|
||||||
|
|
||||||
- if I am certain that we are creating from scratch, don't use save_carefully() to
|
|
||||||
create the Django objects. And I am, because I delete the outdated stuff.
|
|
||||||
|
|
||||||
- pre-compile all the heavily used regular expressions !
|
- pre-compile all the heavily used regular expressions !
|
||||||
|
|
||||||
- refactor to get rid of the global 'logentries', very ugly indeed.
|
|
||||||
|
|
||||||
- profile the code to find bad repetitive things, of which there are many.
|
- profile the code to find bad repetitive things, of which there are many.
|
||||||
|
|
||||||
- attach or link a DataIssue to an individual expo (logbook) so that it can be found and deleted
|
- attach or link a DataIssue to an individual expo (logbook) so that it can be found and deleted
|
||||||
|
|
||||||
- far too many uses of Django field dereferencing to get values, which is SLOW
|
|
||||||
|
|
||||||
- replace explicit 1970 date with a constant EPOCH
|
- replace explicit 1970 date with a constant EPOCH
|
||||||
|
|
||||||
- rewrite to use generators rather than storing everything intermediate in lists - to reduce memory impact.
|
- rewrite to use generators rather than storing everything intermediate in lists - to reduce memory impact.
|
||||||
@ -228,53 +217,41 @@ def tidy_trip_image_urls(text, date):
|
|||||||
text = text.replace("\t", "")
|
text = text.replace("\t", "")
|
||||||
text = text.replace("\n\n\n", "\n\n")
|
text = text.replace("\n\n\n", "\n\n")
|
||||||
return text
|
return text
|
||||||
|
|
||||||
def store_entry_into_database(date, place, tripcave, title, text, trippersons, author, expedition, logtime_underground, tid=None):
|
|
||||||
"""saves a single logbook entry and related persontrips
|
|
||||||
Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday !
|
|
||||||
|
|
||||||
troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite but we are saving the same thing too many times..
|
def tidy_tid(tid, title):
|
||||||
|
|
||||||
Until 18 Dec.2022, this was overwriting logbook entries for the same date with the same title, because
|
|
||||||
lookupAttribs={'date':date, 'title':title}
|
|
||||||
"""
|
|
||||||
text = tidy_trip_image_urls(text, date)
|
|
||||||
|
|
||||||
# Check for an existing copy of the current entry, and save
|
|
||||||
expedition.get_expedition_day(date)
|
|
||||||
|
|
||||||
lookupAttribs = {"date": date, "title": title}
|
|
||||||
# but it is a db query which we should try to avoid - rewrite this
|
|
||||||
|
|
||||||
# This needs attention. The slug field is derived from 'title'
|
|
||||||
# NEW slug for a logbook entry here! Unique id + slugified title fragment
|
|
||||||
|
|
||||||
if tid is not None:
|
if tid is not None:
|
||||||
slug = tid
|
return tid
|
||||||
# slug = tid + "_" + slugify(title)[:10].replace('-','_')
|
|
||||||
else:
|
# print(f"! {title=} ")
|
||||||
slug = str(randint(1000, 9999)) + "_" + slugify(title)[:10].replace("-", "_")
|
tid = str(randint(1000, 9999)) + "_" + slugify(title)[:10].replace("-", "_")
|
||||||
|
return tid
|
||||||
|
|
||||||
|
def store_entry_into_database(date, place, tripcave, title, text, trippersons, author, expedition, logtime_underground, tid):
|
||||||
|
"""saves a single logbook entry and related persontrips
|
||||||
|
Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday !
|
||||||
|
"""
|
||||||
|
|
||||||
|
# gets the current ExpeditionDay, and saves it as an object attached to
|
||||||
|
# the expedition, but does not attach it to the logbook entry. Why ?
|
||||||
|
|
||||||
|
# expedition.get_expedition_day(date)
|
||||||
|
|
||||||
nonLookupAttribs = {
|
nonLookupAttribs = {
|
||||||
"place": place,
|
"place": place,
|
||||||
"text": text,
|
"text": text,
|
||||||
"expedition": expedition,
|
"expedition": expedition,
|
||||||
"time_underground": logtime_underground,
|
"time_underground": logtime_underground,
|
||||||
"cave_slug": str(tripcave),
|
"cave_slug": str(tripcave),
|
||||||
"slug": slug,
|
"slug": tid,
|
||||||
}
|
}
|
||||||
# Rewriting as we know prior objects have already been deleted.
|
lookupAttribs = {"date": date, "title": title}
|
||||||
# This creates the lbo instance of LogbookEntry
|
|
||||||
lbo = LogbookEntry.objects.create(**nonLookupAttribs, **lookupAttribs)
|
lbo = LogbookEntry.objects.create(**nonLookupAttribs, **lookupAttribs)
|
||||||
# lbo, created = save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs)
|
|
||||||
|
|
||||||
# for PersonTrip time_underground is float (decimal hours)
|
|
||||||
for tripperson, time_underground in trippersons:
|
for tripperson, time_underground in trippersons:
|
||||||
# print(f" - {tid} '{tripperson}' author:{tripperson == author}")
|
|
||||||
lookupAttribs = {"personexpedition": tripperson, "logbook_entry": lbo}
|
lookupAttribs = {"personexpedition": tripperson, "logbook_entry": lbo}
|
||||||
nonLookupAttribs = {"time_underground": time_underground, "is_logbook_entry_author": (tripperson == author)}
|
nonLookupAttribs = {"time_underground": time_underground, "is_logbook_entry_author": (tripperson == author)}
|
||||||
# this creates the PersonTrip instance.
|
|
||||||
pt = PersonTrip.objects.create(**nonLookupAttribs, **lookupAttribs)
|
pt = PersonTrip.objects.create(**nonLookupAttribs, **lookupAttribs)
|
||||||
# save_carefully(PersonTrip, lookupAttribs, nonLookupAttribs)
|
|
||||||
|
|
||||||
def parser_date(tripdate, year):
|
def parser_date(tripdate, year):
|
||||||
"""Interprets dates in the expo logbooks and returns a correct datetime.date object"""
|
"""Interprets dates in the expo logbooks and returns a correct datetime.date object"""
|
||||||
@ -321,7 +298,7 @@ def parser_html(year, expedition, txt, seq=""):
|
|||||||
the endmatter up to the frontmatter. This made sense when translating
|
the endmatter up to the frontmatter. This made sense when translating
|
||||||
from parser_html_01 format logfiles, believe me.
|
from parser_html_01 format logfiles, believe me.
|
||||||
"""
|
"""
|
||||||
global logentries
|
logentries = []
|
||||||
dupl = {}
|
dupl = {}
|
||||||
|
|
||||||
# extract front material and stash for later use when rebuilding from list of entries
|
# extract front material and stash for later use when rebuilding from list of entries
|
||||||
@ -397,9 +374,8 @@ def parser_html(year, expedition, txt, seq=""):
|
|||||||
place = triptitles[0]
|
place = triptitles[0]
|
||||||
else:
|
else:
|
||||||
place = "UNKNOWN"
|
place = "UNKNOWN"
|
||||||
ltriptext = re.sub(r"</p>", "", triptext)
|
tripcontent = re.sub(r"</p>", "", triptext)
|
||||||
# ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
|
tripcontent = re.sub(r"<p>", "<br /><br />", tripcontent).strip()
|
||||||
ltriptext = re.sub(r"<p>", "<br /><br />", ltriptext).strip()
|
|
||||||
|
|
||||||
triptitle = triptitle.strip()
|
triptitle = triptitle.strip()
|
||||||
# triptitle must be unique for a given date. We fix this here.
|
# triptitle must be unique for a given date. We fix this here.
|
||||||
@ -414,9 +390,12 @@ def parser_html(year, expedition, txt, seq=""):
|
|||||||
tu = tidy_time_underground(tu)
|
tu = tidy_time_underground(tu)
|
||||||
trippersons, author = tidy_trip_persons(trippeople, expedition, tu, tid)
|
trippersons, author = tidy_trip_persons(trippeople, expedition, tu, tid)
|
||||||
tripcave = tidy_trip_cave(place)
|
tripcave = tidy_trip_cave(place)
|
||||||
|
tripcontent = tidy_trip_image_urls(tripcontent, date)
|
||||||
entrytuple = (ldate, place, tripcave, triptitle, ltriptext, trippersons, author, expedition, tu, tripid1)
|
tid = tidy_tid(tid, triptitle)
|
||||||
|
|
||||||
|
entrytuple = (ldate, place, tripcave, triptitle, tripcontent, trippersons, author, expedition, tu, tid)
|
||||||
logentries.append(entrytuple)
|
logentries.append(entrytuple)
|
||||||
|
return logentries
|
||||||
|
|
||||||
|
|
||||||
def parser_blog(year, expedition, txt, sq=""):
|
def parser_blog(year, expedition, txt, sq=""):
|
||||||
@ -437,7 +416,7 @@ def parser_blog(year, expedition, txt, sq=""):
|
|||||||
So the content is nested inside the header. Attachments (images) come after the content.
|
So the content is nested inside the header. Attachments (images) come after the content.
|
||||||
It's a bugger, but it's out of our control.
|
It's a bugger, but it's out of our control.
|
||||||
"""
|
"""
|
||||||
global logentries
|
logentries = []
|
||||||
|
|
||||||
tripheads = re.findall(
|
tripheads = re.findall(
|
||||||
r"<article class=\"message message--post js-post js-inlineModContainer\s*\"\s*([\s\S]*?)(?=</article)", txt
|
r"<article class=\"message message--post js-post js-inlineModContainer\s*\"\s*([\s\S]*?)(?=</article)", txt
|
||||||
@ -515,9 +494,12 @@ def parser_blog(year, expedition, txt, sq=""):
|
|||||||
|
|
||||||
trippersons, author = tidy_trip_persons(trippeople, expedition, logtime_underground, tid)
|
trippersons, author = tidy_trip_persons(trippeople, expedition, logtime_underground, tid)
|
||||||
tripcave = tidy_trip_cave(place)
|
tripcave = tidy_trip_cave(place)
|
||||||
|
tripcontent = tidy_trip_image_urls(tripcontent, date)
|
||||||
|
tid = tidy_tid(tid, triptitle)
|
||||||
|
|
||||||
entrytuple = (tripdate, place, tripcave, triptitle, tripcontent, trippersons, author, expedition, tu, tid)
|
entrytuple = (tripdate, place, tripcave, triptitle, tripcontent, trippersons, author, expedition, tu, tid)
|
||||||
logentries.append(entrytuple)
|
logentries.append(entrytuple)
|
||||||
|
return logentries
|
||||||
|
|
||||||
def clean_all_logbooks():
|
def clean_all_logbooks():
|
||||||
DataIssue.objects.filter(parser="logbooks").delete()
|
DataIssue.objects.filter(parser="logbooks").delete()
|
||||||
@ -538,7 +520,6 @@ def clean_logbook_for_expedition(expedition):
|
|||||||
def parse_logbook_for_expedition(expedition, blog=False):
|
def parse_logbook_for_expedition(expedition, blog=False):
|
||||||
"""Parses all logbook entries for one expedition
|
"""Parses all logbook entries for one expedition
|
||||||
"""
|
"""
|
||||||
global logentries
|
|
||||||
global ENTRIES
|
global ENTRIES
|
||||||
logentries = []
|
logentries = []
|
||||||
|
|
||||||
@ -590,7 +571,7 @@ def parse_logbook_for_expedition(expedition, blog=False):
|
|||||||
# --------------------
|
# --------------------
|
||||||
parser = globals()[parsefunc]
|
parser = globals()[parsefunc]
|
||||||
print(f" - {year} parsing with {parsefunc} - {lb}")
|
print(f" - {year} parsing with {parsefunc} - {lb}")
|
||||||
parser(year, expedition, txt, sq) # this launches the right parser for this year
|
logentries = parser(year, expedition, txt, sq) # this launches the right parser
|
||||||
# --------------------
|
# --------------------
|
||||||
|
|
||||||
if len(logentries) == expect:
|
if len(logentries) == expect:
|
||||||
@ -599,35 +580,33 @@ def parse_logbook_for_expedition(expedition, blog=False):
|
|||||||
else:
|
else:
|
||||||
print(f"Mismatch in number of log entries: {year} {len(logentries):5d} is not {expect}\n")
|
print(f"Mismatch in number of log entries: {year} {len(logentries):5d} is not {expect}\n")
|
||||||
|
|
||||||
return len(logentries)
|
return logentries
|
||||||
|
|
||||||
|
|
||||||
def LoadLogbook(year):
|
def LoadLogbook(year):
|
||||||
"""One off logbook for testing purposes, and also reloadable on '/expedition/2022?reload'
|
"""One off logbook for testing purposes, and also reloadable on '/expedition/2022?reload'
|
||||||
This is inside an atomic transaction"""
|
This is inside an atomic transaction"""
|
||||||
global logentries
|
|
||||||
nlbe = {}
|
|
||||||
|
|
||||||
expo = Expedition.objects.get(year=year)
|
expo = Expedition.objects.get(year=year)
|
||||||
year = expo.year # some type funny
|
year = expo.year # some type funny
|
||||||
clean_logbook_for_expedition(expo)
|
clean_logbook_for_expedition(expo)
|
||||||
logentries = []
|
logentries = []
|
||||||
|
|
||||||
nlbe[expo] = parse_logbook_for_expedition(expo) # this actually loads the logbook for one expo
|
logentries = parse_logbook_for_expedition(expo) # this actually loads the logbook for one expo
|
||||||
if year in BLOG_PARSER_SETTINGS:
|
if year in BLOG_PARSER_SETTINGS:
|
||||||
nlbe[expo] = parse_logbook_for_expedition(expo, blog=True) # this loads the blog logbook
|
logentries += parse_logbook_for_expedition(expo, blog=True) # this loads the blog logbook
|
||||||
else:
|
else:
|
||||||
print(
|
print(
|
||||||
f" - Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}"
|
f" - Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}"
|
||||||
)
|
)
|
||||||
for entrytuple in logentries:
|
for entrytuple in logentries:
|
||||||
date, place, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple
|
date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid = entrytuple
|
||||||
if expo == expedition: # unneeded check, we zeroed it bbefore filling it
|
if expo == expedition: # unneeded check, we zeroed it before filling it
|
||||||
#print(f" - {triptitle}")
|
#print(f" - {triptitle}")
|
||||||
store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1)
|
store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid)
|
||||||
else:
|
else:
|
||||||
print(f" ! unexpected log entry labelled as '{expedition}' {tripid1}" )
|
print(f" ! unexpected log entry labelled as '{expedition}' {tid}" )
|
||||||
expedition.save() # to save logbook name property
|
expo.save() # to save logbook name property
|
||||||
|
|
||||||
def LoadLogbooks():
|
def LoadLogbooks():
|
||||||
"""This is the master function for parsing all logbooks into the Troggle database.
|
"""This is the master function for parsing all logbooks into the Troggle database.
|
||||||
@ -688,20 +667,15 @@ def LoadLogbooks():
|
|||||||
bloglist.append(expo)
|
bloglist.append(expo)
|
||||||
|
|
||||||
for ex in loglist:
|
for ex in loglist:
|
||||||
nlbe[ex] = parse_logbook_for_expedition(ex) # this loads the logbook for one expo
|
logentries = parse_logbook_for_expedition(ex) # this loads the logbook for one expo
|
||||||
allentries += logentries
|
allentries += logentries
|
||||||
|
|
||||||
for b in bloglist:
|
for b in bloglist:
|
||||||
print(f" - BLOG: {b}")
|
print(f" - BLOG: {b}")
|
||||||
nlbe[b] += parse_logbook_for_expedition(b, blog=True) # loads the blog logbook for one expo
|
logentries = parse_logbook_for_expedition(b, blog=True) # loads the blog logbook for one expo
|
||||||
allentries += logentries
|
allentries += logentries
|
||||||
|
|
||||||
yt = 0
|
print(f"total {len(allentries):,} log entries parsed in all expeditions")
|
||||||
for exp in nlbe:
|
|
||||||
yt += nlbe[exp]
|
|
||||||
print(f"total {yt:,} log entries parsed in all expeditions")
|
|
||||||
|
|
||||||
print(f"total {len(allentries):,} log entries in complete dict")
|
|
||||||
mem = get_process_memory()
|
mem = get_process_memory()
|
||||||
print(f" - MEM: {mem:7.2f} MB in use, {mem-mem1:7.2f} MB more", file=sys.stderr)
|
print(f" - MEM: {mem:7.2f} MB in use, {mem-mem1:7.2f} MB more", file=sys.stderr)
|
||||||
duration = time.time() - start
|
duration = time.time() - start
|
||||||
@ -712,11 +686,11 @@ def LoadLogbooks():
|
|||||||
# - LogBookEntry (text, who when etc.)
|
# - LogBookEntry (text, who when etc.)
|
||||||
# - PersonTrip (who was on that specific trip mentione din the logbook entry)
|
# - PersonTrip (who was on that specific trip mentione din the logbook entry)
|
||||||
for entrytuple in allentries:
|
for entrytuple in allentries:
|
||||||
date, place, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple
|
date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid = entrytuple
|
||||||
store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1)
|
store_entry_into_database(date, place, tripcave, triptitle, text, trippersons, author, expedition, tu, tid)
|
||||||
|
|
||||||
for expo in expos:
|
for expo in expos:
|
||||||
expedition.save() # to save logbook name property
|
expo.save() # to save logbook name property
|
||||||
mem = get_process_memory()
|
mem = get_process_memory()
|
||||||
print(f" - MEM: {mem:7.2f} MB in use, {mem-mem1:7.2f} MB more", file=sys.stderr)
|
print(f" - MEM: {mem:7.2f} MB in use, {mem-mem1:7.2f} MB more", file=sys.stderr)
|
||||||
duration = time.time() - start
|
duration = time.time() - start
|
||||||
|
Loading…
Reference in New Issue
Block a user