forked from expo/troggle
refactoring author checks
This commit is contained in:
parent
e01bd39609
commit
e4c804b305
@ -21,13 +21,20 @@ Parses and imports logbooks in all their wonderful confusion
|
|||||||
https://expo.survex.com/handbook/computing/logbooks-parsing.html
|
https://expo.survex.com/handbook/computing/logbooks-parsing.html
|
||||||
"""
|
"""
|
||||||
todo = """
|
todo = """
|
||||||
- refactor everything with some urgency, esp. parse_logbook_for_expedition()
|
- Most of the time is during the database writing (13s out of 14s).
|
||||||
|
|
||||||
|
- Move a lot of non-db code from store_entry_into_database()
|
||||||
|
into parse_logbook_for_expedition()
|
||||||
|
|
||||||
|
- call GetTripPersons at parsing time, not db writing time
|
||||||
|
- this is a slow and uncertain function too: cave = getCaveByReference(caveRef)
|
||||||
|
|
||||||
|
- if I am certain that we are creating from scratch, don't use save_carefully() to
|
||||||
|
create the Django objects. And I am, because I delete the outdated stuff.
|
||||||
|
|
||||||
- pre-compile all the heavily used regular expressions !
|
- pre-compile all the heavily used regular expressions !
|
||||||
|
|
||||||
- break out the code that hits the database from that which parses the logbook
|
- refactor to get rid of the global 'logentries', very ugly indeed.
|
||||||
so that the file-reading and parsing can be parallelized, while writing to the
|
|
||||||
database remains serialized (sqlite is single-user).
|
|
||||||
|
|
||||||
- profile the code to find bad repetitive things, of which there are many.
|
- profile the code to find bad repetitive things, of which there are many.
|
||||||
|
|
||||||
@ -35,13 +42,14 @@ database remains serialized (sqlite is single-user).
|
|||||||
|
|
||||||
- far too many uses of Django field dereferencing to get values, which is SLOW
|
- far too many uses of Django field dereferencing to get values, which is SLOW
|
||||||
|
|
||||||
|
- replace explicit 1970 date with a constant EPOCH
|
||||||
|
|
||||||
- rewrite to use generators rather than storing everything intermediate in lists - to reduce memory impact.
|
- rewrite to use generators rather than storing everything intermediate in lists - to reduce memory impact.
|
||||||
|
|
||||||
- We should ensure logbook.html is utf-8 and stop this crap:
|
- We should ensure logbook.html is utf-8 and stop this crap:
|
||||||
file_in = open(logbookfile,'rb')
|
file_in = open(logbookfile,'rb')
|
||||||
txt = file_in.read().decode("latin1")
|
txt = file_in.read().decode("latin1")
|
||||||
|
|
||||||
- this is a slow and uncertain function: cave = getCaveByReference(caveRef)
|
|
||||||
|
|
||||||
- use Fixtures https://docs.djangoproject.com/en/4.1/ref/django-admin/#django-admin-loaddata to cache
|
- use Fixtures https://docs.djangoproject.com/en/4.1/ref/django-admin/#django-admin-loaddata to cache
|
||||||
data for old logbooks? Not worth it..
|
data for old logbooks? Not worth it..
|
||||||
@ -162,17 +170,7 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
|
|||||||
# print(f" - {tid} [{author.person}] '{res[0][0].person}'...")
|
# print(f" - {tid} [{author.person}] '{res[0][0].person}'...")
|
||||||
return res, author
|
return res, author
|
||||||
|
|
||||||
|
def tidy_time_underground(logtime_underground):
|
||||||
def store_entry_into_database(date, place, title, text, trippeople, expedition, logtime_underground, tid=None):
|
|
||||||
"""saves a single logbook entry and related persontrips
|
|
||||||
Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday !
|
|
||||||
|
|
||||||
troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite but we are saving the same thing too many times..
|
|
||||||
|
|
||||||
Until 18 Dec.2022, this was overwriting logbook entries for the same date with the same title, because
|
|
||||||
lookupAttribs={'date':date, 'title':title}
|
|
||||||
"""
|
|
||||||
|
|
||||||
# Nasty hack, must tidy this up..
|
# Nasty hack, must tidy this up..
|
||||||
if logtime_underground:
|
if logtime_underground:
|
||||||
try:
|
try:
|
||||||
@ -187,7 +185,9 @@ def store_entry_into_database(date, place, title, text, trippeople, expedition,
|
|||||||
logtime_underground = 0
|
logtime_underground = 0
|
||||||
else:
|
else:
|
||||||
logtime_underground = 0
|
logtime_underground = 0
|
||||||
|
return logtime_underground
|
||||||
|
|
||||||
|
def tidy_trip_persons(trippeople, expedition, logtime_underground, tid):
|
||||||
try:
|
try:
|
||||||
trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground, tid=tid)
|
trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground, tid=tid)
|
||||||
# print(f" - {author} - {logtime_underground}")
|
# print(f" - {author} - {logtime_underground}")
|
||||||
@ -195,14 +195,25 @@ def store_entry_into_database(date, place, title, text, trippeople, expedition,
|
|||||||
message = f" ! - {expedition.year} Skipping logentry: {title} - GetTripPersons FAIL"
|
message = f" ! - {expedition.year} Skipping logentry: {title} - GetTripPersons FAIL"
|
||||||
DataIssue.objects.create(parser="logbooks", message=message)
|
DataIssue.objects.create(parser="logbooks", message=message)
|
||||||
print(message)
|
print(message)
|
||||||
raise
|
# raise
|
||||||
return
|
return
|
||||||
|
|
||||||
if not author:
|
if not author:
|
||||||
message = f" ! - {expedition.year} Warning: logentry: {title} - no expo member author for entry '{tid}'"
|
message = f" ! - {expedition.year} Warning: logentry: {title} - no expo member author for entry '{tid}'"
|
||||||
DataIssue.objects.create(parser="logbooks", message=message)
|
DataIssue.objects.create(parser="logbooks", message=message)
|
||||||
print(message)
|
print(message)
|
||||||
# return
|
|
||||||
|
return trippersons, author
|
||||||
|
|
||||||
|
def store_entry_into_database(date, place, title, text, trippersons, author, expedition, logtime_underground, tid=None):
|
||||||
|
"""saves a single logbook entry and related persontrips
|
||||||
|
Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday !
|
||||||
|
|
||||||
|
troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite but we are saving the same thing too many times..
|
||||||
|
|
||||||
|
Until 18 Dec.2022, this was overwriting logbook entries for the same date with the same title, because
|
||||||
|
lookupAttribs={'date':date, 'title':title}
|
||||||
|
"""
|
||||||
|
|
||||||
# This needs attention. The slug field is derived from 'title'
|
# This needs attention. The slug field is derived from 'title'
|
||||||
# both GetCaveLookup() and GetTripCave() need to work together better. None of this data is *used* though?
|
# both GetCaveLookup() and GetTripCave() need to work together better. None of this data is *used* though?
|
||||||
@ -257,9 +268,6 @@ def store_entry_into_database(date, place, title, text, trippeople, expedition,
|
|||||||
# this creates the PersonTrip instance.
|
# this creates the PersonTrip instance.
|
||||||
save_carefully(PersonTrip, lookupAttribs, nonLookupAttribs)
|
save_carefully(PersonTrip, lookupAttribs, nonLookupAttribs)
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
def parser_date(tripdate, year):
|
def parser_date(tripdate, year):
|
||||||
"""Interprets dates in the expo logbooks and returns a correct datetime.date object"""
|
"""Interprets dates in the expo logbooks and returns a correct datetime.date object"""
|
||||||
dummydate = date(1970, 1, 1) # replace with _EPOCH
|
dummydate = date(1970, 1, 1) # replace with _EPOCH
|
||||||
@ -395,7 +403,10 @@ def parser_html(year, expedition, txt, seq=""):
|
|||||||
else:
|
else:
|
||||||
dupl[check] = 1
|
dupl[check] = 1
|
||||||
|
|
||||||
entrytuple = (ldate, tripcave, triptitle, ltriptext, trippeople, expedition, tu, tripid1)
|
tu = tidy_time_underground(tu)
|
||||||
|
|
||||||
|
trippersons, author = tidy_trip_persons(trippeople, expedition, tu, tid)
|
||||||
|
entrytuple = (ldate, tripcave, triptitle, ltriptext, trippersons, author, expedition, tu, tripid1)
|
||||||
logentries.append(entrytuple)
|
logentries.append(entrytuple)
|
||||||
|
|
||||||
|
|
||||||
@ -439,7 +450,7 @@ def parser_blog(year, expedition, txt, sq=""):
|
|||||||
print(f"{len(tripheads)} - {len(tripparas)}")
|
print(f"{len(tripheads)} - {len(tripparas)}")
|
||||||
|
|
||||||
location = "Plateau" # best guess, fix manually later
|
location = "Plateau" # best guess, fix manually later
|
||||||
tu = 0
|
tu = 0 # no logged time underground in a blog entry
|
||||||
logbook_entry_count = 0
|
logbook_entry_count = 0
|
||||||
for i in range(0, len(tripparas)):
|
for i in range(0, len(tripparas)):
|
||||||
tripstuff = tripparas[i]
|
tripstuff = tripparas[i]
|
||||||
@ -493,7 +504,8 @@ def parser_blog(year, expedition, txt, sq=""):
|
|||||||
tripcontent = re.sub(r"<hr\s*>", "", tripcontent)
|
tripcontent = re.sub(r"<hr\s*>", "", tripcontent)
|
||||||
tripcontent = f"\n\n<!-- Content parsed from UK Caving Blog -->\nBlog Author: {trippeople}" + tripcontent
|
tripcontent = f"\n\n<!-- Content parsed from UK Caving Blog -->\nBlog Author: {trippeople}" + tripcontent
|
||||||
|
|
||||||
entrytuple = (tripdate, location, triptitle, tripcontent, trippeople, expedition, tu, tid)
|
trippersons, author = tidy_trip_persons(trippeople, expedition, logtime_underground, tid)
|
||||||
|
entrytuple = (tripdate, location, triptitle, tripcontent, trippersons, author, expedition, tu, tid)
|
||||||
logentries.append(entrytuple)
|
logentries.append(entrytuple)
|
||||||
|
|
||||||
def clean_all_logbooks():
|
def clean_all_logbooks():
|
||||||
@ -564,7 +576,6 @@ def parse_logbook_for_expedition(expedition, blog=False):
|
|||||||
print(f" ! Very Bad Error opening {lb}")
|
print(f" ! Very Bad Error opening {lb}")
|
||||||
|
|
||||||
if logbook_parseable:
|
if logbook_parseable:
|
||||||
|
|
||||||
# --------------------
|
# --------------------
|
||||||
parser = globals()[parsefunc]
|
parser = globals()[parsefunc]
|
||||||
print(f" - {year} parsing with {parsefunc} - {lb}")
|
print(f" - {year} parsing with {parsefunc} - {lb}")
|
||||||
@ -572,8 +583,8 @@ def parse_logbook_for_expedition(expedition, blog=False):
|
|||||||
# --------------------
|
# --------------------
|
||||||
# move database storage into separate step
|
# move database storage into separate step
|
||||||
# for entrytuple in logentries:
|
# for entrytuple in logentries:
|
||||||
# date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
|
# date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple
|
||||||
# store_entry_into_database(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1)
|
# store_entry_into_database(date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1)
|
||||||
|
|
||||||
if len(logentries) == expect:
|
if len(logentries) == expect:
|
||||||
# print(f"OK {year} {len(logentries):5d} is {expect}\n")
|
# print(f"OK {year} {len(logentries):5d} is {expect}\n")
|
||||||
@ -603,10 +614,10 @@ def LoadLogbook(year):
|
|||||||
f" - Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}"
|
f" - Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}"
|
||||||
)
|
)
|
||||||
for entrytuple in logentries:
|
for entrytuple in logentries:
|
||||||
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
|
date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple
|
||||||
if expo == expedition:
|
if expo == expedition:
|
||||||
#print(f" - {triptitle}")
|
#print(f" - {triptitle}")
|
||||||
store_entry_into_database(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1)
|
store_entry_into_database(date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1)
|
||||||
expedition.save() # to save logbook name property
|
expedition.save() # to save logbook name property
|
||||||
|
|
||||||
def LoadLogbooks():
|
def LoadLogbooks():
|
||||||
@ -692,8 +703,8 @@ def LoadLogbooks():
|
|||||||
# - LogBookEntry (text, who when etc.)
|
# - LogBookEntry (text, who when etc.)
|
||||||
# - PersonTrip (who was on that specific trip mentione din the logbook entry)
|
# - PersonTrip (who was on that specific trip mentione din the logbook entry)
|
||||||
for entrytuple in allentries:
|
for entrytuple in allentries:
|
||||||
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
|
date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple
|
||||||
store_entry_into_database(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1)
|
store_entry_into_database(date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1)
|
||||||
|
|
||||||
for expo in expos:
|
for expo in expos:
|
||||||
expedition.save() # to save logbook name property
|
expedition.save() # to save logbook name property
|
||||||
|
Loading…
Reference in New Issue
Block a user