refactoring author checks

This commit is contained in:
Philip Sargent 2023-01-28 10:47:25 +00:00
parent e01bd39609
commit e4c804b305

View File

@ -21,13 +21,20 @@ Parses and imports logbooks in all their wonderful confusion
https://expo.survex.com/handbook/computing/logbooks-parsing.html
"""
todo = """
- refactor everything with some urgency, esp. parse_logbook_for_expedition()
- Most of the time is during the database writing (13s out of 14s).
- Move a lot of non-db code from store_entry_into_database()
into parse_logbook_for_expedition()
- call GetTripPersons at parsing time, not db writing time
- this is a slow and uncertain function too: cave = getCaveByReference(caveRef)
- if I am certain that we are creating from scratch, don't use save_carefully() to
create the Django objects. And I am, because I delete the outdated stuff.
- pre-compile all the heavily used regular expressions !
- break out the code that hits the database from that which parses the logbook
so that the file-reading and parsing can be parallelized, while writing to the
database remains serialized (sqlite is single-user).
- refactor to get rid of the global 'logentries', very ugly indeed.
- profile the code to find bad repetitive things, of which there are many.
@ -35,13 +42,14 @@ database remains serialized (sqlite is single-user).
- far too many uses of Django field dereferencing to get values, which is SLOW
- replace explicit 1970 date with a constant EPOCH
- rewrite to use generators rather than storing everything intermediate in lists - to reduce memory impact.
- We should ensure logbook.html is utf-8 and stop this crap:
file_in = open(logbookfile,'rb')
txt = file_in.read().decode("latin1")
- this is a slow and uncertain function: cave = getCaveByReference(caveRef)
- use Fixtures https://docs.djangoproject.com/en/4.1/ref/django-admin/#django-admin-loaddata to cache
data for old logbooks? Not worth it..
@ -162,17 +170,7 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
# print(f" - {tid} [{author.person}] '{res[0][0].person}'...")
return res, author
def store_entry_into_database(date, place, title, text, trippeople, expedition, logtime_underground, tid=None):
"""saves a single logbook entry and related persontrips
Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday !
troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite but we are saving the same thing too many times..
Until 18 Dec.2022, this was overwriting logbook entries for the same date with the same title, because
lookupAttribs={'date':date, 'title':title}
"""
def tidy_time_underground(logtime_underground):
# Nasty hack, must tidy this up..
if logtime_underground:
try:
@ -187,7 +185,9 @@ def store_entry_into_database(date, place, title, text, trippeople, expedition,
logtime_underground = 0
else:
logtime_underground = 0
return logtime_underground
def tidy_trip_persons(trippeople, expedition, logtime_underground, tid):
try:
trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground, tid=tid)
# print(f" - {author} - {logtime_underground}")
@ -195,14 +195,25 @@ def store_entry_into_database(date, place, title, text, trippeople, expedition,
message = f" ! - {expedition.year} Skipping logentry: {title} - GetTripPersons FAIL"
DataIssue.objects.create(parser="logbooks", message=message)
print(message)
raise
# raise
return
if not author:
message = f" ! - {expedition.year} Warning: logentry: {title} - no expo member author for entry '{tid}'"
DataIssue.objects.create(parser="logbooks", message=message)
print(message)
# return
return trippersons, author
def store_entry_into_database(date, place, title, text, trippersons, author, expedition, logtime_underground, tid=None):
"""saves a single logbook entry and related persontrips
Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday !
troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite but we are saving the same thing too many times..
Until 18 Dec.2022, this was overwriting logbook entries for the same date with the same title, because
lookupAttribs={'date':date, 'title':title}
"""
# This needs attention. The slug field is derived from 'title'
# both GetCaveLookup() and GetTripCave() need to work together better. None of this data is *used* though?
@ -257,9 +268,6 @@ def store_entry_into_database(date, place, title, text, trippeople, expedition,
# this creates the PersonTrip instance.
save_carefully(PersonTrip, lookupAttribs, nonLookupAttribs)
def parser_date(tripdate, year):
"""Interprets dates in the expo logbooks and returns a correct datetime.date object"""
dummydate = date(1970, 1, 1) # replace with _EPOCH
@ -395,7 +403,10 @@ def parser_html(year, expedition, txt, seq=""):
else:
dupl[check] = 1
entrytuple = (ldate, tripcave, triptitle, ltriptext, trippeople, expedition, tu, tripid1)
tu = tidy_time_underground(tu)
trippersons, author = tidy_trip_persons(trippeople, expedition, tu, tid)
entrytuple = (ldate, tripcave, triptitle, ltriptext, trippersons, author, expedition, tu, tripid1)
logentries.append(entrytuple)
@ -439,7 +450,7 @@ def parser_blog(year, expedition, txt, sq=""):
print(f"{len(tripheads)} - {len(tripparas)}")
location = "Plateau" # best guess, fix manually later
tu = 0
tu = 0 # no logged time underground in a blog entry
logbook_entry_count = 0
for i in range(0, len(tripparas)):
tripstuff = tripparas[i]
@ -493,7 +504,8 @@ def parser_blog(year, expedition, txt, sq=""):
tripcontent = re.sub(r"<hr\s*>", "", tripcontent)
tripcontent = f"\n\n<!-- Content parsed from UK Caving Blog -->\nBlog Author: {trippeople}" + tripcontent
entrytuple = (tripdate, location, triptitle, tripcontent, trippeople, expedition, tu, tid)
trippersons, author = tidy_trip_persons(trippeople, expedition, logtime_underground, tid)
entrytuple = (tripdate, location, triptitle, tripcontent, trippersons, author, expedition, tu, tid)
logentries.append(entrytuple)
def clean_all_logbooks():
@ -564,7 +576,6 @@ def parse_logbook_for_expedition(expedition, blog=False):
print(f" ! Very Bad Error opening {lb}")
if logbook_parseable:
# --------------------
parser = globals()[parsefunc]
print(f" - {year} parsing with {parsefunc} - {lb}")
@ -572,8 +583,8 @@ def parse_logbook_for_expedition(expedition, blog=False):
# --------------------
# move database storage into separate step
# for entrytuple in logentries:
# date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
# store_entry_into_database(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1)
# date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple
# store_entry_into_database(date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1)
if len(logentries) == expect:
# print(f"OK {year} {len(logentries):5d} is {expect}\n")
@ -603,10 +614,10 @@ def LoadLogbook(year):
f" - Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}"
)
for entrytuple in logentries:
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple
if expo == expedition:
#print(f" - {triptitle}")
store_entry_into_database(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1)
store_entry_into_database(date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1)
expedition.save() # to save logbook name property
def LoadLogbooks():
@ -692,8 +703,8 @@ def LoadLogbooks():
# - LogBookEntry (text, who when etc.)
# - PersonTrip (who was on that specific trip mentione din the logbook entry)
for entrytuple in allentries:
date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple
store_entry_into_database(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1)
date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple
store_entry_into_database(date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1)
for expo in expos:
expedition.save() # to save logbook name property