From e4c804b30585da3ff300659ed364b00c65e850b7 Mon Sep 17 00:00:00 2001 From: Philip Sargent Date: Sat, 28 Jan 2023 10:47:25 +0000 Subject: [PATCH] refactoring author checks --- parsers/logbooks.py | 73 ++++++++++++++++++++++++++------------------- 1 file changed, 42 insertions(+), 31 deletions(-) diff --git a/parsers/logbooks.py b/parsers/logbooks.py index a65774d..fd1e7eb 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -21,13 +21,20 @@ Parses and imports logbooks in all their wonderful confusion https://expo.survex.com/handbook/computing/logbooks-parsing.html """ todo = """ -- refactor everything with some urgency, esp. parse_logbook_for_expedition() +- Most of the time is during the database writing (13s out of 14s). + +- Move a lot of non-db code from store_entry_into_database() +into parse_logbook_for_expedition() + +- call GetTripPersons at parsing time, not db writing time +- this is a slow and uncertain function too: cave = getCaveByReference(caveRef) + +- if I am certain that we are creating from scratch, don't use save_carefully() to +create the Django objects. And I am, because I delete the outdated stuff. - pre-compile all the heavily used regular expressions ! -- break out the code that hits the database from that which parses the logbook -so that the file-reading and parsing can be parallelized, while writing to the -database remains serialized (sqlite is single-user). +- refactor to get rid of the global 'logentries', very ugly indeed. - profile the code to find bad repetitive things, of which there are many. @@ -35,13 +42,14 @@ database remains serialized (sqlite is single-user). - far too many uses of Django field dereferencing to get values, which is SLOW +- replace explicit 1970 date with a constant EPOCH + - rewrite to use generators rather than storing everything intermediate in lists - to reduce memory impact. - We should ensure logbook.html is utf-8 and stop this crap: file_in = open(logbookfile,'rb') txt = file_in.read().decode("latin1") -- this is a slow and uncertain function: cave = getCaveByReference(caveRef) - use Fixtures https://docs.djangoproject.com/en/4.1/ref/django-admin/#django-admin-loaddata to cache data for old logbooks? Not worth it.. @@ -162,17 +170,7 @@ def GetTripPersons(trippeople, expedition, logtime_underground, tid=None): # print(f" - {tid} [{author.person}] '{res[0][0].person}'...") return res, author - -def store_entry_into_database(date, place, title, text, trippeople, expedition, logtime_underground, tid=None): - """saves a single logbook entry and related persontrips - Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday ! - - troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite but we are saving the same thing too many times.. - - Until 18 Dec.2022, this was overwriting logbook entries for the same date with the same title, because - lookupAttribs={'date':date, 'title':title} - """ - +def tidy_time_underground(logtime_underground): # Nasty hack, must tidy this up.. if logtime_underground: try: @@ -187,7 +185,9 @@ def store_entry_into_database(date, place, title, text, trippeople, expedition, logtime_underground = 0 else: logtime_underground = 0 + return logtime_underground +def tidy_trip_persons(trippeople, expedition, logtime_underground, tid): try: trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground, tid=tid) # print(f" - {author} - {logtime_underground}") @@ -195,14 +195,25 @@ def store_entry_into_database(date, place, title, text, trippeople, expedition, message = f" ! - {expedition.year} Skipping logentry: {title} - GetTripPersons FAIL" DataIssue.objects.create(parser="logbooks", message=message) print(message) - raise + # raise return if not author: message = f" ! - {expedition.year} Warning: logentry: {title} - no expo member author for entry '{tid}'" DataIssue.objects.create(parser="logbooks", message=message) print(message) - # return + + return trippersons, author + +def store_entry_into_database(date, place, title, text, trippersons, author, expedition, logtime_underground, tid=None): + """saves a single logbook entry and related persontrips + Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday ! + + troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite but we are saving the same thing too many times.. + + Until 18 Dec.2022, this was overwriting logbook entries for the same date with the same title, because + lookupAttribs={'date':date, 'title':title} + """ # This needs attention. The slug field is derived from 'title' # both GetCaveLookup() and GetTripCave() need to work together better. None of this data is *used* though? @@ -257,9 +268,6 @@ def store_entry_into_database(date, place, title, text, trippeople, expedition, # this creates the PersonTrip instance. save_carefully(PersonTrip, lookupAttribs, nonLookupAttribs) - - - def parser_date(tripdate, year): """Interprets dates in the expo logbooks and returns a correct datetime.date object""" dummydate = date(1970, 1, 1) # replace with _EPOCH @@ -395,7 +403,10 @@ def parser_html(year, expedition, txt, seq=""): else: dupl[check] = 1 - entrytuple = (ldate, tripcave, triptitle, ltriptext, trippeople, expedition, tu, tripid1) + tu = tidy_time_underground(tu) + + trippersons, author = tidy_trip_persons(trippeople, expedition, tu, tid) + entrytuple = (ldate, tripcave, triptitle, ltriptext, trippersons, author, expedition, tu, tripid1) logentries.append(entrytuple) @@ -439,7 +450,7 @@ def parser_blog(year, expedition, txt, sq=""): print(f"{len(tripheads)} - {len(tripparas)}") location = "Plateau" # best guess, fix manually later - tu = 0 + tu = 0 # no logged time underground in a blog entry logbook_entry_count = 0 for i in range(0, len(tripparas)): tripstuff = tripparas[i] @@ -493,7 +504,8 @@ def parser_blog(year, expedition, txt, sq=""): tripcontent = re.sub(r"", "", tripcontent) tripcontent = f"\n\n\nBlog Author: {trippeople}" + tripcontent - entrytuple = (tripdate, location, triptitle, tripcontent, trippeople, expedition, tu, tid) + trippersons, author = tidy_trip_persons(trippeople, expedition, logtime_underground, tid) + entrytuple = (tripdate, location, triptitle, tripcontent, trippersons, author, expedition, tu, tid) logentries.append(entrytuple) def clean_all_logbooks(): @@ -564,7 +576,6 @@ def parse_logbook_for_expedition(expedition, blog=False): print(f" ! Very Bad Error opening {lb}") if logbook_parseable: - # -------------------- parser = globals()[parsefunc] print(f" - {year} parsing with {parsefunc} - {lb}") @@ -572,8 +583,8 @@ def parse_logbook_for_expedition(expedition, blog=False): # -------------------- # move database storage into separate step # for entrytuple in logentries: - # date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple - # store_entry_into_database(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1) + # date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple + # store_entry_into_database(date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1) if len(logentries) == expect: # print(f"OK {year} {len(logentries):5d} is {expect}\n") @@ -603,10 +614,10 @@ def LoadLogbook(year): f" - Not a year with extant blog entries to import: '{year}' not in BLOG_PARSER_SETTINGS {BLOG_PARSER_SETTINGS}" ) for entrytuple in logentries: - date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple + date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple if expo == expedition: #print(f" - {triptitle}") - store_entry_into_database(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1) + store_entry_into_database(date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1) expedition.save() # to save logbook name property def LoadLogbooks(): @@ -692,8 +703,8 @@ def LoadLogbooks(): # - LogBookEntry (text, who when etc.) # - PersonTrip (who was on that specific trip mentione din the logbook entry) for entrytuple in allentries: - date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1 = entrytuple - store_entry_into_database(date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, tripid1) + date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1 = entrytuple + store_entry_into_database(date, tripcave, triptitle, text, trippersons, author, expedition, logtime_underground, tripid1) for expo in expos: expedition.save() # to save logbook name property