diff --git a/core/utils.py b/core/utils.py index 8bb7c2a..0b18e3b 100644 --- a/core/utils.py +++ b/core/utils.py @@ -203,6 +203,8 @@ def save_carefully(objectType, lookupAttribs={}, nonLookupAttribs={}): defined in core.models.TroggleModel. We are not using new_since_parsing - it is a fossil from Aaron Curtis's design in 2006. So it is always false. + + NOTE: this takes twice as long as simply creating a new object with the given values. """ try: diff --git a/parsers/logbooks.py b/parsers/logbooks.py index f5250a2..bb592f3 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -13,7 +13,7 @@ from django.template.defaultfilters import slugify from parsers.people import GetPersonExpeditionNameLookup, load_people_expos from troggle.core.models.caves import GetCaveLookup, LogbookEntry, PersonTrip from troggle.core.models.troggle import DataIssue, Expedition -from troggle.core.utils import save_carefully, get_process_memory +from troggle.core.utils import get_process_memory """ Parses and imports logbooks in all their wonderful confusion @@ -21,27 +21,16 @@ Parses and imports logbooks in all their wonderful confusion https://expo.survex.com/handbook/computing/logbooks-parsing.html """ todo = """ -- Most of the time is during the database writing (13s out of 14s). +- Most of the time is during the database writing (6s out of 8s). -- Move a lot of non-db code from store_entry_into_database() -into parse_logbook_for_expedition() - -- call GetTripPersons at parsing time, not db writing time - this is a slow and uncertain function too: cave = getCaveByReference(caveRef) -- if I am certain that we are creating from scratch, don't use save_carefully() to -create the Django objects. And I am, because I delete the outdated stuff. - - pre-compile all the heavily used regular expressions ! -- refactor to get rid of the global 'logentries', very ugly indeed. - - profile the code to find bad repetitive things, of which there are many. - attach or link a DataIssue to an individual expo (logbook) so that it can be found and deleted -- far too many uses of Django field dereferencing to get values, which is SLOW - - replace explicit 1970 date with a constant EPOCH - rewrite to use generators rather than storing everything intermediate in lists - to reduce memory impact. @@ -228,53 +217,41 @@ def tidy_trip_image_urls(text, date): text = text.replace("\t", "") text = text.replace("\n\n\n", "\n\n") return text - -def store_entry_into_database(date, place, tripcave, title, text, trippersons, author, expedition, logtime_underground, tid=None): - """saves a single logbook entry and related persontrips - Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday ! - troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite but we are saving the same thing too many times.. - - Until 18 Dec.2022, this was overwriting logbook entries for the same date with the same title, because - lookupAttribs={'date':date, 'title':title} - """ - text = tidy_trip_image_urls(text, date) - - # Check for an existing copy of the current entry, and save - expedition.get_expedition_day(date) - - lookupAttribs = {"date": date, "title": title} - # but it is a db query which we should try to avoid - rewrite this - - # This needs attention. The slug field is derived from 'title' - # NEW slug for a logbook entry here! Unique id + slugified title fragment +def tidy_tid(tid, title): if tid is not None: - slug = tid - # slug = tid + "_" + slugify(title)[:10].replace('-','_') - else: - slug = str(randint(1000, 9999)) + "_" + slugify(title)[:10].replace("-", "_") + return tid + + # print(f"! {title=} ") + tid = str(randint(1000, 9999)) + "_" + slugify(title)[:10].replace("-", "_") + return tid + +def store_entry_into_database(date, place, tripcave, title, text, trippersons, author, expedition, logtime_underground, tid): + """saves a single logbook entry and related persontrips + Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday ! + """ + + # gets the current ExpeditionDay, and saves it as an object attached to + # the expedition, but does not attach it to the logbook entry. Why ? + + # expedition.get_expedition_day(date) + nonLookupAttribs = { "place": place, "text": text, "expedition": expedition, "time_underground": logtime_underground, "cave_slug": str(tripcave), - "slug": slug, + "slug": tid, } - # Rewriting as we know prior objects have already been deleted. - # This creates the lbo instance of LogbookEntry + lookupAttribs = {"date": date, "title": title} lbo = LogbookEntry.objects.create(**nonLookupAttribs, **lookupAttribs) - # lbo, created = save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs) - - # for PersonTrip time_underground is float (decimal hours) + for tripperson, time_underground in trippersons: - # print(f" - {tid} '{tripperson}' author:{tripperson == author}") lookupAttribs = {"personexpedition": tripperson, "logbook_entry": lbo} nonLookupAttribs = {"time_underground": time_underground, "is_logbook_entry_author": (tripperson == author)} - # this creates the PersonTrip instance. pt = PersonTrip.objects.create(**nonLookupAttribs, **lookupAttribs) - # save_carefully(PersonTrip, lookupAttribs, nonLookupAttribs) def parser_date(tripdate, year): """Interprets dates in the expo logbooks and returns a correct datetime.date object""" @@ -321,7 +298,7 @@ def parser_html(year, expedition, txt, seq=""): the endmatter up to the frontmatter. This made sense when translating from parser_html_01 format logfiles, believe me. """ - global logentries + logentries = [] dupl = {} # extract front material and stash for later use when rebuilding from list of entries @@ -397,9 +374,8 @@ def parser_html(year, expedition, txt, seq=""): place = triptitles[0] else: place = "UNKNOWN" - ltriptext = re.sub(r"

", "", triptext) - # ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext) - ltriptext = re.sub(r"

", "

", ltriptext).strip() + tripcontent = re.sub(r"

", "", triptext) + tripcontent = re.sub(r"

", "

", tripcontent).strip() triptitle = triptitle.strip() # triptitle must be unique for a given date. We fix this here. @@ -414,9 +390,12 @@ def parser_html(year, expedition, txt, seq=""): tu = tidy_time_underground(tu) trippersons, author = tidy_trip_persons(trippeople, expedition, tu, tid) tripcave = tidy_trip_cave(place) - - entrytuple = (ldate, place, tripcave, triptitle, ltriptext, trippersons, author, expedition, tu, tripid1) + tripcontent = tidy_trip_image_urls(tripcontent, date) + tid = tidy_tid(tid, triptitle) + + entrytuple = (ldate, place, tripcave, triptitle, tripcontent, trippersons, author, expedition, tu, tid) logentries.append(entrytuple) + return logentries def parser_blog(year, expedition, txt, sq=""): @@ -437,7 +416,7 @@ def parser_blog(year, expedition, txt, sq=""): So the content is nested inside the header. Attachments (images) come after the content. It's a bugger, but it's out of our control. """ - global logentries + logentries = [] tripheads = re.findall( r"