From 9e71be8169e77ea71d080f3535ee0fce036cf838 Mon Sep 17 00:00:00 2001
From: Philip Sargent
Date: Sat, 28 Jan 2023 13:14:54 +0000
Subject: [PATCH] refactored, global removed
---
core/utils.py | 2 +
parsers/logbooks.py | 128 ++++++++++++++++++--------------------------
2 files changed, 53 insertions(+), 77 deletions(-)
diff --git a/core/utils.py b/core/utils.py
index 8bb7c2a..0b18e3b 100644
--- a/core/utils.py
+++ b/core/utils.py
@@ -203,6 +203,8 @@ def save_carefully(objectType, lookupAttribs={}, nonLookupAttribs={}):
defined in core.models.TroggleModel.
We are not using new_since_parsing - it is a fossil from Aaron Curtis's design in 2006. So it is always false.
+
+ NOTE: this takes twice as long as simply creating a new object with the given values.
"""
try:
diff --git a/parsers/logbooks.py b/parsers/logbooks.py
index f5250a2..bb592f3 100644
--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@@ -13,7 +13,7 @@ from django.template.defaultfilters import slugify
from parsers.people import GetPersonExpeditionNameLookup, load_people_expos
from troggle.core.models.caves import GetCaveLookup, LogbookEntry, PersonTrip
from troggle.core.models.troggle import DataIssue, Expedition
-from troggle.core.utils import save_carefully, get_process_memory
+from troggle.core.utils import get_process_memory
"""
Parses and imports logbooks in all their wonderful confusion
@@ -21,27 +21,16 @@ Parses and imports logbooks in all their wonderful confusion
https://expo.survex.com/handbook/computing/logbooks-parsing.html
"""
todo = """
-- Most of the time is during the database writing (13s out of 14s).
+- Most of the time is during the database writing (6s out of 8s).
-- Move a lot of non-db code from store_entry_into_database()
-into parse_logbook_for_expedition()
-
-- call GetTripPersons at parsing time, not db writing time
- this is a slow and uncertain function too: cave = getCaveByReference(caveRef)
-- if I am certain that we are creating from scratch, don't use save_carefully() to
-create the Django objects. And I am, because I delete the outdated stuff.
-
- pre-compile all the heavily used regular expressions !
-- refactor to get rid of the global 'logentries', very ugly indeed.
-
- profile the code to find bad repetitive things, of which there are many.
- attach or link a DataIssue to an individual expo (logbook) so that it can be found and deleted
-- far too many uses of Django field dereferencing to get values, which is SLOW
-
- replace explicit 1970 date with a constant EPOCH
- rewrite to use generators rather than storing everything intermediate in lists - to reduce memory impact.
@@ -228,53 +217,41 @@ def tidy_trip_image_urls(text, date):
text = text.replace("\t", "")
text = text.replace("\n\n\n", "\n\n")
return text
-
-def store_entry_into_database(date, place, tripcave, title, text, trippersons, author, expedition, logtime_underground, tid=None):
- """saves a single logbook entry and related persontrips
- Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday !
- troggle.log shows that we are creating lots of duplicates, which is no no problem with SQL as they just overwrite but we are saving the same thing too many times..
-
- Until 18 Dec.2022, this was overwriting logbook entries for the same date with the same title, because
- lookupAttribs={'date':date, 'title':title}
- """
- text = tidy_trip_image_urls(text, date)
-
- # Check for an existing copy of the current entry, and save
- expedition.get_expedition_day(date)
-
- lookupAttribs = {"date": date, "title": title}
- # but it is a db query which we should try to avoid - rewrite this
-
- # This needs attention. The slug field is derived from 'title'
- # NEW slug for a logbook entry here! Unique id + slugified title fragment
+def tidy_tid(tid, title):
if tid is not None:
- slug = tid
- # slug = tid + "_" + slugify(title)[:10].replace('-','_')
- else:
- slug = str(randint(1000, 9999)) + "_" + slugify(title)[:10].replace("-", "_")
+ return tid
+
+ # print(f"! {title=} ")
+ tid = str(randint(1000, 9999)) + "_" + slugify(title)[:10].replace("-", "_")
+ return tid
+
+def store_entry_into_database(date, place, tripcave, title, text, trippersons, author, expedition, logtime_underground, tid):
+ """saves a single logbook entry and related persontrips
+ Does NOT save the expeditionday_id - all NULLs. why? Because we are deprecating expeditionday !
+ """
+
+ # gets the current ExpeditionDay, and saves it as an object attached to
+ # the expedition, but does not attach it to the logbook entry. Why ?
+
+ # expedition.get_expedition_day(date)
+
nonLookupAttribs = {
"place": place,
"text": text,
"expedition": expedition,
"time_underground": logtime_underground,
"cave_slug": str(tripcave),
- "slug": slug,
+ "slug": tid,
}
- # Rewriting as we know prior objects have already been deleted.
- # This creates the lbo instance of LogbookEntry
+ lookupAttribs = {"date": date, "title": title}
lbo = LogbookEntry.objects.create(**nonLookupAttribs, **lookupAttribs)
- # lbo, created = save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs)
-
- # for PersonTrip time_underground is float (decimal hours)
+
for tripperson, time_underground in trippersons:
- # print(f" - {tid} '{tripperson}' author:{tripperson == author}")
lookupAttribs = {"personexpedition": tripperson, "logbook_entry": lbo}
nonLookupAttribs = {"time_underground": time_underground, "is_logbook_entry_author": (tripperson == author)}
- # this creates the PersonTrip instance.
pt = PersonTrip.objects.create(**nonLookupAttribs, **lookupAttribs)
- # save_carefully(PersonTrip, lookupAttribs, nonLookupAttribs)
def parser_date(tripdate, year):
"""Interprets dates in the expo logbooks and returns a correct datetime.date object"""
@@ -321,7 +298,7 @@ def parser_html(year, expedition, txt, seq=""):
the endmatter up to the frontmatter. This made sense when translating
from parser_html_01 format logfiles, believe me.
"""
- global logentries
+ logentries = []
dupl = {}
# extract front material and stash for later use when rebuilding from list of entries
@@ -397,9 +374,8 @@ def parser_html(year, expedition, txt, seq=""):
place = triptitles[0]
else:
place = "UNKNOWN"
- ltriptext = re.sub(r"
", "", triptext)
- # ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
- ltriptext = re.sub(r"", "
", ltriptext).strip()
+ tripcontent = re.sub(r"
", "", triptext)
+ tripcontent = re.sub(r"", "
", tripcontent).strip()
triptitle = triptitle.strip()
# triptitle must be unique for a given date. We fix this here.
@@ -414,9 +390,12 @@ def parser_html(year, expedition, txt, seq=""):
tu = tidy_time_underground(tu)
trippersons, author = tidy_trip_persons(trippeople, expedition, tu, tid)
tripcave = tidy_trip_cave(place)
-
- entrytuple = (ldate, place, tripcave, triptitle, ltriptext, trippersons, author, expedition, tu, tripid1)
+ tripcontent = tidy_trip_image_urls(tripcontent, date)
+ tid = tidy_tid(tid, triptitle)
+
+ entrytuple = (ldate, place, tripcave, triptitle, tripcontent, trippersons, author, expedition, tu, tid)
logentries.append(entrytuple)
+ return logentries
def parser_blog(year, expedition, txt, sq=""):
@@ -437,7 +416,7 @@ def parser_blog(year, expedition, txt, sq=""):
So the content is nested inside the header. Attachments (images) come after the content.
It's a bugger, but it's out of our control.
"""
- global logentries
+ logentries = []
tripheads = re.findall(
r"