From 98412c140d9cadc3009febe8f903f79b4ad8769b Mon Sep 17 00:00:00 2001 From: Philip Sargent Date: Fri, 1 Sep 2023 20:31:19 +0300 Subject: [PATCH] more robust tripid labelling --- core/utils.py | 29 ++++++++++++++++++++++++++--- core/views/uploads.py | 24 +++++------------------- parsers/logbooks.py | 16 +++++++--------- 3 files changed, 38 insertions(+), 31 deletions(-) diff --git a/core/utils.py b/core/utils.py index a24a1dc..2aba924 100644 --- a/core/utils.py +++ b/core/utils.py @@ -1,14 +1,15 @@ +import hashlib import logging +import os import random import resource +import string import subprocess -import os from decimal import getcontext from pathlib import Path getcontext().prec = 2 # use 2 significant figures for decimal calculations - import settings """This file declares TROG a globally visible object for caches. @@ -30,6 +31,8 @@ thread. """ TROG = {"pagecache": {"expedition": {}}, "caves": {"gcavelookup": {}, "gcavecount": {}}} +alphabet = [] +sha = hashlib.new('sha256') # This is module-level executable. This is a Bad Thing. Especially when it touches the file system. try: @@ -50,7 +53,27 @@ def chaosmonkey(n): return False # print("CHAOS strikes !", file=sys.stderr) return True - + +def unique_slug(text, n): + """This gives an almost-unique id based on the text, + 2 hex digits would seem adequate, but we might get a collision. + Not used anywhere. + """ + sha.update(text.encode('utf-8')) + return sha.hexdigest()[0:n] + +def alphabet_suffix(n): + """This is called repeatedly during initial parsing import, hence the cached list + """ + global alphabet + if not alphabet: + alphabet = list(string.ascii_lowercase) + + if n < len(alphabet): + suffix = alphabet[n] + else: + suffix = "_X_" + random.choice(string.ascii_lowercase) + random.choice(string.ascii_lowercase) + return suffix def only_commit(fname, message): """Only used to commit a survex file edited and saved in view/survex.py""" diff --git a/core/views/uploads.py b/core/views/uploads.py index f0dedfa..5c2f4f4 100644 --- a/core/views/uploads.py +++ b/core/views/uploads.py @@ -1,5 +1,4 @@ import subprocess -import hashlib import string from datetime import datetime from pathlib import Path @@ -9,10 +8,11 @@ from django.core.files.storage import FileSystemStorage from django.shortcuts import render, redirect import settings -from troggle.core.models.caves import GetCaveLookup + from troggle.core.models.logbooks import LogbookEntry, writelogbook, PersonLogEntry from troggle.core.models.survex import DrawingFile from troggle.core.models.troggle import DataIssue, Expedition, PersonExpedition +from troggle.core.utils import alphabet_suffix from troggle.parsers.people import GetPersonExpeditionNameLookup, known_foreigner # from databaseReset import reinit_db # don't do this. databaseRest runs code *at import time* @@ -45,29 +45,15 @@ todo = """ - Make file rename utility less ugly. """ -sha = hashlib.new('sha256') - -def unique_slug(text, n): - """This gives each logbook entry a unique id based on the date+content, so the order of entries on a particular day - does not matter. This is a change (August 2023) from previous process. - - 2 hex digits would seem adequate for each expo day, but we might get a collision. - The hash is based on the content after substitution of

so should be stable. Which means these ids - can be used elsewhere in the troggle system as permanent slugs. - - When SAVING an edited entry (as opposed to a new one) we will have a different hash so we will have to - delete the original database object - """ - sha.update(text.encode('utf-8')) - return sha.hexdigest()[0:n] def create_new_lbe_slug(date): onthisdate = LogbookEntry.objects.filter(date=date) n = len(onthisdate) # print(f" Already entries on this date: {n}\n {onthisdate}") - alphabet = list(string.ascii_lowercase) - tid = f"{date}{alphabet[n]}" + suffix = alphabet_suffix(n) + + tid = f"{date}{suffix}" print(tid) return tid diff --git a/parsers/logbooks.py b/parsers/logbooks.py index a5f6631..511ed47 100644 --- a/parsers/logbooks.py +++ b/parsers/logbooks.py @@ -15,8 +15,7 @@ from parsers.people import GetPersonExpeditionNameLookup, load_people_expos, kno from troggle.core.models.caves import GetCaveLookup from troggle.core.models.logbooks import LogbookEntry, PersonLogEntry from troggle.core.models.troggle import DataIssue, Expedition -from troggle.core.utils import get_process_memory -from troggle.core.views.uploads import unique_slug +from troggle.core.utils import get_process_memory, alphabet_suffix, unique_slug """ Parses and imports logbooks in all their wonderful confusion @@ -109,7 +108,7 @@ ENTRIES = { logentries = [] # the entire logbook for one year is a single object: a list of entries noncaveplaces = ["travel", "Journey", "Loser Plateau", "UNKNOWN", "plateau", "base camp", "basecamp", "top camp", "topcamp"] tripsdate = {} -alphabet = [] + def set_trip_seq_id(year, seq): '''We have not parsed the trip date yet, so this is a sequence numer @@ -121,15 +120,14 @@ def reset_trip_id(date): '''Now we have the date, we can set the tripid (the lbe slug) to be in our standard form of , i.e. '2003-07-30b' BUT this gets re-set every time the logbook is imported, - so they are not persistent as we would much prefer. + However these are persistent as the entries are ordered on this field. ''' - global alphabet already =tripsdate.get(date, 0) # returns zero if none found - tripsdate[date] = already +1 - if not alphabet: - alphabet = list(string.ascii_lowercase) + n = already + 1 + tripsdate[date] = n + suffix = alphabet_suffix(n) - tid = f"{date}{alphabet[already]}" + tid = f"{date}{suffix}" # print(tid) return tid