more robust tripid labelling

2024-11-21 23:01:52 +00:00 · 2023-09-01 20:31:19 +03:00 · 2023-09-01 20:31:19 +03:00 · 98412c140d
commit 98412c140d
parent 1cf02afec9
3 changed files with 38 additions and 31 deletions
--- a/core/utils.py
+++ b/core/utils.py
@ -1,14 +1,15 @@
+import hashlib
 import logging
+import os
 import random
 import resource
+import string
 import subprocess
-import os
 from decimal import getcontext
 from pathlib import Path

 getcontext().prec = 2  # use 2 significant figures for decimal calculations

-
 import settings

 """This file declares TROG a globally visible object for caches.
@ -30,6 +31,8 @@ thread.
 """

 TROG = {"pagecache": {"expedition": {}}, "caves": {"gcavelookup": {}, "gcavecount": {}}}
+alphabet = []
+sha = hashlib.new('sha256')

 # This is module-level executable. This is a Bad Thing. Especially when it touches the file system.
 try:
@ -50,7 +53,27 @@ def chaosmonkey(n):
        return False
    # print("CHAOS strikes !", file=sys.stderr)
    return True
-
+    
+def unique_slug(text, n):
+    """This gives an almost-unique id based on the text, 
+    2 hex digits would seem adequate, but we might get a collision.
+    Not used anywhere.
+    """
+    sha.update(text.encode('utf-8'))
+    return sha.hexdigest()[0:n]
+    
+def alphabet_suffix(n):
+    """This is called repeatedly during initial parsing import, hence the cached list
+    """
+    global alphabet
+    if not alphabet:
+        alphabet = list(string.ascii_lowercase)
+        
+    if n < len(alphabet):
+        suffix = alphabet[n]
+    else:
+        suffix = "_X_" + random.choice(string.ascii_lowercase) + random.choice(string.ascii_lowercase)
+    return suffix        

 def only_commit(fname, message):
    """Only used to commit a survex file edited and saved in view/survex.py"""
--- a/core/views/uploads.py
+++ b/core/views/uploads.py
@ -1,5 +1,4 @@
 import subprocess
-import hashlib
 import string
 from datetime import datetime
 from pathlib import Path
@ -9,10 +8,11 @@ from django.core.files.storage import FileSystemStorage
 from django.shortcuts import render, redirect

 import settings
-from troggle.core.models.caves import GetCaveLookup
+
 from troggle.core.models.logbooks import LogbookEntry, writelogbook, PersonLogEntry
 from troggle.core.models.survex import DrawingFile
 from troggle.core.models.troggle import DataIssue, Expedition, PersonExpedition
+from troggle.core.utils import alphabet_suffix
 from troggle.parsers.people import GetPersonExpeditionNameLookup, known_foreigner

 # from databaseReset import reinit_db # don't do this. databaseRest runs code *at import time*
@ -45,29 +45,15 @@ todo = """

 - Make file rename utility less ugly.
 """
-sha = hashlib.new('sha256')
-
-def unique_slug(text, n):
-    """This gives each logbook entry a unique id based on the date+content, so the order of entries on a particular day
-    does not matter. This is a change (August 2023) from previous process.
-    
-    2 hex digits would seem adequate for each expo day, but we might get a collision.
-    The hash is based on the content after substitution of <p> so should be stable. Which means these ids
-    can be used elsewhere in the troggle system as permanent slugs.
-    
-    When SAVING an edited entry (as opposed to a new one) we will have a different hash so we will have to 
-    delete the original database object
-    """
-    sha.update(text.encode('utf-8'))
-    return sha.hexdigest()[0:n]

 def create_new_lbe_slug(date):
    onthisdate = LogbookEntry.objects.filter(date=date)
    n = len(onthisdate)
    # print(f" Already entries on this date: {n}\n {onthisdate}")
    
-    alphabet = list(string.ascii_lowercase)
-    tid = f"{date}{alphabet[n]}"
+    suffix = alphabet_suffix(n)
+        
+    tid = f"{date}{suffix}"
    print(tid)
    return tid

--- a/parsers/logbooks.py
+++ b/parsers/logbooks.py
@ -15,8 +15,7 @@ from parsers.people import GetPersonExpeditionNameLookup, load_people_expos, kno
 from troggle.core.models.caves import GetCaveLookup
 from troggle.core.models.logbooks import LogbookEntry, PersonLogEntry
 from troggle.core.models.troggle import DataIssue, Expedition
-from troggle.core.utils import get_process_memory
-from troggle.core.views.uploads import unique_slug
+from troggle.core.utils import get_process_memory, alphabet_suffix, unique_slug

 """
 Parses and imports logbooks in all their wonderful confusion
@ -109,7 +108,7 @@ ENTRIES = {
 logentries = []  # the entire logbook for one year is a single object: a list of entries
 noncaveplaces = ["travel", "Journey", "Loser Plateau", "UNKNOWN", "plateau", "base camp", "basecamp", "top camp", "topcamp"]
 tripsdate = {}
-alphabet = []
+

 def set_trip_seq_id(year, seq):
    '''We have not parsed the trip date yet, so this is a sequence numer
@ -121,15 +120,14 @@ def reset_trip_id(date):
    '''Now we have the date, we can set the tripid (the lbe slug) to be in our standard form
    of <date><letter>, i.e.  '2003-07-30b'
    BUT this gets re-set every time the logbook is imported, 
-    so they are not persistent as we would much prefer.
+    However these are persistent as the entries are ordered on this field.
    '''
-    global alphabet
    already =tripsdate.get(date, 0) # returns zero if none found
-    tripsdate[date] = already +1
-    if not alphabet:
-        alphabet = list(string.ascii_lowercase)
+    n = already + 1
+    tripsdate[date] = n
+    suffix = alphabet_suffix(n)
    
-    tid = f"{date}{alphabet[already]}"
+    tid = f"{date}{suffix}"
    # print(tid)
    return tid