Replaced maintenance headache with cleaner folder walking

2025-03-13 05:41:47 +00:00 · 2022-09-23 23:43:34 +03:00 · 2022-09-23 23:43:34 +03:00 · 6e3fdd35c1
commit 6e3fdd35c1
parent c3672b476c
6 changed files with 107 additions and 109 deletions
--- a/core/utils.py
+++ b/core/utils.py
@ -70,27 +70,6 @@ def chaosmonkey(n):
    # print("CHAOS strikes !", file=sys.stderr)
    return True

-# 
-def GetListDir(sdir):
-    '''handles url or file, so we can refer to a set of scans (not drawings) on another server
-    returns a list of f (file), ff (file full path), is_dir (bool)
-    
-    REPLACE all use of this with Path.rglob() !
-    '''
-    res = [ ]
-    if type(sdir) is str and sdir[:7] == "http://":
-        # s = urllib.request.urlopen(sdir)
-        message = f"! Requesting loading from http:// NOT IMPLEMENTED. [{sdir}]"         
-        print(message)
-        DataIssue.objects.create(parser='Drawings', message=message)
-        sdir[:7] = ""
-
-    for f in os.listdir(sdir):
-        if f[0] != ".":
-            ff = os.path.join(sdir, f)
-            res.append((f, ff, os.path.isdir(ff)))
-    return res
-
 def only_commit(fname, message):
    '''Only used to commit a survex file edited and saved in view/survex.py
    '''
--- a/core/views/scans.py
+++ b/core/views/scans.py
@ -208,8 +208,8 @@ def cavewallets(request, caveid):
                    wallets.add(z)
            else:
                wurl = f"/scanupload/{z.walletname.replace('#',':')}"
-                print(f' - Unrecognised cave name \'{zcaveid}\' in {z.walletname}')
-                message = f" ! In {z.walletname} there is an unrecognised cave name '{zcaveid}'"
+                print(f' - Unrecognised cave name \'{zcaveid}\' in {z.walletname}  (out of {len(Gcavelookup):,} cave names')
+                message = f" ! In {z.walletname} there is an unrecognised cave name '{zcaveid}' (out of {len(Gcavelookup):,} cave names"
                DataIssue.objects.update_or_create(parser='scans', message=message, url=wurl)

    manywallets = list(set(wallets))
--- a/core/views/uploads.py
+++ b/core/views/uploads.py
@ -208,7 +208,7 @@ def get_complaints(complaints, waldata, svxfiles, files, wallet, wurl):
    if not waldata["description written"]: 
        complaints.append("The guidebook description needs writing into the survex file. Tick the 'Cave description written' checkbox when this is done.") 
    # QMs
-    if not waldata["qms written"] and int(w.year()) >= 2015:
+    if not waldata["qms written"] and w.year() and int(w.year()) >= 2015:
        complaints.append("The QMs needs writing into the survex file. Tick the 'QMs written' checkbox when this is done.") 

    # Website
--- a/parsers/drawings.py
+++ b/parsers/drawings.py
@ -12,7 +12,7 @@ from functools import reduce
 import settings
 from troggle.core.models.survex import SingleScan, Wallet, DrawingFile
 from troggle.core.models.troggle import DataIssue
-from troggle.core.utils import save_carefully, GetListDir
+from troggle.core.utils import save_carefully

 '''Searches through all the :drawings: repository looking
 for tunnel and therion files
--- a/parsers/scans.py
+++ b/parsers/scans.py
@ -14,7 +14,7 @@ from pathlib import Path
 import settings
 from troggle.core.models.survex import SingleScan, Wallet, DrawingFile
 from troggle.core.models.troggle import DataIssue
-from troggle.core.utils import save_carefully, GetListDir
+from troggle.core.utils import save_carefully
 from troggle.core.views.scans import datewallet

 '''Searches through all the survey scans directories (wallets) in expofiles, looking for images to be referenced.
@ -26,66 +26,63 @@ git = settings.GIT

 # to do: Actually read all the JSON files and set the survex file field appropriately!

-                    
-def CheckEmptyDate(wallet):
-    '''If date is not set, get it from a linked survex file. 
-    Could also look at filedates for the scans in expofiles/surveyscans/ , but these can be re-set by copying.
-    '''
-    earliest = datetime.datetime.now().date()
+# def GetListDir(sdir):
+    # '''handles url or file, so we can refer to a set of scans (not drawings) on another server
+    # returns a list of f (file), ff (file full path), is_dir (bool)
    
-    # This is not working, can't see why. An scans parser now taking a very long time..
-    #datewallet(wallet, earliest)
-    return
-    
-def CheckEmptyPeople(wallet):
-    '''If people list is empty, copy them from the survex files: all of them
-   
-    To be a Troggle model change; a many:many relationship between wallets and people,
-    as well as being a list in the JSON file (which is the permanent repository). We want the many:many
-    relationship so that we can filter wallets based on a person.
-    
-    For the moment, we will just get a list..
-    '''
-    return
+    # REPLACE all use of this with Path.rglob() !
+    # '''
+    # res = [ ]
+    # if type(sdir) is str and sdir[:7] == "http://":
+        # # s = urllib.request.urlopen(sdir)
+        # message = f"! Requesting loading from http:// NOT IMPLEMENTED. [{sdir}]"         
+        # print(message)
+        # DataIssue.objects.create(parser='Drawings', message=message)
+        # sdir[:7] = ""

-def LoadListScansFile(wallet):
-    gld = [ ]
-    # flatten out any directories in these wallet folders - should not be any
-    for (fyf, ffyf, fisdiryf) in GetListDir(wallet.fpath):
-        if fisdiryf:
-            gld.extend(GetListDir(ffyf))
-        else:
-            gld.append((fyf, ffyf, fisdiryf))
+    # for f in os.listdir(sdir):
+        # if f[0] != ".":
+            # ff = os.path.join(sdir, f)
+            # res.append((f, ff, os.path.isdir(ff)))
+    # return res
+                    
+
+# def LoadListScansFile(wallet):
+    # # formerly a generic troggle utility, written by who ? Being gradually expunged and replaced by python standard library functions
+    # gld = [ ]
+    # # flatten out any directories in these wallet folders - should not be any
+    # for (fyf, ffyf, fisdiryf) in GetListDir(wallet.fpath):
+        # if fisdiryf:
+            # gld.extend(GetListDir(ffyf))
+        # else:
+            # gld.append((fyf, ffyf, fisdiryf))
    
-    c=0
-    for (fyf, ffyf, fisdiryf) in gld:
-        if re.search(r"\.(?:png|jpg|jpeg|pdf|svg|gif|xvi)(?i)$", fyf):
-            singlescan = SingleScan(ffile=ffyf, name=fyf, wallet=wallet)
-            singlescan.save()
-            c+=1
-            if c>=10:
-                print(".", end='')
-                c = 0
+    # c=0
+    # for (fyf, ffyf, fisdiryf) in gld:
+        # if re.search(r"\.(?:png|jpg|jpeg|pdf|svg|gif|xvi)(?i)$", fyf):
+            # singlescan = SingleScan(ffile=ffyf, name=fyf, wallet=wallet)
+            # singlescan.save()
+            # c+=1
+            # if c>=10:
+                # print(".", end='')
+                # c = 0
    
 def load_all_scans():
    '''This iterates through the scans directories (either here or on the remote server)
    and builds up the models we can access later.
+    
    It does NOT read or validate anything in the JSON data attached to each wallet. Those checks
    are done at runtime, when a wallet is accessed, not at import time.
    
-    Replace GetListDir with a more modern Path.iter idiom
-    path = Path("scans")
-    for p in path.rglob("*"):
-        print(p.name)
-    
    '''
    print(' - Loading Survey Scans')

    SingleScan.objects.all().delete()
    Wallet.objects.all().delete()
-    print(' - deleting all Wallet and SingleScan objects')
+    print('  - deleting all Wallet and SingleScan objects')
    DataIssue.objects.filter(parser='scans').delete()
    
+    # These are valid old file types to be visible, they are not necessarily allowed to be uploaded to a new wallet.
    valids = [".top",".txt",".tif",".png",".jpg",".jpeg",".pdf",".svg",".gif",".xvi",
        ".json",".autosave",".sxd",".svx",".th",".th2",".tdr",".sql",".zip",".dxf",".3d",
        ".ods",".csv",".xcf",".xml"]
@ -95,10 +92,12 @@ def load_all_scans():
    # Not all folders with files in them are wallets.
    # they are if they   are /2010/2010#33 
    #     or /1996-1999NotKHbook/
-    #     but not if they are /2010/1010#33/therion  or /1998/
-    print(' - ', end=' ')
+    #     but not if they are /2010/2010#33/therion/  : the wallet is /2010#33/ not /therion/
+    print('   - ', end='')
    scans_path = Path(settings.SCANS_ROOT) 
    seen = []
+    c=0
+    wallets = {}
    for p in scans_path.rglob('*'):
        if p.is_file():
            if p.suffix.lower() not in valids and p.name.lower() not in validnames:
@ -107,6 +106,13 @@ def load_all_scans():
            elif p.parent == scans_path: # skip files directly in /surveyscans/
                pass
            else:
+                
+                c+=1
+                if c % 15 == 0 :
+                    print(".", end='')
+                if c % 500 == 0 :
+                    print("\n   -", end='')
+
                if p.parent.parent.parent.parent == scans_path:
                    # print(f"too deep {p}", end='\n')
                    fpath = p.parent.parent
@ -114,54 +120,66 @@ def load_all_scans():
                else: 
                    fpath = p.parent
                    walletname = p.parent.name
-                    
-                # UNFINISHED
+                
+                if walletname in wallets:
+                    wallet = wallets[walletname]
+                else:
+                    print("", flush=True, end='')
+                    wallet = Wallet(fpath=fpath, walletname=walletname)
+                    wallet.save()
+                    wallets[walletname] = wallet
+                
+                singlescan = SingleScan(ffile=fpath, name=p.name, wallet=wallet)
+                singlescan.save()
+                
+                
+                # only printing progress:
                tag = p.parent
                if len(walletname)>4:
                    if walletname[4] == "#":
                        tag = p.parent.parent
                    
                if tag not in seen:
-                    print(f"{tag.name}", end=' ')
+                    print(f" {tag.name} ", end='')
                    seen.append(tag)
-                     #wallet = Wallet(fpath=fpath, walletname=walletname)
+                     
                    
-    
-    print('\n UNFINISHED  \n\n--- ')
-    for topfolder, fpath, fisdir in GetListDir(settings.SCANS_ROOT):
-        if not fisdir:
-            continue
+    print(f'\n  - found and loaded {c:,} acceptable scan files in {len(wallets):,} wallets')
+   
+    # if False:
+        # n=0
+        # for topfolder, fpath, fisdir in GetListDir(settings.SCANS_ROOT):
+            # if not fisdir:
+                # continue

-        # do the year folders
-        if re.match(r"\d\d\d\d$", topfolder):
-            print(f"{topfolder}", end=' ')
-            for walletname, fpath, fisdir in GetListDir(fpath):
-                if fisdir:
-                    wallet = Wallet(fpath=fpath, walletname=walletname)
-                    # this is where we should record the year explicitly
-                    # line 347 of view/uploads.py and needs refactoring for loading contentsjson
-                    CheckEmptyDate(wallet)
-                    CheckEmptyPeople(wallet)
-                    wallet.save()
-                    LoadListScansFile(wallet)
-        else:
-            # but We *should* load all the scans, even for nonstandard names. 
-            print(f'\n - IGNORE {walletname} - {fpath}')
- 
-    # but we also need to check if JSON exists, even if there are no uploaded scan files
+            # # do the year folders
+            # # if re.match(r"\d\d\d\d$", topfolder):
+            # print(f"{topfolder}", end=' ')
+            # for walletname, fpath, fisdir in GetListDir(fpath):
+                # if fisdir:
+                    # wallet = Wallet(fpath=fpath, walletname=walletname)
+                    # # this is where we should record the year explicitly
+                    # # line 347 of view/uploads.py and needs refactoring for loading contentsjson
+                    # wallet.save()
+                    # LoadListScansFile(wallet)
+            # # else:
+                # # # but We *should* load all the scans, even for nonstandard names. 
+                # # print(f'\n - IGNORE {topfolder} - {fpath}')
+        # print("", flush=True)
+     
+    # but we also need to check if JSON exists, even if there are no uploaded scan files.
+    # Here we know there is a rigid folder structure, so no need to look for sub folders
    contents_path = Path(settings.DRAWINGS_DATA, "walletjson") 
    for yeardir in contents_path.iterdir(): 
        if yeardir.is_dir():
            for walletpath in yeardir.iterdir(): 
                if Path(walletpath, contentsjson).is_file():
                    walletname = walletpath.name
-                    wallet, created = Wallet.objects.update_or_create(walletname=walletname)
-                    # should now also load the json and use it ! check &ref is correct or missing too
-                    if created:
-                        print(f"\n{walletname} created: only JSON, no actual uploaded scan files.", end=' ')
-                        CheckEmptyDate(wallet)
-                        CheckEmptyPeople(wallet)
-                        wallet.save()
- 
-
-    print("", flush=True)
+                    
+                    if walletname not in wallets:
+                        print(f"  - {walletname} creation attempting: only JSON, no actual uploaded scan files.", end=' ')
+                        wallet, created = Wallet.objects.update_or_create(walletname=walletname)
+                        # should now also load the json and use it ! check &ref is correct or missing too
+                        if created:
+                            print(f"  - {walletname} created: only JSON, no actual uploaded scan files.", end=' ')
+                            wallet.save()
--- a/parsers/survex.py
+++ b/parsers/survex.py
@ -323,7 +323,7 @@ class LoadingSurvex():
            perps = get_people_on_trip(survexblock) # What, you don't know Judge Dredd slang ?
            message = f"! DATE Warning only accurate to the month, setting to 1st '{oline}' ({survexblock}) {survexblock.survexfile.path} {perps}"
            print(self.insp+message)
-            DataIssue.objects.create(parser='survex', message=message,  url=get_offending_filename(survexblock.survexfile.path))
+            DataIssue.objects.create(parser='survex-date', message=message,  url=get_offending_filename(survexblock.survexfile.path))
            survexblock.date = datetime.strptime(line.replace('.','-'), '%Y-%m') # sets to first of month
            setdate(year)
        elif len(line) == 4: 
@ -331,7 +331,7 @@ class LoadingSurvex():
            perps = get_people_on_trip(survexblock)
            message = f"! DATE WARNING only accurate to the YEAR, setting to 1st January '{oline}' ({survexblock}) {survexblock.survexfile.path} {perps}"
            print(self.insp+message)
-            DataIssue.objects.create(parser='survex', message=message,  url=get_offending_filename(survexblock.survexfile.path))
+            DataIssue.objects.create(parser='survex-date', message=message,  url=get_offending_filename(survexblock.survexfile.path))
            survexblock.date = datetime.strptime(line, '%Y') # sets to January 1st
            setdate(year)
        else:
@ -1546,6 +1546,7 @@ def LoadSurvexBlocks():
    SurvexStation.objects.all().delete()
    print(" - survex Data Issues flushed")
    DataIssue.objects.filter(parser='survex').delete()
+    DataIssue.objects.filter(parser='survex-date').delete()
    DataIssue.objects.filter(parser='survexleg').delete()
    DataIssue.objects.filter(parser='survexunits').delete()
    DataIssue.objects.filter(parser='entrances').delete()