troggle/parsers/logbooks.py

import csv
from  datetime import datetime, date, time
import os
import re
#import time
import pickle
import shelve

from django.conf import settings
from django.template.defaultfilters import slugify
from django.utils.timezone import get_current_timezone, make_aware

from troggle.core.models.troggle import DataIssue, Expedition
from troggle.core.utils import TROG, save_carefully
from troggle.core.models.caves import Cave, LogbookEntry, PersonTrip, GetCaveLookup
from parsers.people import GetPersonExpeditionNameLookup

'''
Parses and imports logbooks in all their wonderful confusion

# When we edit logbook entries, allow a "?" after any piece of data to say we've frigged it and
# it can be checked up later from the hard-copy if necessary; or it's not possible to determin (name, trip place, etc)
'''
todo='''
- Put the object store 'trips' and the 'logdataissues' into TROG global object

- refactor everything with some urgency, esp. LoadLogbookForExpedition()

- Logbooks log.htm exist for 1983, 84, 85, 87, 88, 89 but have no full-working parser, 
  or it is broken/incomplete and need hand-editing.

- import/parse/re-export-as-html the 'artisanal-format' old logbooks so that
  we keep only a modern HTML05 format. Then we can retiure the old parsers and reduce the
  volume of code here substantially.

- edit LoadLogbooks() to use coroutines to speed up import substantially,
  but perhaps we had better profile it first?
  
- rewrite to use generators rather than storing everything intermediate in lists - to reduce memory impact.

- the object store will need additional functions to replicate the persontrip calculation 
  and storage. For the moment we leave all that to be done in the django db
  
- We should ensure logbook.html is utf-8 and stop this crap:             
            file_in = open(logbookfile,'rb')
            txt = file_in.read().decode("latin1")
            
- this is a slow and uncertain function:  cave = getCaveByReference(caveRef)
'''

logentries = [] # the entire logbook for one year is a single object: a list of entries
noncaveplaces = [ "QMplaceholder", "Journey", "Loser Plateau", "UNKNOWN", 'plateau', 
        'base camp', 'basecamp', 'top camp', 'topcamp' ]
logdataissues = TROG['issues']['logdataissues']
trips ={}


#
# the logbook loading section
#
def set_trip_id(year, seq):
    tid= f"{year}.s{seq:02d}"
    return tid

    
def GetTripPersons(trippeople, expedition, logtime_underground, tid=None):
    res = [ ]
    author = None
    round_bracket_regex = re.compile(r"[\(\[].*?[\)\]]")
    #print(f'# {tid}')
       
    for tripperson in re.split(r",|\+|&amp;|&(?!\w+;)| and ", trippeople):
        tripperson = tripperson.strip()
        if not tid:
            tid = expedition.year + "." + tripperson + datetime.now().strftime("%S%f") # no good. Should be getting the tid
        mul = re.match(r"<u>(.*?)</u>$(?i)", tripperson)
        if mul:
            tripperson = mul.group(1).strip()
        if tripperson and tripperson[0] != '*':
            tripperson = re.sub(round_bracket_regex, "", tripperson).strip()
            personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower())
            if not personyear:
                message = f" ! - {expedition.year} No name match for: '{tripperson}' " 
                print(message)
                DataIssue.objects.create(parser='logbooks', message=message)
                logdataissues[tid]=message
            res.append((personyear, logtime_underground))
            if mul:
                author = personyear
    if not author:
        if not res:
            return None, None
        author = res[-1][0]
    return res, author

def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, entry_type="wiki", tid=None):
    """ saves a logbook entry and related persontrips 
    Does NOT save the expeditionday_id  - all NULLs. why?
    """
    try:
        trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground, tid=tid)
    except:
        message = f" ! - {expedition.year} Skipping logentry: {title} - GetTripPersons FAIL"
        DataIssue.objects.create(parser='logbooks', message=message)
        logdataissues["title"]=message
        print(message)
        return
        
    if not author:
        message = f" ! - {expedition.year} Skipping logentry: {title} - - no author for entry in year "
        DataIssue.objects.create(parser='logbooks', message=message)
        logdataissues["title"]=message
        print(message)
        return

    # This needs attention. The slug field is derived from 'title'
    # both GetCaveLookup() and GetTripCave() need to work together better. None of this data is *used* though?
    #tripCave = GetTripCave(place):

    lplace = place.lower()
    cave=None
    if lplace not in noncaveplaces:
        cave = GetCaveLookup().get(lplace)

    #Check for an existing copy of the current entry, and save
    expeditionday = expedition.get_expedition_day(date)
    lookupAttribs={'date':date, 'title':title}
    # 'cave' is converted to a string doing this, which renders as the cave slug.
    # but it is a db query which we should try to avoid - rewrite this
    
    #NEW sluf for a logbook entry here! Use the unique id, not the title !!!
    slug = tid + slugify(title)[:50]
    nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave_slug':str(cave), 'slug': slug, 'entry_type':entry_type}
    lbo, created=save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs)

    
    for tripperson, time_underground in trippersons:
        lookupAttribs={'personexpedition':tripperson, 'logbook_entry':lbo}
        nonLookupAttribs={'time_underground':time_underground, 'is_logbook_entry_author':(tripperson == author)}
        save_carefully(PersonTrip, lookupAttribs, nonLookupAttribs) # PersonTrip also saved in SetDatesFromLogbookEntries

def ParseDate(tripdate, year):
    """ Interprets dates in the expo logbooks and returns a correct datetime.date object  """
    try:
        mdatestandard = re.match(r"(\d\d\d\d)-(\d\d)-(\d\d)", tripdate)
        mdategoof = re.match(r"(\d\d?)/0?(\d)/(20|19)?(\d\d)", tripdate)
        if mdatestandard:
            if not (mdatestandard.group(1) == year):
                message = " ! - Bad date (year) in logbook: " + tripdate + " - " + year
                DataIssue.objects.create(parser='logbooks', message=message)
                logdataissues["tripdate"]=message
                return datetime.date('1970', '01', '01')
            else:
                year, month, day = int(mdatestandard.group(1)), int(mdatestandard.group(2)), int(mdatestandard.group(3))
        elif mdategoof:
            if not (not mdategoof.group(3) or mdategoof.group(3) == year[:2]):
                message = " ! - Bad date mdategoof.group(3) in logbook: " + tripdate + " - " + mdategoof.group(3)
                DataIssue.objects.create(parser='logbooks', message=message)
                logdataissues["tripdate"]=message
                return date('1970', '01', '01')
            else:
                yadd = int(year[:2]) * 100
                day, month, year = int(mdategoof.group(1)), int(mdategoof.group(2)), int(mdategoof.group(4)) + yadd
        else:
            message = " ! - Bad date in logbook: " + tripdate + " - " + year
            DataIssue.objects.create(parser='logbooks', message=message)
            logdataissues["tripdate"]=message

        return date(year, month, day)
    except:
        message = " ! - Failed to parse date in logbook: " + tripdate + " - " + year
        DataIssue.objects.create(parser='logbooks', message=message)
        logdataissues["tripdate"]=message
        return date(year, month, day)

# (2006 - not any more), 2008 - 2009
def Parselogwikitxt(year, expedition, txt):
    global logentries
    global logdataissues

    logbook_entry_count = 0
    trippara = re.findall(r"===(.*?)===([\s\S]*?)(?====)", txt)
    for triphead, triptext in trippara:
        logbook_entry_count += 1
        tid = set_trip_id(year,logbook_entry_count)
        
        tripheadp = triphead.split("|")
        if not (len(tripheadp) == 3):
            message = " ! - Bad no of items in tripdate in logbook: " + tripdate + " - " + tripheadp
            DataIssue.objects.create(parser='logbooks', message=message)
            logdataissues["tripdate"]=message
         
        tripdate, tripplace, trippeople = tripheadp
        tripsplace = tripplace.split(" - ")
        tripcave = tripsplace[0].strip()
        if len(tripsplace) == 1:
            tripsplace = tripsplace[0] 
        else:
            tripsplace = tripsplace[1]
            
        #print(f"! LOGBOOK {year} {logbook_entry_count:2}  {len(triptext):4}  '{tripsplace}'")

        tul = re.findall(r"T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext)
        if tul:
            tu = tul[0][0]
        else:
            tu = ""

        ldate = ParseDate(tripdate.strip(), year)
        tripid =""
        
        entrytuple = (ldate, tripcave, tripsplace, triptext, 
                trippeople, expedition, tu, "wiki", tripid)
        logentries.append(entrytuple)

        EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople, 
                expedition=expedition, logtime_underground=0, tid=tid)
        
        EnterLogIntoObjStore(year, ldate, tripcave, tripplace, triptext, trippeople, 
                tu, "wiki", tripid, logbook_entry_count, tid=tid)


def EnterLogIntoObjStore(year, date, tripcave, triptitle, text, trippeople, tu, formattype, tripid1, seq, tid=None):
    # This will need additional functions to replicate the persontrip calculation and storage. For the
    # moment we leave all that to be done in the django db
    global trips # should be a singleton TROG eventually
    global logdataissues

    if tid in trips:
        tyear, tdate, *trest = trips[tid]
        msg = f"   ! DUPLICATE on {tdate} id: '{tid}'"
        print(msg)
        DataIssue.objects.create(parser='logbooks', message=msg)
        tid = set_trip_id(str(date),seq)
        #print("   - De-dup ",seq, tid)
        logdataissues[tid]=msg
    trips[tid] = (year, date, tripcave, triptitle, text, trippeople, tu, formattype)

    ## copy a lot of checking functionality here from EnterLogIntoDbase()
    # GetTripPersons is a db query, so this will need to be put in ObjStore  before this will work..
    # or design a different way to do it.
    #trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground)
    # if not author:
        # print(" ! - Skipping logentry: " + title + " - no RECOGNISED author for entry")
        # message = " ! - Skipping logentry: %s - no author for entry in year '%s'" % (title, expedition.year)
        # DataIssue.objects.create(parser='logbooks', message=message)
        # logdataissues[tid+"author"]=message
    pass
 
# 2002, 2004, 2005, 2007, 2010 - now
# 2006 wiki text is incomplete, but the html all there. So using this parser now.
def Parseloghtmltxt(year, expedition, txt):
    global logentries
    global logdataissues

    tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt)
    logbook_entry_count = 0
    for trippara in tripparas:
        logbook_entry_count += 1
        tid = set_trip_id(year,logbook_entry_count)
       
        s = re.match(r'''(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)?  # second date
                            \s*(?:<a\s+id="(.*?)"\s*/>\s*</a>)?
                            \s*<div\s+class="tripdate"\s*(?:id="(.*?)")?>(.*?)</div>(?:<p>)?
                            \s*<div\s+class="trippeople">\s*(.*?)</div>
                            \s*<div\s+class="triptitle">\s*(.*?)</div>
                            ([\s\S]*?)
                            \s*(?:<div\s+class="timeug">\s*(.*?)</div>)?
                            \s*$
                     ''', trippara)
        if not s:
            if not re.search(r"Rigging Guide", trippara):
                msg = " !- Logbook. Can't parse: {} entry:{}".format(trippara, logbook_entry_count) 
                print(msg)
                DataIssue.objects.create(parser='logbooks', message=msg)
                logdataissues[tid]=msg
            continue
        tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups()
        ldate = ParseDate(tripdate.strip(), year)
        triptitles = triptitle.split(" - ")
        if len(triptitles) >= 2:
            tripcave = triptitles[0]
        else:
            tripcave = "UNKNOWN"
        ltriptext = re.sub(r"</p>", "", triptext)
        ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
        ltriptext = re.sub(r"<p>", "</br></br>", ltriptext).strip()

        entrytuple = (ldate, tripcave, triptitle, ltriptext, 
                trippeople, expedition, tu, "html", tripid1)
        logentries.append(entrytuple)

        EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext,
                          trippeople=trippeople, expedition=expedition, logtime_underground=0,
                          entry_type="html", tid=tid)

        EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu, 
                            "html", tripid1, logbook_entry_count, tid=tid)

# main parser for 1991 - 2001.  simpler because the data has been hacked so much to fit it
# trying it out for years 1982 - 1990 too. Some logbook editing required by hand..
def Parseloghtml01(year, expedition, txt):
    global logentries
    global logdataissues
    errorcount = 0

    tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
    logbook_entry_count = 0
    for trippara in tripparas:
        logbook_entry_count += 1
        tid = set_trip_id(year,logbook_entry_count)
        try:
            s = re.match(r"(?s)\s*(?:<p>)?(.*?)</?p>(.*)$(?i)", trippara)
            if not s:
                message = " ! - Skipping logentry {year} failure to parse header: " + tid + trippara[:300] + "..."
                DataIssue.objects.create(parser='logbooks', message=message)
                logdataissues[tid]=message
                print(message)
                break
            tripheader, triptext = s.group(1), s.group(2)
            mtripid = re.search(r'<a id="(.*?)"', tripheader)
            # if not mtripid:
                # # not an error, this is probabluy jusyt a different year
                # message = f" ! - Fail id trip:{tid} header:'{tripheader}'" 
                # DataIssue.objects.create(parser='logbooks', message=message)
                # logdataissues[tid]=message
                # print(message)
                
            tripid = mtripid and mtripid.group(1) or ""
            #print(f" # - mtripid: {mtripid}")
            tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader)

            tripdate, triptitle, trippeople = tripheader.split("|")
            ldate = ParseDate(tripdate.strip(), year)
        
            mtu = re.search(r'<p[^>]*>(T/?U.*)', triptext)
            if mtu:
                tu = mtu.group(1)
                triptext = triptext[:mtu.start(0)] + triptext[mtu.end():]
            else:
                tu = ""

            triptitles = triptitle.split(" - ")
            tripcave = triptitles[0].strip()

            ltriptext = triptext
            
            mtail = re.search(r'(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&amp;|</?p>|\((?:same day|\d+)\))*$', ltriptext)
            if mtail:
                ltriptext = ltriptext[:mtail.start(0)]
            ltriptext = re.sub(r"</p>", "", ltriptext)
            ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
            ltriptext = re.sub(r"<p>|<br>", "\n\n", ltriptext).strip()
            ltriptext = re.sub(r"</?u>", "_", ltriptext)
            ltriptext = re.sub(r"</?i>", "''", ltriptext)
            ltriptext = re.sub(r"</?b>", "'''", ltriptext)
            
            if ltriptext == "":
                message = " ! - Zero content for logbook entry!: " + tid 
                DataIssue.objects.create(parser='logbooks', message=message)
                logdataissues[tid]=message
                print(message)
                

            entrytuple = (ldate, tripcave, triptitle, ltriptext, 
                    trippeople, expedition, tu, "html01", tripid)
            logentries.append(entrytuple)
            try:
                EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext,
                                  trippeople=trippeople, expedition=expedition, logtime_underground=0,
                                  entry_type="html", tid=tid)
            except:
                message = " ! - Enter log entry into database FAIL  exception in: " + tid 
                DataIssue.objects.create(parser='logbooks', message=message)
                logdataissues[tid]=message
                print(message)
                
            try:
                EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu, 
                                    "html01", tripid, logbook_entry_count, tid=tid)
            except:
                message = " ! - Enter log entry into ObjectStore FAIL  exception in: " + tid 
                DataIssue.objects.create(parser='logbooks', message=message)
                logdataissues[tid]=message
                print(message)
                
        except:
            message = f" ! - Skipping logentry {year} due to exception in: {tid}"
            DataIssue.objects.create(parser='logbooks', message=message)
            logdataissues[tid]=message
            print(message)
            errorcount += 1
            if errorcount >5 :
                message = f" !!- TOO MANY ERRORS - aborting at '{tid}' logbook: {year}"
                DataIssue.objects.create(parser='logbooks', message=message)
                logdataissues[tid]=message
                print(message)
                return

# parser for 2003
def Parseloghtml03(year, expedition, txt):
    global logentries
    global logdataissues

    tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt)
    logbook_entry_count = 0
    for trippara in tripparas:
        logbook_entry_count += 1
        tid = set_trip_id(year,logbook_entry_count)
        
        s = re.match(r"(?s)\s*<p>(.*?)</p>(.*)$", trippara)
        if not ( s ) :
            message = " ! - Skipping logentry {year} on failure to parse Parseloghtml03: {} {} {}...".format(tid,s,trippara[:300])
            DataIssue.objects.create(parser='logbooks', message=message)
            logdataissues[tid]=message
            print(message)
            break
        
        tripheader, triptext = s.group(1), s.group(2)
        tripheader = re.sub(r"&nbsp;", " ", tripheader)
        tripheader = re.sub(r"\s+", " ", tripheader).strip()
        sheader = tripheader.split(" -- ")
        tu = ""
        if re.match("T/U|Time underwater", sheader[-1]):
            tu = sheader.pop()
        if len(sheader) != 3:
            print((" ! Header not three pieces", sheader))
        tripdate, triptitle, trippeople = sheader
        ldate = ParseDate(tripdate.strip(), year)
        triptitles = triptitle.split(" , ")
        if len(triptitles) >= 2:
            tripcave = triptitles[0]
        else:
            tripcave = "UNKNOWN"
        ltriptext = re.sub(r"</p>", "", triptext)
        ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
        ltriptext = re.sub(r"<p>", "\n\n", ltriptext).strip()
        ltriptext = re.sub(r"[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext)


        entrytuple = (ldate, tripcave, triptitle, ltriptext, 
                trippeople, expedition, tu, "html03", tid)
        logentries.append(entrytuple)

        EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle,
                          text = ltriptext, trippeople=trippeople, expedition=expedition,
                          logtime_underground=0, entry_type="html", tid=tid)

        EnterLogIntoObjStore(year, ldate, tripcave, triptitle, ltriptext, trippeople, tu, 
                            "html03", tid, logbook_entry_count, tid=tid)


def SetDatesFromLogbookEntries(expedition):
    """
    Sets the date_from and date_to field for an expedition based on persontrips.
    Then sets the expedition date_from and date_to based on the personexpeditions.
    """
    # Probably a faster way to do this. This uses a lot of db queries, but we have all this
    # in memory..
    for personexpedition in expedition.personexpedition_set.all():
        persontrips = personexpedition.persontrip_set.order_by('logbook_entry__date')
        # sequencing is difficult to do
        lprevpersontrip = None
        for persontrip in persontrips:
            persontrip.persontrip_prev = lprevpersontrip
            if lprevpersontrip:
                lprevpersontrip.persontrip_next = persontrip
                lprevpersontrip.save()
            persontrip.persontrip_next = None
            lprevpersontrip = persontrip
            persontrip.save() # also saved in EnterLogIntoDbase. MERGE these to speed up import.


def LoadLogbookForExpedition(expedition, expect):
    """ Parses all logbook entries for one expedition 
    If a cache is found it uses it. If not found, or fails sanity checks, parses source file.
    """
    # absolutely horrid. REFACTOR THIS (all my fault..)
    global logentries
    global logdataissues
    logbook_parseable = False
    logbook_cached = False
    yearlinks   = settings.LOGBOOK_PARSER_SETTINGS
    expologbase = os.path.join(settings.EXPOWEB, "years")
    logentries=[]
    
    def validcache(year,n):
        if year != expedition:
            print("   ! year != expedition ",year, expedition )
            return False
        if len(logentries) != n:
            print("   ! len(logentries) != n ",len(logentries), n )
            return False
        if n != expect:
            print("   ! n != expect ",n, expect )
            return False
        return True
    
    def cleanerrors(year):
        global logdataissues
        dataissues = DataIssue.objects.filter(parser='logbooks')
        for di in dataissues:
            ph = year
            if re.search(ph, di.message) is not None:
                #print(f'   - CLEANING dataissue {di.message}')
                di.delete()
     
        #print(f'   - CLEAN {year} {len(logdataissues)} {type(logdataissues)} data issues for this year')
        dellist = []
        for key, value in logdataissues.items():
            #print(f'   - CLEANING logdataissues [{key}]: {value}')
            if key.startswith(year):
                #print(f'   - CLEANING logdataissues [{key:12}]: {value} ')
                dellist.append(key)
        for i in dellist:
            del logdataissues[i]

    cleanerrors(expedition.year)

    if expedition.year in yearlinks:
        logbookfile = os.path.join(expologbase, yearlinks[expedition.year][0])
        expedition.logbookfile = yearlinks[expedition.year][0] 
        parsefunc   = yearlinks[expedition.year][1]
    else:
        logbookfile = os.path.join(expologbase, expedition.year, settings.DEFAULT_LOGBOOK_FILE)
        expedition.logbookfile = settings.DEFAULT_LOGBOOK_FILE
        parsefunc   = settings.DEFAULT_LOGBOOK_PARSER
    cache_filename = logbookfile + ".cache"
    expedition.save()

    try:
        bad_cache = False
        now = time.time()
        cache_t = os.path.getmtime(cache_filename)
        if os.path.getmtime(logbookfile) - cache_t > 2: # at least 2 secs later
            bad_cache= True
        if now - cache_t > 30*24*60*60:
            bad_cache= True
        if bad_cache:
            print(" - ! Cache is either stale or more than 30 days old. Deleting it.")
            os.remove(cache_filename)
            logentries=[]
            print("   ! Removed stale or corrupt cache file")
            raise
        print("   - Reading cache: " + cache_filename, end='')
        try:
            with open(cache_filename, "rb") as f:
                year,n,logentries = pickle.load(f) 
            if validcache(year,n):
                print("  -- Loaded ", len(logentries), " log entries")
                logbook_cached = True
            else:
                print("  !- Told to expect ", expect, " but ", len(logentries), " found in cache")
                raise
        except:
            print("   ! Failed to load corrupt cache.  (Or I was told to ignore it). Deleting it.")
            os.remove(cache_filename)
            logentries=[]
            raise
    except : # no cache found
        #print("   - No cache \"" + cache_filename +"\"")
        try:
            file_in = open(logbookfile,'rb')
            txt = file_in.read().decode("latin1")
            file_in.close()
            logbook_parseable = True
            print(("   - Using: " + parsefunc + " to parse " + logbookfile))
        except (IOError):
            logbook_parseable = False
            print(("   ! Couldn't open logbook " + logbookfile))

    if logbook_parseable:
        parser = globals()[parsefunc]
        
        parser(expedition.year, expedition, txt) # this launches the parser
        
        SetDatesFromLogbookEntries(expedition)
        if len(logentries) >0:
            print("   - Cacheing " , len(logentries), " log entries")
            with open(cache_filename, "wb") as fc:
                logbk=(expedition,len(logentries),logentries)
                pickle.dump(logbk, fc, protocol=4)
        else:
            print("   ! NO TRIP entries found in logbook, check the syntax.")

    if logbook_cached: # working on this bit...
        i=0
        for entrytuple in range(len(logentries)):
            date, tripcave, triptitle, text, trippeople, expedition, logtime_underground, entry_type, tripid1 = logentries[i]
            EnterLogIntoDbase(date, tripcave, triptitle, text, trippeople, expedition, 0,
                    entry_type)
            EnterLogIntoObjStore(expedition.year, date, tripcave, triptitle, text, trippeople, logtime_underground, 
                    entry_type, tripid1, i)
            i +=1
        SetDatesFromLogbookEntries(expedition)
    return len(logentries)

def LoadLogbooks():
    """ This is the master function for parsing all logbooks into the Troggle database. 
    Parser settings appropriate for each year are set in settings.py LOGBOOK_PARSER_SETTINGS.
    This should be rewritten to use coroutines to load all logbooks from disc in parallel.
    """
    global logdataissues

    logdataissues = {}
    DataIssue.objects.filter(parser='logbooks').delete()
    expos = Expedition.objects.all()
    if len(expos) <= 1:
        print(" ! No expeditions found. Load 'people' first.\n")
    nologbook = ["1976", "1977", "1978", "1979", "1980", "1981", 
        "1982", "1983", "1984", "1985", "1987", "1988", "1989",
        "1986", "2020",]
    entries = {"2021": 0, "2019": 20, "2018": 74, "2017": 60, "2016": 81, "2015": 79, 
        "2014": 65, "2013": 51, "2012": 75, "2011": 68, "2010": 22, "2009": 52, 
        "2008": 49, "2007": 111, "2006": 60, "2005": 55, "2004": 76, "2003": 40, "2002": 31, 
        "2001": 48, "2000": 54, "1999": 79, "1998": 43, "1997": 53, "1996": 94, "1995": 41, 
        "1994": 32, "1993": 41, "1992": 61, "1991": 38, "1990": 87, "1989": 1,"1988": 1,"1987": 1,
        "1985": 1,"1984": 1,"1983": 1,"1982": 42,}
    # Logbooks log.htm exist for 1983, 84, 85, 87, 88, 89 but have no full-working parser, or need hand-editing.
    try:
        os.remove("loadlogbk.log")
    except OSError:
        pass
    nlbe={}
    expd ={}
    with open("loadlogbk.log", "a") as log:
        for expo in expos:
            TROG['pagecache']['expedition'][expo.year] = None # clear cache
            if expo.year not in nologbook:
                print((" - Logbook for: " + expo.year))
                numentries = LoadLogbookForExpedition(expo, entries[expo.year])  # this actually loads the logbook for one year
                log.write("{} {:5d} should be {}\n".format(expo.year, numentries, entries[expo.year]))
                nlbe[expo.year]=numentries
                expd[expo.year]= 0
    print("** total trips in ObjStore:", len(trips))
    #for i in logdataissues:
    #    print("{:15s}:   {}".format(i, logdataissues[i]))
        
    for lbe in trips:
        year, date, tripcave, triptitle, text, trippeople, tu, formattype = trips[lbe]
        expd[year] += 1
    yt=0
    for y in expd:
        print("{} {}".format(y, expd[y]), nlbe[y])
        yt += expd[y]
    print("total {} log entries in all expeditions".format(yt))
    
    with shelve.open('logbktrips.shelve',writeback=True) as odb:
        for lbe in trips:
            odb[lbe]=trips[lbe]
    odb.sync()
    odb.close()

# dateRegex = re.compile(r'<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S)
# expeditionYearRegex = re.compile(r'<span\s+class="expeditionyear">(.*?)</span>', re.S)
# titleRegex = re.compile(r'<H1>(.*?)</H1>', re.S)
# reportRegex = re.compile(r'<div\s+class="report">(.*)</div>\s*</body>', re.S)
# personRegex = re.compile(r'<div\s+class="person">(.*?)</div>', re.S)
# nameAuthorRegex = re.compile(r'<span\s+class="name(,author|)">(.*?)</span>', re.S)
# TURegex = re.compile(r'<span\s+class="TU">([0-9]*\.?[0-9]+)</span>', re.S)
# locationRegex = re.compile(r'<span\s+class="location">(.*?)</span>', re.S)
# caveRegex = re.compile(r'<span\s+class="cave">(.*?)</span>', re.S)