troggle/parsers/logbooks.py

#.-*- coding: utf-8 -*-

from django.conf import settings
import troggle.core.models as models

from parsers.people import GetPersonExpeditionNameLookup
from parsers.cavetab import GetCaveLookup

from django.template.defaultfilters import slugify
from django.utils.timezone import get_current_timezone
from django.utils.timezone import make_aware

import csv
import re
import datetime
import os
from fuzzywuzzy import fuzz

from utils import save_carefully

# 
# When we edit logbook entries, allow a "?" after any piece of data to say we've frigged it and
# it can be checked up later from the hard-copy if necessary; or it's not possible to determin (name, trip place, etc)
#

#
# the logbook loading section
#
def GetTripPersons(trippeople, expedition, logtime_underground):
    res = [ ]
    author = None
    round_bracket_regex = re.compile(r"[\(\[].*?[\)\]]")
    for tripperson in re.split(r",|\+|&amp;|&(?!\w+;)| and ", trippeople):
        tripperson = tripperson.strip()
        mul = re.match(r"<u>(.*?)</u>$(?i)", tripperson)
        if mul:
            tripperson = mul.group(1).strip()
        if tripperson and tripperson[0] != '*':
            #assert tripperson in personyearmap, "'%s' << %s\n\n %s" % (tripperson, trippeople, personyearmap)
            tripperson = re.sub(round_bracket_regex, "", tripperson).strip()
            personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower())
            if not personyear:
                print("   - No name match for: '%s'" % tripperson)
                message = "No name match for: '%s' in year '%s'" % (tripperson, expedition.year)
                models.DataIssue.objects.create(parser='logbooks', message=message)
                print('   - Lets try something fuzzy')
                fuzzy_matches = {}
                for person in GetPersonExpeditionNameLookup(expedition):
                    fuzz_num = fuzz.ratio(tripperson.lower(), person)
                    if fuzz_num > 50:
                        #print("    - %s -> %s = %d" % (tripperson.lower(), person, fuzz_num))
                        fuzzy_matches[person] = fuzz_num
                for i in sorted(fuzzy_matches.items(), key = lambda kv:(kv[1]), reverse=True):
                    print('    - %s -> %s' % (i[0], i[1]))
            res.append((personyear, logtime_underground))
            if mul:
                author = personyear
    if not author:
        if not res:
            return None, None
        author = res[-1][0]
    return res, author

def GetTripCave(place):  #need to be fuzzier about matching here. Already a very slow function...
#    print "Getting cave for " , place
    try:
        katastNumRes=[]
        katastNumRes=list(models.Cave.objects.filter(kataster_number=int(place)))
    except ValueError:
        pass
    officialNameRes=list(models.Cave.objects.filter(official_name=place))
    tripCaveRes=officialNameRes+katastNumRes

    if len(tripCaveRes)==1:
#        print "Place " , place , "entered as" , tripCaveRes[0]
        return tripCaveRes[0]

    elif models.OtherCaveName.objects.filter(name=place):
        tripCaveRes=models.OtherCaveName.objects.filter(name__icontains=place)[0].cave
#        print "Place " , place , "entered as" , tripCaveRes
        return tripCaveRes

    elif len(tripCaveRes)>1:
        print("Ambiguous place " + str(place) + " entered. Choose from " + str(tripCaveRes))
        correctIndex=input("type list index of correct cave")
        return tripCaveRes[correctIndex]
    else:
        print("No cave found for place " , place)
        return


noncaveplaces = [ "Journey", "Loser Plateau" ]
def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground, entry_type="wiki"):
    """ saves a logbook entry and related persontrips """
    trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground)
    if not author:
        print("   - Skipping logentry: " + title + " - no author for entry")
        message = "Skipping logentry: %s - no author for entry in year '%s'" % (title, expedition.year)
        models.DataIssue.objects.create(parser='logbooks', message=message)
        return

    #tripCave = GetTripCave(place)

    lplace = place.lower()
    if lplace not in noncaveplaces:
        cave=GetCaveLookup().get(lplace)

    #Check for an existing copy of the current entry, and save
    expeditionday = expedition.get_expedition_day(date)
    lookupAttribs={'date':date, 'title':title}
    nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave':cave, 'slug':slugify(title)[:50], 'entry_type':entry_type}
    lbo, created=save_carefully(models.LogbookEntry, lookupAttribs, nonLookupAttribs)
    
    for tripperson, time_underground in trippersons:
        lookupAttribs={'personexpedition':tripperson, 'logbook_entry':lbo}
        nonLookupAttribs={'time_underground':time_underground, 'is_logbook_entry_author':(tripperson == author)}
        #print nonLookupAttribs
        save_carefully(models.PersonTrip, lookupAttribs, nonLookupAttribs)


def ParseDate(tripdate, year):
    """ Interprets dates in the expo logbooks and returns a correct datetime.date object  """
    mdatestandard = re.match(r"(\d\d\d\d)-(\d\d)-(\d\d)", tripdate)
    mdategoof = re.match(r"(\d\d?)/0?(\d)/(20|19)?(\d\d)", tripdate)
    if mdatestandard:
        assert mdatestandard.group(1) == year, (tripdate, year)
        year, month, day = int(mdatestandard.group(1)), int(mdatestandard.group(2)), int(mdatestandard.group(3))
    elif mdategoof:
        assert not mdategoof.group(3) or mdategoof.group(3) == year[:2], mdategoof.groups()
        yadd = int(year[:2]) * 100
        day, month, year = int(mdategoof.group(1)), int(mdategoof.group(2)), int(mdategoof.group(4)) + yadd
    else:
        assert False, tripdate
    return make_aware(datetime.datetime(year, month, day), get_current_timezone())

# 2006, 2008 - 2010
def Parselogwikitxt(year, expedition, txt):
    trippara = re.findall(r"===(.*?)===([\s\S]*?)(?====)", txt)
    for triphead, triptext in trippara:
        tripheadp = triphead.split("|")
        #print "ttt", tripheadp
        assert len(tripheadp) == 3, (tripheadp, triptext)
        tripdate, tripplace, trippeople = tripheadp
        tripsplace = tripplace.split(" - ")
        tripcave = tripsplace[0].strip()

        tul = re.findall(r"T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext)
        if tul:
            #assert len(tul) <= 1, (triphead, triptext)
            #assert tul[0][1] in ["hrs", "hours"], (triphead, triptext)
            tu = tul[0][0]
        else:
            tu = ""
            #assert tripcave == "Journey", (triphead, triptext)

        #print tripdate
        ldate = ParseDate(tripdate.strip(), year)
        #print "\n", tripcave, "---   ppp", trippeople, len(triptext)
        EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)

# 2002, 2004, 2005, 2007, 2011 - 2018
def Parseloghtmltxt(year, expedition, txt):
    #print(" - Starting log html parser")
    tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt)
    logbook_entry_count = 0
    for trippara in tripparas:
        #print(" - HR detected - maybe a trip?")
        logbook_entry_count += 1
        
        s = re.match(r'''(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)?  # second date
                            \s*(?:<a\s+id="(.*?)"\s*/>\s*</a>)?
                            \s*<div\s+class="tripdate"\s*(?:id="(.*?)")?>(.*?)</div>(?:<p>)?
                            \s*<div\s+class="trippeople">\s*(.*?)</div>
                            \s*<div\s+class="triptitle">\s*(.*?)</div>
                            ([\s\S]*?)
                            \s*(?:<div\s+class="timeug">\s*(.*?)</div>)?
                            \s*$
                     ''', trippara)
        if not s:
            if not re.search(r"Rigging Guide", trippara):
                print("can't parse: ", trippara)  # this is 2007 which needs editing
            #assert s, trippara
            continue
        tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups()
        ldate = ParseDate(tripdate.strip(), year)
        #assert tripid[:-1] == "t" + tripdate, (tripid, tripdate)
        #trippeople = re.sub(r"Ol(?!l)", "Olly", trippeople)
        #trippeople = re.sub(r"Wook(?!e)", "Wookey", trippeople)
        triptitles = triptitle.split(" - ")
        if len(triptitles) >= 2:
            tripcave = triptitles[0]
        else:
            tripcave = "UNKNOWN"
        #print("\n", tripcave, "---   ppp", trippeople, len(triptext))
        ltriptext = re.sub(r"</p>", "", triptext)
        ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
        ltriptext = re.sub(r"<p>", "\n\n", ltriptext).strip()
        EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext,
                          trippeople=trippeople, expedition=expedition, logtime_underground=0,
                          entry_type="html")
    if logbook_entry_count == 0:
        print(" - No trip entrys found in logbook, check the syntax matches htmltxt format")


# main parser for 1991 - 2001.  simpler because the data has been hacked so much to fit it
def Parseloghtml01(year, expedition, txt):
    tripparas = re.findall(r"<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
    for trippara in tripparas:
        s = re.match(u"(?s)\s*(?:<p>)?(.*?)</?p>(.*)$(?i)", trippara)
        assert s, trippara[:300]
        tripheader, triptext = s.group(1), s.group(2)
        mtripid = re.search(r'<a id="(.*?)"', tripheader)
        tripid = mtripid and mtripid.group(1) or ""
        tripheader = re.sub(r"</?(?:[ab]|span)[^>]*>", "", tripheader)

        #print "   ", [tripheader]
        #continue

        tripdate, triptitle, trippeople = tripheader.split("|")
        ldate = ParseDate(tripdate.strip(), year)
    
        mtu = re.search(r'<p[^>]*>(T/?U.*)', triptext)
        if mtu:
            tu = mtu.group(1)
            triptext = triptext[:mtu.start(0)] + triptext[mtu.end():]
        else:
            tu = ""

        triptitles = triptitle.split(" - ")
        tripcave = triptitles[0].strip()

        ltriptext = triptext
        
        mtail = re.search(r'(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&amp;|</?p>|\((?:same day|\d+)\))*$', ltriptext)
        if mtail:
            #print mtail.group(0)
            ltriptext = ltriptext[:mtail.start(0)]
        ltriptext = re.sub(r"</p>", "", ltriptext)
        ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
        ltriptext = re.sub(r"<p>|<br>", "\n\n", ltriptext).strip()
        #ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!]", "NONASCII", ltriptext)
        ltriptext = re.sub(r"</?u>", "_", ltriptext)
        ltriptext = re.sub(r"</?i>", "''", ltriptext)
        ltriptext = re.sub(r"</?b>", "'''", ltriptext)
        

        #print ldate, trippeople.strip()
            # could includ the tripid (url link for cross referencing)
        EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext,
                          trippeople=trippeople, expedition=expedition, logtime_underground=0,
                          entry_type="html")

# parser for 2003
def Parseloghtml03(year, expedition, txt):
    tripparas = re.findall(r"<hr\s*/>([\s\S]*?)(?=<hr)", txt)
    for trippara in tripparas:
        s = re.match(u"(?s)\s*<p>(.*?)</p>(.*)$", trippara)
        assert s, trippara
        tripheader, triptext = s.group(1), s.group(2)
        tripheader = re.sub(r"&nbsp;", " ", tripheader)
        tripheader = re.sub(r"\s+", " ", tripheader).strip()
        sheader = tripheader.split(" -- ")
        tu = ""
        if re.match("T/U|Time underwater", sheader[-1]):
            tu = sheader.pop()
        if len(sheader) != 3:
            print("header not three pieces", sheader)
        tripdate, triptitle, trippeople = sheader
        ldate = ParseDate(tripdate.strip(), year)
        triptitles = triptitle.split(" , ")
        if len(triptitles) >= 2:
            tripcave = triptitles[0]
        else:
            tripcave = "UNKNOWN"
        #print tripcave, "---   ppp", triptitle, trippeople, len(triptext)
        ltriptext = re.sub(r"</p>", "", triptext)
        ltriptext = re.sub(r"\s*?\n\s*", " ", ltriptext)
        ltriptext = re.sub(r"<p>", "\n\n", ltriptext).strip()
        ltriptext = re.sub(r"[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext)
        EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle,
                          text = ltriptext, trippeople=trippeople, expedition=expedition,
                          logtime_underground=0, entry_type="html")


def SetDatesFromLogbookEntries(expedition):
    """
    Sets the date_from and date_to field for an expedition based on persontrips.
    Then sets the expedition date_from and date_to based on the personexpeditions.
    """
    for personexpedition in expedition.personexpedition_set.all():
        persontrips = personexpedition.persontrip_set.order_by('logbook_entry__date')
        # sequencing is difficult to do
        lprevpersontrip = None
        for persontrip in persontrips:
            persontrip.persontrip_prev = lprevpersontrip
            if lprevpersontrip:
                lprevpersontrip.persontrip_next = persontrip
                lprevpersontrip.save()
            persontrip.persontrip_next = None
            lprevpersontrip = persontrip
            persontrip.save()


def LoadLogbookForExpedition(expedition):
    """ Parses all logbook entries for one expedition """
        
    expowebbase = os.path.join(settings.EXPOWEB, "years")
    yearlinks = settings.LOGBOOK_PARSER_SETTINGS

    logbook_parseable = False

    if expedition.year in yearlinks:
        year_settings = yearlinks[expedition.year]
        file_in = open(os.path.join(expowebbase, year_settings[0]))
        txt = file_in.read().decode("latin1")
        file_in.close()
        parsefunc = year_settings[1]
        logbook_parseable = True
        print(" - Parsing logbook: " + year_settings[0] + "\n - Using parser: " + year_settings[1])
    else:
        try:
            file_in = open(os.path.join(expowebbase, expedition.year, settings.DEFAULT_LOGBOOK_FILE))
            txt = file_in.read().decode("latin1")
            file_in.close()
            logbook_parseable = True
            print("No set parser found using default")
            parsefunc = settings.DEFAULT_LOGBOOK_PARSER
        except (IOError):
            logbook_parseable = False
            print("Couldn't open default logbook file and nothing in settings for expo " + expedition.year)

    if logbook_parseable:
        parser = globals()[parsefunc]
        parser(expedition.year, expedition, txt)
        SetDatesFromLogbookEntries(expedition)

    #return "TOLOAD: " + year + "  " + str(expedition.personexpedition_set.all()[1].logbookentry_set.count()) + "  " + str(models.PersonTrip.objects.filter(personexpedition__expedition=expedition).count())


def LoadLogbooks():
    """ This is the master function for parsing all logbooks into the Troggle database. """

    # Clear the logbook data issues as we are reloading
    models.DataIssue.objects.filter(parser='logbooks').delete()
    # Fetch all expos
    expos = models.Expedition.objects.all()
    for expo in expos:
        print("\nLoading Logbook for: " + expo.year)
        
        # Load logbook for expo
        LoadLogbookForExpedition(expo)


dateRegex = re.compile(r'<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S)
expeditionYearRegex = re.compile(r'<span\s+class="expeditionyear">(.*?)</span>', re.S)
titleRegex = re.compile(r'<H1>(.*?)</H1>', re.S)
reportRegex = re.compile(r'<div\s+class="report">(.*)</div>\s*</body>', re.S)
personRegex = re.compile(r'<div\s+class="person">(.*?)</div>', re.S)
nameAuthorRegex = re.compile(r'<span\s+class="name(,author|)">(.*?)</span>', re.S)
TURegex = re.compile(r'<span\s+class="TU">([0-9]*\.?[0-9]+)</span>', re.S)
locationRegex = re.compile(r'<span\s+class="location">(.*?)</span>', re.S)
caveRegex = re.compile(r'<span\s+class="cave">(.*?)</span>', re.S)

def parseAutoLogBookEntry(filename):
    errors = []
    f = open(filename, "r")
    contents = f.read()
    f.close()

    dateMatch = dateRegex.search(contents)
    if dateMatch:
        year, month, day = [int(x) for x in dateMatch.groups()]
        date = datetime.date(year, month, day)
    else:
        errors.append("Date could not be found")

    expeditionYearMatch = expeditionYearRegex.search(contents)
    if expeditionYearMatch:
        try:
            expedition = models.Expedition.objects.get(year = expeditionYearMatch.groups()[0])
            personExpeditionNameLookup = GetPersonExpeditionNameLookup(expedition)
        except models.Expedition.DoesNotExist:
            errors.append("Expedition not in database")   
    else:
        errors.append("Expediton Year could not be parsed")   

    titleMatch = titleRegex.search(contents)
    if titleMatch:
        title, = titleMatch.groups()
        if len(title) > settings.MAX_LOGBOOK_ENTRY_TITLE_LENGTH:
            errors.append("Title too long")   
    else:
        errors.append("Title could not be found") 

    caveMatch = caveRegex.search(contents)
    if caveMatch:
        caveRef, = caveMatch.groups()
        try:
            cave = models.getCaveByReference(caveRef)
        except AssertionError:
            cave = None
            errors.append("Cave not found in database")   
    else:
        cave = None

    locationMatch = locationRegex.search(contents)
    if locationMatch:
        location, = locationMatch.groups() 
    else:
        location = None
       
    if cave is None and location is None:
        errors.append("Location nor cave could not be found") 

    reportMatch = reportRegex.search(contents)
    if reportMatch:
        report, = reportMatch.groups()
    else:
        errors.append("Contents could not be found") 
    if errors:
        return errors # Easiest to bail out at this point as we need to make sure that we know which expedition to look for people from.
    people = []
    for personMatch in personRegex.findall(contents):
       nameAuthorMatch = nameAuthorRegex.search(contents)
       if nameAuthorMatch:
           author, name = nameAuthorMatch.groups()
           if name.lower() in personExpeditionNameLookup:
               personExpo = personExpeditionNameLookup[name.lower()]
           else:
               errors.append("Person could not be found in database")
           author = bool(author)
       else:
           errors.append("Persons name could not be found")
       
       TUMatch = TURegex.search(contents)
       if TUMatch:
           TU, = TUMatch.groups()
       else:
           errors.append("TU could not be found")
       if not errors:
           people.append((name, author, TU))
    if errors:
        return errors # Bail out before commiting to the database
    logbookEntry = models.LogbookEntry(date = date, 
                                       expedition  = expedition,
                                       title = title, cave = cave, place = location, 
                                       text = report, slug = slugify(title)[:50],
                                       filename = filename)
    logbookEntry.save()
    for name, author, TU in people:
        models.PersonTrip(personexpedition = personExpo,  
                          time_underground = TU, 
                          logbook_entry = logbookEntry, 
                          is_logbook_entry_author = author).save()
    print(logbookEntry)