troggle/parsers/people.py

import csv, re, datetime, os, shutil
from html.parser import HTMLParser
from unidecode import unidecode
from pathlib import Path

from django.conf import settings

from troggle.core.models.troggle import Expedition, Person, PersonExpedition
from troggle.core.models.troggle import DataIssue
from troggle.core.utils import save_carefully, TROG

'''These functions do not match how the stand-alone folk script works. So the script produces an HTML file which has
href links to pages in troggle which troggle does not think are right.
The standalone script needs to be renedred defucnt, and all the parsing needs to be in troggle. Either that,
or they should use the same code by importing a module.
'''

def parse_blurb(personline, header, person):
    """create mugshot Photo instance"""
    ms_filename = personline[header["Mugshot"]]
    ms_path = Path(settings.EXPOWEB, "folk", ms_filename)

    if ms_filename:
        if not ms_path.is_file():
            message = f"! INVALID mug_shot field '{ms_filename}' for {person.fullname}"
            print(message)
            DataIssue.objects.create(parser='people', message=message, url=f"/person/{person.fullname}")
            return

    if ms_filename.startswith('i/'):
        #if person just has an image, add it. It has format 'i/adama2018.jpg'
        person.mug_shot = str(Path("/folk", ms_filename))
        person.blurb = None

    elif ms_filename.startswith('l/'):
        # it has the format 'l/ollybetts.htm' the file may contain <img src="../i/mymug.jpg"> images
        with open(ms_path,'r') as blurbfile:
            blrb = blurbfile.read()
        pblurb=re.search(r'<body>.*<hr',blrb,re.DOTALL)
        if pblurb:
            person.mug_shot = None
            fragment= re.search('<body>(.*)<hr',blrb,re.DOTALL).group(1)
            fragment = fragment.replace('src="../i/', 'src="/folk/i/')
            fragment = fragment.replace("src='../i/", "src='/folk/i/")
            fragment = re.sub(r'<h.*>[^<]*</h.>', '', fragment)
            # replace src="../i/ with src="/folk/i
            person.blurb = fragment
        else:
            message = f"! Blurb parse error in {ms_filename}"
            print(message)
            DataIssue.objects.create(parser='people', message=message, url="/folk/")

    elif ms_filename == '':
        pass
    else:
        message = f"! Unrecognised type of file at mug_shot field '{ms_filename}' for {person.fullname}"
        print(message)
        DataIssue.objects.create(parser='people', message=message, url="/folk/")

    person.save()

def load_people_expos():
    '''This is where the folk.csv file is parsed to read people's names.
    Which it gets wrong for people like Lydia-Clare Leather and various 'von' and 'de' middle 'names'
    and McLean and Mclean and McAdam - interaction with the url parser in urls.py too
    '''
    DataIssue.objects.filter(parser='people').delete()

    persontab = open(os.path.join(settings.EXPOWEB, "folk", "folk.csv")) # should really be EXPOFOLK I guess
    personreader = csv.reader(persontab) # this is an iterator
    headers = next(personreader)
    header = dict(list(zip(headers, list(range(len(headers))))))

    # make expeditions
    print(" - Loading expeditions")
    years = headers[5:]

    for year in years:
        lookupAttribs = {'year':year}
        nonLookupAttribs = {'name':"CUCC expo %s" % year}

        save_carefully(Expedition, lookupAttribs, nonLookupAttribs)

    # make persons
    print(" - Loading personexpeditions")

    for personline in personreader:
        name = personline[header["Name"]]
        name = re.sub(r"<.*?>", "", name)

        firstname = ""
        nickname = ""

        rawlastname = personline[header["Lastname"]].strip()
        matchlastname = re.match(r"^([\w&;\s]+)(?:\(([^)]*)\))?", rawlastname)
        lastname = matchlastname.group(1).strip()

        splitnick = re.match(r"^([\w&;\s]+)(?:\(([^)]*)\))?", name)
        fullname = splitnick.group(1)

        nickname = splitnick.group(2) or ""

        fullname = fullname.strip()
        names = fullname.split(' ')
        firstname = names[0]
        if len(names) == 1:
            lastname = ""

        if personline[header["VfHO member"]] =='':
            vfho = False
        else:
            vfho = True

        lookupAttribs={'first_name':firstname, 'last_name':(lastname or "")}
        nonLookupAttribs={'is_vfho':vfho, 'fullname':fullname}
        person, created = save_carefully(Person, lookupAttribs, nonLookupAttribs)

        parse_blurb(personline=personline, header=header, person=person)

        # make person expedition from table
        for year, attended in list(zip(headers, personline))[5:]:
            expedition = Expedition.objects.get(year=year)
            if attended == "1" or attended == "-1":
                lookupAttribs = {'person':person, 'expedition':expedition}
                nonLookupAttribs = {'nickname':nickname, 'is_guest':(personline[header["Guest"]] == "1")}
                save_carefully(PersonExpedition, lookupAttribs, nonLookupAttribs)
    print("", flush=True)


# used in other referencing parser functions
# expedition name lookup cached for speed (it's a very big list)
Gpersonexpeditionnamelookup = { }
def GetPersonExpeditionNameLookup(expedition):
    global Gpersonexpeditionnamelookup
    res = Gpersonexpeditionnamelookup.get(expedition.name)
    if res:
        return res

    res = { }
    duplicates = set()

    #print("Calculating GetPersonExpeditionNameLookup for " + expedition.year)
    personexpeditions = PersonExpedition.objects.filter(expedition=expedition)
    htmlparser = HTMLParser()
    for personexpedition in personexpeditions:
        possnames = [ ]
        f = unidecode(htmlparser.unescape(personexpedition.person.first_name.lower()))
        l = unidecode(htmlparser.unescape(personexpedition.person.last_name.lower()))
        full = unidecode(htmlparser.unescape(personexpedition.person.fullname.lower()))
        if l:
            possnames.append(f + " " + l)
            possnames.append(f + " " + l[0])
            possnames.append(f + l[0])
            possnames.append(f[0] + " " + l)
        possnames.append(f)
        if full not in possnames:
            possnames.append(full)
        if personexpedition.nickname not in possnames:
            possnames.append(personexpedition.nickname.lower())
            if l:
                # This allows for nickname to be used for short name eg Phil
                # adding Phil Sargent to the list
                if str(personexpedition.nickname.lower() + " " + l) not in possnames:
                    possnames.append(personexpedition.nickname.lower() + " " + l)
                if str(personexpedition.nickname.lower() + " " + l[0]) not in possnames:
                    possnames.append(personexpedition.nickname.lower() + " " + l[0])
                if str(personexpedition.nickname.lower() + l[0]) not in possnames:
                    possnames.append(personexpedition.nickname.lower() + l[0])

        for possname in possnames:
            if possname in res:
                duplicates.add(possname)
            else:
                res[possname] = personexpedition

    for possname in duplicates:
        del res[possname]

    Gpersonexpeditionnamelookup[expedition.name] = res
    return res