troggle/parsers/people.py

import csv
import os
import re
from html import unescape
from pathlib import Path

from django.conf import settings
from unidecode import unidecode

from troggle.core.models.troggle import DataIssue, Expedition, Person, PersonExpedition

"""These functions do not match how the stand-alone folk script works. So the script produces an HTML file which has
href links to pages in troggle which troggle does not think are right.
The standalone script needs to be renedred defucnt, and all the parsing needs to be in troggle. Either that,
or they should use the same code by importing a module.
"""


def parse_blurb(personline, header, person):
    """create mugshot Photo instance"""
    ms_filename = personline[header["Mugshot"]]
    ms_path = Path(settings.EXPOWEB, "folk", ms_filename)

    if ms_filename:
        if not ms_path.is_file():
            message = f"! INVALID mug_shot field '{ms_filename}' for {person.fullname}"
            print(message)
            DataIssue.objects.create(parser="people", message=message, url=f"/person/{person.fullname}")
            return

    if ms_filename.startswith("i/"):
        # if person just has an image, add it. It has format 'i/adama2018.jpg'
        person.mug_shot = str(Path("/folk", ms_filename))
        person.blurb = None

    elif ms_filename.startswith("l/"):
        # it has the format 'l/ollybetts.htm' the file may contain <img src="../i/mymug.jpg"> images
        with open(ms_path, "r") as blurbfile:
            blrb = blurbfile.read()
        pblurb = re.search(r"<body>.*<hr", blrb, re.DOTALL)
        if pblurb:
            person.mug_shot = None
            fragment = re.search("<body>(.*)<hr", blrb, re.DOTALL).group(1)
            fragment = fragment.replace('src="../i/', 'src="/folk/i/')
            fragment = fragment.replace("src='../i/", "src='/folk/i/")
            fragment = re.sub(r"<h.*>[^<]*</h.>", "", fragment)
            # replace src="../i/ with src="/folk/i
            person.blurb = fragment
        else:
            message = f"! Blurb parse error in {ms_filename}"
            print(message)
            DataIssue.objects.create(parser="people", message=message, url="/folk/")

    elif ms_filename == "":
        pass
    else:
        message = f"! Unrecognised type of file at mug_shot field '{ms_filename}' for {person.fullname}"
        print(message)
        DataIssue.objects.create(parser="people", message=message, url="/folk/")

    person.save()


def load_people_expos():
    """This is where the folk.csv file is parsed to read people's names.
    Which it gets wrong for people like Lydia-Clare Leather and various 'von' and 'de' middle 'names'
    and McLean and Mclean and McAdam - interaction with the url parser in urls.py too
    """
    DataIssue.objects.filter(parser="people").delete()

    persontab = open(os.path.join(settings.EXPOWEB, "folk", "folk.csv"))  # should really be EXPOFOLK I guess
    personreader = csv.reader(persontab)  # this is an iterator
    headers = next(personreader)
    header = dict(list(zip(headers, list(range(len(headers))))))

    years = headers[5:]
    nexpos = Expedition.objects.count()
    if nexpos <= 0:
        print(" - Creating expeditions")
        for year in years:
            lookupAttribs = {"year": year}
            nonLookupAttribs = {"name": f"CUCC expo {year}"}
            e = Expedition.objects.create(**nonLookupAttribs, **lookupAttribs)

    print(" - Loading personexpeditions")

    for personline in personreader:
        name = personline[header["Name"]]
        name = re.sub(r"<.*?>", "", name)

        firstname = ""
        nick = ""

        rawlastname = personline[header["Lastname"]].strip()
        matchlastname = re.match(r"^([\w&;\s]+)(?:\(([^)]*)\))?", rawlastname)
        lastname = matchlastname.group(1).strip()

        splitnick = re.match(r"^([\w&;\s]+)(?:\(([^)]*)\))?", name)
        fullname = splitnick.group(1)

        nick = splitnick.group(2) or ""

        fullname = fullname.strip()
        names = fullname.split(" ")
        firstname = names[0]
        if len(names) == 1:
            lastname = ""

        if personline[header["VfHO member"]] == "":
            vfho = False
        else:
            vfho = True

        lookupAttribs = {"first_name": firstname, "last_name": (lastname or "")}
        nonLookupAttribs = {"is_vfho": vfho, "fullname": fullname, "nickname": nick}
        person = Person.objects.create(**nonLookupAttribs, **lookupAttribs)

        parse_blurb(personline=personline, header=header, person=person)

        # make person expedition from table
        for year, attended in list(zip(headers, personline))[5:]:
            expedition = Expedition.objects.get(year=year)
            if attended == "1" or attended == "-1":
                lookupAttribs = {"person": person, "expedition": expedition}
                nonLookupAttribs = {"is_guest": (personline[header["Guest"]] == "1")}
                pe = PersonExpedition.objects.create(**nonLookupAttribs, **lookupAttribs)
    print("", flush=True)


def who_is_this(year, possibleid):
    expo = Expedition.objects.filter(year=year)
    personexpedition = GetPersonExpeditionNameLookup(expo)[possibleid.lower()]
    if personexpedition:
        return personexpedition.person
    else:
        return None


global foreign_friends
foreign_friends = [
    "Aiko",
    "Arndt Karger",
    "Dominik Jauch",
    "Florian Gruner",
    "Fritz Mammel",
    "Gunter Graf",
    "Helmut Stopka-Ebeler",
    "K. Jäger",
    "Kai Schwekend",
    "Karl Gaisberger",
    "Marcus Scheuerman",
    "Mark Morgan",
    "P. Jeutter",
    "R. Seebacher",
    "Regina Kaiser",
    "Robert Seebacher",
    "S. Steinberger",
    "Sepp Steinberger",
    "Thilo Müller",
    "Uli Schütz",
    "Wieland Scheuerle",
]

def known_foreigner(id):
    """If this someone from ARGE or a known Austrian? Name has to be exact, no soft matching
    """
    global foreign_friends

    if id in foreign_friends:
        return True
    else:
        return False


# Refactor. The dict GetPersonExpeditionNameLookup(expo) indexes by name and has values of personexpedition
# This is convoluted, the whole personexpedition concept is unnecessary?

Gpersonexpeditionnamelookup = {}


def GetPersonExpeditionNameLookup(expedition):
    """Yes this should all be in an editable text file, not in the body of the code. Sorry.
    """
    global Gpersonexpeditionnamelookup

    def apply_variations(f, l):
        """Be generous in guessing possible matches. Any duplicates will be ruled as invalid."""
        f = f.lower()
        l = l.lower()
        variations = []
        variations.append(f)
        variations.append(l)
        variations.append(f + l)
        variations.append(f + " " + l)
        variations.append(f + " " + l[0])
        variations.append(f + l[0])
        variations.append(f + " " + l[0] + ".")
        variations.append(f[0] + " " + l)
        variations.append(f[0] + ". " + l)
        variations.append(f[0] + l)
        variations.append(f[0] + l[0])  # initials e.g. gb or bl
        return variations

    res = Gpersonexpeditionnamelookup.get(expedition.name)

    if res:
        return res

    res = {}
    duplicates = set()

    # print("Calculating GetPersonExpeditionNameLookup for " + expedition.year)
    personexpeditions = PersonExpedition.objects.filter(expedition=expedition)
    short = {}
    dellist = []
    for personexpedition in personexpeditions:
        possnames = []
        f = unidecode(unescape(personexpedition.person.first_name.lower()))
        l = unidecode(unescape(personexpedition.person.last_name.lower()))
        full = unidecode(unescape(personexpedition.person.fullname.lower()))
        n = unidecode(unescape(personexpedition.person.nickname.lower()))
        if full not in possnames:
            possnames.append(full)
        if n not in possnames:
            possnames.append(n)

        if l:
            possnames += apply_variations(f, l)

            if n:
                possnames += apply_variations(n, l)

            if f == "Robert".lower():
                possnames += apply_variations("Bob", l)
            if f == "Rob".lower():
                possnames += apply_variations("Robert", l)

            if f == "Thomas".lower():
                possnames += apply_variations("Tom", l)
            if f == "Tom".lower():
                possnames += apply_variations("Thomas", l)

            if f == "Lizzy".lower():
                possnames += apply_variations("Lizzie", l)
            if f == "Lizzie".lower():
                possnames += apply_variations("Lizzy", l)

            if f == "Phil".lower(): # needed when Phil is used with a surname initial, so default short-form does not work.
                possnames += apply_variations("Philip", l)
            if f == "Philip".lower():
                possnames += apply_variations("Phil", l)

            if f == "Andrew".lower():
                possnames += apply_variations("Andy", l)
            if f == "Andy".lower():
                possnames += apply_variations("Andrew", l)
            if f == "Michael".lower():
                possnames += apply_variations("Mike", l)

            if f == "David".lower():
                possnames += apply_variations("Dave", l)
            if f == "Dave".lower():
                possnames += apply_variations("David", l)

            if f == "Peter".lower():
                possnames += apply_variations("Pete", l)
            if f == "Pete".lower():
                possnames += apply_variations("Peter", l)

            if f == "Olly".lower():
                possnames += apply_variations("Oliver", l)
            if f == "Oliver".lower():
                possnames += apply_variations("Olly", l)

            if f == "Ollie".lower():
                possnames += apply_variations("Oliver", l)
            if f == "Oliver".lower():
                possnames += apply_variations("Ollie", l)

            if f == "Becka".lower():
                possnames += apply_variations("Rebecca", l)

            if f"{f} {l}" == "Andy Waddington".lower():
                possnames += apply_variations("aer", "waddington")
            if f"{f} {l}" == "Phil Underwood".lower():
                possnames += apply_variations("phil", "underpants")
            if f"{f} {l}" == "Naomi Griffiths".lower():
                possnames += apply_variations("naomi", "makins")
            if f"{f} {l}" == "Tina White".lower():
                possnames += apply_variations("tina", "richardson")
            if f"{f} {l}" == "Cat Hulse".lower():
                possnames += apply_variations("catherine", "hulse")
                possnames += apply_variations("cat", "henry")
            if f"{f} {l}" == "Jess Stirrups".lower():
                possnames += apply_variations("jessica", "stirrups")
            if f"{f} {l}" == "Nat Dalton".lower():
                possnames += apply_variations("nathanael", "dalton")  # correct. He has a weird spelling.
            if f"{f} {l}" == "Mike Richardson".lower():
                possnames.append("mta")
                possnames.append("miketa")
                possnames.append("mike the animal")
                possnames.append("animal")
            if f"{f} {l}" == "Eric Landgraf".lower():
                possnames.append("eric c.landgraf")
                possnames.append("eric c. landgraf")
                possnames.append("eric c landgraf")
            if f"{f} {l}" == "Nadia Raeburn".lower():
                possnames.append("tinywoman")
                possnames.append("nadia rc")
                possnames.append("nadia raeburn-cherradi")

            if f"{f} {l}" == "Phil Wigglesworth".lower():
                possnames.append("wiggy")
            if f"{f} {l}" == "Philip Banister".lower():
                possnames.append("crofton")
            if f"{f} {l}" == "Elaine Oliver".lower():
                possnames.append("cavingpig")
            if f"{f} {l}" == "Tom Crossley".lower():
                possnames.append("tcacrossley")
            if f"{f} {l}" == "Rob Watson".lower():
                possnames.append("nobrotson")
            if f"{f} {l}" == "Todd Rye".lower():
                possnames.append("samouse1")
            if f"{f} {l}" == "Jono Lester".lower():
                possnames.append("ILoveCaves")
            if f"{f} {l}" == "Joel Stobbart".lower():
                possnames.append("El Stobbarto")
            if f"{f} {l}" == "Rob Watson".lower():
                possnames.append("nobrotson")

        for i in [3, 4, 5, 6]:
            lim = min(i, len(f) + 1)  # short form, e.g. Dan for Daniel.
            if f[:lim] not in short:
                short[f[:lim]] = personexpedition
            else:
                dellist.append(f[:lim])

        possnames = set(possnames)  # remove duplicates
        for possname in possnames:
            if possname in res:
                duplicates.add(possname)
            else:
                res[possname] = personexpedition

    for possname in duplicates:
        del res[possname]

    for possname in dellist:
        if possname in short:  # always true ?
            del short[possname]
    for shortname in short:
        res[shortname] = short[shortname]

    Gpersonexpeditionnamelookup[expedition.name] = res
    return res