troggle/parsers/people.py

import csv
import os
import re
import time
from html import unescape
from pathlib import Path

from django.conf import settings
from django.contrib.auth.models import User
from django.db import models
from unidecode import unidecode

from troggle.core.models.troggle import DataIssue, Expedition, Person, PersonExpedition
from troggle.core.views.signup import read_signups

"""These functions do not match how the stand-alone folk script works. So the script produces an HTML file which has
href links to pages in troggle which troggle does not think are right.
The standalone script needs to be renedred defucnt, and all the parsing needs to be in troggle. Either that,
or they should use the same code by importing a module.
"""

todo = """
- [copy these from paper notes]
"""

def parse_blurb(personline, header):
    """Read the mugshot and blurbfile strings and checks that
    the correspond to real files.

    Fixes the format of <img> tags inside a blurbfile
    and loads the resulting blurb into the database"""
    person = {}

    for folk_field in ["Mugshot", "Blurbfile"]:

        ms_filename = personline[header[folk_field]]
        ms_path = settings.EXPOWEB / "folk" / ms_filename

        if ms_filename:
            if not ms_path.is_file():
                message = f"! INVALID {folk_field} field '{ms_filename}' for {personline[header[folk_field]]}"
                print(message)
                DataIssue.objects.create(parser="people", message=message, url=f"/person/{personline[header['Name']]}")
                return


            person[folk_field.lower()] = str(Path("/folk", ms_filename))

        if (folk_field == "Blurbfile") and ms_filename:
            # it has the format 'l/ollybetts.htm' the file may contain <img src="../i/mymug.jpg"> images
            # print(f"{personline[header["Name"]]}\n-- {folk_field} -- {ms_path}")
            with open(ms_path, "r") as blurbfile:
                blrb = blurbfile.read()
            pblurb = re.search(r"<body>.*<hr", blrb, re.DOTALL)
            if pblurb:
                fragment = re.search("<body>(.*)<hr", blrb, re.DOTALL).group(1)
                fragment = fragment.replace('src="../i/', 'src="/folk/i/')
                fragment = fragment.replace("src='../i/", "src='/folk/i/")
                fragment = re.sub(r"<h.*>[^<]*</h.>", "", fragment)
                person["blurb"] = fragment
            else:
                message = f"! Blurb parse error in {ms_filename}"
                print(message)
                DataIssue.objects.create(parser="people", message=message, url=f"/person/{personline[header['Name']]}")

    return person


slug_cache = {}
def troggle_slugify(longname):
    """Uniqueness enforcement too. Yes we have had two "Dave Johnson"s
    This function copied intact to expoweb/scripts/make-folklist.py
    """
    slug = longname.strip().lower().replace(" ","-")
    slug = re.sub(r'\([^\)]*\)','',slug) # remove nickname in brackets
    slug = slug.replace('&eacute;', 'e')
    slug = slug.replace('&aacute;', 'a')
    slug = slug.replace('&auml;', 'a')
    slug = slug.replace('&', '') # otherwise just remove the &
    slug = slug.replace(';', '') # otherwise just remove the ;
    slug = slug.replace("'", "") # otherwise just remove the ', no O'Reilly problem   # NEW
    slug = re.sub(r'<[^>]*>','',slug) # remove <span-lang = "hu"> and any HTML tags
    slug=slug.strip("-") # remove spare hyphens

    if len(slug) > 40: # slugfield is 50 chars
        slug = slug[:40]
    if slug in slug_cache:
        slug_cache[slug] += 1
        slug = f"{slug}_{slug_cache[slug]}"
    slug_cache[slug] = 1

    return slug

def load_people_expos():
    """This is where the folk.csv file is parsed to read people's names.

    This is ALSO where all the Expedition objects get created. So this is the point at which troggle
    gets told what expeditions exist.

    Given that we need to do stuff for the coming expo, well before we update the folk list,
    the Expedition object for the coming expo is created elsewhere - in addition to
    those created here, if it does not exist.

    Refactored to separate out the creation of objects in the database to use bulk_create to
    speed things up. Made little difference sadly.
    """
    # import cProfile
    # import pstats
    # from pstats import SortKey

    # pr = cProfile.Profile()
    # pr.enable()
    start = time.time()
    DataIssue.objects.filter(parser="people").delete()
    Person.objects.all().delete()
    PersonExpedition.objects.all().delete()

    years_begin = 6

    splitnick_pattern = re.compile(r"^([\w&;\s\-]+)(?:\(([^)]*)\))?")
    displayname_pattern = re.compile(r"^([^(]*)(\(([^)]*)\))?") # removes nickname in brackets
    rawlastname_pattern = re.compile(r"^([\w&;\s]+)(?:\(([^)]*)\))?")

    with open (settings.EXPOWEB / "folk" / "folk.csv", "r") as folkfile:
        folklines = folkfile.readlines() # list of the lines, read all into memory
    personreader = csv.reader(folklines) # this is an iterator
    headers = next(personreader)
    header = dict(list(zip(headers, list(range(len(headers))))))

    years = headers[years_begin:]
    Expedition.objects.all().delete()
    expos = {}
    nexpos = Expedition.objects.count()
    if nexpos <= 0:
        print(" - Creating expeditions")
        for year in years:
            iy = int(year)
            year = year.strip()
            coUniqueAttribs = {"year": year}
            otherAttribs = {"name": f"CUCC expo {year}"}
            e = Expedition.objects.create(**otherAttribs, **coUniqueAttribs)
            expos[year] = e
    else: # re-running a folk import without a complete reset
        print(" - Cacheing expeditions")
        for year in years:
            iy = int(year)
            year = year.strip()
            e = Expedition.objects.get(year=year)
            expos[year] = e
    print("", flush=True)
    print(" - Loading persons and personexpeditions")
    print("  - Reading folk file")

    pe_list = []
    prep_list = []

    p_list =[]
    for personline in personreader:
        #   This is all horrible: refactor it.
        # CSV: Name,Lastname,Guest,VfHO member,Mugshot,..
        # e.g: Olly Betts (Ol),Betts,,,l/ollybetts.htm,
        name = personline[header["Name"]]
        plainname = re.sub(r"<.*?>", "", name) # now in slugify

        displayname = displayname_pattern.match(name).group(1).strip()
        input_name = displayname
        slug = troggle_slugify(displayname)

        firstname = ""
        nick = ""

        rawlastname = personline[header["Lastname"]].strip()
        input_surname = rawlastname

        if rawlastname == "":
            print(f"MISSING SURNAME FIELD for {name} - check against similar names in the list to see what you have done.")
        if matchlastname := rawlastname_pattern.match(rawlastname):
        #re.match(r"^([\w&;\s]+)(?:\(([^)]*)\))?",
            lastname = matchlastname.group(1).strip()
        else:
            print(f"MATCH FAIL {personline=}\n     {slug=}\n     {name=}\n     {rawlastname=}")
            exit(1)

        splitnick = splitnick_pattern.match(plainname)
        fullname = splitnick.group(1) # removes Nickname in brackets, but also cuts hyphenated names
        nick = splitnick.group(2) or ""
        if nick:
            nick = nick.strip()

        fullname = fullname.strip()

        names = fullname.split(" ") # This may have more than one, e.g. "Adeleide de Diesback"
        firstname = names[0]
        if len(names) == 1:
            lastname = "" # wookey special code

        #restore fullname to be the whole string
        fullname = displayname

        if personline[header["VfHO member"]] == "":
            vfho = False
        else:
            vfho = True

        blurb_fields = parse_blurb(personline=personline, header=header)

        attribs = {"slug": slug, "first_name": firstname, "last_name": (lastname or ""),
            "is_vfho": vfho, "fullname": fullname, "nickname": nick, "input_name": input_name, "input_surname": input_surname,
            "is_guest": (personline[header["Guest"]] == "1")}
        p_list.append(Person(**attribs, **blurb_fields))

        # make person expedition
        for year, attended in list(zip(headers, personline))[years_begin:]:
            expedition = expos[str(year)]

            if attended in ("1", "-1"):
                prep_list.append((slug, expedition, (attended == "-1")))
                # pe_list.append(PersonExpedition(person=person, expedition=expedition, noncaver=(attended == "-1")))
    duration = time.time() - start
    print(f"   - duration: {duration:5.1f} s")
    print("  - Loading persons into db")
    persons = Person.objects.bulk_create(p_list)

    print("  - Loading personexpeditions into db", flush=True)
    for pe in prep_list:
        slug, expedition, noncaver = pe
        p = Person.objects.get(slug=slug)
        pe_list.append(PersonExpedition(person=p, expedition=expedition, noncaver=noncaver))
    PersonExpedition.objects.bulk_create(pe_list)

    ensure_users_are_persons()
    most_recent = Expedition.objects.all().first()
    print(most_recent)
    check_new_signups(most_recent)
    # pr.disable()
    # with open("folk_reader.prof", "w") as f:
        # ps = pstats.Stats(pr, stream=f)
        # ps.sort_stats(SortKey.CUMULATIVE)
        # ps.print_stats()


def check_new_signups(expedition):
    signups_clear = read_signups()
    # print(signups_clear)
    for slug in signups_clear:
        print(f"  - Checking signups {slug}")
        p = Person.objects.get(slug=slug)
        pe = PersonExpedition.objects.update_or_create(person=p, expedition=expedition)
        # print("ADDING ",pe, expedition)


def ensure_users_are_persons():
    # Just ensure this is up to date.
    print(f"  - Ensure_users_are_persons() - except for expo and expoadmin of course")
    users = User.objects.all()
    for u in users:
        ps = Person.objects.filter(slug=u.username)
        if len(ps) >= 1:
            p = ps[0]
            p.user = u
            p.save()
            # print(f"     {p.user} {u=}")

def who_is_this(year, possibleid):
    expo = Expedition.objects.filter(year=year)
    personexpedition = GetPersonExpeditionNameLookup(expo)[possibleid.lower()]
    if personexpedition:
        return personexpedition.person
    else:
        return None

def when_on_expo(name):
    """Returns a list of PersonExpedition objects for the string, if recognised as a name
    """
    person_expos = []
    expos = Expedition.objects.all()
    for expo in expos:
        expoers = GetPersonExpeditionNameLookup(expo)
        if name in expoers:
            person_expos.append(expoers[name])
            print(f"{name} => {expoers[name]}")

    return person_expos


global foreign_friends
foreign_friends = [
    "Aiko",
    "Arndt Karger",
    "Dominik Jauch",
    "Florian Gruner",
    "Fritz Mammel",
    "Gunter Graf",
    "Helmut Stopka-Ebeler",
    "K. Jäger",
    "Kai Schwekend",
    "Karl Gaisberger",
    "Marcus Scheuermann",
    "Marcus Scheuerman",
    "Mark Morgan",
    "P. Jeutter",
    "R. Seebacher",
    "Regina Kaiser",
    "Robert Seebacher",
    "S. Steinberger",
    "Sepp Steinberger",
    "Thilo Müller",
    "Uli Schütz",
    "Wieland Scheuerle",
]

def known_foreigner(id):
    """If this someone from ARGE or a known Austrian? Name has to be exact, no soft matching
    """
    global foreign_friends

    if id in foreign_friends:
        return True
    else:
        return False


# Refactor. The dict GetPersonExpeditionNameLookup(expo) indexes by name and has values of personexpedition
# This is convoluted, the  personexpedition concept is unnecessary, should it just retunr person??
# Or better, query with a string and return a list of personexpeditions

Gpersonexpeditionnamelookup = {}


def GetPersonExpeditionNameLookup(expedition):
    """Yes this should all be in an editable text file, not in the body of the code. Sorry.

    This uses the existing database records of everone on an expedition to construct a dictionary
    indexedby every possible pseudonym or alias that the person might be known by.

    This dictionary is used when parsing logbooks and survex files to identify who is being
    referred to, when the name written in the logbook is e.g. "Mike TA" == "Mike The Animal"
    == "Mike Rickardson".
    """
    global Gpersonexpeditionnamelookup

    def apply_initials(variations, a, l):
        variations.append(a + l)
        variations.append(a + " " + l)
        variations.append(a + " " + l[0])
        variations.append(a + l[0])
        variations.append(a + " " + l[0] + ".")
        variations.append(a[0] + " " + l)
        variations.append(a[0] + ". " + l)
        variations.append(a[0] + l)
        variations.append(a[0] + l[0])  # initials e.g. gb or bl
        return variations

    def apply_variations(f, l, n=""):
        """Be generous in guessing possible matches. Any duplicates will be ruled as invalid."""
        f = f.lower()
        l = l.lower()
        variations = []
        variations.append(f)
        variations.append(l)
        variations = apply_initials(variations, f, l)
        if n:
            variations.append(n)
            variations = apply_initials(variations, n, f)
            variations = apply_initials(variations, n, l)
        return variations

    res = Gpersonexpeditionnamelookup.get(expedition.name)

    if res:
        return res

    res = {}
    duplicates = set()

    # print("Calculating GetPersonExpeditionNameLookup for " + expedition.year)
    personexpeditions = PersonExpedition.objects.filter(expedition=expedition)
    short = {}
    dellist = []
    for personexpedition in personexpeditions:
        possnames = []
        f = unidecode(unescape(personexpedition.person.first_name.lower().strip()))
        l = unidecode(unescape(personexpedition.person.last_name.lower().strip()))
        full = unidecode(unescape(personexpedition.person.fullname.lower().strip()))
        n = unidecode(unescape(personexpedition.person.nickname.lower().strip()))
        if full not in possnames:
            possnames.append(full)
        if n not in possnames:
            possnames.append(n)

        if l:
            possnames += apply_variations(f, l, n)


            if f == "Adeleide".lower():
                possnames += apply_variations("Adelaide", l)
            if f == "Adelaide".lower():
                possnames += apply_variations("Adeleide", l)

            if f == "Robert".lower():
                possnames += apply_variations("Bob", l)
            if f == "Rob".lower():
                possnames += apply_variations("Robert", l)

            if f == "Thomas".lower():
                possnames += apply_variations("Tom", l)
            if f == "Tom".lower():
                possnames += apply_variations("Thomas", l)

            if f == "Lizzy".lower():
                possnames += apply_variations("Lizzie", l)
            if f == "Lizzie".lower():
                possnames += apply_variations("Lizzy", l)

            if f == "Phil".lower(): # needed when Phil is used with a surname initial, so default short-form does not work.
                possnames += apply_variations("Philip", l)
            if f == "Philip".lower():
                possnames += apply_variations("Phil", l)

            if f == "Andrew".lower():
                possnames += apply_variations("Andy", l)
            if f == "Andy".lower():
                possnames += apply_variations("Andrew", l)

            if f == "Michael".lower():
                possnames += apply_variations("Mike", l)
            if f == "Mike".lower():
                possnames += apply_variations("Michael", l)

            if f == "David".lower():
                possnames += apply_variations("Dave", l)
            if f == "Dave".lower():
                possnames += apply_variations("David", l)

            if f == "Peter".lower():
                possnames += apply_variations("Pete", l)
            if f == "Pete".lower():
                possnames += apply_variations("Peter", l)

            if f == "Tobias".lower():
                possnames += apply_variations("Toby", l)
            if f == "Toby".lower():
                possnames += apply_variations("Tobias", l)

            if f == "Olly".lower():
                possnames += apply_variations("Oliver", l)
            if f == "Oliver".lower():
                possnames += apply_variations("Olly", l)

            if f == "Ollie".lower():
                possnames += apply_variations("Oliver", l)
            if f == "Oliver".lower():
                possnames += apply_variations("Ollie", l)

            if f == "Becka".lower():
                possnames += apply_variations("Rebecca", l)

            if f"{f} {l}" == "Andy Waddington".lower():
                possnames += apply_variations("aer", "waddington")
            if f"{f} {l}" == "Phil Underwood".lower():
                possnames += apply_variations("phil", "underpants")
            if f"{f} {l}" == "Naomi Griffiths".lower():
                possnames += apply_variations("naomi", "makins")
            if f"{f} {l}" == "Tina White".lower():
                possnames += apply_variations("tina", "richardson")
            if f"{f} {l}" == "Cat Hulse".lower():
                possnames += apply_variations("catherine", "hulse")
                possnames += apply_variations("cat", "henry")
            if f"{f} {l}" == "Jess Stirrups".lower():
                possnames += apply_variations("jessica", "stirrups")
            if f"{f} {l}" == "Nat Dalton".lower():
                possnames += apply_variations("nathanael", "dalton")  # correct. He has a weird spelling.
            if f"{f} {l}" == "Mike Richardson".lower():
                possnames.append("mta")
                possnames.append("miketa")
                possnames.append("mike the animal")
                possnames.append("animal")
            if f"{f} {l}" == "Eric Landgraf".lower():
                possnames.append("eric c.landgraf")
                possnames.append("eric c. landgraf")
                possnames.append("eric c landgraf")
            if f"{f} {l}" == "Nadia Raeburn".lower():
                possnames.append("tinywoman")
                possnames.append("nadia rc")
                possnames.append("nadia raeburn-cherradi")

            if f"{f} {l}" == "Phil Wigglesworth".lower():
                possnames.append("wiggy")
            if f"{f} {l}" == "Philip Banister".lower():
                possnames.append("crofton")
            if f"{f} {l}" == "Elaine Oliver".lower():
                possnames.append("cavingpig")
            if f"{f} {l}" == "Tom Crossley".lower():
                possnames.append("tcacrossley")
            if f"{f} {l}" == "Rob Watson".lower():
                possnames.append("nobrotson")
            if f"{f} {l}" == "Todd Rye".lower():
                possnames.append("samouse1")
            if f"{f} {l}" == "Jono Lester".lower():
                possnames.append("ILoveCaves")
            if f"{f} {l}" == "Joel Stobbart".lower():
                possnames.append("El Stobbarto")
            if f"{f} {l}" == "Rob Watson".lower():
                possnames.append("nobrotson")
            if f"{f} {l}" == "hannah urquhart".lower():
                possnames.append("hannah ug")

        for i in [3, 4, 5, 6]:
            lim = min(i, len(f) + 1)  # short form, e.g. Dan for Daniel.
            if f[:lim] not in short:
                short[f[:lim]] = personexpedition
            else:
                dellist.append(f[:lim])

        possnames = set(possnames)  # remove duplicates
        for possname in possnames:
            if possname in res:
                duplicates.add(possname)
            else:
                res[possname] = personexpedition

    for possname in duplicates:
        del res[possname]

    for possname in dellist:
        if possname in short:  # always true ?
            del short[possname]
    for shortname in short:
        res[shortname] = short[shortname]

    Gpersonexpeditionnamelookup[expedition.name] = res
    return res