troggle-unchained/parsers/people.py

304 lines
12 KiB
Python
Raw Normal View History

2023-01-19 18:33:04 +00:00
import csv
import datetime
import os
import re
import shutil
from html import unescape
from pathlib import Path
2021-04-13 00:11:08 +01:00
from django.conf import settings
2023-01-19 18:33:04 +00:00
from unidecode import unidecode
2021-04-13 00:11:08 +01:00
2023-01-19 18:33:04 +00:00
from troggle.core.models.troggle import (DataIssue, Expedition, Person,
PersonExpedition)
from troggle.core.utils import TROG, save_carefully
2021-04-13 00:11:08 +01:00
'''These functions do not match how the stand-alone folk script works. So the script produces an HTML file which has
2021-02-06 00:18:48 +00:00
href links to pages in troggle which troggle does not think are right.
The standalone script needs to be renedred defucnt, and all the parsing needs to be in troggle. Either that,
or they should use the same code by importing a module.
'''
def parse_blurb(personline, header, person):
2009-07-03 05:31:49 +01:00
"""create mugshot Photo instance"""
ms_filename = personline[header["Mugshot"]]
ms_path = Path(settings.EXPOWEB, "folk", ms_filename)
if ms_filename:
if not ms_path.is_file():
message = f"! INVALID mug_shot field '{ms_filename}' for {person.fullname}"
print(message)
DataIssue.objects.create(parser='people', message=message, url=f"/person/{person.fullname}")
return
if ms_filename.startswith('i/'):
#if person just has an image, add it. It has format 'i/adama2018.jpg'
person.mug_shot = str(Path("/folk", ms_filename))
person.blurb = None
elif ms_filename.startswith('l/'):
# it has the format 'l/ollybetts.htm' the file may contain <img src="../i/mymug.jpg"> images
with open(ms_path,'r') as blurbfile:
blrb = blurbfile.read()
pblurb=re.search(r'<body>.*<hr',blrb,re.DOTALL)
if pblurb:
person.mug_shot = None
fragment= re.search('<body>(.*)<hr',blrb,re.DOTALL).group(1)
fragment = fragment.replace('src="../i/', 'src="/folk/i/')
fragment = fragment.replace("src='../i/", "src='/folk/i/")
fragment = re.sub(r'<h.*>[^<]*</h.>', '', fragment)
# replace src="../i/ with src="/folk/i
person.blurb = fragment
else:
message = f"! Blurb parse error in {ms_filename}"
print(message)
DataIssue.objects.create(parser='people', message=message, url="/folk/")
elif ms_filename == '':
2020-05-15 21:32:55 +01:00
pass
else:
message = f"! Unrecognised type of file at mug_shot field '{ms_filename}' for {person.fullname}"
print(message)
DataIssue.objects.create(parser='people', message=message, url="/folk/")
person.save()
def load_people_expos():
'''This is where the folk.csv file is parsed to read people's names.
Which it gets wrong for people like Lydia-Clare Leather and various 'von' and 'de' middle 'names'
and McLean and Mclean and McAdam - interaction with the url parser in urls.py too
'''
DataIssue.objects.filter(parser='people').delete()
persontab = open(os.path.join(settings.EXPOWEB, "folk", "folk.csv")) # should really be EXPOFOLK I guess
personreader = csv.reader(persontab) # this is an iterator
2020-05-24 01:57:06 +01:00
headers = next(personreader)
header = dict(list(zip(headers, list(range(len(headers))))))
# make expeditions
2020-04-27 23:51:41 +01:00
print(" - Loading expeditions")
years = headers[5:]
for year in years:
2009-05-19 06:32:42 +01:00
lookupAttribs = {'year':year}
2022-11-23 10:41:14 +00:00
nonLookupAttribs = {'name':f"CUCC expo {year}"}
2009-05-19 06:32:42 +01:00
2021-04-13 01:13:08 +01:00
save_carefully(Expedition, lookupAttribs, nonLookupAttribs)
# make persons
2020-04-27 23:51:41 +01:00
print(" - Loading personexpeditions")
for personline in personreader:
name = personline[header["Name"]]
name = re.sub(r"<.*?>", "", name)
firstname = ""
nickname = ""
rawlastname = personline[header["Lastname"]].strip()
matchlastname = re.match(r"^([\w&;\s]+)(?:\(([^)]*)\))?", rawlastname)
lastname = matchlastname.group(1).strip()
splitnick = re.match(r"^([\w&;\s]+)(?:\(([^)]*)\))?", name)
fullname = splitnick.group(1)
nickname = splitnick.group(2) or ""
fullname = fullname.strip()
names = fullname.split(' ')
firstname = names[0]
if len(names) == 1:
lastname = ""
2020-06-19 16:39:05 +01:00
if personline[header["VfHO member"]] =='':
vfho = False
else:
vfho = True
lookupAttribs={'first_name':firstname, 'last_name':(lastname or "")}
2022-10-08 22:17:53 +01:00
nonLookupAttribs={'is_vfho':vfho, 'fullname':fullname, 'nickname':nickname}
2021-04-13 01:13:08 +01:00
person, created = save_carefully(Person, lookupAttribs, nonLookupAttribs)
parse_blurb(personline=personline, header=header, person=person)
# make person expedition from table
2020-05-24 01:57:06 +01:00
for year, attended in list(zip(headers, personline))[5:]:
2021-04-13 01:13:08 +01:00
expedition = Expedition.objects.get(year=year)
if attended == "1" or attended == "-1":
2009-05-19 06:32:42 +01:00
lookupAttribs = {'person':person, 'expedition':expedition}
2020-07-06 20:27:31 +01:00
nonLookupAttribs = {'nickname':nickname, 'is_guest':(personline[header["Guest"]] == "1")}
2021-04-13 01:13:08 +01:00
save_carefully(PersonExpedition, lookupAttribs, nonLookupAttribs)
2021-04-27 20:44:24 +01:00
print("", flush=True)
2022-10-08 22:17:53 +01:00
def who_is_this(year,possibleid):
expo = Expedition.objects.filter(year=year)
personexpedition = GetPersonExpeditionNameLookup(expo)[possibleid.lower()]
if personexpedition:
return personexpedition.person
else:
return None
global foreign_friends
foreign_friends = ["P. Jeutter", "K. Jäger", "S. Steinberger", "R. Seebacher",
"Dominik Jauch", "Fritz Mammel", "Marcus Scheuerman",
2022-12-17 17:05:55 +00:00
"Uli Schütz", "Wieland Scheuerle", "Arndt Karger",
"Kai Schwekend", "Regina Kaiser", "Thilo Müller","Wieland Scheuerle",
"Florian Gruner", "Helmut Stopka-Ebeler", "Aiko", "Mark Morgan", "Arndt Karger"]
def known_foreigner(id):
'''If this someone from ARGE or a known Austrian? Name has to be exact, no soft matching
'''
global foreign_friends
if id in foreign_friends:
return True
else:
return False
2022-10-08 22:17:53 +01:00
2022-10-07 21:47:05 +01:00
# Refactor. The dict GetPersonExpeditionNameLookup(expo) indexes by name and has values of personexpedition
# This is convoluted, the whole personexpedition concept is unnecessary?
2022-10-07 21:47:05 +01:00
Gpersonexpeditionnamelookup = { }
def GetPersonExpeditionNameLookup(expedition):
global Gpersonexpeditionnamelookup
def apply_variations(f, l):
'''Be generous in guessing possible matches. Any duplicates will be ruled as invalid.
'''
f = f.lower()
l = l.lower()
variations = []
variations.append(f)
variations.append(l)
2022-10-09 21:50:32 +01:00
variations.append(f + l)
variations.append(f + " " + l)
variations.append(f + " " + l[0])
variations.append(f + l[0])
variations.append(f + " " +l[0] + '.')
variations.append(f[0] + " " + l)
variations.append(f[0] + ". " + l)
variations.append(f[0] + l)
variations.append(f[0] + l[0]) # initials e.g. gb or bl
return variations
res = Gpersonexpeditionnamelookup.get(expedition.name)
if res:
return res
res = { }
duplicates = set()
2020-05-14 19:37:46 +01:00
#print("Calculating GetPersonExpeditionNameLookup for " + expedition.year)
2021-04-13 01:13:08 +01:00
personexpeditions = PersonExpedition.objects.filter(expedition=expedition)
short = {}
dellist = []
for personexpedition in personexpeditions:
possnames = [ ]
f = unidecode(unescape(personexpedition.person.first_name.lower()))
l = unidecode(unescape(personexpedition.person.last_name.lower()))
full = unidecode(unescape(personexpedition.person.fullname.lower()))
n = unidecode(unescape(personexpedition.nickname.lower()))
if full not in possnames:
possnames.append(full)
if n not in possnames:
possnames.append(n)
if l:
possnames += apply_variations(f,l)
if n:
possnames += apply_variations(n, l)
2022-10-09 21:50:32 +01:00
if f == "Robert".lower():
possnames += apply_variations("Bob", l)
if f == "Rob".lower():
possnames += apply_variations("Robert", l)
if f == "Andrew".lower():
possnames += apply_variations("Andy", l)
if f == "Andy".lower():
possnames += apply_variations("Andrew", l)
if f == "Michael".lower():
possnames += apply_variations("Mike", l)
if f == "David".lower():
possnames += apply_variations("Dave", l)
2022-10-09 21:50:32 +01:00
if f == "Dave".lower():
possnames += apply_variations("David", l)
if f == "Peter".lower():
possnames += apply_variations("Pete", l)
2022-10-09 21:50:32 +01:00
if f == "Pete".lower():
possnames += apply_variations("Peter", l)
2022-10-09 21:50:32 +01:00
if f == "Olly".lower():
possnames += apply_variations("Oliver", l)
if f == "Oliver".lower():
possnames += apply_variations("Olly", l)
if f == "Ollie".lower():
possnames += apply_variations("Oliver", l)
if f == "Oliver".lower():
possnames += apply_variations("Ollie", l)
if f == "Becka".lower():
possnames += apply_variations("Rebecca", l)
if f'{f} {l}' == "Andy Waddington".lower():
2022-10-09 21:50:32 +01:00
possnames += apply_variations("aer", "waddington")
if f'{f} {l}' == "Phil Underwood".lower():
2022-10-09 21:50:32 +01:00
possnames += apply_variations("phil", "underpants")
if f'{f} {l}' == "Naomi Griffiths".lower():
2022-10-09 21:50:32 +01:00
possnames += apply_variations("naomi", "makins")
if f'{f} {l}' == "Tina White".lower():
possnames += apply_variations("tina", "richardson")
if f'{f} {l}' == "Cat Hulse".lower():
2022-10-09 21:50:32 +01:00
possnames += apply_variations("catherine", "hulse")
possnames += apply_variations("cat", "henry")
if f'{f} {l}' == "Jess Stirrups".lower():
2022-10-09 21:50:32 +01:00
possnames += apply_variations("jessica", "stirrups")
if f'{f} {l}' == "Nat Dalton".lower():
2022-10-09 21:50:32 +01:00
possnames += apply_variations("nathanael", "dalton") # correct. He has a weird spelling.
if f'{f} {l}' == "Mike Richardson".lower():
2022-10-09 21:50:32 +01:00
possnames.append("mta")
possnames.append("miketa")
possnames.append("mike the animal")
possnames.append("animal")
if f'{f} {l}' == "Eric Landgraf".lower():
2022-10-09 21:50:32 +01:00
possnames.append("eric c.landgraf")
possnames.append("eric c. landgraf")
possnames.append("eric c landgraf")
if f'{f} {l}' == "Nadia Raeburn".lower():
possnames.append("nadia rc")
possnames.append("nadia raeburn-cherradi")
for i in [3, 4, 5, 6]:
lim = min(i, len(f)+1) # short form, e.g. Dan for Daniel.
if f[:lim] not in short:
short[f[:lim]]= personexpedition
else:
dellist.append(f[:lim])
possnames = set(possnames) # remove duplicates
for possname in possnames:
if possname in res:
duplicates.add(possname)
else:
res[possname] = personexpedition
for possname in duplicates:
del res[possname]
for possname in dellist:
if possname in short: #always true ?
del short[possname]
for shortname in short:
res[shortname] = short[shortname]
Gpersonexpeditionnamelookup[expedition.name] = res
return res