2023-01-19 18:33:04 +00:00
|
|
|
import csv
|
|
|
|
import os
|
|
|
|
import re
|
2022-03-02 23:19:48 +00:00
|
|
|
from html import unescape
|
2021-04-15 17:51:01 +01:00
|
|
|
from pathlib import Path
|
2009-05-13 05:53:37 +01:00
|
|
|
|
2021-04-13 00:11:08 +01:00
|
|
|
from django.conf import settings
|
2023-01-19 18:33:04 +00:00
|
|
|
from unidecode import unidecode
|
2021-04-13 00:11:08 +01:00
|
|
|
|
2023-01-19 21:18:42 +00:00
|
|
|
from troggle.core.models.troggle import DataIssue, Expedition, Person, PersonExpedition
|
2021-04-13 00:11:08 +01:00
|
|
|
|
2023-01-19 21:18:42 +00:00
|
|
|
"""These functions do not match how the stand-alone folk script works. So the script produces an HTML file which has
|
2021-02-06 00:18:48 +00:00
|
|
|
href links to pages in troggle which troggle does not think are right.
|
|
|
|
The standalone script needs to be renedred defucnt, and all the parsing needs to be in troggle. Either that,
|
|
|
|
or they should use the same code by importing a module.
|
2023-01-19 21:18:42 +00:00
|
|
|
"""
|
|
|
|
|
2021-02-06 00:18:48 +00:00
|
|
|
|
2021-04-15 17:51:01 +01:00
|
|
|
def parse_blurb(personline, header, person):
|
2009-07-03 05:31:49 +01:00
|
|
|
"""create mugshot Photo instance"""
|
2021-04-15 17:51:01 +01:00
|
|
|
ms_filename = personline[header["Mugshot"]]
|
|
|
|
ms_path = Path(settings.EXPOWEB, "folk", ms_filename)
|
2023-01-19 21:18:42 +00:00
|
|
|
|
2021-04-15 17:51:01 +01:00
|
|
|
if ms_filename:
|
|
|
|
if not ms_path.is_file():
|
|
|
|
message = f"! INVALID mug_shot field '{ms_filename}' for {person.fullname}"
|
|
|
|
print(message)
|
2023-01-19 21:18:42 +00:00
|
|
|
DataIssue.objects.create(parser="people", message=message, url=f"/person/{person.fullname}")
|
2021-04-15 17:51:01 +01:00
|
|
|
return
|
2023-01-19 21:18:42 +00:00
|
|
|
|
|
|
|
if ms_filename.startswith("i/"):
|
|
|
|
# if person just has an image, add it. It has format 'i/adama2018.jpg'
|
2021-04-15 17:51:01 +01:00
|
|
|
person.mug_shot = str(Path("/folk", ms_filename))
|
|
|
|
person.blurb = None
|
|
|
|
|
2023-01-19 21:18:42 +00:00
|
|
|
elif ms_filename.startswith("l/"):
|
2021-04-15 17:51:01 +01:00
|
|
|
# it has the format 'l/ollybetts.htm' the file may contain <img src="../i/mymug.jpg"> images
|
2023-01-19 21:18:42 +00:00
|
|
|
with open(ms_path, "r") as blurbfile:
|
2021-04-15 17:51:01 +01:00
|
|
|
blrb = blurbfile.read()
|
2023-01-19 21:18:42 +00:00
|
|
|
pblurb = re.search(r"<body>.*<hr", blrb, re.DOTALL)
|
2021-04-15 17:51:01 +01:00
|
|
|
if pblurb:
|
2023-01-19 21:18:42 +00:00
|
|
|
person.mug_shot = None
|
|
|
|
fragment = re.search("<body>(.*)<hr", blrb, re.DOTALL).group(1)
|
2021-04-15 17:51:01 +01:00
|
|
|
fragment = fragment.replace('src="../i/', 'src="/folk/i/')
|
|
|
|
fragment = fragment.replace("src='../i/", "src='/folk/i/")
|
2023-01-19 21:18:42 +00:00
|
|
|
fragment = re.sub(r"<h.*>[^<]*</h.>", "", fragment)
|
2021-04-15 17:51:01 +01:00
|
|
|
# replace src="../i/ with src="/folk/i
|
|
|
|
person.blurb = fragment
|
|
|
|
else:
|
|
|
|
message = f"! Blurb parse error in {ms_filename}"
|
|
|
|
print(message)
|
2023-01-19 21:18:42 +00:00
|
|
|
DataIssue.objects.create(parser="people", message=message, url="/folk/")
|
2021-04-15 17:51:01 +01:00
|
|
|
|
2023-01-19 21:18:42 +00:00
|
|
|
elif ms_filename == "":
|
2020-05-15 21:32:55 +01:00
|
|
|
pass
|
2021-04-15 17:51:01 +01:00
|
|
|
else:
|
|
|
|
message = f"! Unrecognised type of file at mug_shot field '{ms_filename}' for {person.fullname}"
|
|
|
|
print(message)
|
2023-01-19 21:18:42 +00:00
|
|
|
DataIssue.objects.create(parser="people", message=message, url="/folk/")
|
2021-04-15 17:51:01 +01:00
|
|
|
|
2009-05-13 05:53:37 +01:00
|
|
|
person.save()
|
2009-05-13 05:35:59 +01:00
|
|
|
|
2023-01-19 21:18:42 +00:00
|
|
|
|
2021-04-15 17:51:01 +01:00
|
|
|
def load_people_expos():
|
2023-01-19 21:18:42 +00:00
|
|
|
"""This is where the folk.csv file is parsed to read people's names.
|
2021-04-15 17:51:01 +01:00
|
|
|
Which it gets wrong for people like Lydia-Clare Leather and various 'von' and 'de' middle 'names'
|
|
|
|
and McLean and Mclean and McAdam - interaction with the url parser in urls.py too
|
2023-01-19 21:18:42 +00:00
|
|
|
"""
|
|
|
|
DataIssue.objects.filter(parser="people").delete()
|
|
|
|
|
|
|
|
persontab = open(os.path.join(settings.EXPOWEB, "folk", "folk.csv")) # should really be EXPOFOLK I guess
|
|
|
|
personreader = csv.reader(persontab) # this is an iterator
|
2020-05-24 01:57:06 +01:00
|
|
|
headers = next(personreader)
|
|
|
|
header = dict(list(zip(headers, list(range(len(headers))))))
|
2023-01-19 21:18:42 +00:00
|
|
|
|
2009-05-13 05:35:59 +01:00
|
|
|
years = headers[5:]
|
2023-01-28 14:03:46 +00:00
|
|
|
nexpos = Expedition.objects.count()
|
|
|
|
if nexpos <= 0:
|
|
|
|
print(" - Creating expeditions")
|
|
|
|
for year in years:
|
|
|
|
lookupAttribs = {"year": year}
|
|
|
|
nonLookupAttribs = {"name": f"CUCC expo {year}"}
|
|
|
|
e = Expedition.objects.create(**nonLookupAttribs, **lookupAttribs)
|
2023-01-19 21:18:42 +00:00
|
|
|
|
2020-04-27 23:51:41 +01:00
|
|
|
print(" - Loading personexpeditions")
|
2009-05-13 05:35:59 +01:00
|
|
|
|
2009-05-13 05:48:47 +01:00
|
|
|
for personline in personreader:
|
|
|
|
name = personline[header["Name"]]
|
2019-03-30 13:58:38 +00:00
|
|
|
name = re.sub(r"<.*?>", "", name)
|
2019-03-31 15:39:53 +01:00
|
|
|
|
2019-04-19 22:52:54 +01:00
|
|
|
firstname = ""
|
|
|
|
nickname = ""
|
|
|
|
|
|
|
|
rawlastname = personline[header["Lastname"]].strip()
|
|
|
|
matchlastname = re.match(r"^([\w&;\s]+)(?:\(([^)]*)\))?", rawlastname)
|
|
|
|
lastname = matchlastname.group(1).strip()
|
|
|
|
|
|
|
|
splitnick = re.match(r"^([\w&;\s]+)(?:\(([^)]*)\))?", name)
|
|
|
|
fullname = splitnick.group(1)
|
|
|
|
|
|
|
|
nickname = splitnick.group(2) or ""
|
|
|
|
|
|
|
|
fullname = fullname.strip()
|
2023-01-19 21:18:42 +00:00
|
|
|
names = fullname.split(" ")
|
2019-04-19 22:52:54 +01:00
|
|
|
firstname = names[0]
|
|
|
|
if len(names) == 1:
|
|
|
|
lastname = ""
|
|
|
|
|
2023-01-19 21:18:42 +00:00
|
|
|
if personline[header["VfHO member"]] == "":
|
2020-06-19 16:39:05 +01:00
|
|
|
vfho = False
|
|
|
|
else:
|
|
|
|
vfho = True
|
|
|
|
|
2023-01-19 21:18:42 +00:00
|
|
|
lookupAttribs = {"first_name": firstname, "last_name": (lastname or "")}
|
|
|
|
nonLookupAttribs = {"is_vfho": vfho, "fullname": fullname, "nickname": nickname}
|
2023-01-28 14:03:46 +00:00
|
|
|
person = Person.objects.create(**nonLookupAttribs, **lookupAttribs)
|
2019-03-31 15:39:53 +01:00
|
|
|
|
2021-04-15 17:51:01 +01:00
|
|
|
parse_blurb(personline=personline, header=header, person=person)
|
2023-01-19 21:18:42 +00:00
|
|
|
|
2009-05-13 05:48:47 +01:00
|
|
|
# make person expedition from table
|
2020-05-24 01:57:06 +01:00
|
|
|
for year, attended in list(zip(headers, personline))[5:]:
|
2021-04-13 01:13:08 +01:00
|
|
|
expedition = Expedition.objects.get(year=year)
|
2009-05-13 05:35:59 +01:00
|
|
|
if attended == "1" or attended == "-1":
|
2023-01-19 21:18:42 +00:00
|
|
|
lookupAttribs = {"person": person, "expedition": expedition}
|
|
|
|
nonLookupAttribs = {"nickname": nickname, "is_guest": (personline[header["Guest"]] == "1")}
|
2023-01-28 14:03:46 +00:00
|
|
|
pe = PersonExpedition.objects.create(**nonLookupAttribs, **lookupAttribs)
|
2021-04-27 20:44:24 +01:00
|
|
|
print("", flush=True)
|
2009-05-13 05:35:59 +01:00
|
|
|
|
2023-01-19 21:18:42 +00:00
|
|
|
|
|
|
|
def who_is_this(year, possibleid):
|
2022-10-08 22:17:53 +01:00
|
|
|
expo = Expedition.objects.filter(year=year)
|
2023-01-19 21:18:42 +00:00
|
|
|
personexpedition = GetPersonExpeditionNameLookup(expo)[possibleid.lower()]
|
2022-10-08 22:17:53 +01:00
|
|
|
if personexpedition:
|
|
|
|
return personexpedition.person
|
|
|
|
else:
|
|
|
|
return None
|
2023-01-19 21:18:42 +00:00
|
|
|
|
|
|
|
|
2022-10-12 21:10:48 +01:00
|
|
|
global foreign_friends
|
2023-01-19 21:18:42 +00:00
|
|
|
foreign_friends = [
|
|
|
|
"P. Jeutter",
|
|
|
|
"K. Jäger",
|
|
|
|
"S. Steinberger",
|
|
|
|
"R. Seebacher",
|
|
|
|
"Dominik Jauch",
|
|
|
|
"Fritz Mammel",
|
|
|
|
"Marcus Scheuerman",
|
|
|
|
"Uli Schütz",
|
|
|
|
"Wieland Scheuerle",
|
|
|
|
"Arndt Karger",
|
|
|
|
"Kai Schwekend",
|
|
|
|
"Regina Kaiser",
|
|
|
|
"Thilo Müller",
|
|
|
|
"Wieland Scheuerle",
|
|
|
|
"Florian Gruner",
|
|
|
|
"Helmut Stopka-Ebeler",
|
|
|
|
"Aiko",
|
|
|
|
"Mark Morgan",
|
|
|
|
"Arndt Karger",
|
|
|
|
]
|
|
|
|
|
|
|
|
|
2022-10-12 21:10:48 +01:00
|
|
|
def known_foreigner(id):
|
2023-01-19 21:18:42 +00:00
|
|
|
"""If this someone from ARGE or a known Austrian? Name has to be exact, no soft matching"""
|
|
|
|
global foreign_friends
|
2022-10-10 13:40:21 +01:00
|
|
|
|
2022-10-12 21:10:48 +01:00
|
|
|
if id in foreign_friends:
|
2022-10-10 13:40:21 +01:00
|
|
|
return True
|
|
|
|
else:
|
|
|
|
return False
|
|
|
|
|
2023-01-19 21:18:42 +00:00
|
|
|
|
2022-10-07 21:47:05 +01:00
|
|
|
# Refactor. The dict GetPersonExpeditionNameLookup(expo) indexes by name and has values of personexpedition
|
2022-10-10 13:40:21 +01:00
|
|
|
# This is convoluted, the whole personexpedition concept is unnecessary?
|
2022-10-07 21:47:05 +01:00
|
|
|
|
2023-01-19 21:18:42 +00:00
|
|
|
Gpersonexpeditionnamelookup = {}
|
|
|
|
|
|
|
|
|
2009-05-13 05:39:52 +01:00
|
|
|
def GetPersonExpeditionNameLookup(expedition):
|
|
|
|
global Gpersonexpeditionnamelookup
|
2023-01-19 21:18:42 +00:00
|
|
|
|
2022-10-09 00:29:53 +01:00
|
|
|
def apply_variations(f, l):
|
2023-01-19 21:18:42 +00:00
|
|
|
"""Be generous in guessing possible matches. Any duplicates will be ruled as invalid."""
|
2022-10-09 00:29:53 +01:00
|
|
|
f = f.lower()
|
|
|
|
l = l.lower()
|
|
|
|
variations = []
|
|
|
|
variations.append(f)
|
|
|
|
variations.append(l)
|
2022-10-09 21:50:32 +01:00
|
|
|
variations.append(f + l)
|
2022-10-09 00:29:53 +01:00
|
|
|
variations.append(f + " " + l)
|
|
|
|
variations.append(f + " " + l[0])
|
|
|
|
variations.append(f + l[0])
|
2023-01-19 21:18:42 +00:00
|
|
|
variations.append(f + " " + l[0] + ".")
|
2022-10-09 00:29:53 +01:00
|
|
|
variations.append(f[0] + " " + l)
|
2022-10-10 13:40:21 +01:00
|
|
|
variations.append(f[0] + ". " + l)
|
2022-10-09 00:29:53 +01:00
|
|
|
variations.append(f[0] + l)
|
2023-01-19 21:18:42 +00:00
|
|
|
variations.append(f[0] + l[0]) # initials e.g. gb or bl
|
2022-10-09 00:29:53 +01:00
|
|
|
return variations
|
2023-01-19 21:18:42 +00:00
|
|
|
|
2009-05-13 05:39:52 +01:00
|
|
|
res = Gpersonexpeditionnamelookup.get(expedition.name)
|
2023-01-19 21:18:42 +00:00
|
|
|
|
2009-05-13 05:39:52 +01:00
|
|
|
if res:
|
|
|
|
return res
|
2023-01-19 21:18:42 +00:00
|
|
|
|
|
|
|
res = {}
|
2009-05-13 05:39:52 +01:00
|
|
|
duplicates = set()
|
2023-01-19 21:18:42 +00:00
|
|
|
|
|
|
|
# print("Calculating GetPersonExpeditionNameLookup for " + expedition.year)
|
2021-04-13 01:13:08 +01:00
|
|
|
personexpeditions = PersonExpedition.objects.filter(expedition=expedition)
|
2022-10-07 23:52:10 +01:00
|
|
|
short = {}
|
|
|
|
dellist = []
|
2009-05-13 05:39:52 +01:00
|
|
|
for personexpedition in personexpeditions:
|
2023-01-19 21:18:42 +00:00
|
|
|
possnames = []
|
2022-03-02 23:19:48 +00:00
|
|
|
f = unidecode(unescape(personexpedition.person.first_name.lower()))
|
|
|
|
l = unidecode(unescape(personexpedition.person.last_name.lower()))
|
|
|
|
full = unidecode(unescape(personexpedition.person.fullname.lower()))
|
2022-10-09 00:29:53 +01:00
|
|
|
n = unidecode(unescape(personexpedition.nickname.lower()))
|
2019-04-19 22:52:54 +01:00
|
|
|
if full not in possnames:
|
|
|
|
possnames.append(full)
|
2022-10-09 00:29:53 +01:00
|
|
|
if n not in possnames:
|
|
|
|
possnames.append(n)
|
2023-01-19 21:18:42 +00:00
|
|
|
|
2022-10-09 00:29:53 +01:00
|
|
|
if l:
|
2023-01-19 21:18:42 +00:00
|
|
|
possnames += apply_variations(f, l)
|
2022-10-09 00:29:53 +01:00
|
|
|
|
|
|
|
if n:
|
|
|
|
possnames += apply_variations(n, l)
|
2023-01-19 21:18:42 +00:00
|
|
|
|
2022-10-09 00:29:53 +01:00
|
|
|
if f == "Robert".lower():
|
|
|
|
possnames += apply_variations("Bob", l)
|
2022-10-10 13:40:21 +01:00
|
|
|
if f == "Rob".lower():
|
|
|
|
possnames += apply_variations("Robert", l)
|
2023-01-19 21:18:42 +00:00
|
|
|
|
2022-10-09 00:29:53 +01:00
|
|
|
if f == "Andrew".lower():
|
|
|
|
possnames += apply_variations("Andy", l)
|
|
|
|
if f == "Andy".lower():
|
|
|
|
possnames += apply_variations("Andrew", l)
|
|
|
|
if f == "Michael".lower():
|
|
|
|
possnames += apply_variations("Mike", l)
|
2023-01-19 21:18:42 +00:00
|
|
|
|
2022-10-09 00:29:53 +01:00
|
|
|
if f == "David".lower():
|
|
|
|
possnames += apply_variations("Dave", l)
|
2022-10-09 21:50:32 +01:00
|
|
|
if f == "Dave".lower():
|
|
|
|
possnames += apply_variations("David", l)
|
2023-01-19 21:18:42 +00:00
|
|
|
|
2022-10-09 00:29:53 +01:00
|
|
|
if f == "Peter".lower():
|
|
|
|
possnames += apply_variations("Pete", l)
|
2022-10-09 21:50:32 +01:00
|
|
|
if f == "Pete".lower():
|
|
|
|
possnames += apply_variations("Peter", l)
|
2023-01-19 21:18:42 +00:00
|
|
|
|
2022-10-09 21:50:32 +01:00
|
|
|
if f == "Olly".lower():
|
|
|
|
possnames += apply_variations("Oliver", l)
|
|
|
|
if f == "Oliver".lower():
|
|
|
|
possnames += apply_variations("Olly", l)
|
2023-01-19 21:18:42 +00:00
|
|
|
|
2022-10-10 13:40:21 +01:00
|
|
|
if f == "Ollie".lower():
|
|
|
|
possnames += apply_variations("Oliver", l)
|
|
|
|
if f == "Oliver".lower():
|
|
|
|
possnames += apply_variations("Ollie", l)
|
|
|
|
|
2022-10-09 00:29:53 +01:00
|
|
|
if f == "Becka".lower():
|
|
|
|
possnames += apply_variations("Rebecca", l)
|
2023-01-19 21:18:42 +00:00
|
|
|
|
|
|
|
if f"{f} {l}" == "Andy Waddington".lower():
|
2022-10-09 21:50:32 +01:00
|
|
|
possnames += apply_variations("aer", "waddington")
|
2023-01-19 21:18:42 +00:00
|
|
|
if f"{f} {l}" == "Phil Underwood".lower():
|
2022-10-09 21:50:32 +01:00
|
|
|
possnames += apply_variations("phil", "underpants")
|
2023-01-19 21:18:42 +00:00
|
|
|
if f"{f} {l}" == "Naomi Griffiths".lower():
|
2022-10-09 21:50:32 +01:00
|
|
|
possnames += apply_variations("naomi", "makins")
|
2023-01-19 21:18:42 +00:00
|
|
|
if f"{f} {l}" == "Tina White".lower():
|
2022-10-09 21:50:32 +01:00
|
|
|
possnames += apply_variations("tina", "richardson")
|
2023-01-19 21:18:42 +00:00
|
|
|
if f"{f} {l}" == "Cat Hulse".lower():
|
2022-10-09 21:50:32 +01:00
|
|
|
possnames += apply_variations("catherine", "hulse")
|
|
|
|
possnames += apply_variations("cat", "henry")
|
2023-01-19 21:18:42 +00:00
|
|
|
if f"{f} {l}" == "Jess Stirrups".lower():
|
2022-10-09 21:50:32 +01:00
|
|
|
possnames += apply_variations("jessica", "stirrups")
|
2023-01-19 21:18:42 +00:00
|
|
|
if f"{f} {l}" == "Nat Dalton".lower():
|
|
|
|
possnames += apply_variations("nathanael", "dalton") # correct. He has a weird spelling.
|
|
|
|
if f"{f} {l}" == "Mike Richardson".lower():
|
2022-10-09 21:50:32 +01:00
|
|
|
possnames.append("mta")
|
|
|
|
possnames.append("miketa")
|
|
|
|
possnames.append("mike the animal")
|
|
|
|
possnames.append("animal")
|
2023-01-19 21:18:42 +00:00
|
|
|
if f"{f} {l}" == "Eric Landgraf".lower():
|
2022-10-09 21:50:32 +01:00
|
|
|
possnames.append("eric c.landgraf")
|
|
|
|
possnames.append("eric c. landgraf")
|
|
|
|
possnames.append("eric c landgraf")
|
2023-01-19 21:18:42 +00:00
|
|
|
if f"{f} {l}" == "Nadia Raeburn".lower():
|
2023-01-28 14:03:46 +00:00
|
|
|
possnames.append("tinywoman")
|
2022-10-09 21:50:32 +01:00
|
|
|
possnames.append("nadia rc")
|
|
|
|
possnames.append("nadia raeburn-cherradi")
|
2023-01-28 14:03:46 +00:00
|
|
|
if f"{f} {l}" == "Phil Wigglesworth".lower():
|
|
|
|
possnames.append("wiggy")
|
|
|
|
if f"{f} {l}" == "Elaine Oliver".lower():
|
|
|
|
possnames.append("cavingpig")
|
|
|
|
if f"{f} {l}" == "Tom Crossley".lower():
|
|
|
|
possnames.append("tcacrossley")
|
|
|
|
if f"{f} {l}" == "Rob Watson".lower():
|
|
|
|
possnames.append("nobrotson")
|
|
|
|
if f"{f} {l}" == "Todd Rye".lower():
|
|
|
|
possnames.append("samouse1")
|
2023-01-19 21:18:42 +00:00
|
|
|
|
2022-10-09 21:50:32 +01:00
|
|
|
for i in [3, 4, 5, 6]:
|
2023-01-19 21:18:42 +00:00
|
|
|
lim = min(i, len(f) + 1) # short form, e.g. Dan for Daniel.
|
2022-10-09 00:29:53 +01:00
|
|
|
if f[:lim] not in short:
|
2023-01-19 21:18:42 +00:00
|
|
|
short[f[:lim]] = personexpedition
|
2022-10-09 00:29:53 +01:00
|
|
|
else:
|
|
|
|
dellist.append(f[:lim])
|
2023-01-19 21:18:42 +00:00
|
|
|
|
|
|
|
possnames = set(possnames) # remove duplicates
|
2009-05-13 05:39:52 +01:00
|
|
|
for possname in possnames:
|
|
|
|
if possname in res:
|
|
|
|
duplicates.add(possname)
|
|
|
|
else:
|
|
|
|
res[possname] = personexpedition
|
2023-01-19 21:18:42 +00:00
|
|
|
|
2009-05-13 05:39:52 +01:00
|
|
|
for possname in duplicates:
|
|
|
|
del res[possname]
|
2023-01-19 21:18:42 +00:00
|
|
|
|
2022-10-07 23:52:10 +01:00
|
|
|
for possname in dellist:
|
2023-01-19 21:18:42 +00:00
|
|
|
if possname in short: # always true ?
|
2022-10-07 23:52:10 +01:00
|
|
|
del short[possname]
|
|
|
|
for shortname in short:
|
|
|
|
res[shortname] = short[shortname]
|
2023-01-19 21:18:42 +00:00
|
|
|
|
2009-05-13 05:39:52 +01:00
|
|
|
Gpersonexpeditionnamelookup[expedition.name] = res
|
|
|
|
return res
|