2
0
mirror of https://expo.survex.com/repositories/troggle/.git synced 2024-11-22 07:11:52 +00:00
troggle/parsers/people.py

434 lines
17 KiB
Python
Raw Normal View History

2023-01-19 18:33:04 +00:00
import csv
import os
import re
from html import unescape
from pathlib import Path
from unidecode import unidecode
2021-04-13 00:11:08 +01:00
from django.conf import settings
2023-01-19 21:18:42 +00:00
from troggle.core.models.troggle import DataIssue, Expedition, Person, PersonExpedition
2021-04-13 00:11:08 +01:00
2023-01-19 21:18:42 +00:00
"""These functions do not match how the stand-alone folk script works. So the script produces an HTML file which has
2021-02-06 00:18:48 +00:00
href links to pages in troggle which troggle does not think are right.
The standalone script needs to be renedred defucnt, and all the parsing needs to be in troggle. Either that,
or they should use the same code by importing a module.
2023-01-19 21:18:42 +00:00
"""
2021-02-06 00:18:48 +00:00
def parse_blurb(personline, header, person):
"""create mugshot Photo instance
Would be better if all this was done before the Person object was created in the db, then it would not
need re-saving (which is slow)"""
ms_filename = personline[header["Mugshot"]]
ms_path = Path(settings.EXPOWEB, "folk", ms_filename)
2023-01-19 21:18:42 +00:00
if ms_filename:
if not ms_path.is_file():
message = f"! INVALID mug_shot field '{ms_filename}' for {person.fullname}"
print(message)
2023-01-19 21:18:42 +00:00
DataIssue.objects.create(parser="people", message=message, url=f"/person/{person.fullname}")
return
2023-01-19 21:18:42 +00:00
if ms_filename.startswith("i/"):
# if person just has an image, add it. It has format 'i/adama2018.jpg'
person.mug_shot = str(Path("/folk", ms_filename))
person.blurb = None
2023-01-19 21:18:42 +00:00
elif ms_filename.startswith("l/"):
# it has the format 'l/ollybetts.htm' the file may contain <img src="../i/mymug.jpg"> images
2023-01-19 21:18:42 +00:00
with open(ms_path, "r") as blurbfile:
blrb = blurbfile.read()
2023-01-19 21:18:42 +00:00
pblurb = re.search(r"<body>.*<hr", blrb, re.DOTALL)
if pblurb:
2023-01-19 21:18:42 +00:00
person.mug_shot = None
fragment = re.search("<body>(.*)<hr", blrb, re.DOTALL).group(1)
fragment = fragment.replace('src="../i/', 'src="/folk/i/')
fragment = fragment.replace("src='../i/", "src='/folk/i/")
2023-01-19 21:18:42 +00:00
fragment = re.sub(r"<h.*>[^<]*</h.>", "", fragment)
# replace src="../i/ with src="/folk/i
person.blurb = fragment
else:
message = f"! Blurb parse error in {ms_filename}"
print(message)
2023-01-19 21:18:42 +00:00
DataIssue.objects.create(parser="people", message=message, url="/folk/")
2023-01-19 21:18:42 +00:00
elif ms_filename == "":
2020-05-15 21:32:55 +01:00
pass
else:
message = f"! Unrecognised type of file at mug_shot field '{ms_filename}' for {person.fullname}"
print(message)
2023-01-19 21:18:42 +00:00
DataIssue.objects.create(parser="people", message=message, url="/folk/")
person.save()
slug_cache = {}
def troggle_slugify(longname):
"""Uniqueness enforcement too. Yes we have had two "Dave Johnson"s
2023-10-01 15:53:25 +01:00
This function copied intact to expoweb/scripts/make-folklist.py
"""
slug = longname.strip().lower().replace(" ","-")
2023-11-23 18:46:44 +00:00
slug = re.sub(r'\([^\)]*\)','',slug) # remove nickname in brackets
2023-10-01 15:53:25 +01:00
slug = slug.replace('&eacute;', 'e')
slug = slug.replace('&aacute;', 'a')
slug = slug.replace('&auml;', 'a')
slug = slug.replace('&', '') # otherwise just remove the &
slug = slug.replace(';', '') # otherwise just remove the ;
2023-11-23 18:46:44 +00:00
slug = re.sub(r'<[^>]*>','',slug) # remove <span-lang = "hu">
slug=slug.strip("-") # remove spare hyphens
if len(slug) > 40: # slugfield is 50 chars
slug = slug[:40]
if slug in slug_cache:
slug_cache[slug] += 1
slug = f"{slug}_{slug_cache[slug]}"
slug_cache[slug] = 1
2023-10-01 15:53:25 +01:00
return slug
def load_people_expos():
2023-01-19 21:18:42 +00:00
"""This is where the folk.csv file is parsed to read people's names.
Which it gets wrong for people like Lydia-Clare Leather and various 'von' and 'de' middle 'names'
and McLean and Mclean and McAdam - interaction with the url parser in urls.py too
2024-03-14 19:53:01 +00:00
This is ALSO where all the Expedition objects get created. So this is the point at which troggle
gets told what expeditions exist.
Given that we need to do stuff for the coming expo, well before we update the folk list,
the Expedition object for the coming expo is created elsewhere - in addition to
those created here, if it does not exist.
2023-01-19 21:18:42 +00:00
"""
DataIssue.objects.filter(parser="people").delete()
persontab = open(os.path.join(settings.EXPOWEB, "folk", "folk.csv")) # should really be EXPOFOLK I guess
personreader = csv.reader(persontab) # this is an iterator
2020-05-24 01:57:06 +01:00
headers = next(personreader)
header = dict(list(zip(headers, list(range(len(headers))))))
2023-01-19 21:18:42 +00:00
years = headers[5:]
nexpos = Expedition.objects.count()
if nexpos <= 0:
print(" - Creating expeditions")
for year in years:
coUniqueAttribs = {"year": year}
otherAttribs = {"name": f"CUCC expo {year}"}
e = Expedition.objects.create(**otherAttribs, **coUniqueAttribs)
2023-01-19 21:18:42 +00:00
2020-04-27 23:51:41 +01:00
print(" - Loading personexpeditions")
for personline in personreader:
2023-10-01 15:53:25 +01:00
# This is all horrible: refactor it.
name = personline[header["Name"]]
2023-10-01 15:53:25 +01:00
plainname = re.sub(r"<.*?>", "", name) # now in slugify
match = re.match(r"^([^(]*)(\(([^)]*)\))?", name) # removes nickname in brackets
displayname = match.group(1)
slug = troggle_slugify(displayname)
firstname = ""
2023-01-30 22:27:17 +00:00
nick = ""
rawlastname = personline[header["Lastname"]].strip()
matchlastname = re.match(r"^([\w&;\s]+)(?:\(([^)]*)\))?", rawlastname)
lastname = matchlastname.group(1).strip()
2023-10-01 15:53:25 +01:00
splitnick = re.match(r"^([\w&;\s]+)(?:\(([^)]*)\))?", plainname)
fullname = splitnick.group(1) # removes Nickname in brackets, but also cuts hyphenated names
nick = splitnick.group(2) or ""
fullname = fullname.strip()
names = fullname.split(" ") # This may have more than one, e.g. "Adeleide de Diesback"
firstname = names[0]
if len(names) == 1:
lastname = "" # wookey special code
#restore fullname to be the whole string
fullname = displayname
2023-01-19 21:18:42 +00:00
if personline[header["VfHO member"]] == "":
2020-06-19 16:39:05 +01:00
vfho = False
else:
vfho = True
# would be better to just create the python object, and only cmmit to db once all done inc blurb
# and better to save all the Persons in a bulk update, then do all the PersonExpeditions
coUniqueAttribs = {"slug": slug}
otherAttribs = {"first_name": firstname, "last_name": (lastname or ""), "is_vfho": vfho, "fullname": fullname, "nickname": nick,"is_guest": (personline[header["Guest"]] == "1")}
person = Person.objects.create(**otherAttribs, **coUniqueAttribs)
parse_blurb(personline=personline, header=header, person=person) # saves to db too
2023-01-19 21:18:42 +00:00
# make person expedition from table
2020-05-24 01:57:06 +01:00
for year, attended in list(zip(headers, personline))[5:]:
2021-04-13 01:13:08 +01:00
expedition = Expedition.objects.get(year=year)
if attended == "1" or attended == "-1":
coUniqueAttribs = {"person": person, "expedition": expedition}
# otherAttribs = {"is_guest": (personline[header["Guest"]] == "1")}
pe = PersonExpedition.objects.create(**coUniqueAttribs)
2021-04-27 20:44:24 +01:00
print("", flush=True)
2023-01-19 21:18:42 +00:00
def who_is_this(year, possibleid):
2022-10-08 22:17:53 +01:00
expo = Expedition.objects.filter(year=year)
2023-01-19 21:18:42 +00:00
personexpedition = GetPersonExpeditionNameLookup(expo)[possibleid.lower()]
2022-10-08 22:17:53 +01:00
if personexpedition:
return personexpedition.person
else:
return None
2023-01-19 21:18:42 +00:00
def when_on_expo(name):
"""Returns a list of PersonExpedition objects for the string, if recognised as a name
"""
person_expos = []
expos = Expedition.objects.all()
for expo in expos:
expoers = GetPersonExpeditionNameLookup(expo)
if name in expoers:
person_expos.append(expoers[name])
print(f"{name} => {expoers[name]}")
return person_expos
global foreign_friends
2023-01-19 21:18:42 +00:00
foreign_friends = [
2023-09-05 12:35:56 +01:00
"Aiko",
"Arndt Karger",
2023-01-19 21:18:42 +00:00
"Dominik Jauch",
2023-09-05 12:35:56 +01:00
"Florian Gruner",
2023-01-19 21:18:42 +00:00
"Fritz Mammel",
2023-09-05 12:35:56 +01:00
"Gunter Graf",
"Helmut Stopka-Ebeler",
"K. Jäger",
2023-01-19 21:18:42 +00:00
"Kai Schwekend",
2023-09-05 12:35:56 +01:00
"Karl Gaisberger",
2023-10-04 20:23:26 +01:00
"Marcus Scheuermann",
2023-09-05 12:35:56 +01:00
"Marcus Scheuerman",
"Mark Morgan",
"P. Jeutter",
"R. Seebacher",
2023-01-19 21:18:42 +00:00
"Regina Kaiser",
2023-09-05 12:35:56 +01:00
"Robert Seebacher",
"S. Steinberger",
"Sepp Steinberger",
2023-01-19 21:18:42 +00:00
"Thilo Müller",
2023-09-05 12:35:56 +01:00
"Uli Schütz",
2023-01-19 21:18:42 +00:00
"Wieland Scheuerle",
]
def known_foreigner(id):
2023-08-07 21:10:30 +01:00
"""If this someone from ARGE or a known Austrian? Name has to be exact, no soft matching
2023-09-05 12:35:56 +01:00
"""
2023-01-19 21:18:42 +00:00
global foreign_friends
if id in foreign_friends:
return True
else:
return False
2023-01-19 21:18:42 +00:00
2022-10-07 21:47:05 +01:00
# Refactor. The dict GetPersonExpeditionNameLookup(expo) indexes by name and has values of personexpedition
# This is convoluted, the personexpedition concept is unnecessary, should it just retunr person??
# Or better, query with a string and return a list of personexpeditions
2022-10-07 21:47:05 +01:00
2023-01-19 21:18:42 +00:00
Gpersonexpeditionnamelookup = {}
def GetPersonExpeditionNameLookup(expedition):
2023-07-12 15:44:34 +01:00
"""Yes this should all be in an editable text file, not in the body of the code. Sorry.
This uses the existing database records of everone on an expedition to construct a dictionary
indexedby every possible pseudonym or alias that the person might be known by.
This dictionary is used when parsing logbooks and survex files to identify who is being
referred to, when the name written in the logbook is e.g. "Mike TA" == "Mike The Animal"
== "Mike Rickardson".
2023-07-12 15:44:34 +01:00
"""
global Gpersonexpeditionnamelookup
2023-01-19 21:18:42 +00:00
def apply_variations(f, l):
2023-01-19 21:18:42 +00:00
"""Be generous in guessing possible matches. Any duplicates will be ruled as invalid."""
f = f.lower()
l = l.lower()
variations = []
variations.append(f)
variations.append(l)
2022-10-09 21:50:32 +01:00
variations.append(f + l)
variations.append(f + " " + l)
variations.append(f + " " + l[0])
variations.append(f + l[0])
2023-01-19 21:18:42 +00:00
variations.append(f + " " + l[0] + ".")
variations.append(f[0] + " " + l)
variations.append(f[0] + ". " + l)
variations.append(f[0] + l)
2023-01-19 21:18:42 +00:00
variations.append(f[0] + l[0]) # initials e.g. gb or bl
return variations
2023-01-19 21:18:42 +00:00
res = Gpersonexpeditionnamelookup.get(expedition.name)
2023-01-19 21:18:42 +00:00
if res:
return res
2023-01-19 21:18:42 +00:00
res = {}
duplicates = set()
2023-01-19 21:18:42 +00:00
# print("Calculating GetPersonExpeditionNameLookup for " + expedition.year)
2021-04-13 01:13:08 +01:00
personexpeditions = PersonExpedition.objects.filter(expedition=expedition)
short = {}
dellist = []
for personexpedition in personexpeditions:
2023-01-19 21:18:42 +00:00
possnames = []
2023-10-05 12:45:46 +01:00
f = unidecode(unescape(personexpedition.person.first_name.lower().strip()))
l = unidecode(unescape(personexpedition.person.last_name.lower().strip()))
full = unidecode(unescape(personexpedition.person.fullname.lower().strip()))
n = unidecode(unescape(personexpedition.person.nickname.lower().strip()))
if full not in possnames:
possnames.append(full)
if n not in possnames:
possnames.append(n)
2023-01-19 21:18:42 +00:00
if l:
2023-01-19 21:18:42 +00:00
possnames += apply_variations(f, l)
if n:
possnames += apply_variations(n, l)
2023-01-19 21:18:42 +00:00
2023-10-04 20:23:26 +01:00
if f == "Adeleide".lower():
possnames += apply_variations("Adelaide", l)
if f == "Adelaide".lower():
possnames += apply_variations("Adeleide", l)
if f == "Robert".lower():
possnames += apply_variations("Bob", l)
if f == "Rob".lower():
possnames += apply_variations("Robert", l)
2023-01-19 21:18:42 +00:00
2023-08-08 16:23:32 +01:00
if f == "Thomas".lower():
possnames += apply_variations("Tom", l)
if f == "Tom".lower():
possnames += apply_variations("Thomas", l)
2023-07-12 15:44:34 +01:00
if f == "Lizzy".lower():
possnames += apply_variations("Lizzie", l)
if f == "Lizzie".lower():
possnames += apply_variations("Lizzy", l)
if f == "Phil".lower(): # needed when Phil is used with a surname initial, so default short-form does not work.
possnames += apply_variations("Philip", l)
if f == "Philip".lower():
possnames += apply_variations("Phil", l)
if f == "Andrew".lower():
possnames += apply_variations("Andy", l)
if f == "Andy".lower():
possnames += apply_variations("Andrew", l)
2023-10-04 20:23:26 +01:00
if f == "Michael".lower():
possnames += apply_variations("Mike", l)
2023-10-04 21:34:36 +01:00
if f == "Mike".lower():
possnames += apply_variations("Michael", l)
2023-01-19 21:18:42 +00:00
if f == "David".lower():
possnames += apply_variations("Dave", l)
2022-10-09 21:50:32 +01:00
if f == "Dave".lower():
possnames += apply_variations("David", l)
2023-01-19 21:18:42 +00:00
if f == "Peter".lower():
possnames += apply_variations("Pete", l)
2022-10-09 21:50:32 +01:00
if f == "Pete".lower():
possnames += apply_variations("Peter", l)
2023-01-19 21:18:42 +00:00
2023-10-04 20:23:26 +01:00
if f == "Tobias".lower():
possnames += apply_variations("Toby", l)
if f == "Toby".lower():
possnames += apply_variations("Tobias", l)
2022-10-09 21:50:32 +01:00
if f == "Olly".lower():
possnames += apply_variations("Oliver", l)
if f == "Oliver".lower():
possnames += apply_variations("Olly", l)
2023-01-19 21:18:42 +00:00
if f == "Ollie".lower():
possnames += apply_variations("Oliver", l)
if f == "Oliver".lower():
possnames += apply_variations("Ollie", l)
if f == "Becka".lower():
possnames += apply_variations("Rebecca", l)
2023-01-19 21:18:42 +00:00
if f"{f} {l}" == "Andy Waddington".lower():
2022-10-09 21:50:32 +01:00
possnames += apply_variations("aer", "waddington")
2023-01-19 21:18:42 +00:00
if f"{f} {l}" == "Phil Underwood".lower():
2022-10-09 21:50:32 +01:00
possnames += apply_variations("phil", "underpants")
2023-01-19 21:18:42 +00:00
if f"{f} {l}" == "Naomi Griffiths".lower():
2022-10-09 21:50:32 +01:00
possnames += apply_variations("naomi", "makins")
2023-01-19 21:18:42 +00:00
if f"{f} {l}" == "Tina White".lower():
2022-10-09 21:50:32 +01:00
possnames += apply_variations("tina", "richardson")
2023-01-19 21:18:42 +00:00
if f"{f} {l}" == "Cat Hulse".lower():
2022-10-09 21:50:32 +01:00
possnames += apply_variations("catherine", "hulse")
possnames += apply_variations("cat", "henry")
2023-01-19 21:18:42 +00:00
if f"{f} {l}" == "Jess Stirrups".lower():
2022-10-09 21:50:32 +01:00
possnames += apply_variations("jessica", "stirrups")
2023-01-19 21:18:42 +00:00
if f"{f} {l}" == "Nat Dalton".lower():
possnames += apply_variations("nathanael", "dalton") # correct. He has a weird spelling.
if f"{f} {l}" == "Mike Richardson".lower():
2022-10-09 21:50:32 +01:00
possnames.append("mta")
possnames.append("miketa")
possnames.append("mike the animal")
possnames.append("animal")
2023-01-19 21:18:42 +00:00
if f"{f} {l}" == "Eric Landgraf".lower():
2022-10-09 21:50:32 +01:00
possnames.append("eric c.landgraf")
possnames.append("eric c. landgraf")
possnames.append("eric c landgraf")
2023-01-19 21:18:42 +00:00
if f"{f} {l}" == "Nadia Raeburn".lower():
possnames.append("tinywoman")
2022-10-09 21:50:32 +01:00
possnames.append("nadia rc")
possnames.append("nadia raeburn-cherradi")
2023-07-12 15:44:34 +01:00
if f"{f} {l}" == "Phil Wigglesworth".lower():
2023-07-12 15:44:34 +01:00
possnames.append("wiggy")
if f"{f} {l}" == "Philip Banister".lower():
possnames.append("crofton")
if f"{f} {l}" == "Elaine Oliver".lower():
possnames.append("cavingpig")
if f"{f} {l}" == "Tom Crossley".lower():
possnames.append("tcacrossley")
if f"{f} {l}" == "Rob Watson".lower():
possnames.append("nobrotson")
if f"{f} {l}" == "Todd Rye".lower():
possnames.append("samouse1")
if f"{f} {l}" == "Jono Lester".lower():
possnames.append("ILoveCaves")
if f"{f} {l}" == "Joel Stobbart".lower():
possnames.append("El Stobbarto")
if f"{f} {l}" == "Rob Watson".lower():
possnames.append("nobrotson")
2023-01-19 21:18:42 +00:00
2022-10-09 21:50:32 +01:00
for i in [3, 4, 5, 6]:
2023-01-19 21:18:42 +00:00
lim = min(i, len(f) + 1) # short form, e.g. Dan for Daniel.
if f[:lim] not in short:
2023-01-19 21:18:42 +00:00
short[f[:lim]] = personexpedition
else:
dellist.append(f[:lim])
2023-01-19 21:18:42 +00:00
possnames = set(possnames) # remove duplicates
for possname in possnames:
if possname in res:
duplicates.add(possname)
else:
res[possname] = personexpedition
2023-01-19 21:18:42 +00:00
for possname in duplicates:
del res[possname]
2023-01-19 21:18:42 +00:00
for possname in dellist:
2023-01-19 21:18:42 +00:00
if possname in short: # always true ?
del short[possname]
for shortname in short:
res[shortname] = short[shortname]
2023-01-19 21:18:42 +00:00
Gpersonexpeditionnamelookup[expedition.name] = res
return res