troggle-unchained/parsers/people.py

152 lines
6.3 KiB
Python
Raw Normal View History

import csv, re, datetime, os, shutil
2020-05-24 01:57:06 +01:00
from html.parser import HTMLParser
from unidecode import unidecode
2021-04-13 00:11:08 +01:00
from django.conf import settings
import troggle.core.models as models
from troggle.core.utils import save_carefully
'''These functions do not match how the stand-alone folk script works. So the script produces an HTML file which has
2021-02-06 00:18:48 +00:00
href links to pages in troggle which troggle does not think are right.
The standalone script needs to be renedred defucnt, and all the parsing needs to be in troggle. Either that,
or they should use the same code by importing a module.
'''
def parseMugShotAndBlurb(personline, header, person):
2009-07-03 05:31:49 +01:00
"""create mugshot Photo instance"""
mugShotFilename=personline[header["Mugshot"]]
mugShotPath = os.path.join(settings.EXPOWEB, "folk", mugShotFilename)
if mugShotPath[-3:]=='jpg': #if person just has an image, add it
2020-05-15 21:32:55 +01:00
#saveMugShot(mugShotPath=mugShotPath, mugShotFilename=mugShotFilename, person=person)
pass
elif mugShotPath[-3:]=='htm': #if person has an html page, find the image(s) and add it. Also, add the text from the html page to the "blurb" field in his model instance.
personPageOld=open(mugShotPath,'r').read()
2009-05-19 06:32:42 +01:00
if not person.blurb:
pblurb=re.search('<body>.*<hr',personPageOld,re.DOTALL)
if pblurb:
#this needs to be refined, take care of the HTML and make sure it doesn't match beyond the blurb.
#Only finds the first image, not all of them
person.blurb=re.search('<body>.*<hr',personPageOld,re.DOTALL).group()
else:
2020-05-24 01:57:06 +01:00
print("ERROR: --------------- Broken link or Blurb parse error in ", mugShotFilename)
2020-05-15 21:32:55 +01:00
#for mugShotFilename in re.findall('i/.*?jpg',personPageOld,re.DOTALL):
# mugShotPath = os.path.join(settings.EXPOWEB, "folk", mugShotFilename)
# saveMugShot(mugShotPath=mugShotPath, mugShotFilename=mugShotFilename, person=person)
person.save()
def LoadPersonsExpos():
persontab = open(os.path.join(settings.EXPOWEB, "folk", "folk.csv"))
personreader = csv.reader(persontab)
2020-05-24 01:57:06 +01:00
headers = next(personreader)
header = dict(list(zip(headers, list(range(len(headers))))))
# make expeditions
2020-04-27 23:51:41 +01:00
print(" - Loading expeditions")
years = headers[5:]
for year in years:
2009-05-19 06:32:42 +01:00
lookupAttribs = {'year':year}
nonLookupAttribs = {'name':"CUCC expo %s" % year}
save_carefully(models.Expedition, lookupAttribs, nonLookupAttribs)
# make persons
2020-04-27 23:51:41 +01:00
print(" - Loading personexpeditions")
for personline in personreader:
name = personline[header["Name"]]
name = re.sub(r"<.*?>", "", name)
firstname = ""
nickname = ""
rawlastname = personline[header["Lastname"]].strip()
matchlastname = re.match(r"^([\w&;\s]+)(?:\(([^)]*)\))?", rawlastname)
lastname = matchlastname.group(1).strip()
splitnick = re.match(r"^([\w&;\s]+)(?:\(([^)]*)\))?", name)
fullname = splitnick.group(1)
nickname = splitnick.group(2) or ""
fullname = fullname.strip()
names = fullname.split(' ')
firstname = names[0]
if len(names) == 1:
lastname = ""
2020-06-19 16:39:05 +01:00
if personline[header["VfHO member"]] =='':
vfho = False
else:
vfho = True
lookupAttribs={'first_name':firstname, 'last_name':(lastname or "")}
2020-06-19 16:39:05 +01:00
nonLookupAttribs={'is_vfho':vfho, 'fullname':fullname}
2009-05-19 06:32:42 +01:00
person, created = save_carefully(models.Person, lookupAttribs, nonLookupAttribs)
parseMugShotAndBlurb(personline=personline, header=header, person=person)
# make person expedition from table
2020-05-24 01:57:06 +01:00
for year, attended in list(zip(headers, personline))[5:]:
expedition = models.Expedition.objects.get(year=year)
if attended == "1" or attended == "-1":
2009-05-19 06:32:42 +01:00
lookupAttribs = {'person':person, 'expedition':expedition}
2020-07-06 20:27:31 +01:00
nonLookupAttribs = {'nickname':nickname, 'is_guest':(personline[header["Guest"]] == "1")}
2009-05-19 06:32:42 +01:00
save_carefully(models.PersonExpedition, lookupAttribs, nonLookupAttribs)
# used in other referencing parser functions
# expedition name lookup cached for speed (it's a very big list)
Gpersonexpeditionnamelookup = { }
def GetPersonExpeditionNameLookup(expedition):
global Gpersonexpeditionnamelookup
res = Gpersonexpeditionnamelookup.get(expedition.name)
if res:
return res
res = { }
duplicates = set()
2020-05-14 19:37:46 +01:00
#print("Calculating GetPersonExpeditionNameLookup for " + expedition.year)
personexpeditions = models.PersonExpedition.objects.filter(expedition=expedition)
htmlparser = HTMLParser()
for personexpedition in personexpeditions:
possnames = [ ]
f = unidecode(htmlparser.unescape(personexpedition.person.first_name.lower()))
l = unidecode(htmlparser.unescape(personexpedition.person.last_name.lower()))
full = unidecode(htmlparser.unescape(personexpedition.person.fullname.lower()))
if l:
possnames.append(f + " " + l)
possnames.append(f + " " + l[0])
possnames.append(f + l[0])
possnames.append(f[0] + " " + l)
possnames.append(f)
if full not in possnames:
possnames.append(full)
if personexpedition.nickname not in possnames:
possnames.append(personexpedition.nickname.lower())
if l:
# This allows for nickname to be used for short name eg Phil
# adding Phil Sargent to the list
if str(personexpedition.nickname.lower() + " " + l) not in possnames:
possnames.append(personexpedition.nickname.lower() + " " + l)
if str(personexpedition.nickname.lower() + " " + l[0]) not in possnames:
possnames.append(personexpedition.nickname.lower() + " " + l[0])
if str(personexpedition.nickname.lower() + l[0]) not in possnames:
possnames.append(personexpedition.nickname.lower() + l[0])
for possname in possnames:
if possname in res:
duplicates.add(possname)
else:
res[possname] = personexpedition
for possname in duplicates:
del res[possname]
Gpersonexpeditionnamelookup[expedition.name] = res
return res