forked from expo/troggle
enabled mugshots & blurb in people pages
This commit is contained in:
@@ -1,11 +1,13 @@
|
||||
import csv, re, datetime, os, shutil
|
||||
from html.parser import HTMLParser
|
||||
from unidecode import unidecode
|
||||
from pathlib import Path
|
||||
|
||||
from django.conf import settings
|
||||
|
||||
from troggle.core.models.troggle import Expedition, Person, PersonExpedition
|
||||
from troggle.core.utils import save_carefully
|
||||
from troggle.core.models.troggle import DataIssue
|
||||
from troggle.core.utils import save_carefully, TROG
|
||||
|
||||
'''These functions do not match how the stand-alone folk script works. So the script produces an HTML file which has
|
||||
href links to pages in troggle which troggle does not think are right.
|
||||
@@ -13,32 +15,59 @@ The standalone script needs to be renedred defucnt, and all the parsing needs to
|
||||
or they should use the same code by importing a module.
|
||||
'''
|
||||
|
||||
def parseMugShotAndBlurb(personline, header, person):
|
||||
def parse_blurb(personline, header, person):
|
||||
"""create mugshot Photo instance"""
|
||||
mugShotFilename=personline[header["Mugshot"]]
|
||||
mugShotPath = os.path.join(settings.EXPOWEB, "folk", mugShotFilename)
|
||||
if mugShotPath[-3:]=='jpg': #if person just has an image, add it
|
||||
#saveMugShot(mugShotPath=mugShotPath, mugShotFilename=mugShotFilename, person=person)
|
||||
ms_filename = personline[header["Mugshot"]]
|
||||
ms_path = Path(settings.EXPOWEB, "folk", ms_filename)
|
||||
|
||||
if ms_filename:
|
||||
if not ms_path.is_file():
|
||||
message = f"! INVALID mug_shot field '{ms_filename}' for {person.fullname}"
|
||||
print(message)
|
||||
DataIssue.objects.create(parser='people', message=message, url=f"/person/{person.fullname}")
|
||||
return
|
||||
|
||||
if ms_filename.startswith('i/'):
|
||||
#if person just has an image, add it. It has format 'i/adama2018.jpg'
|
||||
person.mug_shot = str(Path("/folk", ms_filename))
|
||||
person.blurb = None
|
||||
|
||||
elif ms_filename.startswith('l/'):
|
||||
# it has the format 'l/ollybetts.htm' the file may contain <img src="../i/mymug.jpg"> images
|
||||
with open(ms_path,'r') as blurbfile:
|
||||
blrb = blurbfile.read()
|
||||
pblurb=re.search(r'<body>.*<hr',blrb,re.DOTALL)
|
||||
if pblurb:
|
||||
person.mug_shot = None
|
||||
fragment= re.search('<body>(.*)<hr',blrb,re.DOTALL).group(1)
|
||||
fragment = fragment.replace('src="../i/', 'src="/folk/i/')
|
||||
fragment = fragment.replace("src='../i/", "src='/folk/i/")
|
||||
fragment = re.sub(r'<h.*>[^<]*</h.>', '', fragment)
|
||||
# replace src="../i/ with src="/folk/i
|
||||
person.blurb = fragment
|
||||
else:
|
||||
message = f"! Blurb parse error in {ms_filename}"
|
||||
print(message)
|
||||
DataIssue.objects.create(parser='people', message=message, url="/folk/")
|
||||
|
||||
elif ms_filename == '':
|
||||
pass
|
||||
elif mugShotPath[-3:]=='htm': #if person has an html page, find the image(s) and add it. Also, add the text from the html page to the "blurb" field in his model instance.
|
||||
personPageOld=open(mugShotPath,'r').read()
|
||||
if not person.blurb:
|
||||
pblurb=re.search('<body>.*<hr',personPageOld,re.DOTALL)
|
||||
if pblurb:
|
||||
#this needs to be refined, take care of the HTML and make sure it doesn't match beyond the blurb.
|
||||
#Only finds the first image, not all of them
|
||||
person.blurb=re.search('<body>.*<hr',personPageOld,re.DOTALL).group()
|
||||
else:
|
||||
print("ERROR: --------------- Broken link or Blurb parse error in ", mugShotFilename)
|
||||
#for mugShotFilename in re.findall('i/.*?jpg',personPageOld,re.DOTALL):
|
||||
# mugShotPath = os.path.join(settings.EXPOWEB, "folk", mugShotFilename)
|
||||
# saveMugShot(mugShotPath=mugShotPath, mugShotFilename=mugShotFilename, person=person)
|
||||
else:
|
||||
message = f"! Unrecognised type of file at mug_shot field '{ms_filename}' for {person.fullname}"
|
||||
print(message)
|
||||
DataIssue.objects.create(parser='people', message=message, url="/folk/")
|
||||
|
||||
person.save()
|
||||
|
||||
def LoadPersonsExpos():
|
||||
def load_people_expos():
|
||||
'''This is where the folk.csv file is parsed to read people's names.
|
||||
Which it gets wrong for people like Lydia-Clare Leather and various 'von' and 'de' middle 'names'
|
||||
and McLean and Mclean and McAdam - interaction with the url parser in urls.py too
|
||||
'''
|
||||
DataIssue.objects.filter(parser='people').delete()
|
||||
|
||||
persontab = open(os.path.join(settings.EXPOWEB, "folk", "folk.csv"))
|
||||
personreader = csv.reader(persontab)
|
||||
persontab = open(os.path.join(settings.EXPOWEB, "folk", "folk.csv")) # should really be EXPOFOLK I guess
|
||||
personreader = csv.reader(persontab) # this is an iterator
|
||||
headers = next(personreader)
|
||||
header = dict(list(zip(headers, list(range(len(headers))))))
|
||||
|
||||
@@ -86,7 +115,7 @@ def LoadPersonsExpos():
|
||||
nonLookupAttribs={'is_vfho':vfho, 'fullname':fullname}
|
||||
person, created = save_carefully(Person, lookupAttribs, nonLookupAttribs)
|
||||
|
||||
parseMugShotAndBlurb(personline=personline, header=header, person=person)
|
||||
parse_blurb(personline=personline, header=header, person=person)
|
||||
|
||||
# make person expedition from table
|
||||
for year, attended in list(zip(headers, personline))[5:]:
|
||||
|
||||
Reference in New Issue
Block a user