2009-07-02 22:31:28 +01:00
import csv , re , datetime , os , shutil
2020-05-24 01:57:06 +01:00
from html . parser import HTMLParser
2019-07-11 12:29:38 +01:00
from unidecode import unidecode
2009-05-13 05:53:37 +01:00
2021-04-13 00:11:08 +01:00
from django . conf import settings
import troggle . core . models as models
from troggle . core . utils import save_carefully
''' These functions do not match how the stand-alone folk script works. So the script produces an HTML file which has
2021-02-06 00:18:48 +00:00
href links to pages in troggle which troggle does not think are right .
The standalone script needs to be renedred defucnt , and all the parsing needs to be in troggle . Either that ,
or they should use the same code by importing a module .
'''
2009-05-13 05:53:37 +01:00
def parseMugShotAndBlurb ( personline , header , person ) :
2009-07-03 05:31:49 +01:00
""" create mugshot Photo instance """
2009-05-13 05:53:37 +01:00
mugShotFilename = personline [ header [ " Mugshot " ] ]
mugShotPath = os . path . join ( settings . EXPOWEB , " folk " , mugShotFilename )
2009-05-13 05:35:59 +01:00
if mugShotPath [ - 3 : ] == ' jpg ' : #if person just has an image, add it
2020-05-15 21:32:55 +01:00
#saveMugShot(mugShotPath=mugShotPath, mugShotFilename=mugShotFilename, person=person)
pass
2009-05-13 05:35:59 +01:00
elif mugShotPath [ - 3 : ] == ' htm ' : #if person has an html page, find the image(s) and add it. Also, add the text from the html page to the "blurb" field in his model instance.
personPageOld = open ( mugShotPath , ' r ' ) . read ( )
2009-05-19 06:32:42 +01:00
if not person . blurb :
2020-04-01 19:58:31 +01:00
pblurb = re . search ( ' <body>.*<hr ' , personPageOld , re . DOTALL )
if pblurb :
#this needs to be refined, take care of the HTML and make sure it doesn't match beyond the blurb.
#Only finds the first image, not all of them
person . blurb = re . search ( ' <body>.*<hr ' , personPageOld , re . DOTALL ) . group ( )
else :
2020-05-24 01:57:06 +01:00
print ( " ERROR: --------------- Broken link or Blurb parse error in " , mugShotFilename )
2020-05-15 21:32:55 +01:00
#for mugShotFilename in re.findall('i/.*?jpg',personPageOld,re.DOTALL):
# mugShotPath = os.path.join(settings.EXPOWEB, "folk", mugShotFilename)
# saveMugShot(mugShotPath=mugShotPath, mugShotFilename=mugShotFilename, person=person)
2009-05-13 05:53:37 +01:00
person . save ( )
2009-05-13 05:35:59 +01:00
def LoadPersonsExpos ( ) :
2019-04-02 00:57:13 +01:00
persontab = open ( os . path . join ( settings . EXPOWEB , " folk " , " folk.csv " ) )
2009-05-13 05:35:59 +01:00
personreader = csv . reader ( persontab )
2020-05-24 01:57:06 +01:00
headers = next ( personreader )
header = dict ( list ( zip ( headers , list ( range ( len ( headers ) ) ) ) ) )
2009-05-13 05:35:59 +01:00
2009-05-13 05:48:47 +01:00
# make expeditions
2020-04-27 23:51:41 +01:00
print ( " - Loading expeditions " )
2009-05-13 05:35:59 +01:00
years = headers [ 5 : ]
2009-05-13 05:53:37 +01:00
2009-05-13 05:35:59 +01:00
for year in years :
2009-05-19 06:32:42 +01:00
lookupAttribs = { ' year ' : year }
nonLookupAttribs = { ' name ' : " CUCC expo %s " % year }
save_carefully ( models . Expedition , lookupAttribs , nonLookupAttribs )
2009-05-13 05:35:59 +01:00
2009-05-13 05:48:47 +01:00
# make persons
2020-04-27 23:51:41 +01:00
print ( " - Loading personexpeditions " )
2009-05-13 05:35:59 +01:00
2009-05-13 05:48:47 +01:00
for personline in personreader :
name = personline [ header [ " Name " ] ]
2019-03-30 13:58:38 +00:00
name = re . sub ( r " <.*?> " , " " , name )
2019-03-31 15:39:53 +01:00
2019-04-19 22:52:54 +01:00
firstname = " "
nickname = " "
rawlastname = personline [ header [ " Lastname " ] ] . strip ( )
matchlastname = re . match ( r " ^([ \ w&; \ s]+)(?: \ (([^)]*) \ ))? " , rawlastname )
lastname = matchlastname . group ( 1 ) . strip ( )
splitnick = re . match ( r " ^([ \ w&; \ s]+)(?: \ (([^)]*) \ ))? " , name )
fullname = splitnick . group ( 1 )
nickname = splitnick . group ( 2 ) or " "
fullname = fullname . strip ( )
names = fullname . split ( ' ' )
firstname = names [ 0 ]
if len ( names ) == 1 :
lastname = " "
2020-06-19 16:39:05 +01:00
if personline [ header [ " VfHO member " ] ] == ' ' :
vfho = False
else :
vfho = True
2019-04-19 22:52:54 +01:00
lookupAttribs = { ' first_name ' : firstname , ' last_name ' : ( lastname or " " ) }
2020-06-19 16:39:05 +01:00
nonLookupAttribs = { ' is_vfho ' : vfho , ' fullname ' : fullname }
2009-05-19 06:32:42 +01:00
person , created = save_carefully ( models . Person , lookupAttribs , nonLookupAttribs )
2019-03-31 15:39:53 +01:00
2009-05-13 06:02:42 +01:00
parseMugShotAndBlurb ( personline = personline , header = header , person = person )
2009-05-13 05:35:59 +01:00
2009-05-13 05:48:47 +01:00
# make person expedition from table
2020-05-24 01:57:06 +01:00
for year , attended in list ( zip ( headers , personline ) ) [ 5 : ] :
2009-05-13 05:48:47 +01:00
expedition = models . Expedition . objects . get ( year = year )
2009-05-13 05:35:59 +01:00
if attended == " 1 " or attended == " -1 " :
2009-05-19 06:32:42 +01:00
lookupAttribs = { ' person ' : person , ' expedition ' : expedition }
2020-07-06 20:27:31 +01:00
nonLookupAttribs = { ' nickname ' : nickname , ' is_guest ' : ( personline [ header [ " Guest " ] ] == " 1 " ) }
2009-05-19 06:32:42 +01:00
save_carefully ( models . PersonExpedition , lookupAttribs , nonLookupAttribs )
2009-05-13 05:35:59 +01:00
2009-05-13 05:53:37 +01:00
2009-05-13 05:48:47 +01:00
# used in other referencing parser functions
2009-05-13 05:39:52 +01:00
# expedition name lookup cached for speed (it's a very big list)
Gpersonexpeditionnamelookup = { }
def GetPersonExpeditionNameLookup ( expedition ) :
global Gpersonexpeditionnamelookup
res = Gpersonexpeditionnamelookup . get ( expedition . name )
if res :
return res
2009-05-13 05:48:47 +01:00
res = { }
2009-05-13 05:39:52 +01:00
duplicates = set ( )
2020-05-14 19:37:46 +01:00
#print("Calculating GetPersonExpeditionNameLookup for " + expedition.year)
2009-05-13 05:39:52 +01:00
personexpeditions = models . PersonExpedition . objects . filter ( expedition = expedition )
2019-07-11 12:29:38 +01:00
htmlparser = HTMLParser ( )
2009-05-13 05:39:52 +01:00
for personexpedition in personexpeditions :
possnames = [ ]
2019-07-11 12:29:38 +01:00
f = unidecode ( htmlparser . unescape ( personexpedition . person . first_name . lower ( ) ) )
l = unidecode ( htmlparser . unescape ( personexpedition . person . last_name . lower ( ) ) )
full = unidecode ( htmlparser . unescape ( personexpedition . person . fullname . lower ( ) ) )
2009-05-13 05:39:52 +01:00
if l :
possnames . append ( f + " " + l )
possnames . append ( f + " " + l [ 0 ] )
possnames . append ( f + l [ 0 ] )
possnames . append ( f [ 0 ] + " " + l )
possnames . append ( f )
2019-04-19 22:52:54 +01:00
if full not in possnames :
possnames . append ( full )
if personexpedition . nickname not in possnames :
2009-05-13 05:39:52 +01:00
possnames . append ( personexpedition . nickname . lower ( ) )
2019-04-19 22:52:54 +01:00
if l :
# This allows for nickname to be used for short name eg Phil
# adding Phil Sargent to the list
if str ( personexpedition . nickname . lower ( ) + " " + l ) not in possnames :
possnames . append ( personexpedition . nickname . lower ( ) + " " + l )
if str ( personexpedition . nickname . lower ( ) + " " + l [ 0 ] ) not in possnames :
possnames . append ( personexpedition . nickname . lower ( ) + " " + l [ 0 ] )
2019-07-11 12:29:38 +01:00
if str ( personexpedition . nickname . lower ( ) + l [ 0 ] ) not in possnames :
possnames . append ( personexpedition . nickname . lower ( ) + l [ 0 ] )
2009-05-13 05:39:52 +01:00
for possname in possnames :
if possname in res :
duplicates . add ( possname )
else :
res [ possname ] = personexpedition
for possname in duplicates :
del res [ possname ]
Gpersonexpeditionnamelookup [ expedition . name ] = res
return res