troggle/parsers/people.py

from django.conf import settings
import troggle.core.models as models
import csv, re, datetime, os, shutil
from utils import save_carefully
from html.parser import HTMLParser
from unidecode import unidecode

def parseMugShotAndBlurb(personline, header, person):
    """create mugshot Photo instance"""
    mugShotFilename=personline[header["Mugshot"]]
    mugShotPath = os.path.join(settings.EXPOWEB, "folk", mugShotFilename)
    if mugShotPath[-3:]=='jpg': #if person just has an image, add it
        #saveMugShot(mugShotPath=mugShotPath, mugShotFilename=mugShotFilename, person=person)
        pass
    elif mugShotPath[-3:]=='htm': #if person has an html page, find the image(s) and add it. Also, add the text from the html page to the "blurb" field in his model instance.
        personPageOld=open(mugShotPath,'r').read()
        if not person.blurb:
            pblurb=re.search('<body>.*<hr',personPageOld,re.DOTALL)
            if pblurb:
                #this needs to be refined, take care of the HTML and make sure it doesn't match beyond the blurb.
                #Only finds the first image, not all of them
                person.blurb=re.search('<body>.*<hr',personPageOld,re.DOTALL).group() 
            else:
                print("ERROR: --------------- Broken link or Blurb parse error in ", mugShotFilename)
            #for mugShotFilename in re.findall('i/.*?jpg',personPageOld,re.DOTALL):
            #    mugShotPath = os.path.join(settings.EXPOWEB, "folk", mugShotFilename)
            #    saveMugShot(mugShotPath=mugShotPath, mugShotFilename=mugShotFilename, person=person)
    person.save()

def LoadPersonsExpos():
    
    persontab = open(os.path.join(settings.EXPOWEB, "folk", "folk.csv"))
    personreader = csv.reader(persontab)
    headers = next(personreader)
    header = dict(list(zip(headers, list(range(len(headers))))))
    
    # make expeditions
    print(" - Loading expeditions")
    years = headers[5:]
    
    for year in years:
        lookupAttribs = {'year':year}
        nonLookupAttribs = {'name':"CUCC expo %s" % year}
        
        save_carefully(models.Expedition, lookupAttribs, nonLookupAttribs)

    # make persons
    print(" - Loading personexpeditions")

    for personline in personreader:
        name = personline[header["Name"]]
        name = re.sub(r"<.*?>", "", name)

        firstname = ""
        nickname = ""

        rawlastname = personline[header["Lastname"]].strip()
        matchlastname = re.match(r"^([\w&;\s]+)(?:\(([^)]*)\))?", rawlastname)
        lastname = matchlastname.group(1).strip()

        splitnick = re.match(r"^([\w&;\s]+)(?:\(([^)]*)\))?", name)
        fullname = splitnick.group(1)

        nickname = splitnick.group(2) or ""

        fullname = fullname.strip()
        names = fullname.split(' ')
        firstname = names[0]
        if len(names) == 1:
            lastname = ""

        if personline[header["VfHO member"]] =='':
            vfho = False
        else:
            vfho = True

        lookupAttribs={'first_name':firstname, 'last_name':(lastname or "")}
        nonLookupAttribs={'is_vfho':vfho, 'fullname':fullname}
        person, created = save_carefully(models.Person, lookupAttribs, nonLookupAttribs)

        parseMugShotAndBlurb(personline=personline, header=header, person=person)
    
        # make person expedition from table
        for year, attended in list(zip(headers, personline))[5:]:
            expedition = models.Expedition.objects.get(year=year)
            if attended == "1" or attended == "-1":
                lookupAttribs = {'person':person, 'expedition':expedition}
                nonLookupAttribs = {'nickname':nickname, 'is_guest':(personline[header["Guest"]] == "1")}
                save_carefully(models.PersonExpedition, lookupAttribs, nonLookupAttribs)


# used in other referencing parser functions
# expedition name lookup cached for speed (it's a very big list)
Gpersonexpeditionnamelookup = { }
def GetPersonExpeditionNameLookup(expedition):
    global Gpersonexpeditionnamelookup
    res = Gpersonexpeditionnamelookup.get(expedition.name)
    if res:
        return res
    
    res = { }
    duplicates = set()
    
    #print("Calculating GetPersonExpeditionNameLookup for " + expedition.year)
    personexpeditions = models.PersonExpedition.objects.filter(expedition=expedition)
    htmlparser = HTMLParser()
    for personexpedition in personexpeditions:
        possnames = [ ]
        f = unidecode(htmlparser.unescape(personexpedition.person.first_name.lower()))
        l = unidecode(htmlparser.unescape(personexpedition.person.last_name.lower()))
        full = unidecode(htmlparser.unescape(personexpedition.person.fullname.lower()))
        if l:
            possnames.append(f + " " + l)
            possnames.append(f + " " + l[0])
            possnames.append(f + l[0])
            possnames.append(f[0] + " " + l)
        possnames.append(f)
        if full not in possnames:
            possnames.append(full)
        if personexpedition.nickname not in possnames:
            possnames.append(personexpedition.nickname.lower())
            if l:
                # This allows for nickname to be used for short name eg Phil
                # adding Phil Sargent to the list
                if str(personexpedition.nickname.lower() + " " + l) not in possnames:
                    possnames.append(personexpedition.nickname.lower() + " " + l)
                if str(personexpedition.nickname.lower() + " " + l[0]) not in possnames:
                    possnames.append(personexpedition.nickname.lower() + " " + l[0])
                if str(personexpedition.nickname.lower() + l[0]) not in possnames:
                    possnames.append(personexpedition.nickname.lower() + l[0])
        
        for possname in possnames:
            if possname in res:
                duplicates.add(possname)
            else:
                res[possname] = personexpedition
        
    for possname in duplicates:
        del res[possname]
    
    Gpersonexpeditionnamelookup[expedition.name] = res
    return res
[svn] Fix leftover from expo -> core rename, and add databaseReset.py to README.txt 2009-07-02 22:31:28 +01:00			`from django.conf import settings`
Django 1.7 mostly working. Big refactor so probably bugs 2018-04-15 16:28:13 +01:00			`import troggle.core.models as models`
[svn] Fix leftover from expo -> core rename, and add databaseReset.py to README.txt 2009-07-02 22:31:28 +01:00			`import csv, re, datetime, os, shutil`
[svn] Brief code cleanup. 2009-07-03 05:31:49 +01:00			`from utils import save_carefully`
Convert codebase for python3 usage 2020-05-24 01:57:06 +01:00			`from html.parser import HTMLParser`
Allow comments against names in logbooks in brackets Convert accent chars in names into simple chars as this is what people enter in the logbook 2019-07-11 12:29:38 +01:00			`from unidecode import unidecode`
[svn] Photo file handling and mugshots parsing sorted. Made URL settings more relative, less redundant. Copied from http://cucc@cucc.survex.com/svn/trunk/expoweb/troggle/, rev. 8246 by aaron @ 2/18/2009 6:45 AM 2009-05-13 05:53:37 +01:00
			`def parseMugShotAndBlurb(personline, header, person):`
[svn] Brief code cleanup. 2009-07-03 05:31:49 +01:00			`"""create mugshot Photo instance"""`
[svn] Photo file handling and mugshots parsing sorted. Made URL settings more relative, less redundant. Copied from http://cucc@cucc.survex.com/svn/trunk/expoweb/troggle/, rev. 8246 by aaron @ 2/18/2009 6:45 AM 2009-05-13 05:53:37 +01:00			`mugShotFilename=personline[header["Mugshot"]]`
			`mugShotPath = os.path.join(settings.EXPOWEB, "folk", mugShotFilename)`
[svn] Julian playing with the logbooks and expoyears Copied from http://cucc@cucc.survex.com/svn/trunk/expoweb/troggle/, rev. 8180 by julian @ 1/18/2009 3:59 PM 2009-05-13 05:35:59 +01:00			`if mugShotPath[-3:]=='jpg': #if person just has an image, add it`
Remove PHOTOS_ROOT and DPhoto class 2020-05-15 21:32:55 +01:00			`#saveMugShot(mugShotPath=mugShotPath, mugShotFilename=mugShotFilename, person=person)`
			`pass`
[svn] Julian playing with the logbooks and expoyears Copied from http://cucc@cucc.survex.com/svn/trunk/expoweb/troggle/, rev. 8180 by julian @ 1/18/2009 3:59 PM 2009-05-13 05:35:59 +01:00			`elif mugShotPath[-3:]=='htm': #if person has an html page, find the image(s) and add it. Also, add the text from the html page to the "blurb" field in his model instance.`
			`personPageOld=open(mugShotPath,'r').read()`
[svn] 2009-05-19 06:32:42 +01:00			`if not person.blurb:`
Validation of mugshot or blrub file added 2020-04-01 19:58:31 +01:00			`pblurb=re.search('<body>.*<hr',personPageOld,re.DOTALL)`
			`if pblurb:`
			`#this needs to be refined, take care of the HTML and make sure it doesn't match beyond the blurb.`
			`#Only finds the first image, not all of them`
			`person.blurb=re.search('<body>.*<hr',personPageOld,re.DOTALL).group()`
			`else:`
Convert codebase for python3 usage 2020-05-24 01:57:06 +01:00			`print("ERROR: --------------- Broken link or Blurb parse error in ", mugShotFilename)`
Remove PHOTOS_ROOT and DPhoto class 2020-05-15 21:32:55 +01:00			`#for mugShotFilename in re.findall('i/.*?jpg',personPageOld,re.DOTALL):`
			`# mugShotPath = os.path.join(settings.EXPOWEB, "folk", mugShotFilename)`
			`# saveMugShot(mugShotPath=mugShotPath, mugShotFilename=mugShotFilename, person=person)`
[svn] Photo file handling and mugshots parsing sorted. Made URL settings more relative, less redundant. Copied from http://cucc@cucc.survex.com/svn/trunk/expoweb/troggle/, rev. 8246 by aaron @ 2/18/2009 6:45 AM 2009-05-13 05:53:37 +01:00			`person.save()`
[svn] Julian playing with the logbooks and expoyears Copied from http://cucc@cucc.survex.com/svn/trunk/expoweb/troggle/, rev. 8180 by julian @ 1/18/2009 3:59 PM 2009-05-13 05:35:59 +01:00
			`def LoadPersonsExpos():`

folk.csv has moved into 'folk' dir out of 'noinfo' 2019-04-02 00:57:13 +01:00			`persontab = open(os.path.join(settings.EXPOWEB, "folk", "folk.csv"))`
[svn] Julian playing with the logbooks and expoyears Copied from http://cucc@cucc.survex.com/svn/trunk/expoweb/troggle/, rev. 8180 by julian @ 1/18/2009 3:59 PM 2009-05-13 05:35:59 +01:00			`personreader = csv.reader(persontab)`
Convert codebase for python3 usage 2020-05-24 01:57:06 +01:00			`headers = next(personreader)`
			`header = dict(list(zip(headers, list(range(len(headers))))))`
[svn] Julian playing with the logbooks and expoyears Copied from http://cucc@cucc.survex.com/svn/trunk/expoweb/troggle/, rev. 8180 by julian @ 1/18/2009 3:59 PM 2009-05-13 05:35:59 +01:00
[svn] yorkshire work with tunnel integration Copied from http://cucc@cucc.survex.com/svn/trunk/expoweb/troggle/, rev. 8232 by julian @ 1/29/2009 11:40 PM 2009-05-13 05:48:47 +01:00			`# make expeditions`
Thorough spring clean and profiling 2020-04-27 23:51:41 +01:00			`print(" - Loading expeditions")`
[svn] Julian playing with the logbooks and expoyears Copied from http://cucc@cucc.survex.com/svn/trunk/expoweb/troggle/, rev. 8180 by julian @ 1/18/2009 3:59 PM 2009-05-13 05:35:59 +01:00			`years = headers[5:]`
[svn] Photo file handling and mugshots parsing sorted. Made URL settings more relative, less redundant. Copied from http://cucc@cucc.survex.com/svn/trunk/expoweb/troggle/, rev. 8246 by aaron @ 2/18/2009 6:45 AM 2009-05-13 05:53:37 +01:00
[svn] Julian playing with the logbooks and expoyears Copied from http://cucc@cucc.survex.com/svn/trunk/expoweb/troggle/, rev. 8180 by julian @ 1/18/2009 3:59 PM 2009-05-13 05:35:59 +01:00			`for year in years:`
[svn] 2009-05-19 06:32:42 +01:00			`lookupAttribs = {'year':year}`
			`nonLookupAttribs = {'name':"CUCC expo %s" % year}`

			`save_carefully(models.Expedition, lookupAttribs, nonLookupAttribs)`
[svn] Julian playing with the logbooks and expoyears Copied from http://cucc@cucc.survex.com/svn/trunk/expoweb/troggle/, rev. 8180 by julian @ 1/18/2009 3:59 PM 2009-05-13 05:35:59 +01:00
[svn] yorkshire work with tunnel integration Copied from http://cucc@cucc.survex.com/svn/trunk/expoweb/troggle/, rev. 8232 by julian @ 1/29/2009 11:40 PM 2009-05-13 05:48:47 +01:00			`# make persons`
Thorough spring clean and profiling 2020-04-27 23:51:41 +01:00			`print(" - Loading personexpeditions")`
[svn] Julian playing with the logbooks and expoyears Copied from http://cucc@cucc.survex.com/svn/trunk/expoweb/troggle/, rev. 8180 by julian @ 1/18/2009 3:59 PM 2009-05-13 05:35:59 +01:00
[svn] yorkshire work with tunnel integration Copied from http://cucc@cucc.survex.com/svn/trunk/expoweb/troggle/, rev. 8232 by julian @ 1/29/2009 11:40 PM 2009-05-13 05:48:47 +01:00			`for personline in personreader:`
			`name = personline[header["Name"]]`
Update new management command for DB reset Switch to content_type from mimetype Make DB reset not nuke so much Tidy logbook parser 2019-03-30 13:58:38 +00:00			`name = re.sub(r"<.*?>", "", name)`
Support html and wiki logbook entrys Move nearest_station to nearest_station_name and make nearest_station a foreign key to SurvexStation Lots of tidying 2019-03-31 15:39:53 +01:00
Updating caves and entrances is no longer nuclear! Big overhaul of people processing, fullname added to the model lastname is now names -1 unless you only have one (yes you Wookey) this allows for Jon Arne Toft and Wookey to live it the same DB names can now have html chars in them, this should be real unicode but that can only happen when we go to Python 3! 2019-04-19 22:52:54 +01:00			`firstname = ""`
			`nickname = ""`

			`rawlastname = personline[header["Lastname"]].strip()`
			`matchlastname = re.match(r"^([\w&;\s]+)(?:\(([^)]*)\))?", rawlastname)`
			`lastname = matchlastname.group(1).strip()`

			`splitnick = re.match(r"^([\w&;\s]+)(?:\(([^)]*)\))?", name)`
			`fullname = splitnick.group(1)`

			`nickname = splitnick.group(2) or ""`

			`fullname = fullname.strip()`
			`names = fullname.split(' ')`
			`firstname = names[0]`
			`if len(names) == 1:`
			`lastname = ""`

Fully working dj 1.11.29 2020-06-19 16:39:05 +01:00			`if personline[header["VfHO member"]] =='':`
			`vfho = False`
			`else:`
			`vfho = True`

Updating caves and entrances is no longer nuclear! Big overhaul of people processing, fullname added to the model lastname is now names -1 unless you only have one (yes you Wookey) this allows for Jon Arne Toft and Wookey to live it the same DB names can now have html chars in them, this should be real unicode but that can only happen when we go to Python 3! 2019-04-19 22:52:54 +01:00			`lookupAttribs={'first_name':firstname, 'last_name':(lastname or "")}`
Fully working dj 1.11.29 2020-06-19 16:39:05 +01:00			`nonLookupAttribs={'is_vfho':vfho, 'fullname':fullname}`
[svn] 2009-05-19 06:32:42 +01:00			`person, created = save_carefully(models.Person, lookupAttribs, nonLookupAttribs)`
Support html and wiki logbook entrys Move nearest_station to nearest_station_name and make nearest_station a foreign key to SurvexStation Lots of tidying 2019-03-31 15:39:53 +01:00
[svn] Reverted the reverts from 8267. Fixed the next / previous trip in personexpedition on the LogbookEntry template -- I had misunderstood what this was supposed to do last time I messed with it. This involved adding the methods PersonTrip.get_persons_next_trip and persons_previous_trip. Couldn't find any other broken things. Kept the productive changes in 8267: extending the logbook parsing back to 1993, changing index page, changes to view_surveys.py Copied from http://cucc@cucc.survex.com/svn/trunk/expoweb/troggle/, rev. 8274 by aaron @ 3/14/2009 8:38 AM 2009-05-13 06:02:42 +01:00			`parseMugShotAndBlurb(personline=personline, header=header, person=person)`
[svn] Julian playing with the logbooks and expoyears Copied from http://cucc@cucc.survex.com/svn/trunk/expoweb/troggle/, rev. 8180 by julian @ 1/18/2009 3:59 PM 2009-05-13 05:35:59 +01:00
[svn] yorkshire work with tunnel integration Copied from http://cucc@cucc.survex.com/svn/trunk/expoweb/troggle/, rev. 8232 by julian @ 1/29/2009 11:40 PM 2009-05-13 05:48:47 +01:00			`# make person expedition from table`
Convert codebase for python3 usage 2020-05-24 01:57:06 +01:00			`for year, attended in list(zip(headers, personline))[5:]:`
[svn] yorkshire work with tunnel integration Copied from http://cucc@cucc.survex.com/svn/trunk/expoweb/troggle/, rev. 8232 by julian @ 1/29/2009 11:40 PM 2009-05-13 05:48:47 +01:00			`expedition = models.Expedition.objects.get(year=year)`
[svn] Julian playing with the logbooks and expoyears Copied from http://cucc@cucc.survex.com/svn/trunk/expoweb/troggle/, rev. 8180 by julian @ 1/18/2009 3:59 PM 2009-05-13 05:35:59 +01:00			`if attended == "1" or attended == "-1":`
[svn] 2009-05-19 06:32:42 +01:00			`lookupAttribs = {'person':person, 'expedition':expedition}`
fix schema and try cache caves import 2020-07-06 20:27:31 +01:00			`nonLookupAttribs = {'nickname':nickname, 'is_guest':(personline[header["Guest"]] == "1")}`
[svn] 2009-05-19 06:32:42 +01:00			`save_carefully(models.PersonExpedition, lookupAttribs, nonLookupAttribs)`
[svn] Julian playing with the logbooks and expoyears Copied from http://cucc@cucc.survex.com/svn/trunk/expoweb/troggle/, rev. 8180 by julian @ 1/18/2009 3:59 PM 2009-05-13 05:35:59 +01:00
[svn] Photo file handling and mugshots parsing sorted. Made URL settings more relative, less redundant. Copied from http://cucc@cucc.survex.com/svn/trunk/expoweb/troggle/, rev. 8246 by aaron @ 2/18/2009 6:45 AM 2009-05-13 05:53:37 +01:00
[svn] yorkshire work with tunnel integration Copied from http://cucc@cucc.survex.com/svn/trunk/expoweb/troggle/, rev. 8232 by julian @ 1/29/2009 11:40 PM 2009-05-13 05:48:47 +01:00			`# used in other referencing parser functions`
[svn] survey block object Copied from http://cucc@cucc.survex.com/svn/trunk/expoweb/troggle/, rev. 8199 by julian @ 1/19/2009 12:22 AM 2009-05-13 05:39:52 +01:00			`# expedition name lookup cached for speed (it's a very big list)`
			`Gpersonexpeditionnamelookup = { }`
			`def GetPersonExpeditionNameLookup(expedition):`
			`global Gpersonexpeditionnamelookup`
			`res = Gpersonexpeditionnamelookup.get(expedition.name)`
			`if res:`
			`return res`

[svn] yorkshire work with tunnel integration Copied from http://cucc@cucc.survex.com/svn/trunk/expoweb/troggle/, rev. 8232 by julian @ 1/29/2009 11:40 PM 2009-05-13 05:48:47 +01:00			`res = { }`
[svn] survey block object Copied from http://cucc@cucc.survex.com/svn/trunk/expoweb/troggle/, rev. 8199 by julian @ 1/19/2009 12:22 AM 2009-05-13 05:39:52 +01:00			`duplicates = set()`

add mysql startup documentation 2020-05-14 19:37:46 +01:00			`#print("Calculating GetPersonExpeditionNameLookup for " + expedition.year)`
[svn] survey block object Copied from http://cucc@cucc.survex.com/svn/trunk/expoweb/troggle/, rev. 8199 by julian @ 1/19/2009 12:22 AM 2009-05-13 05:39:52 +01:00			`personexpeditions = models.PersonExpedition.objects.filter(expedition=expedition)`
Allow comments against names in logbooks in brackets Convert accent chars in names into simple chars as this is what people enter in the logbook 2019-07-11 12:29:38 +01:00			`htmlparser = HTMLParser()`
[svn] survey block object Copied from http://cucc@cucc.survex.com/svn/trunk/expoweb/troggle/, rev. 8199 by julian @ 1/19/2009 12:22 AM 2009-05-13 05:39:52 +01:00			`for personexpedition in personexpeditions:`
			`possnames = [ ]`
Allow comments against names in logbooks in brackets Convert accent chars in names into simple chars as this is what people enter in the logbook 2019-07-11 12:29:38 +01:00			`f = unidecode(htmlparser.unescape(personexpedition.person.first_name.lower()))`
			`l = unidecode(htmlparser.unescape(personexpedition.person.last_name.lower()))`
			`full = unidecode(htmlparser.unescape(personexpedition.person.fullname.lower()))`
[svn] survey block object Copied from http://cucc@cucc.survex.com/svn/trunk/expoweb/troggle/, rev. 8199 by julian @ 1/19/2009 12:22 AM 2009-05-13 05:39:52 +01:00			`if l:`
			`possnames.append(f + " " + l)`
			`possnames.append(f + " " + l[0])`
			`possnames.append(f + l[0])`
			`possnames.append(f[0] + " " + l)`
			`possnames.append(f)`
Updating caves and entrances is no longer nuclear! Big overhaul of people processing, fullname added to the model lastname is now names -1 unless you only have one (yes you Wookey) this allows for Jon Arne Toft and Wookey to live it the same DB names can now have html chars in them, this should be real unicode but that can only happen when we go to Python 3! 2019-04-19 22:52:54 +01:00			`if full not in possnames:`
			`possnames.append(full)`
			`if personexpedition.nickname not in possnames:`
[svn] survey block object Copied from http://cucc@cucc.survex.com/svn/trunk/expoweb/troggle/, rev. 8199 by julian @ 1/19/2009 12:22 AM 2009-05-13 05:39:52 +01:00			`possnames.append(personexpedition.nickname.lower())`
Updating caves and entrances is no longer nuclear! Big overhaul of people processing, fullname added to the model lastname is now names -1 unless you only have one (yes you Wookey) this allows for Jon Arne Toft and Wookey to live it the same DB names can now have html chars in them, this should be real unicode but that can only happen when we go to Python 3! 2019-04-19 22:52:54 +01:00			`if l:`
			`# This allows for nickname to be used for short name eg Phil`
			`# adding Phil Sargent to the list`
			`if str(personexpedition.nickname.lower() + " " + l) not in possnames:`
			`possnames.append(personexpedition.nickname.lower() + " " + l)`
			`if str(personexpedition.nickname.lower() + " " + l[0]) not in possnames:`
			`possnames.append(personexpedition.nickname.lower() + " " + l[0])`
Allow comments against names in logbooks in brackets Convert accent chars in names into simple chars as this is what people enter in the logbook 2019-07-11 12:29:38 +01:00			`if str(personexpedition.nickname.lower() + l[0]) not in possnames:`
			`possnames.append(personexpedition.nickname.lower() + l[0])`
[svn] survey block object Copied from http://cucc@cucc.survex.com/svn/trunk/expoweb/troggle/, rev. 8199 by julian @ 1/19/2009 12:22 AM 2009-05-13 05:39:52 +01:00
			`for possname in possnames:`
			`if possname in res:`
			`duplicates.add(possname)`
			`else:`
			`res[possname] = personexpedition`

			`for possname in duplicates:`
			`del res[possname]`

			`Gpersonexpeditionnamelookup[expedition.name] = res`
			`return res`