2008-10-26 21:04:06 +00:00
#.-*- coding: utf-8 -*-
2009-01-07 04:50:55 +00:00
import settings
import expo . models as models
2008-10-26 21:04:06 +00:00
import csv
import re
import datetime
2009-01-15 06:22:54 +00:00
import os
2008-10-26 21:04:06 +00:00
persontab = open ( os . path . join ( settings . EXPOWEB , " noinfo " , " folk.csv " ) )
personreader = csv . reader ( persontab )
headers = personreader . next ( )
header = dict ( zip ( headers , range ( len ( headers ) ) ) )
def LoadExpos ( ) :
models . Expedition . objects . all ( ) . delete ( )
2008-10-26 23:27:31 +00:00
years = headers [ 5 : ]
years . append ( " 2008 " )
for year in years :
y = models . Expedition ( year = year , name = " CUCC expo %s " % year )
2008-10-26 21:04:06 +00:00
y . save ( )
2008-10-26 23:27:31 +00:00
print " lll " , years
2008-10-26 21:04:06 +00:00
def LoadPersons ( ) :
models . Person . objects . all ( ) . delete ( )
models . PersonExpedition . objects . all ( ) . delete ( )
expoers2008 = """ Edvin Deadman,Kathryn Hopkins,Djuke Veldhuis,Becka Lawson,Julian Todd,Natalie Uomini,Aaron Curtis,Tony Rooke,Ollie Stevens,Frank Tully,Martin Jahnke,Mark Shinwell,Jess Stirrups,Nial Peters,Serena Povia,Olly Madge,Steve Jones,Pete Harley,Eeva Makiranta,Keith Curtis """ . split ( " , " )
expomissing = set ( expoers2008 )
for person in personreader :
name = person [ header [ " Name " ] ]
name = re . sub ( " <.*?> " , " " , name )
2008-11-08 18:24:03 +00:00
mname = re . match ( " ( \ w+)(?: \ s((?:van |ten )? \ w+))?(?: \ s \ (([^)]*) \ ))? " , name )
2008-10-26 23:27:31 +00:00
2008-11-07 02:08:47 +00:00
if mname . group ( 3 ) :
nickname = mname . group ( 3 )
2008-10-26 21:04:06 +00:00
else :
2008-10-26 23:27:31 +00:00
nickname = " "
2008-11-08 18:24:03 +00:00
firstname , lastname = mname . group ( 1 ) , mname . group ( 2 ) or " "
2009-01-04 14:06:01 +00:00
print firstname , lastname , " NNN " , nickname
2008-10-26 21:04:06 +00:00
#assert lastname == person[header[""]], person
2008-10-26 23:27:31 +00:00
2008-10-26 21:04:06 +00:00
pObject = models . Person ( first_name = firstname ,
last_name = lastname ,
is_vfho = person [ header [ " VfHO member " ] ] ,
2009-01-04 14:06:01 +00:00
)
2008-12-31 02:59:15 +00:00
2008-10-26 23:27:31 +00:00
is_guest = person [ header [ " Guest " ] ] == " 1 " # this is really a per-expo catagory; not a permanent state
2008-12-31 02:59:15 +00:00
pObject . save ( )
2009-01-04 14:06:01 +00:00
parseMugShotAndBlurb ( firstname , lastname , person , header , pObject )
2008-10-26 21:04:06 +00:00
for year , attended in zip ( headers , person ) [ 5 : ] :
yo = models . Expedition . objects . filter ( year = year ) [ 0 ]
if attended == " 1 " or attended == " -1 " :
2008-10-26 23:27:31 +00:00
pyo = models . PersonExpedition ( person = pObject , expedition = yo , nickname = nickname , is_guest = is_guest )
2008-11-08 18:24:03 +00:00
pyo . save ( )
# error
elif ( firstname , lastname ) == ( " Mike " , " Richardson " ) and year == " 2001 " :
print " Mike Richardson(2001) error "
2008-11-06 21:29:46 +00:00
pyo = models . PersonExpedition ( person = pObject , expedition = yo , nickname = nickname , is_guest = is_guest )
2008-11-08 18:24:03 +00:00
pyo . save ( )
2008-11-06 21:29:46 +00:00
2009-01-04 14:06:01 +00:00
2008-10-26 21:04:06 +00:00
2008-10-26 23:27:31 +00:00
# this fills in those peopl for whom 2008 was their first expo
2008-10-26 21:04:06 +00:00
for name in expomissing :
firstname , lastname = name . split ( )
2008-12-08 04:28:03 +00:00
is_guest = name in [ " Eeva Makiranta " , " Keith Curtis " ]
2009-01-04 14:06:01 +00:00
print " 2008: " , name
2008-10-26 21:04:06 +00:00
pObject = models . Person ( first_name = firstname ,
last_name = lastname ,
is_vfho = False ,
mug_shot = " " )
pObject . save ( )
yo = models . Expedition . objects . filter ( year = " 2008 " ) [ 0 ]
2008-10-26 23:27:31 +00:00
pyo = models . PersonExpedition ( person = pObject , expedition = yo , nickname = " " , is_guest = is_guest )
2008-10-26 21:04:06 +00:00
pyo . save ( )
2009-01-04 14:06:01 +00:00
# Julian: the below code was causing errors and it seems like a duplication of the above. Hope I haven't broken anything by commenting it. -Aaron
#
# if name in expoers2008:
# print "2008:", name
# expomissing.discard(name) # I got an error which I think was caused by this -- python complained that a set changed size during iteration.
# yo = models.Expedition.objects.filter(year = "2008")[0]
# pyo = models.PersonExpedition(person = pObject, expedition = yo, is_guest=is_guest)
# pyo.save()
def parseMugShotAndBlurb ( firstname , lastname , person , header , pObject ) :
#create mugshot Photo instance
mugShotPath = settings . EXPOWEB + " folk/ " + person [ header [ " Mugshot " ] ]
if mugShotPath [ - 3 : ] == ' jpg ' : #if person just has an image, add it
mugShotObj = models . Photo (
caption = " Mugshot for " + firstname + " " + lastname ,
is_mugshot = True ,
file = mugShotPath ,
)
mugShotObj . save ( )
mugShotObj . contains_person . add ( pObject )
mugShotObj . save ( )
elif mugShotPath [ - 3 : ] == ' htm ' : #if person has an html page, find the image(s) and add it. Also, add the text from the html page to the "blurb" field in his model instance.
personPageOld = open ( mugShotPath , ' r ' ) . read ( )
pObject . blurb = re . search ( ' <body>.*<hr ' , personPageOld , re . DOTALL ) . group ( ) #this needs to be refined, take care of the HTML and make sure it doesn't match beyond the blurb
for photoFilename in re . findall ( ' i/.*?jpg ' , personPageOld , re . DOTALL ) :
mugShotPath = settings . EXPOWEB + " folk/ " + photoFilename
mugShotObj = models . Photo (
caption = " Mugshot for " + firstname + " " + lastname ,
is_mugshot = True ,
file = mugShotPath ,
)
mugShotObj . save ( )
mugShotObj . contains_person . add ( pObject )
mugShotObj . save ( )
pObject . save ( )
2008-10-26 21:04:06 +00:00
#
# the logbook loading section
#
def GetTripPersons ( trippeople , expedition ) :
res = [ ]
author = None
2008-11-06 21:29:46 +00:00
for tripperson in re . split ( " ,| \ +|&|&(?! \ w+;)| and " , trippeople ) :
2008-10-26 21:04:06 +00:00
tripperson = tripperson . strip ( )
2008-11-06 21:29:46 +00:00
mul = re . match ( " <u>(.*?)</u>$(?i) " , tripperson )
2008-10-26 21:04:06 +00:00
if mul :
2008-10-27 02:03:52 +00:00
tripperson = mul . group ( 1 ) . strip ( )
2008-10-26 21:04:06 +00:00
if tripperson and tripperson [ 0 ] != ' * ' :
#assert tripperson in personyearmap, "'%s' << %s\n\n %s" % (tripperson, trippeople, personyearmap)
personyear = expedition . GetPersonExpedition ( tripperson )
2008-11-08 18:24:03 +00:00
if not personyear :
print " NoMatchFor: ' %s ' " % tripperson
2008-10-26 21:04:06 +00:00
res . append ( personyear )
if mul :
author = personyear
if not author :
author = res [ - 1 ]
return res , author
2009-01-04 14:06:01 +00:00
def GetTripCave ( place ) : #need to be fuzzier about matching here. Already a very slow function...
# print "Getting cave for " , place
try :
katastNumRes = [ ]
katastNumRes = list ( models . Cave . objects . filter ( kataster_number = int ( place ) ) )
except ValueError :
pass
officialNameRes = list ( models . Cave . objects . filter ( official_name = place ) )
tripCaveRes = officialNameRes + katastNumRes
if len ( tripCaveRes ) == 1 :
# print "Place " , place , "entered as" , tripCaveRes[0]
return tripCaveRes [ 0 ]
elif models . OtherCaveName . objects . filter ( name = place ) :
tripCaveRes = models . OtherCaveName . objects . filter ( name__icontains = place ) [ 0 ] . cave
# print "Place " , place , "entered as" , tripCaveRes
return tripCaveRes
elif len ( tripCaveRes ) > 1 :
print " Ambiguous place " + str ( place ) + " entered. Choose from " + str ( tripCaveRes )
correctIndex = input ( " type list index of correct cave " )
return tripCaveRes [ correctIndex ]
else :
print " No cave found for place " , place
return
2008-11-08 18:24:03 +00:00
def EnterLogIntoDbase ( date , place , title , text , trippeople , expedition , tu ) :
2008-11-06 21:29:46 +00:00
trippersons , author = GetTripPersons ( trippeople , expedition )
2009-01-04 14:06:01 +00:00
tripCave = GetTripCave ( place )
2008-11-07 02:08:47 +00:00
lbo = models . LogbookEntry ( date = date , place = place , title = title [ : 50 ] , text = text , author = author )
2009-01-04 14:06:01 +00:00
if tripCave :
lbo . cave = tripCave
2008-11-08 18:24:03 +00:00
lbo . save ( )
2008-11-06 21:29:46 +00:00
print " ttt " , date , place
for tripperson in trippersons :
2008-11-08 18:24:03 +00:00
pto = models . PersonTrip ( person_expedition = tripperson , place = place , date = date , time_underground = ( tu or " " ) ,
logbook_entry = lbo , is_logbook_entry_author = ( tripperson == author ) )
2008-11-06 21:29:46 +00:00
pto . save ( )
2008-11-08 18:24:03 +00:00
def ParseDate ( tripdate , year ) :
2008-11-07 02:08:47 +00:00
mdatestandard = re . match ( " ( \ d \ d \ d \ d)-( \ d \ d)-( \ d \ d) " , tripdate )
mdategoof = re . match ( " ( \ d \ d?)/0?( \ d)/(20|19)?( \ d \ d) " , tripdate )
if mdatestandard :
2008-11-08 18:24:03 +00:00
assert mdatestandard . group ( 1 ) == year , ( tripdate , year )
2008-11-07 02:08:47 +00:00
year , month , day = int ( mdatestandard . group ( 1 ) ) , int ( mdatestandard . group ( 2 ) ) , int ( mdatestandard . group ( 3 ) )
2008-11-08 18:24:03 +00:00
elif mdategoof :
assert not mdategoof . group ( 3 ) or mdategoof . group ( 3 ) == year [ : 2 ]
2008-11-07 02:08:47 +00:00
yadd = int ( year [ : 2 ] ) * 100
day , month , year = int ( mdategoof . group ( 1 ) ) , int ( mdategoof . group ( 2 ) ) , int ( mdategoof . group ( 4 ) ) + yadd
else :
2008-11-08 18:24:03 +00:00
assert False , tripdate
2008-11-07 02:08:47 +00:00
return datetime . date ( year , month , day )
2008-11-08 18:24:03 +00:00
# 2007, 2008, 2006
2008-10-27 02:03:52 +00:00
def Parselogwikitxt ( year , expedition , txt ) :
2008-10-26 21:04:06 +00:00
trippara = re . findall ( " ===(.*?)===([ \ s \ S]*?)(?====) " , txt )
for triphead , triptext in trippara :
tripheadp = triphead . split ( " | " )
2008-10-31 14:43:57 +00:00
assert len ( tripheadp ) == 3 , ( tripheadp , triptext )
2008-10-26 21:04:06 +00:00
tripdate , tripplace , trippeople = tripheadp
tripsplace = tripplace . split ( " - " )
2008-11-07 02:08:47 +00:00
tripcave = tripsplace [ 0 ] . strip ( )
2008-10-26 21:04:06 +00:00
tul = re . findall ( " T/?U:? \ s*( \ d+(?: \ . \ d*)?|unknown) \ s*(hrs|hours)? " , triptext )
if tul :
#assert len(tul) <= 1, (triphead, triptext)
#assert tul[0][1] in ["hrs", "hours"], (triphead, triptext)
2008-11-06 21:29:46 +00:00
tu = tul [ 0 ] [ 0 ]
2008-10-26 21:04:06 +00:00
else :
2008-11-06 21:29:46 +00:00
tu = " "
2008-10-26 21:04:06 +00:00
#assert tripcave == "Journey", (triphead, triptext)
2008-11-08 18:24:03 +00:00
ldate = ParseDate ( tripdate . strip ( ) , year )
2008-11-06 21:29:46 +00:00
#print "\n", tripcave, "--- ppp", trippeople, len(triptext)
2008-11-08 18:24:03 +00:00
EnterLogIntoDbase ( date = ldate , place = tripcave , title = tripplace , text = triptext , trippeople = trippeople , expedition = expedition , tu = tu )
2008-10-26 21:04:06 +00:00
2008-11-08 18:24:03 +00:00
# 2002, 2004, 2005
2008-10-26 21:04:06 +00:00
def Parseloghtmltxt ( year , expedition , txt ) :
tripparas = re . findall ( " <hr \ s*/>([ \ s \ S]*?)(?=<hr) " , txt )
for trippara in tripparas :
s = re . match ( ''' (?x) \ s*(?:<a \ s+id= " (.*?) " \ s*/>)?
\s * < div \s + class = " tripdate " \s * ( ? : id = " (.*?) " ) ? > ( . * ? ) < / div >
2008-11-06 21:29:46 +00:00
\s * < div \s + class = " trippeople " > \s * ( . * ? ) < / div >
\s * < div \s + class = " triptitle " > \s * ( . * ? ) < / div >
2008-10-26 21:04:06 +00:00
( [ \s \S ] * ? )
2008-11-06 21:29:46 +00:00
\s * ( ? : < div \s + class = " timeug " > \s * ( . * ? ) < / div > ) ?
2008-10-26 21:04:06 +00:00
\s * $
''' , trippara)
assert s , trippara
2008-11-06 21:29:46 +00:00
tripid , tripid1 , tripdate , trippeople , triptitle , triptext , tu = s . groups ( )
2008-11-08 18:24:03 +00:00
ldate = ParseDate ( tripdate . strip ( ) , year )
#assert tripid[:-1] == "t" + tripdate, (tripid, tripdate)
trippeople = re . sub ( " Ol(?!l) " , " Olly " , trippeople )
trippeople = re . sub ( " Wook(?!e) " , " Wookey " , trippeople )
triptitles = triptitle . split ( " - " )
if len ( triptitles ) > = 2 :
tripcave = triptitles [ 0 ]
else :
2008-10-31 14:43:57 +00:00
tripcave = " UNKNOWN "
2008-11-06 21:29:46 +00:00
#print "\n", tripcave, "--- ppp", trippeople, len(triptext)
2008-10-30 13:13:35 +00:00
ltriptext = re . sub ( " </p> " , " " , triptext )
ltriptext = re . sub ( " \ s*? \n \ s* " , " " , ltriptext )
2008-11-08 18:24:03 +00:00
ltriptext = re . sub ( " <p> " , " \n \n " , ltriptext ) . strip ( )
EnterLogIntoDbase ( date = ldate , place = tripcave , title = triptitle , text = ltriptext , trippeople = trippeople , expedition = expedition , tu = tu )
2008-10-26 21:04:06 +00:00
2008-11-08 18:24:03 +00:00
# main parser for pre-2001. simpler because the data has been hacked so much to fit it
2008-11-06 21:29:46 +00:00
def Parseloghtml01 ( year , expedition , txt ) :
tripparas = re . findall ( " <hr[ \ s/]*>([ \ s \ S]*?)(?=<hr) " , txt )
for trippara in tripparas :
s = re . match ( u " (?s) \ s*(?:<p>)?(.*?)</?p>(.*)$(?i) " , trippara )
2008-11-08 18:24:03 +00:00
assert s , trippara [ : 100 ]
2008-11-06 21:29:46 +00:00
tripheader , triptext = s . group ( 1 ) , s . group ( 2 )
2008-11-08 18:24:03 +00:00
mtripid = re . search ( ' <a id= " (.*?) " ' , tripheader )
tripid = mtripid and mtripid . group ( 1 ) or " "
tripheader = re . sub ( " </?(?:[ab]|span)[^>]*> " , " " , tripheader )
#print [tripheader]
#continue
2008-11-06 21:29:46 +00:00
tripdate , triptitle , trippeople = tripheader . split ( " | " )
2008-11-08 18:24:03 +00:00
ldate = ParseDate ( tripdate . strip ( ) , year )
mtu = re . search ( ' <p[^>]*>(T/?U.*) ' , triptext )
if mtu :
tu = mtu . group ( 1 )
triptext = triptext [ : mtu . start ( 0 ) ] + triptext [ mtu . end ( ) : ]
else :
tu = " "
triptitles = triptitle . split ( " - " )
tripcave = triptitles [ 0 ] . strip ( )
2008-11-06 21:29:46 +00:00
ltriptext = re . sub ( " </p> " , " " , triptext )
ltriptext = re . sub ( " \ s*? \n \ s* " , " " , ltriptext )
2008-11-07 02:25:00 +00:00
ltriptext = re . sub ( " <p> " , " \n \n " , ltriptext ) . strip ( )
2008-11-08 18:24:03 +00:00
#ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!]", "NONASCII", ltriptext)
#print ldate, trippeople.strip()
# could includ the tripid (url link for cross referencing)
EnterLogIntoDbase ( date = ldate , place = tripcave , title = triptitle , text = ltriptext , trippeople = trippeople , expedition = expedition , tu = tu )
2008-10-31 14:43:57 +00:00
def Parseloghtml03 ( year , expedition , txt ) :
tripparas = re . findall ( " <hr \ s*/>([ \ s \ S]*?)(?=<hr) " , txt )
for trippara in tripparas :
s = re . match ( u " (?s) \ s*<p>(.*?)</p>(.*)$ " , trippara )
2008-11-08 18:24:03 +00:00
assert s , trippara
2008-10-31 14:43:57 +00:00
tripheader , triptext = s . group ( 1 ) , s . group ( 2 )
2008-11-08 18:24:03 +00:00
tripheader = re . sub ( " " , " " , tripheader )
tripheader = re . sub ( " \ s+ " , " " , tripheader ) . strip ( )
sheader = tripheader . split ( " -- " )
tu = " "
if re . match ( " T/U|Time underwater " , sheader [ - 1 ] ) :
tu = sheader . pop ( )
if len ( sheader ) != 3 :
print sheader
# continue
tripdate , triptitle , trippeople = sheader
ldate = ParseDate ( tripdate . strip ( ) , year )
triptitles = triptitle . split ( " , " )
if len ( triptitles ) > = 2 :
tripcave = triptitles [ 0 ]
else :
2008-10-31 14:43:57 +00:00
tripcave = " UNKNOWN "
2008-11-06 21:29:46 +00:00
#print tripcave, "--- ppp", triptitle, trippeople, len(triptext)
2008-10-31 14:43:57 +00:00
ltriptext = re . sub ( " </p> " , " " , triptext )
ltriptext = re . sub ( " \ s*? \n \ s* " , " " , ltriptext )
2008-11-08 18:24:03 +00:00
ltriptext = re . sub ( " <p> " , " \n \n " , ltriptext ) . strip ( )
ltriptext = re . sub ( " [^ \ s0-9a-zA-Z \ -.,:; ' !&() \ [ \ ]<>?=+* % ] " , " _NONASCII_ " , ltriptext )
EnterLogIntoDbase ( date = ldate , place = tripcave , title = triptitle , text = ltriptext , trippeople = trippeople , expedition = expedition , tu = tu )
2008-10-26 21:04:06 +00:00
def LoadLogbooks ( ) :
models . LogbookEntry . objects . all ( ) . delete ( )
2008-10-27 00:36:06 +00:00
expowebbase = os . path . join ( settings . EXPOWEB , " years " )
2008-10-26 21:04:06 +00:00
yearlinks = [
2009-01-04 14:06:01 +00:00
( " 2008 " , " 2008/2008logbook.txt " , Parselogwikitxt ) ,
( " 2007 " , " 2007/2007logbook.txt " , Parselogwikitxt ) ,
2008-11-07 02:08:47 +00:00
( " 2006 " , " 2006/logbook/logbook_06.txt " , Parselogwikitxt ) ,
2008-11-06 21:29:46 +00:00
( " 2005 " , " 2005/logbook.html " , Parseloghtmltxt ) ,
( " 2004 " , " 2004/logbook.html " , Parseloghtmltxt ) ,
( " 2003 " , " 2003/logbook.html " , Parseloghtml03 ) ,
( " 2002 " , " 2002/logbook.html " , Parseloghtmltxt ) ,
( " 2001 " , " 2001/log.htm " , Parseloghtml01 ) ,
( " 2000 " , " 2000/log.htm " , Parseloghtml01 ) ,
( " 1999 " , " 1999/log.htm " , Parseloghtml01 ) ,
2008-11-07 02:08:47 +00:00
( " 1998 " , " 1998/log.htm " , Parseloghtml01 ) ,
( " 1997 " , " 1997/log.htm " , Parseloghtml01 ) ,
2008-10-26 21:04:06 +00:00
]
2008-11-07 02:30:53 +00:00
#yearlinks = [ ("2001", "2001/log.htm", Parseloghtml01), ] #overwrite
2008-10-26 21:04:06 +00:00
2008-11-06 21:29:46 +00:00
for year , lloc , parsefunc in yearlinks :
2008-10-26 21:04:06 +00:00
expedition = models . Expedition . objects . filter ( year = year ) [ 0 ]
fin = open ( os . path . join ( expowebbase , lloc ) )
txt = fin . read ( )
2008-11-08 18:24:03 +00:00
fin . close ( )
parsefunc ( year , expedition , txt )
2008-11-06 21:29:46 +00:00
2008-10-26 23:27:31 +00:00
2008-10-26 21:04:06 +00:00
# command line run through the loading stages
2008-10-26 23:27:31 +00:00
# you can comment out these in turn to control what gets reloaded
2008-11-08 18:24:03 +00:00
LoadExpos ( )
LoadPersons ( )
2008-10-26 21:04:06 +00:00
LoadLogbooks ( )