2011-07-11 02:10:22 +01:00
#.-*- coding: utf-8 -*-
from django . conf import settings
import core . models as models
from parsers . people import GetPersonExpeditionNameLookup
from parsers . cavetab import GetCaveLookup
from django . template . defaultfilters import slugify
import csv
import re
import datetime
import os
from utils import save_carefully
#
# When we edit logbook entries, allow a "?" after any piece of data to say we've frigged it and
# it can be checked up later from the hard-copy if necessary; or it's not possible to determin (name, trip place, etc)
#
#
# the logbook loading section
#
def GetTripPersons ( trippeople , expedition , logtime_underground ) :
res = [ ]
author = None
for tripperson in re . split ( " ,| \ +|&|&(?! \ w+;)| and " , trippeople ) :
tripperson = tripperson . strip ( )
mul = re . match ( " <u>(.*?)</u>$(?i) " , tripperson )
if mul :
tripperson = mul . group ( 1 ) . strip ( )
if tripperson and tripperson [ 0 ] != ' * ' :
#assert tripperson in personyearmap, "'%s' << %s\n\n %s" % (tripperson, trippeople, personyearmap)
personyear = GetPersonExpeditionNameLookup ( expedition ) . get ( tripperson . lower ( ) )
if not personyear :
print " NoMatchFor: ' %s ' " % tripperson
res . append ( ( personyear , logtime_underground ) )
if mul :
author = personyear
if not author :
if not res :
return None , None
author = res [ - 1 ] [ 0 ]
return res , author
def GetTripCave ( place ) : #need to be fuzzier about matching here. Already a very slow function...
# print "Getting cave for " , place
try :
katastNumRes = [ ]
katastNumRes = list ( models . Cave . objects . filter ( kataster_number = int ( place ) ) )
except ValueError :
pass
officialNameRes = list ( models . Cave . objects . filter ( official_name = place ) )
tripCaveRes = officialNameRes + katastNumRes
if len ( tripCaveRes ) == 1 :
# print "Place " , place , "entered as" , tripCaveRes[0]
return tripCaveRes [ 0 ]
elif models . OtherCaveName . objects . filter ( name = place ) :
tripCaveRes = models . OtherCaveName . objects . filter ( name__icontains = place ) [ 0 ] . cave
# print "Place " , place , "entered as" , tripCaveRes
return tripCaveRes
elif len ( tripCaveRes ) > 1 :
print " Ambiguous place " + str ( place ) + " entered. Choose from " + str ( tripCaveRes )
correctIndex = input ( " type list index of correct cave " )
return tripCaveRes [ correctIndex ]
else :
print " No cave found for place " , place
return
noncaveplaces = [ " Journey " , " Loser Plateau " ]
def EnterLogIntoDbase ( date , place , title , text , trippeople , expedition , logtime_underground ) :
""" saves a logbook entry and related persontrips """
trippersons , author = GetTripPersons ( trippeople , expedition , logtime_underground )
if not author :
print " skipping logentry " , title
return
# tripCave = GetTripCave(place)
#
lplace = place . lower ( )
if lplace not in noncaveplaces :
cave = GetCaveLookup ( ) . get ( lplace )
#Check for an existing copy of the current entry, and save
expeditionday = expedition . get_expedition_day ( date )
lookupAttribs = { ' date ' : date , ' title ' : title }
nonLookupAttribs = { ' place ' : place , ' text ' : text , ' expedition ' : expedition , ' cave ' : cave , ' slug ' : slugify ( title ) [ : 50 ] }
lbo , created = save_carefully ( models . LogbookEntry , lookupAttribs , nonLookupAttribs )
for tripperson , time_underground in trippersons :
lookupAttribs = { ' personexpedition ' : tripperson , ' logbook_entry ' : lbo }
nonLookupAttribs = { ' time_underground ' : time_underground , ' is_logbook_entry_author ' : ( tripperson == author ) }
#print nonLookupAttribs
save_carefully ( models . PersonTrip , lookupAttribs , nonLookupAttribs )
def ParseDate ( tripdate , year ) :
""" Interprets dates in the expo logbooks and returns a correct datetime.date object """
mdatestandard = re . match ( " ( \ d \ d \ d \ d)-( \ d \ d)-( \ d \ d) " , tripdate )
mdategoof = re . match ( " ( \ d \ d?)/0?( \ d)/(20|19)?( \ d \ d) " , tripdate )
if mdatestandard :
assert mdatestandard . group ( 1 ) == year , ( tripdate , year )
year , month , day = int ( mdatestandard . group ( 1 ) ) , int ( mdatestandard . group ( 2 ) ) , int ( mdatestandard . group ( 3 ) )
elif mdategoof :
assert not mdategoof . group ( 3 ) or mdategoof . group ( 3 ) == year [ : 2 ] , mdategoof . groups ( )
yadd = int ( year [ : 2 ] ) * 100
day , month , year = int ( mdategoof . group ( 1 ) ) , int ( mdategoof . group ( 2 ) ) , int ( mdategoof . group ( 4 ) ) + yadd
else :
assert False , tripdate
return datetime . date ( year , month , day )
# 2007, 2008, 2006
def Parselogwikitxt ( year , expedition , txt ) :
trippara = re . findall ( " ===(.*?)===([ \ s \ S]*?)(?====) " , txt )
for triphead , triptext in trippara :
tripheadp = triphead . split ( " | " )
#print "ttt", tripheadp
assert len ( tripheadp ) == 3 , ( tripheadp , triptext )
tripdate , tripplace , trippeople = tripheadp
tripsplace = tripplace . split ( " - " )
tripcave = tripsplace [ 0 ] . strip ( )
tul = re . findall ( " T/?U:? \ s*( \ d+(?: \ . \ d*)?|unknown) \ s*(hrs|hours)? " , triptext )
if tul :
#assert len(tul) <= 1, (triphead, triptext)
#assert tul[0][1] in ["hrs", "hours"], (triphead, triptext)
tu = tul [ 0 ] [ 0 ]
else :
tu = " "
#assert tripcave == "Journey", (triphead, triptext)
#print tripdate
ldate = ParseDate ( tripdate . strip ( ) , year )
#print "\n", tripcave, "--- ppp", trippeople, len(triptext)
EnterLogIntoDbase ( date = ldate , place = tripcave , title = tripplace , text = triptext , trippeople = trippeople , expedition = expedition , logtime_underground = 0 )
# 2002, 2004, 2005
def Parseloghtmltxt ( year , expedition , txt ) :
tripparas = re . findall ( " <hr \ s*/>([ \ s \ S]*?)(?=<hr) " , txt )
for trippara in tripparas :
s = re . match ( ''' (?x)(?: \ s*<div \ sclass= " tripdate " \ sid= " .*? " >.*?</div> \ s*<p>)? # second date
\s * ( ? : < a \s + id = " (.*?) " \s * / > \s * < / a > ) ?
\s * < div \s + class = " tripdate " \s * ( ? : id = " (.*?) " ) ? > ( . * ? ) < / div > ( ? : < p > ) ?
\s * < div \s + class = " trippeople " > \s * ( . * ? ) < / div >
\s * < div \s + class = " triptitle " > \s * ( . * ? ) < / div >
( [ \s \S ] * ? )
\s * ( ? : < div \s + class = " timeug " > \s * ( . * ? ) < / div > ) ?
\s * $
''' , trippara)
if not s :
if not re . search ( " Rigging Guide " , trippara ) :
print " can ' t parse: " , trippara # this is 2007 which needs editing
#assert s, trippara
continue
tripid , tripid1 , tripdate , trippeople , triptitle , triptext , tu = s . groups ( )
ldate = ParseDate ( tripdate . strip ( ) , year )
#assert tripid[:-1] == "t" + tripdate, (tripid, tripdate)
trippeople = re . sub ( " Ol(?!l) " , " Olly " , trippeople )
trippeople = re . sub ( " Wook(?!e) " , " Wookey " , trippeople )
triptitles = triptitle . split ( " - " )
if len ( triptitles ) > = 2 :
tripcave = triptitles [ 0 ]
else :
tripcave = " UNKNOWN "
#print "\n", tripcave, "--- ppp", trippeople, len(triptext)
ltriptext = re . sub ( " </p> " , " " , triptext )
ltriptext = re . sub ( " \ s*? \n \ s* " , " " , ltriptext )
ltriptext = re . sub ( " <p> " , " \n \n " , ltriptext ) . strip ( )
EnterLogIntoDbase ( date = ldate , place = tripcave , title = triptitle , text = ltriptext , trippeople = trippeople , expedition = expedition , logtime_underground = 0 )
# main parser for pre-2001. simpler because the data has been hacked so much to fit it
def Parseloghtml01 ( year , expedition , txt ) :
tripparas = re . findall ( " <hr[ \ s/]*>([ \ s \ S]*?)(?=<hr) " , txt )
for trippara in tripparas :
s = re . match ( u " (?s) \ s*(?:<p>)?(.*?)</?p>(.*)$(?i) " , trippara )
assert s , trippara [ : 300 ]
tripheader , triptext = s . group ( 1 ) , s . group ( 2 )
mtripid = re . search ( ' <a id= " (.*?) " ' , tripheader )
tripid = mtripid and mtripid . group ( 1 ) or " "
tripheader = re . sub ( " </?(?:[ab]|span)[^>]*> " , " " , tripheader )
#print " ", [tripheader]
#continue
tripdate , triptitle , trippeople = tripheader . split ( " | " )
ldate = ParseDate ( tripdate . strip ( ) , year )
mtu = re . search ( ' <p[^>]*>(T/?U.*) ' , triptext )
if mtu :
tu = mtu . group ( 1 )
triptext = triptext [ : mtu . start ( 0 ) ] + triptext [ mtu . end ( ) : ]
else :
tu = " "
triptitles = triptitle . split ( " - " )
tripcave = triptitles [ 0 ] . strip ( )
ltriptext = triptext
mtail = re . search ( ' (?:<a href= " [^ " ]* " >[^<]*</a>| \ s|/|-|&|</?p>| \ ((?:same day| \ d+) \ ))*$ ' , ltriptext )
if mtail :
#print mtail.group(0)
ltriptext = ltriptext [ : mtail . start ( 0 ) ]
ltriptext = re . sub ( " </p> " , " " , ltriptext )
ltriptext = re . sub ( " \ s*? \n \ s* " , " " , ltriptext )
ltriptext = re . sub ( " <p>|<br> " , " \n \n " , ltriptext ) . strip ( )
#ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!]", "NONASCII", ltriptext)
ltriptext = re . sub ( " </?u> " , " _ " , ltriptext )
ltriptext = re . sub ( " </?i> " , " ' ' " , ltriptext )
ltriptext = re . sub ( " </?b> " , " ' ' ' " , ltriptext )
#print ldate, trippeople.strip()
# could includ the tripid (url link for cross referencing)
EnterLogIntoDbase ( date = ldate , place = tripcave , title = triptitle , text = ltriptext , trippeople = trippeople , expedition = expedition , logtime_underground = 0 )
def Parseloghtml03 ( year , expedition , txt ) :
tripparas = re . findall ( " <hr \ s*/>([ \ s \ S]*?)(?=<hr) " , txt )
for trippara in tripparas :
s = re . match ( u " (?s) \ s*<p>(.*?)</p>(.*)$ " , trippara )
assert s , trippara
tripheader , triptext = s . group ( 1 ) , s . group ( 2 )
tripheader = re . sub ( " " , " " , tripheader )
tripheader = re . sub ( " \ s+ " , " " , tripheader ) . strip ( )
sheader = tripheader . split ( " -- " )
tu = " "
if re . match ( " T/U|Time underwater " , sheader [ - 1 ] ) :
tu = sheader . pop ( )
if len ( sheader ) != 3 :
print " header not three pieces " , sheader
tripdate , triptitle , trippeople = sheader
ldate = ParseDate ( tripdate . strip ( ) , year )
triptitles = triptitle . split ( " , " )
if len ( triptitles ) > = 2 :
tripcave = triptitles [ 0 ]
else :
tripcave = " UNKNOWN "
#print tripcave, "--- ppp", triptitle, trippeople, len(triptext)
ltriptext = re . sub ( " </p> " , " " , triptext )
ltriptext = re . sub ( " \ s*? \n \ s* " , " " , ltriptext )
ltriptext = re . sub ( " <p> " , " \n \n " , ltriptext ) . strip ( )
ltriptext = re . sub ( " [^ \ s0-9a-zA-Z \ -.,:; ' !&() \ [ \ ]<>?=+* % ] " , " _NONASCII_ " , ltriptext )
EnterLogIntoDbase ( date = ldate , place = tripcave , title = triptitle , text = ltriptext , trippeople = trippeople , expedition = expedition , logtime_underground = 0 )
yearlinks = [
2011-09-15 12:12:18 +01:00
( " 2011 " , " 2011/logbook.html " , Parseloghtmltxt ) ,
( " 2010 " , " 2010/logbook.html " , Parselogwikitxt ) ,
2011-07-11 02:10:22 +01:00
( " 2009 " , " 2009/2009logbook.txt " , Parselogwikitxt ) ,
( " 2008 " , " 2008/2008logbook.txt " , Parselogwikitxt ) ,
( " 2007 " , " 2007/logbook.html " , Parseloghtmltxt ) ,
( " 2006 " , " 2006/logbook/logbook_06.txt " , Parselogwikitxt ) ,
( " 2005 " , " 2005/logbook.html " , Parseloghtmltxt ) ,
( " 2004 " , " 2004/logbook.html " , Parseloghtmltxt ) ,
( " 2003 " , " 2003/logbook.html " , Parseloghtml03 ) ,
( " 2002 " , " 2002/logbook.html " , Parseloghtmltxt ) ,
( " 2001 " , " 2001/log.htm " , Parseloghtml01 ) ,
( " 2000 " , " 2000/log.htm " , Parseloghtml01 ) ,
( " 1999 " , " 1999/log.htm " , Parseloghtml01 ) ,
( " 1998 " , " 1998/log.htm " , Parseloghtml01 ) ,
( " 1997 " , " 1997/log.htm " , Parseloghtml01 ) ,
( " 1996 " , " 1996/log.htm " , Parseloghtml01 ) ,
( " 1995 " , " 1995/log.htm " , Parseloghtml01 ) ,
( " 1994 " , " 1994/log.htm " , Parseloghtml01 ) ,
( " 1993 " , " 1993/log.htm " , Parseloghtml01 ) ,
( " 1992 " , " 1992/log.htm " , Parseloghtml01 ) ,
( " 1991 " , " 1991/log.htm " , Parseloghtml01 ) ,
]
def SetDatesFromLogbookEntries ( expedition ) :
"""
Sets the date_from and date_to field for an expedition based on persontrips .
Then sets the expedition date_from and date_to based on the personexpeditions .
"""
for personexpedition in expedition . personexpedition_set . all ( ) :
persontrips = personexpedition . persontrip_set . order_by ( ' logbook_entry__date ' )
# sequencing is difficult to do
lprevpersontrip = None
for persontrip in persontrips :
persontrip . persontrip_prev = lprevpersontrip
if lprevpersontrip :
lprevpersontrip . persontrip_next = persontrip
lprevpersontrip . save ( )
persontrip . persontrip_next = None
lprevpersontrip = persontrip
persontrip . save ( )
def LoadLogbookForExpedition ( expedition ) :
""" Parses all logbook entries for one expedition """
expowebbase = os . path . join ( settings . EXPOWEB , " years " )
year = str ( expedition . year )
for lyear , lloc , parsefunc in yearlinks :
if lyear == year :
break
fin = open ( os . path . join ( expowebbase , lloc ) )
print " opennning " , lloc
txt = fin . read ( ) . decode ( " latin1 " )
fin . close ( )
parsefunc ( year , expedition , txt )
SetDatesFromLogbookEntries ( expedition )
return " TOLOAD: " + year + " " + str ( expedition . personexpedition_set . all ( ) [ 1 ] . logbookentry_set . count ( ) ) + " " + str ( models . PersonTrip . objects . filter ( personexpedition__expedition = expedition ) . count ( ) )
def LoadLogbooks ( ) :
""" This is the master function for parsing all logbooks into the Troggle database. Requires yearlinks, which is a list of tuples for each expedition with expedition year, logbook path, and parsing function. """
#Deletion has been moved to a seperate function to enable the non-destructive importing
#models.LogbookEntry.objects.all().delete()
expowebbase = os . path . join ( settings . EXPOWEB , " years " )
#yearlinks = [ ("2001", "2001/log.htm", Parseloghtml01), ] #overwrite
#yearlinks = [ ("1996", "1996/log.htm", Parseloghtml01),] # overwrite
for year , lloc , parsefunc in yearlinks :
expedition = models . Expedition . objects . filter ( year = year ) [ 0 ]
fin = open ( os . path . join ( expowebbase , lloc ) )
txt = fin . read ( ) . decode ( " latin1 " )
fin . close ( )
parsefunc ( year , expedition , txt )
SetDatesFromLogbookEntries ( expedition )
dateRegex = re . compile ( ' <span \ s+class= " date " >( \ d \ d \ d \ d)-( \ d \ d)-( \ d \ d)</span> ' , re . S )
expeditionYearRegex = re . compile ( ' <span \ s+class= " expeditionyear " >(.*?)</span> ' , re . S )
titleRegex = re . compile ( ' <H1>(.*?)</H1> ' , re . S )
reportRegex = re . compile ( ' <div \ s+class= " report " >(.*)</div> \ s*</body> ' , re . S )
personRegex = re . compile ( ' <div \ s+class= " person " >(.*?)</div> ' , re . S )
nameAuthorRegex = re . compile ( ' <span \ s+class= " name(,author|) " >(.*?)</span> ' , re . S )
TURegex = re . compile ( ' <span \ s+class= " TU " >([0-9]* \ .?[0-9]+)</span> ' , re . S )
locationRegex = re . compile ( ' <span \ s+class= " location " >(.*?)</span> ' , re . S )
caveRegex = re . compile ( ' <span \ s+class= " cave " >(.*?)</span> ' , re . S )
def parseAutoLogBookEntry ( filename ) :
errors = [ ]
f = open ( filename , " r " )
contents = f . read ( )
f . close ( )
dateMatch = dateRegex . search ( contents )
if dateMatch :
year , month , day = [ int ( x ) for x in dateMatch . groups ( ) ]
date = datetime . date ( year , month , day )
else :
errors . append ( " Date could not be found " )
expeditionYearMatch = expeditionYearRegex . search ( contents )
if expeditionYearMatch :
try :
expedition = models . Expedition . objects . get ( year = expeditionYearMatch . groups ( ) [ 0 ] )
personExpeditionNameLookup = GetPersonExpeditionNameLookup ( expedition )
except models . Expedition . DoesNotExist :
errors . append ( " Expedition not in database " )
else :
errors . append ( " Expediton Year could not be parsed " )
titleMatch = titleRegex . search ( contents )
if titleMatch :
title , = titleMatch . groups ( )
if len ( title ) > settings . MAX_LOGBOOK_ENTRY_TITLE_LENGTH :
errors . append ( " Title too long " )
else :
errors . append ( " Title could not be found " )
caveMatch = caveRegex . search ( contents )
if caveMatch :
caveRef , = caveMatch . groups ( )
try :
cave = models . getCaveByReference ( caveRef )
except AssertionError :
cave = None
errors . append ( " Cave not found in database " )
else :
cave = None
locationMatch = locationRegex . search ( contents )
if locationMatch :
location , = locationMatch . groups ( )
else :
location = None
if cave is None and location is None :
errors . append ( " Location nor cave could not be found " )
reportMatch = reportRegex . search ( contents )
if reportMatch :
report , = reportMatch . groups ( )
else :
errors . append ( " Contents could not be found " )
if errors :
return errors # Easiest to bail out at this point as we need to make sure that we know which expedition to look for people from.
people = [ ]
for personMatch in personRegex . findall ( contents ) :
nameAuthorMatch = nameAuthorRegex . search ( contents )
if nameAuthorMatch :
author , name = nameAuthorMatch . groups ( )
if name . lower ( ) in personExpeditionNameLookup :
personExpo = personExpeditionNameLookup [ name . lower ( ) ]
else :
errors . append ( " Person could not be found in database " )
author = bool ( author )
else :
errors . append ( " Persons name could not be found " )
TUMatch = TURegex . search ( contents )
if TUMatch :
TU , = TUMatch . groups ( )
else :
errors . append ( " TU could not be found " )
if not errors :
people . append ( ( name , author , TU ) )
if errors :
return errors # Bail out before commiting to the database
logbookEntry = models . LogbookEntry ( date = date ,
expedition = expedition ,
title = title , cave = cave , place = location ,
text = report , slug = slugify ( title ) [ : 50 ] ,
filename = filename )
logbookEntry . save ( )
for name , author , TU in people :
models . PersonTrip ( personexpedition = personExpo ,
time_underground = TU ,
logbook_entry = logbookEntry ,
is_logbook_entry_author = author ) . save ( )
print logbookEntry