2009-05-13 05:13:38 +01:00
#.-*- coding: utf-8 -*-
2009-05-13 05:26:51 +01:00
2009-05-23 16:51:21 +01:00
from django . conf import settings
2009-07-02 20:43:18 +01:00
import core . models as models
2009-05-13 05:39:52 +01:00
2009-05-23 16:51:21 +01:00
from parsers . people import GetPersonExpeditionNameLookup
from parsers . cavetab import GetCaveLookup
2009-05-13 05:39:52 +01:00
2009-05-13 06:09:55 +01:00
from django . template . defaultfilters import slugify
2009-05-13 05:13:38 +01:00
import csv
import re
import datetime
2009-05-13 05:27:43 +01:00
import os
2009-05-13 05:13:38 +01:00
2009-07-03 05:31:49 +01:00
from utils import save_carefully
2009-05-13 05:13:38 +01:00
2009-05-13 05:48:47 +01:00
#
# When we edit logbook entries, allow a "?" after any piece of data to say we've frigged it and
# it can be checked up later from the hard-copy if necessary; or it's not possible to determin (name, trip place, etc)
#
2009-05-13 05:13:38 +01:00
#
# the logbook loading section
#
2009-05-13 05:52:59 +01:00
def GetTripPersons ( trippeople , expedition , logtime_underground ) :
2009-05-13 05:13:38 +01:00
res = [ ]
author = None
2009-05-13 05:18:07 +01:00
for tripperson in re . split ( " ,| \ +|&|&(?! \ w+;)| and " , trippeople ) :
2009-05-13 05:13:38 +01:00
tripperson = tripperson . strip ( )
2009-05-13 05:18:07 +01:00
mul = re . match ( " <u>(.*?)</u>$(?i) " , tripperson )
2009-05-13 05:13:38 +01:00
if mul :
2009-05-13 05:14:41 +01:00
tripperson = mul . group ( 1 ) . strip ( )
2009-05-13 05:13:38 +01:00
if tripperson and tripperson [ 0 ] != ' * ' :
#assert tripperson in personyearmap, "'%s' << %s\n\n %s" % (tripperson, trippeople, personyearmap)
2009-05-13 05:39:52 +01:00
personyear = GetPersonExpeditionNameLookup ( expedition ) . get ( tripperson . lower ( ) )
2009-05-13 05:21:05 +01:00
if not personyear :
print " NoMatchFor: ' %s ' " % tripperson
2009-05-13 05:35:59 +01:00
res . append ( ( personyear , logtime_underground ) )
2009-05-13 05:13:38 +01:00
if mul :
author = personyear
if not author :
2009-08-05 11:58:36 +01:00
if not res :
return None , None
2009-05-13 05:35:59 +01:00
author = res [ - 1 ] [ 0 ]
2009-05-13 05:13:38 +01:00
return res , author
2009-05-13 05:26:14 +01:00
def GetTripCave ( place ) : #need to be fuzzier about matching here. Already a very slow function...
# print "Getting cave for " , place
try :
katastNumRes = [ ]
katastNumRes = list ( models . Cave . objects . filter ( kataster_number = int ( place ) ) )
except ValueError :
pass
officialNameRes = list ( models . Cave . objects . filter ( official_name = place ) )
tripCaveRes = officialNameRes + katastNumRes
if len ( tripCaveRes ) == 1 :
# print "Place " , place , "entered as" , tripCaveRes[0]
return tripCaveRes [ 0 ]
elif models . OtherCaveName . objects . filter ( name = place ) :
tripCaveRes = models . OtherCaveName . objects . filter ( name__icontains = place ) [ 0 ] . cave
# print "Place " , place , "entered as" , tripCaveRes
return tripCaveRes
elif len ( tripCaveRes ) > 1 :
print " Ambiguous place " + str ( place ) + " entered. Choose from " + str ( tripCaveRes )
correctIndex = input ( " type list index of correct cave " )
return tripCaveRes [ correctIndex ]
else :
print " No cave found for place " , place
return
2009-05-13 05:35:59 +01:00
2009-05-13 05:46:12 +01:00
noncaveplaces = [ " Journey " , " Loser Plateau " ]
2009-05-13 05:35:59 +01:00
def EnterLogIntoDbase ( date , place , title , text , trippeople , expedition , logtime_underground ) :
2009-05-13 06:15:48 +01:00
""" saves a logbook entry and related persontrips """
2009-05-13 05:35:59 +01:00
trippersons , author = GetTripPersons ( trippeople , expedition , logtime_underground )
2009-08-05 11:58:36 +01:00
if not author :
print " skipping logentry " , title
return
2009-05-13 05:43:20 +01:00
# tripCave = GetTripCave(place)
2009-05-13 06:15:48 +01:00
#
2009-05-13 05:46:12 +01:00
lplace = place . lower ( )
if lplace not in noncaveplaces :
2009-05-13 06:15:48 +01:00
cave = GetCaveLookup ( ) . get ( lplace )
#Check for an existing copy of the current entry, and save
2009-09-11 23:56:47 +01:00
expeditionday = expedition . get_expedition_day ( date )
2009-05-23 16:51:21 +01:00
lookupAttribs = { ' date ' : date , ' title ' : title }
2009-09-11 23:56:47 +01:00
nonLookupAttribs = { ' place ' : place , ' text ' : text , ' author ' : author , ' expedition ' : expedition , ' expeditionday ' : expeditionday , ' cave ' : cave , ' slug ' : slugify ( title ) [ : 50 ] }
2009-05-13 06:15:48 +01:00
lbo , created = save_carefully ( models . LogbookEntry , lookupAttribs , nonLookupAttribs )
2009-09-11 09:04:59 +01:00
2009-05-13 05:35:59 +01:00
for tripperson , time_underground in trippersons :
2009-09-11 23:56:47 +01:00
lookupAttribs = { ' personexpedition ' : tripperson , ' logbook_entry ' : lbo }
nonLookupAttribs = { ' time_underground ' : time_underground , ' date ' : date , ' expeditionday ' : expeditionday , ' is_logbook_entry_author ' : ( tripperson == author ) }
2009-08-01 07:31:27 +01:00
#print nonLookupAttribs
2009-05-13 06:15:48 +01:00
save_carefully ( models . PersonTrip , lookupAttribs , nonLookupAttribs )
2009-05-13 05:21:05 +01:00
2009-05-13 05:35:59 +01:00
2009-05-13 05:21:05 +01:00
def ParseDate ( tripdate , year ) :
2009-06-09 00:29:00 +01:00
""" Interprets dates in the expo logbooks and returns a correct datetime.date object """
2009-05-13 05:19:07 +01:00
mdatestandard = re . match ( " ( \ d \ d \ d \ d)-( \ d \ d)-( \ d \ d) " , tripdate )
mdategoof = re . match ( " ( \ d \ d?)/0?( \ d)/(20|19)?( \ d \ d) " , tripdate )
if mdatestandard :
2009-05-13 05:21:05 +01:00
assert mdatestandard . group ( 1 ) == year , ( tripdate , year )
2009-05-13 05:19:07 +01:00
year , month , day = int ( mdatestandard . group ( 1 ) ) , int ( mdatestandard . group ( 2 ) ) , int ( mdatestandard . group ( 3 ) )
2009-05-13 05:21:05 +01:00
elif mdategoof :
2009-07-27 13:43:43 +01:00
assert not mdategoof . group ( 3 ) or mdategoof . group ( 3 ) == year [ : 2 ] , mdategoof . groups ( )
2009-05-13 05:19:07 +01:00
yadd = int ( year [ : 2 ] ) * 100
day , month , year = int ( mdategoof . group ( 1 ) ) , int ( mdategoof . group ( 2 ) ) , int ( mdategoof . group ( 4 ) ) + yadd
else :
2009-05-13 05:21:05 +01:00
assert False , tripdate
2009-05-13 05:19:07 +01:00
return datetime . date ( year , month , day )
2009-05-13 05:21:05 +01:00
# 2007, 2008, 2006
2009-05-13 05:14:41 +01:00
def Parselogwikitxt ( year , expedition , txt ) :
2009-05-13 05:13:38 +01:00
trippara = re . findall ( " ===(.*?)===([ \ s \ S]*?)(?====) " , txt )
for triphead , triptext in trippara :
tripheadp = triphead . split ( " | " )
2009-09-14 22:52:46 +01:00
#print "ttt", tripheadp
2009-05-13 05:16:11 +01:00
assert len ( tripheadp ) == 3 , ( tripheadp , triptext )
2009-05-13 05:13:38 +01:00
tripdate , tripplace , trippeople = tripheadp
tripsplace = tripplace . split ( " - " )
2009-05-13 05:19:07 +01:00
tripcave = tripsplace [ 0 ] . strip ( )
2009-05-13 05:13:38 +01:00
tul = re . findall ( " T/?U:? \ s*( \ d+(?: \ . \ d*)?|unknown) \ s*(hrs|hours)? " , triptext )
if tul :
#assert len(tul) <= 1, (triphead, triptext)
#assert tul[0][1] in ["hrs", "hours"], (triphead, triptext)
2009-05-13 05:18:07 +01:00
tu = tul [ 0 ] [ 0 ]
2009-05-13 05:13:38 +01:00
else :
2009-05-13 05:18:07 +01:00
tu = " "
2009-05-13 05:13:38 +01:00
#assert tripcave == "Journey", (triphead, triptext)
2009-09-14 22:52:46 +01:00
#print tripdate
2009-05-13 05:21:05 +01:00
ldate = ParseDate ( tripdate . strip ( ) , year )
2009-05-13 05:18:07 +01:00
#print "\n", tripcave, "--- ppp", trippeople, len(triptext)
2009-05-13 05:35:59 +01:00
EnterLogIntoDbase ( date = ldate , place = tripcave , title = tripplace , text = triptext , trippeople = trippeople , expedition = expedition , logtime_underground = 0 )
2009-05-13 05:13:38 +01:00
2009-05-13 05:21:05 +01:00
# 2002, 2004, 2005
2009-05-13 05:13:38 +01:00
def Parseloghtmltxt ( year , expedition , txt ) :
tripparas = re . findall ( " <hr \ s*/>([ \ s \ S]*?)(?=<hr) " , txt )
for trippara in tripparas :
2009-08-05 11:58:36 +01:00
s = re . match ( ''' (?x)(?: \ s*<div \ sclass= " tripdate " \ sid= " .*? " >.*?</div> \ s*<p>)? # second date
\s * ( ? : < a \s + id = " (.*?) " \s * / > ) ?
\s * < div \s + class = " tripdate " \s * ( ? : id = " (.*?) " ) ? > ( . * ? ) < / div > ( ? : < p > ) ?
2009-05-13 05:18:07 +01:00
\s * < div \s + class = " trippeople " > \s * ( . * ? ) < / div >
\s * < div \s + class = " triptitle " > \s * ( . * ? ) < / div >
2009-05-13 05:13:38 +01:00
( [ \s \S ] * ? )
2009-05-13 05:18:07 +01:00
\s * ( ? : < div \s + class = " timeug " > \s * ( . * ? ) < / div > ) ?
2009-05-13 05:13:38 +01:00
\s * $
''' , trippara)
2009-08-05 11:58:36 +01:00
if not s :
2009-09-14 22:52:46 +01:00
if not re . search ( " Rigging Guide " , trippara ) :
print " can ' t parse: " , trippara # this is 2007 which needs editing
2009-08-05 11:58:36 +01:00
#assert s, trippara
continue
2009-05-13 05:13:38 +01:00
2009-05-13 05:18:07 +01:00
tripid , tripid1 , tripdate , trippeople , triptitle , triptext , tu = s . groups ( )
2009-05-13 05:21:05 +01:00
ldate = ParseDate ( tripdate . strip ( ) , year )
#assert tripid[:-1] == "t" + tripdate, (tripid, tripdate)
trippeople = re . sub ( " Ol(?!l) " , " Olly " , trippeople )
trippeople = re . sub ( " Wook(?!e) " , " Wookey " , trippeople )
triptitles = triptitle . split ( " - " )
if len ( triptitles ) > = 2 :
tripcave = triptitles [ 0 ]
else :
2009-05-13 05:16:11 +01:00
tripcave = " UNKNOWN "
2009-05-13 05:18:07 +01:00
#print "\n", tripcave, "--- ppp", trippeople, len(triptext)
2009-05-13 05:15:49 +01:00
ltriptext = re . sub ( " </p> " , " " , triptext )
ltriptext = re . sub ( " \ s*? \n \ s* " , " " , ltriptext )
2009-05-13 05:21:05 +01:00
ltriptext = re . sub ( " <p> " , " \n \n " , ltriptext ) . strip ( )
2009-05-13 05:35:59 +01:00
EnterLogIntoDbase ( date = ldate , place = tripcave , title = triptitle , text = ltriptext , trippeople = trippeople , expedition = expedition , logtime_underground = 0 )
2009-05-13 05:21:05 +01:00
2009-05-13 05:13:38 +01:00
2009-05-13 05:21:05 +01:00
# main parser for pre-2001. simpler because the data has been hacked so much to fit it
2009-05-13 05:18:07 +01:00
def Parseloghtml01 ( year , expedition , txt ) :
tripparas = re . findall ( " <hr[ \ s/]*>([ \ s \ S]*?)(?=<hr) " , txt )
for trippara in tripparas :
s = re . match ( u " (?s) \ s*(?:<p>)?(.*?)</?p>(.*)$(?i) " , trippara )
2009-06-09 18:59:54 +01:00
assert s , trippara [ : 300 ]
2009-05-13 05:18:07 +01:00
tripheader , triptext = s . group ( 1 ) , s . group ( 2 )
2009-05-13 05:21:05 +01:00
mtripid = re . search ( ' <a id= " (.*?) " ' , tripheader )
tripid = mtripid and mtripid . group ( 1 ) or " "
tripheader = re . sub ( " </?(?:[ab]|span)[^>]*> " , " " , tripheader )
2009-09-11 09:04:59 +01:00
#print " ", [tripheader]
2009-05-13 05:21:05 +01:00
#continue
2009-05-13 05:18:07 +01:00
tripdate , triptitle , trippeople = tripheader . split ( " | " )
2009-05-13 05:21:05 +01:00
ldate = ParseDate ( tripdate . strip ( ) , year )
2009-06-09 18:59:54 +01:00
2009-05-13 05:21:05 +01:00
mtu = re . search ( ' <p[^>]*>(T/?U.*) ' , triptext )
if mtu :
tu = mtu . group ( 1 )
triptext = triptext [ : mtu . start ( 0 ) ] + triptext [ mtu . end ( ) : ]
else :
tu = " "
triptitles = triptitle . split ( " - " )
tripcave = triptitles [ 0 ] . strip ( )
2009-05-13 05:38:18 +01:00
ltriptext = triptext
mtail = re . search ( ' (?:<a href= " [^ " ]* " >[^<]*</a>| \ s|/|-|&|</?p>| \ ((?:same day| \ d+) \ ))*$ ' , ltriptext )
if mtail :
#print mtail.group(0)
ltriptext = ltriptext [ : mtail . start ( 0 ) ]
ltriptext = re . sub ( " </p> " , " " , ltriptext )
2009-05-13 05:18:07 +01:00
ltriptext = re . sub ( " \ s*? \n \ s* " , " " , ltriptext )
2009-05-13 05:38:18 +01:00
ltriptext = re . sub ( " <p>|<br> " , " \n \n " , ltriptext ) . strip ( )
2009-05-13 05:21:05 +01:00
#ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!]", "NONASCII", ltriptext)
2009-05-13 05:38:18 +01:00
ltriptext = re . sub ( " </?u> " , " _ " , ltriptext )
ltriptext = re . sub ( " </?i> " , " ' ' " , ltriptext )
ltriptext = re . sub ( " </?b> " , " ' ' ' " , ltriptext )
2009-05-13 05:21:05 +01:00
2009-09-14 22:52:46 +01:00
#print ldate, trippeople.strip()
2009-05-13 06:02:42 +01:00
# could includ the tripid (url link for cross referencing)
2009-06-09 18:59:54 +01:00
EnterLogIntoDbase ( date = ldate , place = tripcave , title = triptitle , text = ltriptext , trippeople = trippeople , expedition = expedition , logtime_underground = 0 )
2009-05-13 05:35:59 +01:00
2009-05-13 05:21:05 +01:00
2009-05-13 05:16:11 +01:00
def Parseloghtml03 ( year , expedition , txt ) :
tripparas = re . findall ( " <hr \ s*/>([ \ s \ S]*?)(?=<hr) " , txt )
for trippara in tripparas :
s = re . match ( u " (?s) \ s*<p>(.*?)</p>(.*)$ " , trippara )
2009-05-13 05:21:05 +01:00
assert s , trippara
2009-05-13 05:16:11 +01:00
tripheader , triptext = s . group ( 1 ) , s . group ( 2 )
2009-05-13 05:21:05 +01:00
tripheader = re . sub ( " " , " " , tripheader )
tripheader = re . sub ( " \ s+ " , " " , tripheader ) . strip ( )
sheader = tripheader . split ( " -- " )
tu = " "
if re . match ( " T/U|Time underwater " , sheader [ - 1 ] ) :
tu = sheader . pop ( )
if len ( sheader ) != 3 :
2009-09-14 22:52:46 +01:00
print " header not three pieces " , sheader
2009-05-13 05:21:05 +01:00
tripdate , triptitle , trippeople = sheader
ldate = ParseDate ( tripdate . strip ( ) , year )
triptitles = triptitle . split ( " , " )
if len ( triptitles ) > = 2 :
tripcave = triptitles [ 0 ]
else :
2009-05-13 05:16:11 +01:00
tripcave = " UNKNOWN "
2009-05-13 05:18:07 +01:00
#print tripcave, "--- ppp", triptitle, trippeople, len(triptext)
2009-05-13 05:16:11 +01:00
ltriptext = re . sub ( " </p> " , " " , triptext )
ltriptext = re . sub ( " \ s*? \n \ s* " , " " , ltriptext )
2009-05-13 05:21:05 +01:00
ltriptext = re . sub ( " <p> " , " \n \n " , ltriptext ) . strip ( )
ltriptext = re . sub ( " [^ \ s0-9a-zA-Z \ -.,:; ' !&() \ [ \ ]<>?=+* % ] " , " _NONASCII_ " , ltriptext )
2009-05-13 05:35:59 +01:00
EnterLogIntoDbase ( date = ldate , place = tripcave , title = triptitle , text = ltriptext , trippeople = trippeople , expedition = expedition , logtime_underground = 0 )
yearlinks = [
2009-09-08 23:05:04 +01:00
( " 2009 " , " 2009/2009logbook.txt " , Parselogwikitxt ) ,
2009-05-13 05:35:59 +01:00
( " 2008 " , " 2008/2008logbook.txt " , Parselogwikitxt ) ,
2009-08-05 11:58:36 +01:00
( " 2007 " , " 2007/logbook.html " , Parseloghtmltxt ) ,
2009-05-13 05:35:59 +01:00
( " 2006 " , " 2006/logbook/logbook_06.txt " , Parselogwikitxt ) ,
( " 2005 " , " 2005/logbook.html " , Parseloghtmltxt ) ,
( " 2004 " , " 2004/logbook.html " , Parseloghtmltxt ) ,
( " 2003 " , " 2003/logbook.html " , Parseloghtml03 ) ,
( " 2002 " , " 2002/logbook.html " , Parseloghtmltxt ) ,
( " 2001 " , " 2001/log.htm " , Parseloghtml01 ) ,
( " 2000 " , " 2000/log.htm " , Parseloghtml01 ) ,
( " 1999 " , " 1999/log.htm " , Parseloghtml01 ) ,
( " 1998 " , " 1998/log.htm " , Parseloghtml01 ) ,
( " 1997 " , " 1997/log.htm " , Parseloghtml01 ) ,
2009-05-13 06:02:42 +01:00
( " 1996 " , " 1996/log.htm " , Parseloghtml01 ) ,
2009-05-13 05:58:58 +01:00
( " 1995 " , " 1995/log.htm " , Parseloghtml01 ) ,
( " 1994 " , " 1994/log.htm " , Parseloghtml01 ) ,
2009-05-13 06:02:42 +01:00
( " 1993 " , " 1993/log.htm " , Parseloghtml01 ) ,
2009-06-09 18:59:54 +01:00
( " 1992 " , " 1992/log.htm " , Parseloghtml01 ) ,
2009-09-11 09:04:59 +01:00
( " 1991 " , " 1991/log.htm " , Parseloghtml01 ) ,
2009-05-13 05:35:59 +01:00
]
def SetDatesFromLogbookEntries ( expedition ) :
2009-06-09 00:29:00 +01:00
"""
Sets the date_from and date_to field for an expedition based on persontrips .
Then sets the expedition date_from and date_to based on the personexpeditions .
"""
2009-05-13 05:35:59 +01:00
for personexpedition in expedition . personexpedition_set . all ( ) :
2009-06-09 00:29:00 +01:00
persontrips = personexpedition . persontrip_set . order_by ( ' logbook_entry__date ' )
2009-07-27 13:43:43 +01:00
# sequencing is difficult to do
lprevpersontrip = None
for persontrip in persontrips :
persontrip . persontrip_prev = lprevpersontrip
if lprevpersontrip :
lprevpersontrip . persontrip_next = persontrip
lprevpersontrip . save ( )
persontrip . persontrip_next = None
lprevpersontrip = persontrip
persontrip . save ( )
2009-05-13 05:35:59 +01:00
def LoadLogbookForExpedition ( expedition ) :
2009-05-13 06:15:48 +01:00
""" Parses all logbook entries for one expedition """
2009-09-11 23:56:47 +01:00
2009-05-13 05:35:59 +01:00
expowebbase = os . path . join ( settings . EXPOWEB , " years " )
year = str ( expedition . year )
for lyear , lloc , parsefunc in yearlinks :
if lyear == year :
break
fin = open ( os . path . join ( expowebbase , lloc ) )
2009-08-05 11:58:36 +01:00
print " opennning " , lloc
2009-08-01 07:31:27 +01:00
txt = fin . read ( ) . decode ( " latin1 " )
2009-05-13 05:35:59 +01:00
fin . close ( )
parsefunc ( year , expedition , txt )
SetDatesFromLogbookEntries ( expedition )
2009-09-11 23:56:47 +01:00
return " TOLOAD: " + year + " " + str ( expedition . personexpedition_set . all ( ) [ 1 ] . logbookentry_set . count ( ) ) + " " + str ( models . PersonTrip . objects . filter ( personexpedition__expedition = expedition ) . count ( ) )
2009-05-13 05:35:59 +01:00
2009-05-13 05:13:38 +01:00
def LoadLogbooks ( ) :
2009-05-13 06:15:48 +01:00
""" This is the master function for parsing all logbooks into the Troggle database. Requires yearlinks, which is a list of tuples for each expedition with expedition year, logbook path, and parsing function. """
#Deletion has been moved to a seperate function to enable the non-destructive importing
#models.LogbookEntry.objects.all().delete()
2009-05-13 05:14:28 +01:00
expowebbase = os . path . join ( settings . EXPOWEB , " years " )
2009-05-13 05:19:44 +01:00
#yearlinks = [ ("2001", "2001/log.htm", Parseloghtml01), ] #overwrite
2009-05-13 06:02:42 +01:00
#yearlinks = [ ("1996", "1996/log.htm", Parseloghtml01),] # overwrite
2009-05-13 05:13:38 +01:00
2009-05-13 05:18:07 +01:00
for year , lloc , parsefunc in yearlinks :
2009-05-13 05:13:38 +01:00
expedition = models . Expedition . objects . filter ( year = year ) [ 0 ]
fin = open ( os . path . join ( expowebbase , lloc ) )
2009-07-27 13:43:43 +01:00
txt = fin . read ( ) . decode ( " latin1 " )
2009-05-13 05:21:05 +01:00
fin . close ( )
parsefunc ( year , expedition , txt )
2009-05-13 05:35:59 +01:00
SetDatesFromLogbookEntries ( expedition )
2009-05-13 05:14:03 +01:00
2009-05-13 06:02:42 +01:00