2011-07-11 02:10:22 +01:00
#.-*- coding: utf-8 -*-
from django . conf import settings
2018-04-15 16:28:13 +01:00
import troggle . core . models as models
2011-07-11 02:10:22 +01:00
from parsers . people import GetPersonExpeditionNameLookup
from parsers . cavetab import GetCaveLookup
from django . template . defaultfilters import slugify
import csv
import re
import datetime
import os
from utils import save_carefully
#
# When we edit logbook entries, allow a "?" after any piece of data to say we've frigged it and
# it can be checked up later from the hard-copy if necessary; or it's not possible to determin (name, trip place, etc)
#
#
# the logbook loading section
#
def GetTripPersons ( trippeople , expedition , logtime_underground ) :
res = [ ]
author = None
2019-03-06 23:20:34 +00:00
for tripperson in re . split ( r " ,| \ +|&|&(?! \ w+;)| and " , trippeople ) :
2011-07-11 02:10:22 +01:00
tripperson = tripperson . strip ( )
2019-03-06 23:20:34 +00:00
mul = re . match ( r " <u>(.*?)</u>$(?i) " , tripperson )
2011-07-11 02:10:22 +01:00
if mul :
tripperson = mul . group ( 1 ) . strip ( )
if tripperson and tripperson [ 0 ] != ' * ' :
#assert tripperson in personyearmap, "'%s' << %s\n\n %s" % (tripperson, trippeople, personyearmap)
personyear = GetPersonExpeditionNameLookup ( expedition ) . get ( tripperson . lower ( ) )
if not personyear :
2019-03-06 23:20:34 +00:00
print ( " - No name match for: ' %s ' " % tripperson )
2011-07-11 02:10:22 +01:00
res . append ( ( personyear , logtime_underground ) )
if mul :
author = personyear
if not author :
if not res :
return None , None
author = res [ - 1 ] [ 0 ]
return res , author
def GetTripCave ( place ) : #need to be fuzzier about matching here. Already a very slow function...
# print "Getting cave for " , place
try :
katastNumRes = [ ]
katastNumRes = list ( models . Cave . objects . filter ( kataster_number = int ( place ) ) )
except ValueError :
pass
officialNameRes = list ( models . Cave . objects . filter ( official_name = place ) )
tripCaveRes = officialNameRes + katastNumRes
if len ( tripCaveRes ) == 1 :
# print "Place " , place , "entered as" , tripCaveRes[0]
return tripCaveRes [ 0 ]
elif models . OtherCaveName . objects . filter ( name = place ) :
tripCaveRes = models . OtherCaveName . objects . filter ( name__icontains = place ) [ 0 ] . cave
# print "Place " , place , "entered as" , tripCaveRes
return tripCaveRes
elif len ( tripCaveRes ) > 1 :
2019-03-06 23:20:34 +00:00
print ( " Ambiguous place " + str ( place ) + " entered. Choose from " + str ( tripCaveRes ) )
2011-07-11 02:10:22 +01:00
correctIndex = input ( " type list index of correct cave " )
return tripCaveRes [ correctIndex ]
else :
2019-03-06 23:20:34 +00:00
print ( " No cave found for place " , place )
2011-07-11 02:10:22 +01:00
return
noncaveplaces = [ " Journey " , " Loser Plateau " ]
def EnterLogIntoDbase ( date , place , title , text , trippeople , expedition , logtime_underground ) :
""" saves a logbook entry and related persontrips """
trippersons , author = GetTripPersons ( trippeople , expedition , logtime_underground )
if not author :
2019-03-06 23:20:34 +00:00
print ( " - skipping logentry " + title + " no author for entry " )
2011-07-11 02:10:22 +01:00
return
# tripCave = GetTripCave(place)
#
lplace = place . lower ( )
if lplace not in noncaveplaces :
cave = GetCaveLookup ( ) . get ( lplace )
#Check for an existing copy of the current entry, and save
expeditionday = expedition . get_expedition_day ( date )
lookupAttribs = { ' date ' : date , ' title ' : title }
nonLookupAttribs = { ' place ' : place , ' text ' : text , ' expedition ' : expedition , ' cave ' : cave , ' slug ' : slugify ( title ) [ : 50 ] }
lbo , created = save_carefully ( models . LogbookEntry , lookupAttribs , nonLookupAttribs )
for tripperson , time_underground in trippersons :
lookupAttribs = { ' personexpedition ' : tripperson , ' logbook_entry ' : lbo }
nonLookupAttribs = { ' time_underground ' : time_underground , ' is_logbook_entry_author ' : ( tripperson == author ) }
#print nonLookupAttribs
save_carefully ( models . PersonTrip , lookupAttribs , nonLookupAttribs )
def ParseDate ( tripdate , year ) :
""" Interprets dates in the expo logbooks and returns a correct datetime.date object """
2019-03-06 23:20:34 +00:00
mdatestandard = re . match ( r " ( \ d \ d \ d \ d)-( \ d \ d)-( \ d \ d) " , tripdate )
mdategoof = re . match ( r " ( \ d \ d?)/0?( \ d)/(20|19)?( \ d \ d) " , tripdate )
2011-07-11 02:10:22 +01:00
if mdatestandard :
assert mdatestandard . group ( 1 ) == year , ( tripdate , year )
year , month , day = int ( mdatestandard . group ( 1 ) ) , int ( mdatestandard . group ( 2 ) ) , int ( mdatestandard . group ( 3 ) )
elif mdategoof :
assert not mdategoof . group ( 3 ) or mdategoof . group ( 3 ) == year [ : 2 ] , mdategoof . groups ( )
yadd = int ( year [ : 2 ] ) * 100
day , month , year = int ( mdategoof . group ( 1 ) ) , int ( mdategoof . group ( 2 ) ) , int ( mdategoof . group ( 4 ) ) + yadd
else :
assert False , tripdate
return datetime . date ( year , month , day )
# 2007, 2008, 2006
def Parselogwikitxt ( year , expedition , txt ) :
2019-03-06 23:20:34 +00:00
trippara = re . findall ( r " ===(.*?)===([ \ s \ S]*?)(?====) " , txt )
2011-07-11 02:10:22 +01:00
for triphead , triptext in trippara :
tripheadp = triphead . split ( " | " )
#print "ttt", tripheadp
assert len ( tripheadp ) == 3 , ( tripheadp , triptext )
tripdate , tripplace , trippeople = tripheadp
tripsplace = tripplace . split ( " - " )
tripcave = tripsplace [ 0 ] . strip ( )
2019-03-06 23:20:34 +00:00
tul = re . findall ( r " T/?U:? \ s*( \ d+(?: \ . \ d*)?|unknown) \ s*(hrs|hours)? " , triptext )
2011-07-11 02:10:22 +01:00
if tul :
#assert len(tul) <= 1, (triphead, triptext)
#assert tul[0][1] in ["hrs", "hours"], (triphead, triptext)
tu = tul [ 0 ] [ 0 ]
else :
tu = " "
#assert tripcave == "Journey", (triphead, triptext)
#print tripdate
ldate = ParseDate ( tripdate . strip ( ) , year )
#print "\n", tripcave, "--- ppp", trippeople, len(triptext)
EnterLogIntoDbase ( date = ldate , place = tripcave , title = tripplace , text = triptext , trippeople = trippeople , expedition = expedition , logtime_underground = 0 )
# 2002, 2004, 2005
def Parseloghtmltxt ( year , expedition , txt ) :
2019-03-06 23:20:34 +00:00
print ( " - Using log html parser " )
tripparas = re . findall ( r " <hr \ s*/>([ \ s \ S]*?)(?=<hr) " , txt )
logbook_entry_count = 0
2011-07-11 02:10:22 +01:00
for trippara in tripparas :
2019-03-06 23:20:34 +00:00
#print(" - HR detected - maybe a trip?")
logbook_entry_count + = 1
2011-07-11 02:10:22 +01:00
2019-03-06 23:20:34 +00:00
s = re . match ( r ''' (?x)(?: \ s*<div \ sclass= " tripdate " \ sid= " .*? " >.*?</div> \ s*<p>)? # second date
2011-07-11 02:10:22 +01:00
\s * ( ? : < a \s + id = " (.*?) " \s * / > \s * < / a > ) ?
\s * < div \s + class = " tripdate " \s * ( ? : id = " (.*?) " ) ? > ( . * ? ) < / div > ( ? : < p > ) ?
\s * < div \s + class = " trippeople " > \s * ( . * ? ) < / div >
\s * < div \s + class = " triptitle " > \s * ( . * ? ) < / div >
( [ \s \S ] * ? )
\s * ( ? : < div \s + class = " timeug " > \s * ( . * ? ) < / div > ) ?
\s * $
''' , trippara)
if not s :
2019-03-06 23:20:34 +00:00
if not re . search ( r " Rigging Guide " , trippara ) :
print ( " can ' t parse: " , trippara ) # this is 2007 which needs editing
2011-07-11 02:10:22 +01:00
#assert s, trippara
continue
tripid , tripid1 , tripdate , trippeople , triptitle , triptext , tu = s . groups ( )
ldate = ParseDate ( tripdate . strip ( ) , year )
#assert tripid[:-1] == "t" + tripdate, (tripid, tripdate)
2019-03-06 23:20:34 +00:00
trippeople = re . sub ( r " Ol(?!l) " , " Olly " , trippeople )
trippeople = re . sub ( r " Wook(?!e) " , " Wookey " , trippeople )
2011-07-11 02:10:22 +01:00
triptitles = triptitle . split ( " - " )
if len ( triptitles ) > = 2 :
tripcave = triptitles [ 0 ]
else :
tripcave = " UNKNOWN "
#print "\n", tripcave, "--- ppp", trippeople, len(triptext)
2019-03-06 23:20:34 +00:00
ltriptext = re . sub ( r " </p> " , " " , triptext )
ltriptext = re . sub ( r " \ s*? \ n \ s* " , " " , ltriptext )
ltriptext = re . sub ( r " <p> " , " \n \n " , ltriptext ) . strip ( )
2011-07-11 02:10:22 +01:00
EnterLogIntoDbase ( date = ldate , place = tripcave , title = triptitle , text = ltriptext , trippeople = trippeople , expedition = expedition , logtime_underground = 0 )
2019-03-06 23:20:34 +00:00
if logbook_entry_count == 0 :
print ( " - No trip entrys found in logbook, check the syntax matches htmltxt format " )
2011-07-11 02:10:22 +01:00
# main parser for pre-2001. simpler because the data has been hacked so much to fit it
def Parseloghtml01 ( year , expedition , txt ) :
2019-03-06 23:20:34 +00:00
tripparas = re . findall ( r " <hr[ \ s/]*>([ \ s \ S]*?)(?=<hr) " , txt )
2011-07-11 02:10:22 +01:00
for trippara in tripparas :
s = re . match ( u " (?s) \ s*(?:<p>)?(.*?)</?p>(.*)$(?i) " , trippara )
assert s , trippara [ : 300 ]
tripheader , triptext = s . group ( 1 ) , s . group ( 2 )
2019-03-06 23:20:34 +00:00
mtripid = re . search ( r ' <a id= " (.*?) " ' , tripheader )
2011-07-11 02:10:22 +01:00
tripid = mtripid and mtripid . group ( 1 ) or " "
2019-03-06 23:20:34 +00:00
tripheader = re . sub ( r " </?(?:[ab]|span)[^>]*> " , " " , tripheader )
2011-07-11 02:10:22 +01:00
#print " ", [tripheader]
#continue
tripdate , triptitle , trippeople = tripheader . split ( " | " )
ldate = ParseDate ( tripdate . strip ( ) , year )
2019-03-06 23:20:34 +00:00
mtu = re . search ( r ' <p[^>]*>(T/?U.*) ' , triptext )
2011-07-11 02:10:22 +01:00
if mtu :
tu = mtu . group ( 1 )
triptext = triptext [ : mtu . start ( 0 ) ] + triptext [ mtu . end ( ) : ]
else :
tu = " "
triptitles = triptitle . split ( " - " )
tripcave = triptitles [ 0 ] . strip ( )
ltriptext = triptext
2019-03-06 23:20:34 +00:00
mtail = re . search ( r ' (?:<a href= " [^ " ]* " >[^<]*</a>| \ s|/|-|&|</?p>| \ ((?:same day| \ d+) \ ))*$ ' , ltriptext )
2011-07-11 02:10:22 +01:00
if mtail :
#print mtail.group(0)
ltriptext = ltriptext [ : mtail . start ( 0 ) ]
2019-03-06 23:20:34 +00:00
ltriptext = re . sub ( r " </p> " , " " , ltriptext )
ltriptext = re . sub ( r " \ s*? \ n \ s* " , " " , ltriptext )
ltriptext = re . sub ( r " <p>|<br> " , " \n \n " , ltriptext ) . strip ( )
2011-07-11 02:10:22 +01:00
#ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!]", "NONASCII", ltriptext)
2019-03-06 23:20:34 +00:00
ltriptext = re . sub ( r " </?u> " , " _ " , ltriptext )
ltriptext = re . sub ( r " </?i> " , " ' ' " , ltriptext )
ltriptext = re . sub ( r " </?b> " , " ' ' ' " , ltriptext )
2011-07-11 02:10:22 +01:00
#print ldate, trippeople.strip()
# could includ the tripid (url link for cross referencing)
EnterLogIntoDbase ( date = ldate , place = tripcave , title = triptitle , text = ltriptext , trippeople = trippeople , expedition = expedition , logtime_underground = 0 )
def Parseloghtml03 ( year , expedition , txt ) :
2019-03-06 23:20:34 +00:00
tripparas = re . findall ( r " <hr \ s*/>([ \ s \ S]*?)(?=<hr) " , txt )
2011-07-11 02:10:22 +01:00
for trippara in tripparas :
s = re . match ( u " (?s) \ s*<p>(.*?)</p>(.*)$ " , trippara )
assert s , trippara
tripheader , triptext = s . group ( 1 ) , s . group ( 2 )
2019-03-06 23:20:34 +00:00
tripheader = re . sub ( r " " , " " , tripheader )
tripheader = re . sub ( r " \ s+ " , " " , tripheader ) . strip ( )
2011-07-11 02:10:22 +01:00
sheader = tripheader . split ( " -- " )
tu = " "
if re . match ( " T/U|Time underwater " , sheader [ - 1 ] ) :
tu = sheader . pop ( )
if len ( sheader ) != 3 :
2019-03-06 23:20:34 +00:00
print ( " header not three pieces " , sheader )
2011-07-11 02:10:22 +01:00
tripdate , triptitle , trippeople = sheader
ldate = ParseDate ( tripdate . strip ( ) , year )
triptitles = triptitle . split ( " , " )
if len ( triptitles ) > = 2 :
tripcave = triptitles [ 0 ]
else :
tripcave = " UNKNOWN "
#print tripcave, "--- ppp", triptitle, trippeople, len(triptext)
2019-03-06 23:20:34 +00:00
ltriptext = re . sub ( r " </p> " , " " , triptext )
ltriptext = re . sub ( r " \ s*? \ n \ s* " , " " , ltriptext )
ltriptext = re . sub ( r " <p> " , " \n \n " , ltriptext ) . strip ( )
ltriptext = re . sub ( r " [^ \ s0-9a-zA-Z \ -.,:; ' !&() \ [ \ ]<>?=+* % ] " , " _NONASCII_ " , ltriptext )
2011-07-11 02:10:22 +01:00
EnterLogIntoDbase ( date = ldate , place = tripcave , title = triptitle , text = ltriptext , trippeople = trippeople , expedition = expedition , logtime_underground = 0 )
def SetDatesFromLogbookEntries ( expedition ) :
"""
Sets the date_from and date_to field for an expedition based on persontrips .
Then sets the expedition date_from and date_to based on the personexpeditions .
"""
for personexpedition in expedition . personexpedition_set . all ( ) :
persontrips = personexpedition . persontrip_set . order_by ( ' logbook_entry__date ' )
# sequencing is difficult to do
lprevpersontrip = None
for persontrip in persontrips :
persontrip . persontrip_prev = lprevpersontrip
if lprevpersontrip :
lprevpersontrip . persontrip_next = persontrip
lprevpersontrip . save ( )
persontrip . persontrip_next = None
lprevpersontrip = persontrip
persontrip . save ( )
2019-03-06 23:20:34 +00:00
2011-07-11 02:10:22 +01:00
def LoadLogbookForExpedition ( expedition ) :
""" Parses all logbook entries for one expedition """
expowebbase = os . path . join ( settings . EXPOWEB , " years " )
year = str ( expedition . year )
2019-03-06 23:20:34 +00:00
yearlinks = settings . LOGBOOK_PARSER_SETTINGS
logbook_parseable = False
if expedition . year in yearlinks :
year_settings = yearlinks [ expedition . year ]
file_in = open ( os . path . join ( expowebbase , year_settings [ 0 ] ) )
txt = file_in . read ( ) . decode ( " latin1 " )
file_in . close ( )
parsefunc = year_settings [ 1 ]
logbook_parseable = True
else :
try :
file_in = open ( os . path . join ( expowebbase , expedition . year , settings . DEFAULT_LOGBOOK_FILE ) )
txt = file_in . read ( ) . decode ( " latin1 " )
file_in . close ( )
logbook_parseable = True
print ( " No set parser found using default " )
parsefunc = settings . DEFAULT_LOGBOOK_PARSER
except ( IOError ) :
logbook_parseable = False
print ( " Couldn ' t open default logbook file and nothing set for expo " + expo . year )
if logbook_parseable :
parser = globals ( ) [ parsefunc ]
parser ( expedition . year , expedition , txt )
SetDatesFromLogbookEntries ( expedition )
2011-07-11 02:10:22 +01:00
return " TOLOAD: " + year + " " + str ( expedition . personexpedition_set . all ( ) [ 1 ] . logbookentry_set . count ( ) ) + " " + str ( models . PersonTrip . objects . filter ( personexpedition__expedition = expedition ) . count ( ) )
def LoadLogbooks ( ) :
""" This is the master function for parsing all logbooks into the Troggle database. Requires yearlinks, which is a list of tuples for each expedition with expedition year, logbook path, and parsing function. """
#Deletion has been moved to a seperate function to enable the non-destructive importing
#models.LogbookEntry.objects.all().delete()
expowebbase = os . path . join ( settings . EXPOWEB , " years " )
#yearlinks = [ ("2001", "2001/log.htm", Parseloghtml01), ] #overwrite
#yearlinks = [ ("1996", "1996/log.htm", Parseloghtml01),] # overwrite
2019-03-06 23:20:34 +00:00
yearlinks = settings . LOGBOOK_PARSER_SETTINGS
2011-07-11 02:10:22 +01:00
2019-03-06 23:20:34 +00:00
expos = models . Expedition . objects . all ( )
for expo in expos :
print ( " \n Loading Logbook for: " + expo . year )
logbook_parseable = False
if expo . year in yearlinks :
#print(yearlinks[expo.year])
year_settings = yearlinks [ expo . year ]
file_in = open ( os . path . join ( expowebbase , year_settings [ 0 ] ) )
txt = file_in . read ( ) . decode ( " latin1 " )
file_in . close ( )
parsefunc = year_settings [ 1 ]
logbook_parseable = True
else :
try :
file_in = open ( os . path . join ( expowebbase , expo . year , settings . DEFAULT_LOGBOOK_FILE ) )
txt = file_in . read ( ) . decode ( " latin1 " )
file_in . close ( )
logbook_parseable = True
print ( " No set parser found using default " )
parsefunc = settings . DEFAULT_LOGBOOK_PARSER
except ( IOError ) :
logbook_parseable = False
print ( " Couldn ' t open default logbook file and nothing in settings for expo " + expo . year )
if logbook_parseable :
parser = globals ( ) [ parsefunc ]
parser ( expo . year , expo , txt )
SetDatesFromLogbookEntries ( expo )
dateRegex = re . compile ( r ' <span \ s+class= " date " >( \ d \ d \ d \ d)-( \ d \ d)-( \ d \ d)</span> ' , re . S )
expeditionYearRegex = re . compile ( r ' <span \ s+class= " expeditionyear " >(.*?)</span> ' , re . S )
titleRegex = re . compile ( r ' <H1>(.*?)</H1> ' , re . S )
reportRegex = re . compile ( r ' <div \ s+class= " report " >(.*)</div> \ s*</body> ' , re . S )
personRegex = re . compile ( r ' <div \ s+class= " person " >(.*?)</div> ' , re . S )
nameAuthorRegex = re . compile ( r ' <span \ s+class= " name(,author|) " >(.*?)</span> ' , re . S )
TURegex = re . compile ( r ' <span \ s+class= " TU " >([0-9]* \ .?[0-9]+)</span> ' , re . S )
locationRegex = re . compile ( r ' <span \ s+class= " location " >(.*?)</span> ' , re . S )
caveRegex = re . compile ( r ' <span \ s+class= " cave " >(.*?)</span> ' , re . S )
2011-07-11 02:10:22 +01:00
def parseAutoLogBookEntry ( filename ) :
errors = [ ]
f = open ( filename , " r " )
contents = f . read ( )
f . close ( )
dateMatch = dateRegex . search ( contents )
if dateMatch :
year , month , day = [ int ( x ) for x in dateMatch . groups ( ) ]
date = datetime . date ( year , month , day )
else :
errors . append ( " Date could not be found " )
expeditionYearMatch = expeditionYearRegex . search ( contents )
if expeditionYearMatch :
try :
expedition = models . Expedition . objects . get ( year = expeditionYearMatch . groups ( ) [ 0 ] )
personExpeditionNameLookup = GetPersonExpeditionNameLookup ( expedition )
except models . Expedition . DoesNotExist :
errors . append ( " Expedition not in database " )
else :
errors . append ( " Expediton Year could not be parsed " )
titleMatch = titleRegex . search ( contents )
if titleMatch :
title , = titleMatch . groups ( )
if len ( title ) > settings . MAX_LOGBOOK_ENTRY_TITLE_LENGTH :
errors . append ( " Title too long " )
else :
errors . append ( " Title could not be found " )
caveMatch = caveRegex . search ( contents )
if caveMatch :
caveRef , = caveMatch . groups ( )
try :
cave = models . getCaveByReference ( caveRef )
except AssertionError :
cave = None
errors . append ( " Cave not found in database " )
else :
cave = None
locationMatch = locationRegex . search ( contents )
if locationMatch :
location , = locationMatch . groups ( )
else :
location = None
if cave is None and location is None :
errors . append ( " Location nor cave could not be found " )
reportMatch = reportRegex . search ( contents )
if reportMatch :
report , = reportMatch . groups ( )
else :
errors . append ( " Contents could not be found " )
if errors :
return errors # Easiest to bail out at this point as we need to make sure that we know which expedition to look for people from.
people = [ ]
for personMatch in personRegex . findall ( contents ) :
nameAuthorMatch = nameAuthorRegex . search ( contents )
if nameAuthorMatch :
author , name = nameAuthorMatch . groups ( )
if name . lower ( ) in personExpeditionNameLookup :
personExpo = personExpeditionNameLookup [ name . lower ( ) ]
else :
errors . append ( " Person could not be found in database " )
author = bool ( author )
else :
errors . append ( " Persons name could not be found " )
TUMatch = TURegex . search ( contents )
if TUMatch :
TU , = TUMatch . groups ( )
else :
errors . append ( " TU could not be found " )
if not errors :
people . append ( ( name , author , TU ) )
if errors :
return errors # Bail out before commiting to the database
logbookEntry = models . LogbookEntry ( date = date ,
expedition = expedition ,
title = title , cave = cave , place = location ,
text = report , slug = slugify ( title ) [ : 50 ] ,
filename = filename )
logbookEntry . save ( )
for name , author , TU in people :
models . PersonTrip ( personexpedition = personExpo ,
time_underground = TU ,
logbook_entry = logbookEntry ,
is_logbook_entry_author = author ) . save ( )
2019-03-06 23:20:34 +00:00
print ( logbookEntry )