rest of martin's changes, without reverting lineend issues

This commit is contained in:
Wookey
2011-07-11 23:28:23 +01:00
77 changed files with 6650 additions and 6652 deletions

View File

@@ -1,254 +1,253 @@
# -*- coding: utf-8 -*-
import troggle.core.models as models
from django.conf import settings
import csv, time, re, os, logging
from utils import save_carefully
from django.core.urlresolvers import reverse
import flatpages.models
##format of CAVETAB2.CSV is
KatasterNumber = 0
KatStatusCode = 1
Entrances = 2
UnofficialNumber = 3
MultipleEntrances = 4
AutogenFile = 5
LinkFile = 6
LinkEntrance = 7
Name = 8
UnofficialName = 9
Comment = 10
Area = 11
Explorers = 12
UndergroundDescription = 13
Equipment = 14
QMList = 15
KatasterStatus = 16
References = 17
UndergroundCentreLine = 18
UndergroundDrawnSurvey = 19
SurvexFile = 20
Length = 21
Depth = 22
Extent = 23
Notes = 24
EntranceName = 25
TagPoint = 26
OtherPoint = 27
DescriptionOfOtherPoint = 28
ExactEntrance = 29
TypeOfFix = 30
GPSpreSA = 31
GPSpostSA = 32
Northing = 33
Easting = 34
Altitude = 35
Bearings = 36
Map = 37
Location = 38
Approach = 39
EntranceDescription = 40
PhotoOfLocation = 41
Marking = 42
MarkingComment = 43
Findability = 44
FindabilityComment = 45
def LoadCaveTab():
cavetab = open(os.path.join(settings.EXPOWEB, "noinfo", "CAVETAB2.CSV"),'rU')
caveReader = csv.reader(cavetab)
caveReader.next() # Strip out column headers
logging.info("Beginning to import caves from "+str(cavetab)+"\n"+"-"*60+"\n")
for katArea in ['1623', '1626']:
if not models.Area.objects.filter(short_name = katArea):
newArea = models.Area(short_name = katArea)
newArea.save()
logging.info("Added area "+str(newArea.short_name)+"\n")
area1626 = models.Area.objects.filter(short_name = '1626')[0]
area1623 = models.Area.objects.filter(short_name = '1623')[0]
counter=0
for line in caveReader :
if line[Area] == 'nonexistent':
continue
entranceLetters=[] #Used in caves that have mulitlple entrances, which are not described on seperate lines
if line[MultipleEntrances] == 'yes' or line[MultipleEntrances]=='': #When true, this line contains an actual cave, otherwise it is an extra entrance.
args = {}
defaultArgs = {}
def addToArgs(CSVname, modelName):
if line[CSVname]:
args[modelName] = line[CSVname]
def addToDefaultArgs(CSVname, modelName): #This has to do with the non-destructive import. These arguments will be passed as the "default" dictionary in a get_or_create
if line[CSVname]:
defaultArgs[modelName] = line[CSVname]
# The attributes added using "addToArgs" will be used to look up an existing cave. Those added using "addToDefaultArgs" will not.
addToArgs(KatasterNumber, "kataster_number")
addToDefaultArgs(KatStatusCode, "kataster_code")
addToArgs(UnofficialNumber, "unofficial_number")
addToArgs(Name, "official_name")
addToDefaultArgs(Comment, "notes")
addToDefaultArgs(Explorers, "explorers")
addToDefaultArgs(UndergroundDescription, "underground_description")
addToDefaultArgs(Equipment, "equipment")
addToDefaultArgs(KatasterStatus, "kataster_status")
addToDefaultArgs(References, "references")
addToDefaultArgs(UndergroundCentreLine, "underground_centre_line")
addToDefaultArgs(UndergroundDrawnSurvey, "survey")
addToDefaultArgs(Length, "length")
addToDefaultArgs(Depth, "depth")
addToDefaultArgs(Extent, "extent")
addToDefaultArgs(SurvexFile, "survex_file")
addToDefaultArgs(Notes, "notes")
addToDefaultArgs(AutogenFile, "url")
if line[Area] == "1626":
if line[KatasterNumber] != "":
args["slug"] = line[Area] + "-" + line[KatasterNumber]
else:
args["slug"] = line[Area] + "-" + line[UnofficialNumber]
else:
if line[KatasterNumber] != "":
args["slug"] = "1623" + "-" + line[KatasterNumber]
else:
args["slug"] = "1623" + "-" + line[UnofficialNumber]
#The following adds the legacy_file_path. This is always in either Autogen file or Link file
for header in (AutogenFile,LinkFile):
if line[header]:
addToDefaultArgs(header,"description_file")
break
#The following checks if this cave is non-public i.e. we don't have rights to display it online.
#Noinfo was the name of the old password protected directory, so if it has that then we will
#set the non_public field of the model instance to true.
defaultArgs["non_public"]=line[AutogenFile].startswith('noinfo') or line[LinkFile].startswith('noinfo')
newCave, created=save_carefully(models.Cave, lookupAttribs=args, nonLookupAttribs=defaultArgs)
logging.info("Added cave "+str(newCave)+"\n")
#If we created a new cave, add the area to it. This does mean that if a cave's identifying features have not changed, areas will not be updated from csv.
if created and line[Area]:
if line[Area] == "1626":
newCave.area.add(area1626)
else:
area = models.Area.objects.filter(short_name = line[Area])
if area:
newArea = area[0]
else:
newArea = models.Area(short_name = line[Area], parent = area1623)
newArea.save()
newCave.area.add(newArea)
newCave.area.add(area1623)
elif created:
newCave.area.add(area1623)
newCave.save()
logging.info("Added area "+line[Area]+" to cave "+str(newCave)+"\n")
if created and line[UnofficialName]:
newUnofficialName = models.OtherCaveName(cave = newCave, name = line[UnofficialName])
newUnofficialName.save()
logging.info("Added unofficial name "+str(newUnofficialName)+" to cave "+str(newCave)+"\n")
if created and line[MultipleEntrances] == '' or \
line[MultipleEntrances] == 'entrance' or \
line[MultipleEntrances] == 'last entrance':
args = {}
if line[Entrances]:
entrance_letter = line[Entrances]
else:
entrance_letter = ''
def addToArgs(CSVname, modelName):
if line[CSVname]:
args[modelName] = line[CSVname]
def addToArgsViaDict(CSVname, modelName, dictionary):
if line[CSVname]:
args[modelName] = dictionary[line[CSVname]]
addToArgs(EntranceName, 'name')
addToArgs(Explorers, 'explorers')
addToArgs(Map, 'map_description')
addToArgs(Location, 'location_description')
addToArgs(Approach, 'approach')
addToArgs(EntranceDescription, 'entrance_description')
addToArgs(UndergroundDescription, 'underground_description')
addToArgs(PhotoOfLocation, 'photo')
addToArgsViaDict(Marking, 'marking', {"Paint": "P",
"Paint (?)": "P?",
"Tag": "T",
"Tag (?)": "T?",
"Retagged": "R",
"Retag": "R",
"Spit": "S",
"Spit (?)": "S?",
"Unmarked": "U",
"": "?",
})
addToArgs(MarkingComment, 'marking_comment')
addToArgsViaDict(Findability, 'findability', {"Surveyed": "S",
"Lost": "L",
"Refindable": "R",
"": "?",
"?": "?",
})
addToArgs(FindabilityComment, 'findability_description')
addToArgs(Easting, 'easting')
addToArgs(Northing, 'northing')
addToArgs(Altitude, 'alt')
addToArgs(DescriptionOfOtherPoint, 'other_description')
addToArgs(TagPoint, 'tag_station')
addToArgs(ExactEntrance, 'exact_station')
addToArgs(OtherPoint, 'other_station')
addToArgs(OtherPoint, 'other_description')
if line[GPSpreSA]:
addToArgs(GPSpreSA, 'other_station')
args['other_description'] = 'pre selective availability GPS'
if line[GPSpostSA]:
addToArgs(GPSpostSA, 'other_station')
args['other_description'] = 'post selective availability GPS'
addToArgs(Bearings, 'bearings')
args['slug'] = newCave.slug + entrance_letter
newEntrance = models.Entrance(**args)
newEntrance.save()
logging.info("Added entrance "+str(newEntrance)+"\n")
newCaveAndEntrance = models.CaveAndEntrance(cave = newCave, entrance = newEntrance, entrance_letter = entrance_letter)
newCaveAndEntrance.save()
logging.info("Added CaveAndEntrance "+str(newCaveAndEntrance)+"\n")
if line[AutogenFile] != "":
f = flatpages.models.EntranceRedirect(originalURL = line[AutogenFile], entrance = newEntrance)
f.save()
# lookup function modelled on GetPersonExpeditionNameLookup
Gcavelookup = None
def GetCaveLookup():
global Gcavelookup
if Gcavelookup:
return Gcavelookup
Gcavelookup = {"NONEPLACEHOLDER":None}
for cave in models.Cave.objects.all():
Gcavelookup[cave.official_name.lower()] = cave
if cave.kataster_number:
Gcavelookup[cave.kataster_number] = cave
if cave.unofficial_number:
Gcavelookup[cave.unofficial_number] = cave
Gcavelookup["tunnocks"] = Gcavelookup["258"]
Gcavelookup["hauchhole"] = Gcavelookup["234"]
return Gcavelookup
# -*- coding: utf-8 -*-
import troggle.core.models as models
from django.conf import settings
import csv, time, re, os, logging
from utils import save_carefully
from django.core.urlresolvers import reverse
import flatpages.models
##format of CAVETAB2.CSV is
KatasterNumber = 0
KatStatusCode = 1
Entrances = 2
UnofficialNumber = 3
MultipleEntrances = 4
AutogenFile = 5
LinkFile = 6
LinkEntrance = 7
Name = 8
UnofficialName = 9
Comment = 10
Area = 11
Explorers = 12
UndergroundDescription = 13
Equipment = 14
QMList = 15
KatasterStatus = 16
References = 17
UndergroundCentreLine = 18
UndergroundDrawnSurvey = 19
SurvexFile = 20
Length = 21
Depth = 22
Extent = 23
Notes = 24
EntranceName = 25
TagPoint = 26
OtherPoint = 27
DescriptionOfOtherPoint = 28
ExactEntrance = 29
TypeOfFix = 30
GPSpreSA = 31
GPSpostSA = 32
Northing = 33
Easting = 34
Altitude = 35
Bearings = 36
Map = 37
Location = 38
Approach = 39
EntranceDescription = 40
PhotoOfLocation = 41
Marking = 42
MarkingComment = 43
Findability = 44
FindabilityComment = 45
def LoadCaveTab():
cavetab = open(os.path.join(settings.EXPOWEB, "noinfo", "CAVETAB2.CSV"),'rU')
caveReader = csv.reader(cavetab)
caveReader.next() # Strip out column headers
logging.info("Beginning to import caves from "+str(cavetab)+"\n"+"-"*60+"\n")
for katArea in ['1623', '1626']:
if not models.Area.objects.filter(short_name = katArea):
newArea = models.Area(short_name = katArea)
newArea.save()
logging.info("Added area "+str(newArea.short_name)+"\n")
area1626 = models.Area.objects.filter(short_name = '1626')[0]
area1623 = models.Area.objects.filter(short_name = '1623')[0]
counter=0
for line in caveReader :
if line[Area] == 'nonexistent':
continue
entranceLetters=[] #Used in caves that have mulitlple entrances, which are not described on seperate lines
if line[MultipleEntrances] == 'yes' or line[MultipleEntrances]=='': #When true, this line contains an actual cave, otherwise it is an extra entrance.
args = {}
defaultArgs = {}
def addToArgs(CSVname, modelName):
if line[CSVname]:
args[modelName] = line[CSVname]
def addToDefaultArgs(CSVname, modelName): #This has to do with the non-destructive import. These arguments will be passed as the "default" dictionary in a get_or_create
if line[CSVname]:
defaultArgs[modelName] = line[CSVname]
# The attributes added using "addToArgs" will be used to look up an existing cave. Those added using "addToDefaultArgs" will not.
addToArgs(KatasterNumber, "kataster_number")
addToDefaultArgs(KatStatusCode, "kataster_code")
addToArgs(UnofficialNumber, "unofficial_number")
addToArgs(Name, "official_name")
addToDefaultArgs(Comment, "notes")
addToDefaultArgs(Explorers, "explorers")
addToDefaultArgs(UndergroundDescription, "underground_description")
addToDefaultArgs(Equipment, "equipment")
addToDefaultArgs(KatasterStatus, "kataster_status")
addToDefaultArgs(References, "references")
addToDefaultArgs(UndergroundCentreLine, "underground_centre_line")
addToDefaultArgs(UndergroundDrawnSurvey, "survey")
addToDefaultArgs(Length, "length")
addToDefaultArgs(Depth, "depth")
addToDefaultArgs(Extent, "extent")
addToDefaultArgs(SurvexFile, "survex_file")
addToDefaultArgs(Notes, "notes")
addToDefaultArgs(AutogenFile, "url")
if line[Area] == "1626":
if line[KatasterNumber] != "":
args["slug"] = line[Area] + "-" + line[KatasterNumber]
else:
args["slug"] = line[Area] + "-" + line[UnofficialNumber]
else:
if line[KatasterNumber] != "":
args["slug"] = "1623" + "-" + line[KatasterNumber]
else:
args["slug"] = "1623" + "-" + line[UnofficialNumber]
#The following adds the legacy_file_path. This is always in either Autogen file or Link file
for header in (AutogenFile,LinkFile):
if line[header]:
addToDefaultArgs(header,"description_file")
break
#The following checks if this cave is non-public i.e. we don't have rights to display it online.
#Noinfo was the name of the old password protected directory, so if it has that then we will
#set the non_public field of the model instance to true.
defaultArgs["non_public"]=line[AutogenFile].startswith('noinfo') or line[LinkFile].startswith('noinfo')
newCave, created=save_carefully(models.Cave, lookupAttribs=args, nonLookupAttribs=defaultArgs)
logging.info("Added cave "+str(newCave)+"\n")
#If we created a new cave, add the area to it. This does mean that if a cave's identifying features have not changed, areas will not be updated from csv.
if created and line[Area]:
if line[Area] == "1626":
newCave.area.add(area1626)
else:
area = models.Area.objects.filter(short_name = line[Area])
if area:
newArea = area[0]
else:
newArea = models.Area(short_name = line[Area], parent = area1623)
newArea.save()
newCave.area.add(newArea)
newCave.area.add(area1623)
elif created:
newCave.area.add(area1623)
newCave.save()
logging.info("Added area "+line[Area]+" to cave "+str(newCave)+"\n")
if created and line[UnofficialName]:
newUnofficialName = models.OtherCaveName(cave = newCave, name = line[UnofficialName])
newUnofficialName.save()
logging.info("Added unofficial name "+str(newUnofficialName)+" to cave "+str(newCave)+"\n")
if created and line[MultipleEntrances] == '' or \
line[MultipleEntrances] == 'entrance' or \
line[MultipleEntrances] == 'last entrance':
args = {}
if line[Entrances]:
entrance_letter = line[Entrances]
else:
entrance_letter = ''
def addToArgs(CSVname, modelName):
if line[CSVname]:
args[modelName] = line[CSVname]
def addToArgsViaDict(CSVname, modelName, dictionary):
if line[CSVname]:
args[modelName] = dictionary[line[CSVname]]
addToArgs(EntranceName, 'name')
addToArgs(Explorers, 'explorers')
addToArgs(Map, 'map_description')
addToArgs(Location, 'location_description')
addToArgs(Approach, 'approach')
addToArgs(EntranceDescription, 'entrance_description')
addToArgs(UndergroundDescription, 'underground_description')
addToArgs(PhotoOfLocation, 'photo')
addToArgsViaDict(Marking, 'marking', {"Paint": "P",
"Paint (?)": "P?",
"Tag": "T",
"Tag (?)": "T?",
"Retagged": "R",
"Retag": "R",
"Spit": "S",
"Spit (?)": "S?",
"Unmarked": "U",
"": "?",
})
addToArgs(MarkingComment, 'marking_comment')
addToArgsViaDict(Findability, 'findability', {"Surveyed": "S",
"Lost": "L",
"Refindable": "R",
"": "?",
"?": "?",
})
addToArgs(FindabilityComment, 'findability_description')
addToArgs(Easting, 'easting')
addToArgs(Northing, 'northing')
addToArgs(Altitude, 'alt')
addToArgs(DescriptionOfOtherPoint, 'other_description')
addToArgs(TagPoint, 'tag_station')
addToArgs(ExactEntrance, 'exact_station')
addToArgs(OtherPoint, 'other_station')
addToArgs(OtherPoint, 'other_description')
if line[GPSpreSA]:
addToArgs(GPSpreSA, 'other_station')
args['other_description'] = 'pre selective availability GPS'
if line[GPSpostSA]:
addToArgs(GPSpostSA, 'other_station')
args['other_description'] = 'post selective availability GPS'
addToArgs(Bearings, 'bearings')
args['slug'] = newCave.slug + entrance_letter
newEntrance = models.Entrance(**args)
newEntrance.save()
logging.info("Added entrance "+str(newEntrance)+"\n")
newCaveAndEntrance = models.CaveAndEntrance(cave = newCave, entrance = newEntrance, entrance_letter = entrance_letter)
newCaveAndEntrance.save()
logging.info("Added CaveAndEntrance "+str(newCaveAndEntrance)+"\n")
if line[AutogenFile] != "":
f = flatpages.models.EntranceRedirect(originalURL = line[AutogenFile], entrance = newEntrance)
f.save()
# lookup function modelled on GetPersonExpeditionNameLookup
Gcavelookup = None
def GetCaveLookup():
global Gcavelookup
if Gcavelookup:
return Gcavelookup
Gcavelookup = {"NONEPLACEHOLDER":None}
for cave in models.Cave.objects.all():
Gcavelookup[cave.official_name.lower()] = cave
if cave.kataster_number:
Gcavelookup[cave.kataster_number] = cave
if cave.unofficial_number:
Gcavelookup[cave.unofficial_number] = cave
Gcavelookup["tunnocks"] = Gcavelookup["258"]
Gcavelookup["hauchhole"] = Gcavelookup["234"]
return Gcavelookup

View File

@@ -1,45 +1,45 @@
from django.conf import settings
import core.models as models
import os
from utils import html_to_wiki, get_html_body, get_html_title
pages = [(["smkridge", "204", "ariston-rigging.html"], "ariston-rigging"),
(["smkridge", "204", "ariston.html"], "ariston"),
(["smkridge", "204", "bivvy.html"], "bivvy"),
(["smkridge", "204", "bridge.html"], "bridge"),
(["smkridge", "204", "entrance-rigging.html"], "entrance-rigging"),
(["smkridge", "204", "entrance.html"], "entrance"),
(["smkridge", "204", "midlevel.html"], "midlevel"),
(["smkridge", "204", "millennium.html"], "millennium"),
(["smkridge", "204", "nopain.html"], "nopain"),
(["smkridge", "204", "razordance.html"], "razordance"),
(["smkridge", "204", "rhino.html"], "rhino"),
(["smkridge", "204", "sbview.html"], "sbview"),
(["smkridge", "204", "subway.html"], "subway"),
(["smkridge", "204", "swings.html"], "swings"),
(["smkridge", "204", "treeumphant.html"], "treeumphant"),
(["smkridge", "204", "uworld.html"], "uworld"), ]
def getDescriptions():
"""Creates objects in the database for each item in the list 'pages' . """
for filelocation, name in pages:
f = open(os.path.join(settings.EXPOWEB, *filelocation), "r")
html = f.read()
cd = models.CaveDescription(short_name = name,
long_name = unicode(get_html_title(html), "latin1"),
description = unicode(get_html_body(html), "latin1"))
cd.save()
def parseDescriptions():
"""Turns the HTML in each cave description into wikicode"""
for cd in models.CaveDescription.objects.all():
cd.description = html_to_wiki(cd.description)
cd.save()
def parseDescriptionsOnCaveObjects():
for cave in models.Cave.objects.all():
cave.underground_description=html_to_wiki(unicode(cave.underground_description))
from django.conf import settings
import core.models as models
import os
from utils import html_to_wiki, get_html_body, get_html_title
pages = [(["smkridge", "204", "ariston-rigging.html"], "ariston-rigging"),
(["smkridge", "204", "ariston.html"], "ariston"),
(["smkridge", "204", "bivvy.html"], "bivvy"),
(["smkridge", "204", "bridge.html"], "bridge"),
(["smkridge", "204", "entrance-rigging.html"], "entrance-rigging"),
(["smkridge", "204", "entrance.html"], "entrance"),
(["smkridge", "204", "midlevel.html"], "midlevel"),
(["smkridge", "204", "millennium.html"], "millennium"),
(["smkridge", "204", "nopain.html"], "nopain"),
(["smkridge", "204", "razordance.html"], "razordance"),
(["smkridge", "204", "rhino.html"], "rhino"),
(["smkridge", "204", "sbview.html"], "sbview"),
(["smkridge", "204", "subway.html"], "subway"),
(["smkridge", "204", "swings.html"], "swings"),
(["smkridge", "204", "treeumphant.html"], "treeumphant"),
(["smkridge", "204", "uworld.html"], "uworld"), ]
def getDescriptions():
"""Creates objects in the database for each item in the list 'pages' . """
for filelocation, name in pages:
f = open(os.path.join(settings.EXPOWEB, *filelocation), "r")
html = f.read()
cd = models.CaveDescription(short_name = name,
long_name = unicode(get_html_title(html), "latin1"),
description = unicode(get_html_body(html), "latin1"))
cd.save()
def parseDescriptions():
"""Turns the HTML in each cave description into wikicode"""
for cd in models.CaveDescription.objects.all():
cd.description = html_to_wiki(cd.description)
cd.save()
def parseDescriptionsOnCaveObjects():
for cave in models.Cave.objects.all():
cave.underground_description=html_to_wiki(unicode(cave.underground_description))
cave.save()

View File

@@ -1,432 +1,432 @@
#.-*- coding: utf-8 -*-
from django.conf import settings
import core.models as models
from parsers.people import GetPersonExpeditionNameLookup
from parsers.cavetab import GetCaveLookup
from django.template.defaultfilters import slugify
import csv
import re
import datetime
import os
from utils import save_carefully
#
# When we edit logbook entries, allow a "?" after any piece of data to say we've frigged it and
# it can be checked up later from the hard-copy if necessary; or it's not possible to determin (name, trip place, etc)
#
#
# the logbook loading section
#
def GetTripPersons(trippeople, expedition, logtime_underground):
res = [ ]
author = None
for tripperson in re.split(",|\+|&|&(?!\w+;)| and ", trippeople):
tripperson = tripperson.strip()
mul = re.match("<u>(.*?)</u>$(?i)", tripperson)
if mul:
tripperson = mul.group(1).strip()
if tripperson and tripperson[0] != '*':
#assert tripperson in personyearmap, "'%s' << %s\n\n %s" % (tripperson, trippeople, personyearmap)
personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower())
if not personyear:
print "NoMatchFor: '%s'" % tripperson
res.append((personyear, logtime_underground))
if mul:
author = personyear
if not author:
if not res:
return None, None
author = res[-1][0]
return res, author
def GetTripCave(place): #need to be fuzzier about matching here. Already a very slow function...
# print "Getting cave for " , place
try:
katastNumRes=[]
katastNumRes=list(models.Cave.objects.filter(kataster_number=int(place)))
except ValueError:
pass
officialNameRes=list(models.Cave.objects.filter(official_name=place))
tripCaveRes=officialNameRes+katastNumRes
if len(tripCaveRes)==1:
# print "Place " , place , "entered as" , tripCaveRes[0]
return tripCaveRes[0]
elif models.OtherCaveName.objects.filter(name=place):
tripCaveRes=models.OtherCaveName.objects.filter(name__icontains=place)[0].cave
# print "Place " , place , "entered as" , tripCaveRes
return tripCaveRes
elif len(tripCaveRes)>1:
print "Ambiguous place " + str(place) + " entered. Choose from " + str(tripCaveRes)
correctIndex=input("type list index of correct cave")
return tripCaveRes[correctIndex]
else:
print "No cave found for place " , place
return
noncaveplaces = [ "Journey", "Loser Plateau" ]
def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground):
""" saves a logbook entry and related persontrips """
trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground)
if not author:
print "skipping logentry", title
return
# tripCave = GetTripCave(place)
#
lplace = place.lower()
if lplace not in noncaveplaces:
cave=GetCaveLookup().get(lplace)
#Check for an existing copy of the current entry, and save
expeditionday = expedition.get_expedition_day(date)
lookupAttribs={'date':date, 'title':title}
nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave':cave, 'slug':slugify(title)[:50]}
lbo, created=save_carefully(models.LogbookEntry, lookupAttribs, nonLookupAttribs)
for tripperson, time_underground in trippersons:
lookupAttribs={'personexpedition':tripperson, 'logbook_entry':lbo}
nonLookupAttribs={'time_underground':time_underground, 'is_logbook_entry_author':(tripperson == author)}
#print nonLookupAttribs
save_carefully(models.PersonTrip, lookupAttribs, nonLookupAttribs)
def ParseDate(tripdate, year):
""" Interprets dates in the expo logbooks and returns a correct datetime.date object """
mdatestandard = re.match("(\d\d\d\d)-(\d\d)-(\d\d)", tripdate)
mdategoof = re.match("(\d\d?)/0?(\d)/(20|19)?(\d\d)", tripdate)
if mdatestandard:
assert mdatestandard.group(1) == year, (tripdate, year)
year, month, day = int(mdatestandard.group(1)), int(mdatestandard.group(2)), int(mdatestandard.group(3))
elif mdategoof:
assert not mdategoof.group(3) or mdategoof.group(3) == year[:2], mdategoof.groups()
yadd = int(year[:2]) * 100
day, month, year = int(mdategoof.group(1)), int(mdategoof.group(2)), int(mdategoof.group(4)) + yadd
else:
assert False, tripdate
return datetime.date(year, month, day)
# 2007, 2008, 2006
def Parselogwikitxt(year, expedition, txt):
trippara = re.findall("===(.*?)===([\s\S]*?)(?====)", txt)
for triphead, triptext in trippara:
tripheadp = triphead.split("|")
#print "ttt", tripheadp
assert len(tripheadp) == 3, (tripheadp, triptext)
tripdate, tripplace, trippeople = tripheadp
tripsplace = tripplace.split(" - ")
tripcave = tripsplace[0].strip()
tul = re.findall("T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext)
if tul:
#assert len(tul) <= 1, (triphead, triptext)
#assert tul[0][1] in ["hrs", "hours"], (triphead, triptext)
tu = tul[0][0]
else:
tu = ""
#assert tripcave == "Journey", (triphead, triptext)
#print tripdate
ldate = ParseDate(tripdate.strip(), year)
#print "\n", tripcave, "--- ppp", trippeople, len(triptext)
EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)
# 2002, 2004, 2005
def Parseloghtmltxt(year, expedition, txt):
tripparas = re.findall("<hr\s*/>([\s\S]*?)(?=<hr)", txt)
for trippara in tripparas:
s = re.match('''(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date
\s*(?:<a\s+id="(.*?)"\s*/>\s*</a>)?
\s*<div\s+class="tripdate"\s*(?:id="(.*?)")?>(.*?)</div>(?:<p>)?
\s*<div\s+class="trippeople">\s*(.*?)</div>
\s*<div\s+class="triptitle">\s*(.*?)</div>
([\s\S]*?)
\s*(?:<div\s+class="timeug">\s*(.*?)</div>)?
\s*$
''', trippara)
if not s:
if not re.search("Rigging Guide", trippara):
print "can't parse: ", trippara # this is 2007 which needs editing
#assert s, trippara
continue
tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups()
ldate = ParseDate(tripdate.strip(), year)
#assert tripid[:-1] == "t" + tripdate, (tripid, tripdate)
trippeople = re.sub("Ol(?!l)", "Olly", trippeople)
trippeople = re.sub("Wook(?!e)", "Wookey", trippeople)
triptitles = triptitle.split(" - ")
if len(triptitles) >= 2:
tripcave = triptitles[0]
else:
tripcave = "UNKNOWN"
#print "\n", tripcave, "--- ppp", trippeople, len(triptext)
ltriptext = re.sub("</p>", "", triptext)
ltriptext = re.sub("\s*?\n\s*", " ", ltriptext)
ltriptext = re.sub("<p>", "\n\n", ltriptext).strip()
EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)
# main parser for pre-2001. simpler because the data has been hacked so much to fit it
def Parseloghtml01(year, expedition, txt):
tripparas = re.findall("<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
for trippara in tripparas:
s = re.match(u"(?s)\s*(?:<p>)?(.*?)</?p>(.*)$(?i)", trippara)
assert s, trippara[:300]
tripheader, triptext = s.group(1), s.group(2)
mtripid = re.search('<a id="(.*?)"', tripheader)
tripid = mtripid and mtripid.group(1) or ""
tripheader = re.sub("</?(?:[ab]|span)[^>]*>", "", tripheader)
#print " ", [tripheader]
#continue
tripdate, triptitle, trippeople = tripheader.split("|")
ldate = ParseDate(tripdate.strip(), year)
mtu = re.search('<p[^>]*>(T/?U.*)', triptext)
if mtu:
tu = mtu.group(1)
triptext = triptext[:mtu.start(0)] + triptext[mtu.end():]
else:
tu = ""
triptitles = triptitle.split(" - ")
tripcave = triptitles[0].strip()
ltriptext = triptext
mtail = re.search('(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&amp;|</?p>|\((?:same day|\d+)\))*$', ltriptext)
if mtail:
#print mtail.group(0)
ltriptext = ltriptext[:mtail.start(0)]
ltriptext = re.sub("</p>", "", ltriptext)
ltriptext = re.sub("\s*?\n\s*", " ", ltriptext)
ltriptext = re.sub("<p>|<br>", "\n\n", ltriptext).strip()
#ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!]", "NONASCII", ltriptext)
ltriptext = re.sub("</?u>", "_", ltriptext)
ltriptext = re.sub("</?i>", "''", ltriptext)
ltriptext = re.sub("</?b>", "'''", ltriptext)
#print ldate, trippeople.strip()
# could includ the tripid (url link for cross referencing)
EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)
def Parseloghtml03(year, expedition, txt):
tripparas = re.findall("<hr\s*/>([\s\S]*?)(?=<hr)", txt)
for trippara in tripparas:
s = re.match(u"(?s)\s*<p>(.*?)</p>(.*)$", trippara)
assert s, trippara
tripheader, triptext = s.group(1), s.group(2)
tripheader = re.sub("&nbsp;", " ", tripheader)
tripheader = re.sub("\s+", " ", tripheader).strip()
sheader = tripheader.split(" -- ")
tu = ""
if re.match("T/U|Time underwater", sheader[-1]):
tu = sheader.pop()
if len(sheader) != 3:
print "header not three pieces", sheader
tripdate, triptitle, trippeople = sheader
ldate = ParseDate(tripdate.strip(), year)
triptitles = triptitle.split(" , ")
if len(triptitles) >= 2:
tripcave = triptitles[0]
else:
tripcave = "UNKNOWN"
#print tripcave, "--- ppp", triptitle, trippeople, len(triptext)
ltriptext = re.sub("</p>", "", triptext)
ltriptext = re.sub("\s*?\n\s*", " ", ltriptext)
ltriptext = re.sub("<p>", "\n\n", ltriptext).strip()
ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext)
EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)
yearlinks = [
("2009", "2009/2009logbook.txt", Parselogwikitxt),
("2008", "2008/2008logbook.txt", Parselogwikitxt),
("2007", "2007/logbook.html", Parseloghtmltxt),
("2006", "2006/logbook/logbook_06.txt", Parselogwikitxt),
("2005", "2005/logbook.html", Parseloghtmltxt),
("2004", "2004/logbook.html", Parseloghtmltxt),
("2003", "2003/logbook.html", Parseloghtml03),
("2002", "2002/logbook.html", Parseloghtmltxt),
("2001", "2001/log.htm", Parseloghtml01),
("2000", "2000/log.htm", Parseloghtml01),
("1999", "1999/log.htm", Parseloghtml01),
("1998", "1998/log.htm", Parseloghtml01),
("1997", "1997/log.htm", Parseloghtml01),
("1996", "1996/log.htm", Parseloghtml01),
("1995", "1995/log.htm", Parseloghtml01),
("1994", "1994/log.htm", Parseloghtml01),
("1993", "1993/log.htm", Parseloghtml01),
("1992", "1992/log.htm", Parseloghtml01),
("1991", "1991/log.htm", Parseloghtml01),
]
def SetDatesFromLogbookEntries(expedition):
"""
Sets the date_from and date_to field for an expedition based on persontrips.
Then sets the expedition date_from and date_to based on the personexpeditions.
"""
for personexpedition in expedition.personexpedition_set.all():
persontrips = personexpedition.persontrip_set.order_by('logbook_entry__date')
# sequencing is difficult to do
lprevpersontrip = None
for persontrip in persontrips:
persontrip.persontrip_prev = lprevpersontrip
if lprevpersontrip:
lprevpersontrip.persontrip_next = persontrip
lprevpersontrip.save()
persontrip.persontrip_next = None
lprevpersontrip = persontrip
persontrip.save()
def LoadLogbookForExpedition(expedition):
""" Parses all logbook entries for one expedition """
expowebbase = os.path.join(settings.EXPOWEB, "years")
year = str(expedition.year)
for lyear, lloc, parsefunc in yearlinks:
if lyear == year:
break
fin = open(os.path.join(expowebbase, lloc))
print "opennning", lloc
txt = fin.read().decode("latin1")
fin.close()
parsefunc(year, expedition, txt)
SetDatesFromLogbookEntries(expedition)
return "TOLOAD: " + year + " " + str(expedition.personexpedition_set.all()[1].logbookentry_set.count()) + " " + str(models.PersonTrip.objects.filter(personexpedition__expedition=expedition).count())
def LoadLogbooks():
""" This is the master function for parsing all logbooks into the Troggle database. Requires yearlinks, which is a list of tuples for each expedition with expedition year, logbook path, and parsing function. """
#Deletion has been moved to a seperate function to enable the non-destructive importing
#models.LogbookEntry.objects.all().delete()
expowebbase = os.path.join(settings.EXPOWEB, "years")
#yearlinks = [ ("2001", "2001/log.htm", Parseloghtml01), ] #overwrite
#yearlinks = [ ("1996", "1996/log.htm", Parseloghtml01),] # overwrite
for year, lloc, parsefunc in yearlinks:
expedition = models.Expedition.objects.filter(year = year)[0]
fin = open(os.path.join(expowebbase, lloc))
txt = fin.read().decode("latin1")
fin.close()
parsefunc(year, expedition, txt)
SetDatesFromLogbookEntries(expedition)
dateRegex = re.compile('<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S)
expeditionYearRegex = re.compile('<span\s+class="expeditionyear">(.*?)</span>', re.S)
titleRegex = re.compile('<H1>(.*?)</H1>', re.S)
reportRegex = re.compile('<div\s+class="report">(.*)</div>\s*</body>', re.S)
personRegex = re.compile('<div\s+class="person">(.*?)</div>', re.S)
nameAuthorRegex = re.compile('<span\s+class="name(,author|)">(.*?)</span>', re.S)
TURegex = re.compile('<span\s+class="TU">([0-9]*\.?[0-9]+)</span>', re.S)
locationRegex = re.compile('<span\s+class="location">(.*?)</span>', re.S)
caveRegex = re.compile('<span\s+class="cave">(.*?)</span>', re.S)
def parseAutoLogBookEntry(filename):
errors = []
f = open(filename, "r")
contents = f.read()
f.close()
dateMatch = dateRegex.search(contents)
if dateMatch:
year, month, day = [int(x) for x in dateMatch.groups()]
date = datetime.date(year, month, day)
else:
errors.append("Date could not be found")
expeditionYearMatch = expeditionYearRegex.search(contents)
if expeditionYearMatch:
try:
expedition = models.Expedition.objects.get(year = expeditionYearMatch.groups()[0])
personExpeditionNameLookup = GetPersonExpeditionNameLookup(expedition)
except models.Expedition.DoesNotExist:
errors.append("Expedition not in database")
else:
errors.append("Expediton Year could not be parsed")
titleMatch = titleRegex.search(contents)
if titleMatch:
title, = titleMatch.groups()
if len(title) > settings.MAX_LOGBOOK_ENTRY_TITLE_LENGTH:
errors.append("Title too long")
else:
errors.append("Title could not be found")
caveMatch = caveRegex.search(contents)
if caveMatch:
caveRef, = caveMatch.groups()
try:
cave = models.getCaveByReference(caveRef)
except AssertionError:
cave = None
errors.append("Cave not found in database")
else:
cave = None
locationMatch = locationRegex.search(contents)
if locationMatch:
location, = locationMatch.groups()
else:
location = None
if cave is None and location is None:
errors.append("Location nor cave could not be found")
reportMatch = reportRegex.search(contents)
if reportMatch:
report, = reportMatch.groups()
else:
errors.append("Contents could not be found")
if errors:
return errors # Easiest to bail out at this point as we need to make sure that we know which expedition to look for people from.
people = []
for personMatch in personRegex.findall(contents):
nameAuthorMatch = nameAuthorRegex.search(contents)
if nameAuthorMatch:
author, name = nameAuthorMatch.groups()
if name.lower() in personExpeditionNameLookup:
personExpo = personExpeditionNameLookup[name.lower()]
else:
errors.append("Person could not be found in database")
author = bool(author)
else:
errors.append("Persons name could not be found")
TUMatch = TURegex.search(contents)
if TUMatch:
TU, = TUMatch.groups()
else:
errors.append("TU could not be found")
if not errors:
people.append((name, author, TU))
if errors:
return errors # Bail out before commiting to the database
logbookEntry = models.LogbookEntry(date = date,
expedition = expedition,
title = title, cave = cave, place = location,
text = report, slug = slugify(title)[:50],
filename = filename)
logbookEntry.save()
for name, author, TU in people:
models.PersonTrip(personexpedition = personExpo,
time_underground = TU,
logbook_entry = logbookEntry,
is_logbook_entry_author = author).save()
print logbookEntry
#.-*- coding: utf-8 -*-
from django.conf import settings
import core.models as models
from parsers.people import GetPersonExpeditionNameLookup
from parsers.cavetab import GetCaveLookup
from django.template.defaultfilters import slugify
import csv
import re
import datetime
import os
from utils import save_carefully
#
# When we edit logbook entries, allow a "?" after any piece of data to say we've frigged it and
# it can be checked up later from the hard-copy if necessary; or it's not possible to determin (name, trip place, etc)
#
#
# the logbook loading section
#
def GetTripPersons(trippeople, expedition, logtime_underground):
res = [ ]
author = None
for tripperson in re.split(",|\+|&amp;|&(?!\w+;)| and ", trippeople):
tripperson = tripperson.strip()
mul = re.match("<u>(.*?)</u>$(?i)", tripperson)
if mul:
tripperson = mul.group(1).strip()
if tripperson and tripperson[0] != '*':
#assert tripperson in personyearmap, "'%s' << %s\n\n %s" % (tripperson, trippeople, personyearmap)
personyear = GetPersonExpeditionNameLookup(expedition).get(tripperson.lower())
if not personyear:
print "NoMatchFor: '%s'" % tripperson
res.append((personyear, logtime_underground))
if mul:
author = personyear
if not author:
if not res:
return None, None
author = res[-1][0]
return res, author
def GetTripCave(place): #need to be fuzzier about matching here. Already a very slow function...
# print "Getting cave for " , place
try:
katastNumRes=[]
katastNumRes=list(models.Cave.objects.filter(kataster_number=int(place)))
except ValueError:
pass
officialNameRes=list(models.Cave.objects.filter(official_name=place))
tripCaveRes=officialNameRes+katastNumRes
if len(tripCaveRes)==1:
# print "Place " , place , "entered as" , tripCaveRes[0]
return tripCaveRes[0]
elif models.OtherCaveName.objects.filter(name=place):
tripCaveRes=models.OtherCaveName.objects.filter(name__icontains=place)[0].cave
# print "Place " , place , "entered as" , tripCaveRes
return tripCaveRes
elif len(tripCaveRes)>1:
print "Ambiguous place " + str(place) + " entered. Choose from " + str(tripCaveRes)
correctIndex=input("type list index of correct cave")
return tripCaveRes[correctIndex]
else:
print "No cave found for place " , place
return
noncaveplaces = [ "Journey", "Loser Plateau" ]
def EnterLogIntoDbase(date, place, title, text, trippeople, expedition, logtime_underground):
""" saves a logbook entry and related persontrips """
trippersons, author = GetTripPersons(trippeople, expedition, logtime_underground)
if not author:
print "skipping logentry", title
return
# tripCave = GetTripCave(place)
#
lplace = place.lower()
if lplace not in noncaveplaces:
cave=GetCaveLookup().get(lplace)
#Check for an existing copy of the current entry, and save
expeditionday = expedition.get_expedition_day(date)
lookupAttribs={'date':date, 'title':title}
nonLookupAttribs={'place':place, 'text':text, 'expedition':expedition, 'cave':cave, 'slug':slugify(title)[:50]}
lbo, created=save_carefully(models.LogbookEntry, lookupAttribs, nonLookupAttribs)
for tripperson, time_underground in trippersons:
lookupAttribs={'personexpedition':tripperson, 'logbook_entry':lbo}
nonLookupAttribs={'time_underground':time_underground, 'is_logbook_entry_author':(tripperson == author)}
#print nonLookupAttribs
save_carefully(models.PersonTrip, lookupAttribs, nonLookupAttribs)
def ParseDate(tripdate, year):
""" Interprets dates in the expo logbooks and returns a correct datetime.date object """
mdatestandard = re.match("(\d\d\d\d)-(\d\d)-(\d\d)", tripdate)
mdategoof = re.match("(\d\d?)/0?(\d)/(20|19)?(\d\d)", tripdate)
if mdatestandard:
assert mdatestandard.group(1) == year, (tripdate, year)
year, month, day = int(mdatestandard.group(1)), int(mdatestandard.group(2)), int(mdatestandard.group(3))
elif mdategoof:
assert not mdategoof.group(3) or mdategoof.group(3) == year[:2], mdategoof.groups()
yadd = int(year[:2]) * 100
day, month, year = int(mdategoof.group(1)), int(mdategoof.group(2)), int(mdategoof.group(4)) + yadd
else:
assert False, tripdate
return datetime.date(year, month, day)
# 2007, 2008, 2006
def Parselogwikitxt(year, expedition, txt):
trippara = re.findall("===(.*?)===([\s\S]*?)(?====)", txt)
for triphead, triptext in trippara:
tripheadp = triphead.split("|")
#print "ttt", tripheadp
assert len(tripheadp) == 3, (tripheadp, triptext)
tripdate, tripplace, trippeople = tripheadp
tripsplace = tripplace.split(" - ")
tripcave = tripsplace[0].strip()
tul = re.findall("T/?U:?\s*(\d+(?:\.\d*)?|unknown)\s*(hrs|hours)?", triptext)
if tul:
#assert len(tul) <= 1, (triphead, triptext)
#assert tul[0][1] in ["hrs", "hours"], (triphead, triptext)
tu = tul[0][0]
else:
tu = ""
#assert tripcave == "Journey", (triphead, triptext)
#print tripdate
ldate = ParseDate(tripdate.strip(), year)
#print "\n", tripcave, "--- ppp", trippeople, len(triptext)
EnterLogIntoDbase(date = ldate, place = tripcave, title = tripplace, text = triptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)
# 2002, 2004, 2005
def Parseloghtmltxt(year, expedition, txt):
tripparas = re.findall("<hr\s*/>([\s\S]*?)(?=<hr)", txt)
for trippara in tripparas:
s = re.match('''(?x)(?:\s*<div\sclass="tripdate"\sid=".*?">.*?</div>\s*<p>)? # second date
\s*(?:<a\s+id="(.*?)"\s*/>\s*</a>)?
\s*<div\s+class="tripdate"\s*(?:id="(.*?)")?>(.*?)</div>(?:<p>)?
\s*<div\s+class="trippeople">\s*(.*?)</div>
\s*<div\s+class="triptitle">\s*(.*?)</div>
([\s\S]*?)
\s*(?:<div\s+class="timeug">\s*(.*?)</div>)?
\s*$
''', trippara)
if not s:
if not re.search("Rigging Guide", trippara):
print "can't parse: ", trippara # this is 2007 which needs editing
#assert s, trippara
continue
tripid, tripid1, tripdate, trippeople, triptitle, triptext, tu = s.groups()
ldate = ParseDate(tripdate.strip(), year)
#assert tripid[:-1] == "t" + tripdate, (tripid, tripdate)
trippeople = re.sub("Ol(?!l)", "Olly", trippeople)
trippeople = re.sub("Wook(?!e)", "Wookey", trippeople)
triptitles = triptitle.split(" - ")
if len(triptitles) >= 2:
tripcave = triptitles[0]
else:
tripcave = "UNKNOWN"
#print "\n", tripcave, "--- ppp", trippeople, len(triptext)
ltriptext = re.sub("</p>", "", triptext)
ltriptext = re.sub("\s*?\n\s*", " ", ltriptext)
ltriptext = re.sub("<p>", "\n\n", ltriptext).strip()
EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)
# main parser for pre-2001. simpler because the data has been hacked so much to fit it
def Parseloghtml01(year, expedition, txt):
tripparas = re.findall("<hr[\s/]*>([\s\S]*?)(?=<hr)", txt)
for trippara in tripparas:
s = re.match(u"(?s)\s*(?:<p>)?(.*?)</?p>(.*)$(?i)", trippara)
assert s, trippara[:300]
tripheader, triptext = s.group(1), s.group(2)
mtripid = re.search('<a id="(.*?)"', tripheader)
tripid = mtripid and mtripid.group(1) or ""
tripheader = re.sub("</?(?:[ab]|span)[^>]*>", "", tripheader)
#print " ", [tripheader]
#continue
tripdate, triptitle, trippeople = tripheader.split("|")
ldate = ParseDate(tripdate.strip(), year)
mtu = re.search('<p[^>]*>(T/?U.*)', triptext)
if mtu:
tu = mtu.group(1)
triptext = triptext[:mtu.start(0)] + triptext[mtu.end():]
else:
tu = ""
triptitles = triptitle.split(" - ")
tripcave = triptitles[0].strip()
ltriptext = triptext
mtail = re.search('(?:<a href="[^"]*">[^<]*</a>|\s|/|-|&amp;|</?p>|\((?:same day|\d+)\))*$', ltriptext)
if mtail:
#print mtail.group(0)
ltriptext = ltriptext[:mtail.start(0)]
ltriptext = re.sub("</p>", "", ltriptext)
ltriptext = re.sub("\s*?\n\s*", " ", ltriptext)
ltriptext = re.sub("<p>|<br>", "\n\n", ltriptext).strip()
#ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!]", "NONASCII", ltriptext)
ltriptext = re.sub("</?u>", "_", ltriptext)
ltriptext = re.sub("</?i>", "''", ltriptext)
ltriptext = re.sub("</?b>", "'''", ltriptext)
#print ldate, trippeople.strip()
# could includ the tripid (url link for cross referencing)
EnterLogIntoDbase(date=ldate, place=tripcave, title=triptitle, text=ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)
def Parseloghtml03(year, expedition, txt):
tripparas = re.findall("<hr\s*/>([\s\S]*?)(?=<hr)", txt)
for trippara in tripparas:
s = re.match(u"(?s)\s*<p>(.*?)</p>(.*)$", trippara)
assert s, trippara
tripheader, triptext = s.group(1), s.group(2)
tripheader = re.sub("&nbsp;", " ", tripheader)
tripheader = re.sub("\s+", " ", tripheader).strip()
sheader = tripheader.split(" -- ")
tu = ""
if re.match("T/U|Time underwater", sheader[-1]):
tu = sheader.pop()
if len(sheader) != 3:
print "header not three pieces", sheader
tripdate, triptitle, trippeople = sheader
ldate = ParseDate(tripdate.strip(), year)
triptitles = triptitle.split(" , ")
if len(triptitles) >= 2:
tripcave = triptitles[0]
else:
tripcave = "UNKNOWN"
#print tripcave, "--- ppp", triptitle, trippeople, len(triptext)
ltriptext = re.sub("</p>", "", triptext)
ltriptext = re.sub("\s*?\n\s*", " ", ltriptext)
ltriptext = re.sub("<p>", "\n\n", ltriptext).strip()
ltriptext = re.sub("[^\s0-9a-zA-Z\-.,:;'!&()\[\]<>?=+*%]", "_NONASCII_", ltriptext)
EnterLogIntoDbase(date = ldate, place = tripcave, title = triptitle, text = ltriptext, trippeople=trippeople, expedition=expedition, logtime_underground=0)
yearlinks = [
("2009", "2009/2009logbook.txt", Parselogwikitxt),
("2008", "2008/2008logbook.txt", Parselogwikitxt),
("2007", "2007/logbook.html", Parseloghtmltxt),
("2006", "2006/logbook/logbook_06.txt", Parselogwikitxt),
("2005", "2005/logbook.html", Parseloghtmltxt),
("2004", "2004/logbook.html", Parseloghtmltxt),
("2003", "2003/logbook.html", Parseloghtml03),
("2002", "2002/logbook.html", Parseloghtmltxt),
("2001", "2001/log.htm", Parseloghtml01),
("2000", "2000/log.htm", Parseloghtml01),
("1999", "1999/log.htm", Parseloghtml01),
("1998", "1998/log.htm", Parseloghtml01),
("1997", "1997/log.htm", Parseloghtml01),
("1996", "1996/log.htm", Parseloghtml01),
("1995", "1995/log.htm", Parseloghtml01),
("1994", "1994/log.htm", Parseloghtml01),
("1993", "1993/log.htm", Parseloghtml01),
("1992", "1992/log.htm", Parseloghtml01),
("1991", "1991/log.htm", Parseloghtml01),
]
def SetDatesFromLogbookEntries(expedition):
"""
Sets the date_from and date_to field for an expedition based on persontrips.
Then sets the expedition date_from and date_to based on the personexpeditions.
"""
for personexpedition in expedition.personexpedition_set.all():
persontrips = personexpedition.persontrip_set.order_by('logbook_entry__date')
# sequencing is difficult to do
lprevpersontrip = None
for persontrip in persontrips:
persontrip.persontrip_prev = lprevpersontrip
if lprevpersontrip:
lprevpersontrip.persontrip_next = persontrip
lprevpersontrip.save()
persontrip.persontrip_next = None
lprevpersontrip = persontrip
persontrip.save()
def LoadLogbookForExpedition(expedition):
""" Parses all logbook entries for one expedition """
expowebbase = os.path.join(settings.EXPOWEB, "years")
year = str(expedition.year)
for lyear, lloc, parsefunc in yearlinks:
if lyear == year:
break
fin = open(os.path.join(expowebbase, lloc))
print "opennning", lloc
txt = fin.read().decode("latin1")
fin.close()
parsefunc(year, expedition, txt)
SetDatesFromLogbookEntries(expedition)
return "TOLOAD: " + year + " " + str(expedition.personexpedition_set.all()[1].logbookentry_set.count()) + " " + str(models.PersonTrip.objects.filter(personexpedition__expedition=expedition).count())
def LoadLogbooks():
""" This is the master function for parsing all logbooks into the Troggle database. Requires yearlinks, which is a list of tuples for each expedition with expedition year, logbook path, and parsing function. """
#Deletion has been moved to a seperate function to enable the non-destructive importing
#models.LogbookEntry.objects.all().delete()
expowebbase = os.path.join(settings.EXPOWEB, "years")
#yearlinks = [ ("2001", "2001/log.htm", Parseloghtml01), ] #overwrite
#yearlinks = [ ("1996", "1996/log.htm", Parseloghtml01),] # overwrite
for year, lloc, parsefunc in yearlinks:
expedition = models.Expedition.objects.filter(year = year)[0]
fin = open(os.path.join(expowebbase, lloc))
txt = fin.read().decode("latin1")
fin.close()
parsefunc(year, expedition, txt)
SetDatesFromLogbookEntries(expedition)
dateRegex = re.compile('<span\s+class="date">(\d\d\d\d)-(\d\d)-(\d\d)</span>', re.S)
expeditionYearRegex = re.compile('<span\s+class="expeditionyear">(.*?)</span>', re.S)
titleRegex = re.compile('<H1>(.*?)</H1>', re.S)
reportRegex = re.compile('<div\s+class="report">(.*)</div>\s*</body>', re.S)
personRegex = re.compile('<div\s+class="person">(.*?)</div>', re.S)
nameAuthorRegex = re.compile('<span\s+class="name(,author|)">(.*?)</span>', re.S)
TURegex = re.compile('<span\s+class="TU">([0-9]*\.?[0-9]+)</span>', re.S)
locationRegex = re.compile('<span\s+class="location">(.*?)</span>', re.S)
caveRegex = re.compile('<span\s+class="cave">(.*?)</span>', re.S)
def parseAutoLogBookEntry(filename):
errors = []
f = open(filename, "r")
contents = f.read()
f.close()
dateMatch = dateRegex.search(contents)
if dateMatch:
year, month, day = [int(x) for x in dateMatch.groups()]
date = datetime.date(year, month, day)
else:
errors.append("Date could not be found")
expeditionYearMatch = expeditionYearRegex.search(contents)
if expeditionYearMatch:
try:
expedition = models.Expedition.objects.get(year = expeditionYearMatch.groups()[0])
personExpeditionNameLookup = GetPersonExpeditionNameLookup(expedition)
except models.Expedition.DoesNotExist:
errors.append("Expedition not in database")
else:
errors.append("Expediton Year could not be parsed")
titleMatch = titleRegex.search(contents)
if titleMatch:
title, = titleMatch.groups()
if len(title) > settings.MAX_LOGBOOK_ENTRY_TITLE_LENGTH:
errors.append("Title too long")
else:
errors.append("Title could not be found")
caveMatch = caveRegex.search(contents)
if caveMatch:
caveRef, = caveMatch.groups()
try:
cave = models.getCaveByReference(caveRef)
except AssertionError:
cave = None
errors.append("Cave not found in database")
else:
cave = None
locationMatch = locationRegex.search(contents)
if locationMatch:
location, = locationMatch.groups()
else:
location = None
if cave is None and location is None:
errors.append("Location nor cave could not be found")
reportMatch = reportRegex.search(contents)
if reportMatch:
report, = reportMatch.groups()
else:
errors.append("Contents could not be found")
if errors:
return errors # Easiest to bail out at this point as we need to make sure that we know which expedition to look for people from.
people = []
for personMatch in personRegex.findall(contents):
nameAuthorMatch = nameAuthorRegex.search(contents)
if nameAuthorMatch:
author, name = nameAuthorMatch.groups()
if name.lower() in personExpeditionNameLookup:
personExpo = personExpeditionNameLookup[name.lower()]
else:
errors.append("Person could not be found in database")
author = bool(author)
else:
errors.append("Persons name could not be found")
TUMatch = TURegex.search(contents)
if TUMatch:
TU, = TUMatch.groups()
else:
errors.append("TU could not be found")
if not errors:
people.append((name, author, TU))
if errors:
return errors # Bail out before commiting to the database
logbookEntry = models.LogbookEntry(date = date,
expedition = expedition,
title = title, cave = cave, place = location,
text = report, slug = slugify(title)[:50],
filename = filename)
logbookEntry.save()
for name, author, TU in people:
models.PersonTrip(personexpedition = personExpo,
time_underground = TU,
logbook_entry = logbookEntry,
is_logbook_entry_author = author).save()
print logbookEntry

View File

@@ -1,56 +1,56 @@
'''
This module is the part of troggle that parses descriptions of cave parts (subcaves) from the legacy html files and saves them in the troggle database as instances of the model Subcave. Unfortunately, this parser can not be very flexible because the legacy format is poorly structured.
'''
import sys, os
import os, re, logging
from django.conf import settings
from core.models import Subcave, Cave
from utils import save_carefully
def getLinksInCaveDescription(cave):
'''
Returns all HTML <a href> tags from a given cave as a list of tuples
in the format ('filename.html','Description')
'''
pattern='<a href=\"(.*?)\">(.*?)</a>'
if cave.underground_description:
return re.findall(pattern,cave.underground_description)
else:
return []
def importSubcaves(cave):
for link in getLinksInCaveDescription(cave):
try:
subcaveFilePath=os.path.join(
settings.EXPOWEB,
os.path.dirname(cave.description_file),
link[0])
subcaveFile=open(subcaveFilePath,'r')
description=subcaveFile.read().decode('iso-8859-1').encode('utf-8')
lookupAttribs={'title':link[1], 'cave':cave}
nonLookupAttribs={'description':description}
newSubcave=save_carefully(Subcave,lookupAttribs=lookupAttribs,nonLookupAttribs=nonLookupAttribs)
logging.info("Added " + unicode(newSubcave) + " to " + unicode(cave))
except IOError:
logging.info("Subcave import couldn't open "+subcaveFilePath)
def getLinksInSubcaveDescription(subcave):
pattern='<a href=\"(.*?)\">(.*?)</a>'
if subcave.description:
return re.findall(pattern,subcave.description)
else:
return []
def getLinksInAllSubcaves():
bigList=[]
for subcave in Subcave.objects.all():
bigList+=getLinksInSubcaveDescription(subcave)
return bigList
def importAllSubcaves():
for cave in Cave.objects.all():
importSubcaves(cave)
'''
This module is the part of troggle that parses descriptions of cave parts (subcaves) from the legacy html files and saves them in the troggle database as instances of the model Subcave. Unfortunately, this parser can not be very flexible because the legacy format is poorly structured.
'''
import sys, os
import os, re, logging
from django.conf import settings
from core.models import Subcave, Cave
from utils import save_carefully
def getLinksInCaveDescription(cave):
'''
Returns all HTML <a href> tags from a given cave as a list of tuples
in the format ('filename.html','Description')
'''
pattern='<a href=\"(.*?)\">(.*?)</a>'
if cave.underground_description:
return re.findall(pattern,cave.underground_description)
else:
return []
def importSubcaves(cave):
for link in getLinksInCaveDescription(cave):
try:
subcaveFilePath=os.path.join(
settings.EXPOWEB,
os.path.dirname(cave.description_file),
link[0])
subcaveFile=open(subcaveFilePath,'r')
description=subcaveFile.read().decode('iso-8859-1').encode('utf-8')
lookupAttribs={'title':link[1], 'cave':cave}
nonLookupAttribs={'description':description}
newSubcave=save_carefully(Subcave,lookupAttribs=lookupAttribs,nonLookupAttribs=nonLookupAttribs)
logging.info("Added " + unicode(newSubcave) + " to " + unicode(cave))
except IOError:
logging.info("Subcave import couldn't open "+subcaveFilePath)
def getLinksInSubcaveDescription(subcave):
pattern='<a href=\"(.*?)\">(.*?)</a>'
if subcave.description:
return re.findall(pattern,subcave.description)
else:
return []
def getLinksInAllSubcaves():
bigList=[]
for subcave in Subcave.objects.all():
bigList+=getLinksInSubcaveDescription(subcave)
return bigList
def importAllSubcaves():
for cave in Cave.objects.all():
importSubcaves(cave)

View File

@@ -1,301 +1,301 @@
import sys, os, types, logging, stat
#sys.path.append('C:\\Expo\\expoweb')
#from troggle import *
#os.environ['DJANGO_SETTINGS_MODULE']='troggle.settings'
import settings
from core.models import *
from PIL import Image
#import settings
#import core.models as models
import csv
import re
import datetime
from utils import save_carefully
def get_or_create_placeholder(year):
""" All surveys must be related to a logbookentry. We don't have a way to
automatically figure out which survey went with which logbookentry,
so we create a survey placeholder logbook entry for each year. This
function always returns such a placeholder, and creates it if it doesn't
exist yet.
"""
lookupAttribs={'date__year':int(year), 'title':"placeholder for surveys",}
nonLookupAttribs={'text':"surveys temporarily attached to this should be re-attached to their actual trips", 'date':datetime.date(int(year),1,1)}
placeholder_logbook_entry, newly_created = save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs)
return placeholder_logbook_entry
# dead
def readSurveysFromCSV():
try: # could probably combine these two
surveytab = open(os.path.join(settings.SURVEY_SCANS, "Surveys.csv"))
except IOError:
import cStringIO, urllib
surveytab = cStringIO.StringIO(urllib.urlopen(settings.SURVEY_SCANS + "/Surveys.csv").read())
dialect=csv.Sniffer().sniff(surveytab.read())
surveytab.seek(0,0)
surveyreader = csv.reader(surveytab,dialect=dialect)
headers = surveyreader.next()
header = dict(zip(headers, range(len(headers)))) #set up a dictionary where the indexes are header names and the values are column numbers
# test if the expeditions have been added yet
if Expedition.objects.count()==0:
print "There are no expeditions in the database. Please run the logbook parser."
sys.exit()
logging.info("Deleting all scanned images")
ScannedImage.objects.all().delete()
logging.info("Deleting all survey objects")
Survey.objects.all().delete()
logging.info("Beginning to import surveys from "+str(os.path.join(settings.SURVEYS, "Surveys.csv"))+"\n"+"-"*60+"\n")
for survey in surveyreader:
#I hate this, but some surveys have a letter eg 2000#34a. The next line deals with that.
walletNumberLetter = re.match(r'(?P<number>\d*)(?P<letter>[a-zA-Z]*)',survey[header['Survey Number']])
# print walletNumberLetter.groups()
year=survey[header['Year']]
surveyobj = Survey(
expedition = Expedition.objects.filter(year=year)[0],
wallet_number = walletNumberLetter.group('number'),
logbook_entry = get_or_create_placeholder(year),
comments = survey[header['Comments']],
location = survey[header['Location']]
)
surveyobj.wallet_letter = walletNumberLetter.group('letter')
if survey[header['Finished']]=='Yes':
#try and find the sketch_scan
pass
surveyobj.save()
logging.info("added survey " + survey[header['Year']] + "#" + surveyobj.wallet_number + "\r")
# dead
def listdir(*directories):
try:
return os.listdir(os.path.join(settings.SURVEYS, *directories))
except:
import urllib
url = settings.SURVEYS + reduce(lambda x, y: x + "/" + y, ["listdir"] + list(directories))
folders = urllib.urlopen(url.replace("#", "%23")).readlines()
return [folder.rstrip(r"/") for folder in folders]
# add survey scans
def parseSurveyScans(year, logfile=None):
# yearFileList = listdir(year.year)
yearPath=os.path.join(settings.SURVEY_SCANS, "years", year.year)
yearFileList=os.listdir(yearPath)
print yearFileList
for surveyFolder in yearFileList:
try:
surveyNumber=re.match(r'\d\d\d\d#0*(\d+)',surveyFolder).groups()
# scanList = listdir(year.year, surveyFolder)
scanList=os.listdir(os.path.join(yearPath,surveyFolder))
except AttributeError:
print surveyFolder + " ignored",
continue
for scan in scanList:
try:
scanChopped=re.match(r'(?i).*(notes|elev|plan|elevation|extend)(\d*)\.(png|jpg|jpeg)',scan).groups()
scanType,scanNumber,scanFormat=scanChopped
except AttributeError:
print scan + " ignored \r",
continue
if scanType == 'elev' or scanType == 'extend':
scanType = 'elevation'
if scanNumber=='':
scanNumber=1
if type(surveyNumber)==types.TupleType:
surveyNumber=surveyNumber[0]
try:
placeholder=get_or_create_placeholder(year=int(year.year))
survey=Survey.objects.get_or_create(wallet_number=surveyNumber, expedition=year, defaults={'logbook_entry':placeholder})[0]
except Survey.MultipleObjectsReturned:
survey=Survey.objects.filter(wallet_number=surveyNumber, expedition=year)[0]
file=os.path.join(year.year, surveyFolder, scan)
scanObj = ScannedImage(
file=file,
contents=scanType,
number_in_wallet=scanNumber,
survey=survey,
new_since_parsing=False,
)
#print "Added scanned image at " + str(scanObj)
if scanFormat=="png":
if isInterlacedPNG(os.path.join(settings.SURVEY_SCANS,file)):
print file + " is an interlaced PNG. No can do."
continue
scanObj.save()
# dead
def parseSurveys(logfile=None):
readSurveysFromCSV()
for year in Expedition.objects.filter(year__gte=2000): #expos since 2000, because paths and filenames were nonstandard before then
parseSurveyScans(year)
# dead
def isInterlacedPNG(filePath): #We need to check for interlaced PNGs because the thumbnail engine can't handle them (uses PIL)
file=Image.open(filePath)
print filePath
if 'interlace' in file.info:
return file.info['interlace']
else:
return False
# handles url or file, so we can refer to a set of scans on another server
def GetListDir(sdir):
res = [ ]
if sdir[:7] == "http://":
assert False, "Not written"
s = urllib.urlopen(sdir)
else:
for f in os.listdir(sdir):
if f[0] != ".":
ff = os.path.join(sdir, f)
res.append((f, ff, os.path.isdir(ff)))
return res
def LoadListScansFile(survexscansfolder):
gld = [ ]
# flatten out any directories in these book files
for (fyf, ffyf, fisdiryf) in GetListDir(survexscansfolder.fpath):
if fisdiryf:
gld.extend(GetListDir(ffyf))
else:
gld.append((fyf, ffyf, fisdiryf))
for (fyf, ffyf, fisdiryf) in gld:
assert not fisdiryf, ffyf
if re.search("\.(?:png|jpg|jpeg)(?i)$", fyf):
survexscansingle = SurvexScanSingle(ffile=ffyf, name=fyf, survexscansfolder=survexscansfolder)
survexscansingle.save()
# this iterates through the scans directories (either here or on the remote server)
# and builds up the models we can access later
def LoadListScans():
SurvexScanSingle.objects.all().delete()
SurvexScansFolder.objects.all().delete()
# first do the smkhs (large kh survey scans) directory
survexscansfoldersmkhs = SurvexScansFolder(fpath=os.path.join(settings.SURVEY_SCANS, "smkhs"), walletname="smkhs")
if os.path.isdir(survexscansfoldersmkhs.fpath):
survexscansfoldersmkhs.save()
LoadListScansFile(survexscansfoldersmkhs)
# iterate into the surveyscans directory
for f, ff, fisdir in GetListDir(os.path.join(settings.SURVEY_SCANS, "surveyscans")):
if not fisdir:
continue
# do the year folders
if re.match("\d\d\d\d$", f):
for fy, ffy, fisdiry in GetListDir(ff):
if fisdiry:
assert fisdiry, ffy
survexscansfolder = SurvexScansFolder(fpath=ffy, walletname=fy)
survexscansfolder.save()
LoadListScansFile(survexscansfolder)
# do the
elif f != "thumbs":
survexscansfolder = SurvexScansFolder(fpath=ff, walletname=f)
survexscansfolder.save()
LoadListScansFile(survexscansfolder)
def FindTunnelScan(tunnelfile, path):
scansfolder, scansfile = None, None
mscansdir = re.search("(\d\d\d\d#\d+\w?|1995-96kh|92-94Surveybookkh|1991surveybook|smkhs)/(.*?(?:png|jpg))$", path)
if mscansdir:
scansfolderl = SurvexScansFolder.objects.filter(walletname=mscansdir.group(1))
if len(scansfolderl):
assert len(scansfolderl) == 1
scansfolder = scansfolderl[0]
if scansfolder:
scansfilel = scansfolder.survexscansingle_set.filter(name=mscansdir.group(2))
if len(scansfilel):
assert len(scansfilel) == 1
scansfile = scansfilel[0]
if scansfolder:
tunnelfile.survexscansfolders.add(scansfolder)
if scansfile:
tunnelfile.survexscans.add(scansfile)
elif path and not re.search("\.(?:png|jpg)$(?i)", path):
name = os.path.split(path)[1]
print "ttt", tunnelfile.tunnelpath, path, name
rtunnelfilel = TunnelFile.objects.filter(tunnelname=name)
if len(rtunnelfilel):
assert len(rtunnelfilel) == 1, ("two paths with name of", path, "need more discrimination coded")
rtunnelfile = rtunnelfilel[0]
#print "ttt", tunnelfile.tunnelpath, path, name, rtunnelfile.tunnelpath
tunnelfile.tunnelcontains.add(rtunnelfile)
tunnelfile.save()
def SetTunnelfileInfo(tunnelfile):
ff = os.path.join(settings.TUNNEL_DATA, tunnelfile.tunnelpath)
tunnelfile.filesize = os.stat(ff)[stat.ST_SIZE]
fin = open(ff)
ttext = fin.read()
fin.close()
mtype = re.search("<(fontcolours|sketch)", ttext)
assert mtype, ff
tunnelfile.bfontcolours = (mtype.group(1)=="fontcolours")
tunnelfile.npaths = len(re.findall("<skpath", ttext))
tunnelfile.save()
# <tunnelxml tunnelversion="version2009-06-21 Matienzo" tunnelproject="ireby" tunneluser="goatchurch" tunneldate="2009-06-29 23:22:17">
# <pcarea area_signal="frame" sfscaledown="12.282584" sfrotatedeg="-90.76982" sfxtrans="11.676667377221136" sfytrans="-15.677173422877454" sfsketch="204description/scans/plan(38).png" sfstyle="" nodeconnzsetrelative="0.0">
for path, style in re.findall('<pcarea area_signal="frame".*?sfsketch="([^"]*)" sfstyle="([^"]*)"', ttext):
FindTunnelScan(tunnelfile, path)
# should also scan and look for survex blocks that might have been included
# and also survex titles as well.
tunnelfile.save()
def LoadTunnelFiles():
tunneldatadir = settings.TUNNEL_DATA
TunnelFile.objects.all().delete()
tunneldirs = [ "" ]
while tunneldirs:
tunneldir = tunneldirs.pop()
for f in os.listdir(os.path.join(tunneldatadir, tunneldir)):
if f[0] == "." or f[-1] == "~":
continue
lf = os.path.join(tunneldir, f)
ff = os.path.join(tunneldatadir, lf)
if os.path.isdir(ff):
tunneldirs.append(lf)
elif f[-4:] == ".xml":
tunnelfile = TunnelFile(tunnelpath=lf, tunnelname=os.path.split(f[:-4])[1])
tunnelfile.save()
for tunnelfile in TunnelFile.objects.all():
SetTunnelfileInfo(tunnelfile)
import sys, os, types, logging, stat
#sys.path.append('C:\\Expo\\expoweb')
#from troggle import *
#os.environ['DJANGO_SETTINGS_MODULE']='troggle.settings'
import settings
from core.models import *
from PIL import Image
#import settings
#import core.models as models
import csv
import re
import datetime
from utils import save_carefully
def get_or_create_placeholder(year):
""" All surveys must be related to a logbookentry. We don't have a way to
automatically figure out which survey went with which logbookentry,
so we create a survey placeholder logbook entry for each year. This
function always returns such a placeholder, and creates it if it doesn't
exist yet.
"""
lookupAttribs={'date__year':int(year), 'title':"placeholder for surveys",}
nonLookupAttribs={'text':"surveys temporarily attached to this should be re-attached to their actual trips", 'date':datetime.date(int(year),1,1)}
placeholder_logbook_entry, newly_created = save_carefully(LogbookEntry, lookupAttribs, nonLookupAttribs)
return placeholder_logbook_entry
# dead
def readSurveysFromCSV():
try: # could probably combine these two
surveytab = open(os.path.join(settings.SURVEY_SCANS, "Surveys.csv"))
except IOError:
import cStringIO, urllib
surveytab = cStringIO.StringIO(urllib.urlopen(settings.SURVEY_SCANS + "/Surveys.csv").read())
dialect=csv.Sniffer().sniff(surveytab.read())
surveytab.seek(0,0)
surveyreader = csv.reader(surveytab,dialect=dialect)
headers = surveyreader.next()
header = dict(zip(headers, range(len(headers)))) #set up a dictionary where the indexes are header names and the values are column numbers
# test if the expeditions have been added yet
if Expedition.objects.count()==0:
print "There are no expeditions in the database. Please run the logbook parser."
sys.exit()
logging.info("Deleting all scanned images")
ScannedImage.objects.all().delete()
logging.info("Deleting all survey objects")
Survey.objects.all().delete()
logging.info("Beginning to import surveys from "+str(os.path.join(settings.SURVEYS, "Surveys.csv"))+"\n"+"-"*60+"\n")
for survey in surveyreader:
#I hate this, but some surveys have a letter eg 2000#34a. The next line deals with that.
walletNumberLetter = re.match(r'(?P<number>\d*)(?P<letter>[a-zA-Z]*)',survey[header['Survey Number']])
# print walletNumberLetter.groups()
year=survey[header['Year']]
surveyobj = Survey(
expedition = Expedition.objects.filter(year=year)[0],
wallet_number = walletNumberLetter.group('number'),
logbook_entry = get_or_create_placeholder(year),
comments = survey[header['Comments']],
location = survey[header['Location']]
)
surveyobj.wallet_letter = walletNumberLetter.group('letter')
if survey[header['Finished']]=='Yes':
#try and find the sketch_scan
pass
surveyobj.save()
logging.info("added survey " + survey[header['Year']] + "#" + surveyobj.wallet_number + "\r")
# dead
def listdir(*directories):
try:
return os.listdir(os.path.join(settings.SURVEYS, *directories))
except:
import urllib
url = settings.SURVEYS + reduce(lambda x, y: x + "/" + y, ["listdir"] + list(directories))
folders = urllib.urlopen(url.replace("#", "%23")).readlines()
return [folder.rstrip(r"/") for folder in folders]
# add survey scans
def parseSurveyScans(year, logfile=None):
# yearFileList = listdir(year.year)
yearPath=os.path.join(settings.SURVEY_SCANS, "years", year.year)
yearFileList=os.listdir(yearPath)
print yearFileList
for surveyFolder in yearFileList:
try:
surveyNumber=re.match(r'\d\d\d\d#0*(\d+)',surveyFolder).groups()
# scanList = listdir(year.year, surveyFolder)
scanList=os.listdir(os.path.join(yearPath,surveyFolder))
except AttributeError:
print surveyFolder + " ignored",
continue
for scan in scanList:
try:
scanChopped=re.match(r'(?i).*(notes|elev|plan|elevation|extend)(\d*)\.(png|jpg|jpeg)',scan).groups()
scanType,scanNumber,scanFormat=scanChopped
except AttributeError:
print scan + " ignored \r",
continue
if scanType == 'elev' or scanType == 'extend':
scanType = 'elevation'
if scanNumber=='':
scanNumber=1
if type(surveyNumber)==types.TupleType:
surveyNumber=surveyNumber[0]
try:
placeholder=get_or_create_placeholder(year=int(year.year))
survey=Survey.objects.get_or_create(wallet_number=surveyNumber, expedition=year, defaults={'logbook_entry':placeholder})[0]
except Survey.MultipleObjectsReturned:
survey=Survey.objects.filter(wallet_number=surveyNumber, expedition=year)[0]
file=os.path.join(year.year, surveyFolder, scan)
scanObj = ScannedImage(
file=file,
contents=scanType,
number_in_wallet=scanNumber,
survey=survey,
new_since_parsing=False,
)
#print "Added scanned image at " + str(scanObj)
if scanFormat=="png":
if isInterlacedPNG(os.path.join(settings.SURVEY_SCANS,file)):
print file + " is an interlaced PNG. No can do."
continue
scanObj.save()
# dead
def parseSurveys(logfile=None):
readSurveysFromCSV()
for year in Expedition.objects.filter(year__gte=2000): #expos since 2000, because paths and filenames were nonstandard before then
parseSurveyScans(year)
# dead
def isInterlacedPNG(filePath): #We need to check for interlaced PNGs because the thumbnail engine can't handle them (uses PIL)
file=Image.open(filePath)
print filePath
if 'interlace' in file.info:
return file.info['interlace']
else:
return False
# handles url or file, so we can refer to a set of scans on another server
def GetListDir(sdir):
res = [ ]
if sdir[:7] == "http://":
assert False, "Not written"
s = urllib.urlopen(sdir)
else:
for f in os.listdir(sdir):
if f[0] != ".":
ff = os.path.join(sdir, f)
res.append((f, ff, os.path.isdir(ff)))
return res
def LoadListScansFile(survexscansfolder):
gld = [ ]
# flatten out any directories in these book files
for (fyf, ffyf, fisdiryf) in GetListDir(survexscansfolder.fpath):
if fisdiryf:
gld.extend(GetListDir(ffyf))
else:
gld.append((fyf, ffyf, fisdiryf))
for (fyf, ffyf, fisdiryf) in gld:
assert not fisdiryf, ffyf
if re.search("\.(?:png|jpg|jpeg)(?i)$", fyf):
survexscansingle = SurvexScanSingle(ffile=ffyf, name=fyf, survexscansfolder=survexscansfolder)
survexscansingle.save()
# this iterates through the scans directories (either here or on the remote server)
# and builds up the models we can access later
def LoadListScans():
SurvexScanSingle.objects.all().delete()
SurvexScansFolder.objects.all().delete()
# first do the smkhs (large kh survey scans) directory
survexscansfoldersmkhs = SurvexScansFolder(fpath=os.path.join(settings.SURVEY_SCANS, "smkhs"), walletname="smkhs")
if os.path.isdir(survexscansfoldersmkhs.fpath):
survexscansfoldersmkhs.save()
LoadListScansFile(survexscansfoldersmkhs)
# iterate into the surveyscans directory
for f, ff, fisdir in GetListDir(os.path.join(settings.SURVEY_SCANS, "surveyscans")):
if not fisdir:
continue
# do the year folders
if re.match("\d\d\d\d$", f):
for fy, ffy, fisdiry in GetListDir(ff):
if fisdiry:
assert fisdiry, ffy
survexscansfolder = SurvexScansFolder(fpath=ffy, walletname=fy)
survexscansfolder.save()
LoadListScansFile(survexscansfolder)
# do the
elif f != "thumbs":
survexscansfolder = SurvexScansFolder(fpath=ff, walletname=f)
survexscansfolder.save()
LoadListScansFile(survexscansfolder)
def FindTunnelScan(tunnelfile, path):
scansfolder, scansfile = None, None
mscansdir = re.search("(\d\d\d\d#\d+\w?|1995-96kh|92-94Surveybookkh|1991surveybook|smkhs)/(.*?(?:png|jpg))$", path)
if mscansdir:
scansfolderl = SurvexScansFolder.objects.filter(walletname=mscansdir.group(1))
if len(scansfolderl):
assert len(scansfolderl) == 1
scansfolder = scansfolderl[0]
if scansfolder:
scansfilel = scansfolder.survexscansingle_set.filter(name=mscansdir.group(2))
if len(scansfilel):
assert len(scansfilel) == 1
scansfile = scansfilel[0]
if scansfolder:
tunnelfile.survexscansfolders.add(scansfolder)
if scansfile:
tunnelfile.survexscans.add(scansfile)
elif path and not re.search("\.(?:png|jpg)$(?i)", path):
name = os.path.split(path)[1]
print "ttt", tunnelfile.tunnelpath, path, name
rtunnelfilel = TunnelFile.objects.filter(tunnelname=name)
if len(rtunnelfilel):
assert len(rtunnelfilel) == 1, ("two paths with name of", path, "need more discrimination coded")
rtunnelfile = rtunnelfilel[0]
#print "ttt", tunnelfile.tunnelpath, path, name, rtunnelfile.tunnelpath
tunnelfile.tunnelcontains.add(rtunnelfile)
tunnelfile.save()
def SetTunnelfileInfo(tunnelfile):
ff = os.path.join(settings.TUNNEL_DATA, tunnelfile.tunnelpath)
tunnelfile.filesize = os.stat(ff)[stat.ST_SIZE]
fin = open(ff)
ttext = fin.read()
fin.close()
mtype = re.search("<(fontcolours|sketch)", ttext)
assert mtype, ff
tunnelfile.bfontcolours = (mtype.group(1)=="fontcolours")
tunnelfile.npaths = len(re.findall("<skpath", ttext))
tunnelfile.save()
# <tunnelxml tunnelversion="version2009-06-21 Matienzo" tunnelproject="ireby" tunneluser="goatchurch" tunneldate="2009-06-29 23:22:17">
# <pcarea area_signal="frame" sfscaledown="12.282584" sfrotatedeg="-90.76982" sfxtrans="11.676667377221136" sfytrans="-15.677173422877454" sfsketch="204description/scans/plan(38).png" sfstyle="" nodeconnzsetrelative="0.0">
for path, style in re.findall('<pcarea area_signal="frame".*?sfsketch="([^"]*)" sfstyle="([^"]*)"', ttext):
FindTunnelScan(tunnelfile, path)
# should also scan and look for survex blocks that might have been included
# and also survex titles as well.
tunnelfile.save()
def LoadTunnelFiles():
tunneldatadir = settings.TUNNEL_DATA
TunnelFile.objects.all().delete()
tunneldirs = [ "" ]
while tunneldirs:
tunneldir = tunneldirs.pop()
for f in os.listdir(os.path.join(tunneldatadir, tunneldir)):
if f[0] == "." or f[-1] == "~":
continue
lf = os.path.join(tunneldir, f)
ff = os.path.join(tunneldatadir, lf)
if os.path.isdir(ff):
tunneldirs.append(lf)
elif f[-4:] == ".xml":
tunnelfile = TunnelFile(tunnelpath=lf, tunnelname=os.path.split(f[:-4])[1])
tunnelfile.save()
for tunnelfile in TunnelFile.objects.all():
SetTunnelfileInfo(tunnelfile)