[svn] Work on turn html pages into cavedescription models.py.

Moved parser/cavetabs html_to_wiki function to utils.py
Added databaseReset.py desc to refresh the cavedescriptions.
This commit is contained in:
martin speleo 2009-07-04 16:42:17 +01:00
parent fa9a6416d1
commit c377d0376f
4 changed files with 133 additions and 93 deletions

View File

@ -1,5 +1,10 @@
import urllib, urlparse, string, os, datetime, logging
import troggle.mptt as mptt
try:
import mptt
except ImportError:
#I think we should be having troggle directory as the base import place
#but I am leaving the following line in to make sure I do not break anything
import troggle.mptt as mptt
from django.forms import ModelForm
from django.db import models
from django.contrib import admin
@ -539,18 +544,19 @@ try:
mptt.register(Subcave, order_insertion_by=['title'])
except mptt.AlreadyRegistered:
print "mptt already registered"
class CaveDescription(TroggleModel):
name = models.CharField(max_length=50)
short_name = models.CharField(max_length=50, unique = True)
long_name = models.CharField(max_length=200, blank=True, null=True)
description = models.TextField(blank=True,null=True)
linked_subcaves = models.ManyToManyField("Subcave")
linked_entrances = models.ManyToManyField("Entrance")
linked_qms = models.ManyToManyField("QM")
def __unicode__(self):
return unicode(self.name)
return unicode(self.short_name)
class NewSubCave(TroggleModel):
name = models.CharField(max_length=200)
name = models.CharField(max_length=200, unique = True)
def __unicode__(self):
return unicode(self.name)

View File

@ -58,6 +58,14 @@ def import_surveys():
import parsers.surveys
parsers.surveys.parseSurveys(logfile=settings.LOGFILE)
def import_descriptions():
import parsers.descriptions
parsers.descriptions.getDescriptions()
def parse_descriptions():
import parsers.descriptions
parsers.descriptions.parseDescriptions()
def reset():
""" Wipe the troggle database and import everything from legacy data
"""
@ -69,16 +77,29 @@ def reset():
import_survex()
import_QMs()
import_surveys()
import_descriptions()
parse_descriptions()
def resetdesc():
""" Wipe the troggle database and import descriptions
"""
import core.models
for desc in core.models.CaveDescription.objects.all():
desc.delete()
import_descriptions()
parse_descriptions()
def export_cavetab():
from export import tocavetab
outfile=file(os.path.join(settings.EXPOWEB, "noinfo", "CAVETAB2.CSV"),'w')
tocavetab.writeCaveTab(outfile)
outfile.close()
if __name__ == "__main__":
if __name__ == "__main__":
import sys
if "reset" in sys.argv:
if "desc" in sys.argv:
resetdesc()
elif "reset" in sys.argv:
reset()
else:
print "Do 'python databaseReset.py reset'"

View File

@ -3,6 +3,7 @@ import troggle.core.models as models
from django.conf import settings
import csv, time, re, os, logging
from utils import save_carefully
from utils import html_to_wiki
##format of CAVETAB2.CSV is
KatasterNumber = 0
@ -52,85 +53,6 @@ MarkingComment = 43
Findability = 44
FindabilityComment = 45
def html_to_wiki(text):
if type(text) != str:
return text
text = unicode(text, "utf-8")
#Characters
#text = re.sub("ü", u"\xfc", text)
#text = re.sub("ö", u"\xf6", text)
#text = re.sub("ä", u"\xe4", text)
#text = re.sub("°", u"\xb0", text)
#text = re.sub("©", u"\xa9", text)
#text = re.sub("&", u"\x26", text)
#text = re.sub("ß", u"\xdf", text)
#text = re.sub("ß", u"\xdf", text)
#text = re.sub("&lt;", u"<", text)
#text = re.sub("&gt;", u">", text)
#text = re.sub("&egrave;", u"\xe8", text)
#text = re.sub("&eacute;", u"\xe9", text)
#text = re.sub("&quote;", u'"', text)
#text = re.sub("&quot;", u'"', text)
#text = re.sub("&Ouml;", u'\xd6', text)
#text = re.sub("&times;", u'"', text)
#text = re.sub("&(.*);", "/1", text)
#if s:
# print s.groups()
#Lists
text = re.sub("</p>", r"", text)
text = re.sub("<p>$", r"", text)
text = re.sub("<p>", r"\n\n", text)
out = ""
lists = ""
while text:
mstar = re.match("^(.*?)<ul>\s*<li[^>]*>(.*?)</li>(.*)$", text, re.DOTALL)
munstar = re.match("^(\s*)</ul>(.*)$", text, re.DOTALL)
mhash = re.match("^(.*?)<ol>\s*<li[^>]*>(.*?)</li>(.*)$", text, re.DOTALL)
munhash = re.match("^(\s*)</ol>(.*)$", text, re.DOTALL)
mitem = re.match("^(\s*)<li[^>]*>(.*?)</li>(.*)$", text, re.DOTALL)
ms = [len(m.groups()[0]) for m in [mstar, munstar, mhash, munhash, mitem] if m]
def min_(i, l):
try:
v = i.groups()[0]
l.remove(len(v))
return len(v) < min(l, 1000000000)
except:
return False
if min_(mstar, ms):
lists += "*"
pre, val, post = mstar.groups()
out += pre + "\n" + lists + " " + val
text = post
elif min_(mhash, ms):
lists += "#"
pre, val, post = mhash.groups()
out += pre + "\n" + lists + " " + val
text = post
elif min_(mitem, ms):
pre, val, post = mitem.groups()
out += "\n" + lists + " " + val
text = post
elif min_(munstar, ms):
lists = lists[:-1]
text = munstar.groups()[1]
elif min_(munhash, ms):
lists.pop()
text = munhash.groups()[1]
else:
out += text
text = ""
text2 = out
while text2:
mtag = re.match("^(.*?)<(.*?)>(.*)$", text, re.DOTALL)
if mtag:
text2 = mtag.groups()[2]
print mtag.groups()[1]
else:
text2 = ""
return out
def LoadCaveTab():
cavetab = open(os.path.join(settings.EXPOWEB, "noinfo", "CAVETAB2.CSV"),'rU')
caveReader = csv.reader(cavetab)

103
utils.py
View File

@ -1,5 +1,8 @@
from django.conf import settings
from troggle.core.models import LogbookEntry
try:
from django.db import models
except:#We want to get rid of this try statement if possible
from troggle.core.models import LogbookEntry
import random, re, logging
def weighted_choice(lst):
@ -16,11 +19,11 @@ def randomLogbookSentence():
# needs to handle empty logbooks without crashing
#Choose a random logbook entry
randSent['entry']=LogbookEntry.objects.order_by('?')[0]
randSent['entry']=models.LogbookEntry.objects.order_by('?')[0]
#Choose again if there are no sentances (this happens if it is a placeholder entry)
while len(re.findall('[A-Z].*?\.',randSent['entry'].text))==0:
randSent['entry']=LogbookEntry.objects.order_by('?')[0]
randSent['entry']=models.LogbookEntry.objects.order_by('?')[0]
#Choose a random sentence from that entry. Store the sentence as randSent['sentence'], and the number of that sentence in the entry as randSent['number']
sentenceList=re.findall('[A-Z].*?\.',randSent['entry'].text)
@ -60,10 +63,98 @@ def save_carefully(objectType, lookupAttribs={}, nonLookupAttribs={}):
def render_with_context(req, *args, **kwargs):
"""this is the snippet from http://www.djangosnippets.org/snippets/3/
Django uses Context, not RequestContext when you call render_to_response. We always want to use RequestContext, so that django adds the context from settings.TEMPLATE_CONTEXT_PROCESSORS. This way we automatically get necessary settings variables passed to each template. So we use a custom method, render_response instead of render_to_response. Hopefully future Django releases will make this unnecessary."""
Django uses Context, not RequestContext when you call render_to_response.
We always want to use RequestContext, so that django adds the context from
settings.TEMPLATE_CONTEXT_PROCESSORS. This way we automatically get
necessary settings variables passed to each template. So we use a custom
method, render_response instead of render_to_response. Hopefully future
Django releases will make this unnecessary."""
from django.shortcuts import render_to_response
from django.template import RequestContext
kwargs['context_instance'] = RequestContext(req)
return render_to_response(*args, **kwargs)
return render_to_response(*args, **kwargs)
re_body = re.compile(r"\<body[^>]*\>(.*)\</body\>", re.DOTALL)
re_title = re.compile(r"\<title[^>]*\>(.*)\</title\>", re.DOTALL)
def get_html_body(text):
return get_single_match(re_body, text)
def get_html_title(text):
return get_single_match(re_title, text)
def get_single_match(regex, text):
match = regex.search(text)
if match:
return match.groups()[0]
else:
return None
re_subs = [(re.compile(r"\<b[^>]*\>(.*?)\</b\>", re.DOTALL), r"'''\1'''"),
(re.compile(r"\<i\>(.*?)\</i\>", re.DOTALL), r"''\1''"),
(re.compile(r"\<h1[^>]*\>(.*?)\</h1\>", re.DOTALL), r"=\1="),
(re.compile(r"\<h2[^>]*\>(.*?)\</h2\>", re.DOTALL), r"==\1=="),
(re.compile(r"\<h3[^>]*\>(.*?)\</h3\>", re.DOTALL), r"===\1==="),
(re.compile(r"\<h4[^>]*\>(.*?)\</h4\>", re.DOTALL), r"====\1===="),
(re.compile(r"\<h5[^>]*\>(.*?)\</h5\>", re.DOTALL), r"=====\1====="),
(re.compile(r"\<h6[^>]*\>(.*?)\</h6\>", re.DOTALL), r"======\1======"),
(re.compile(r"\<a\s+id=['\"]([^'\"]*)['\"]\s*\>(.*?)\</a\>", re.DOTALL), r"[subcave:\1|\2]"),
#interpage link needed
(re.compile(r"\<a\s+href=['\"]#([^'\"]*)['\"]\s*\>(.*?)\</a\>", re.DOTALL), r"[cavedescription:\1|\2]"),
(re.compile(r"\[\<a\s+href=['\"][^'\"]*['\"]\s+id=['\"][^'\"]*['\"]\s*\>([^\s]*).*?\</a\>\]", re.DOTALL), r"![qm:\1]"),
]
def html_to_wiki(text, codec = "utf-8"):
if type(text) == str:
text = unicode(text, codec)
text = re.sub("</p>", r"", text)
text = re.sub("<p>$", r"", text)
text = re.sub("<p>", r"\n\n", text)
out = ""
lists = ""
#lists
while text:
mstar = re.match("^(.*?)<ul[^>]*>\s*<li[^>]*>(.*?)</li>(.*)$", text, re.DOTALL)
munstar = re.match("^(\s*)</ul>(.*)$", text, re.DOTALL)
mhash = re.match("^(.*?)<ol[^>]*>\s*<li[^>]*>(.*?)</li>(.*)$", text, re.DOTALL)
munhash = re.match("^(\s*)</ol>(.*)$", text, re.DOTALL)
mitem = re.match("^(\s*)<li[^>]*>(.*?)</li>(.*)$", text, re.DOTALL)
ms = [len(m.groups()[0]) for m in [mstar, munstar, mhash, munhash, mitem] if m]
def min_(i, l):
try:
v = i.groups()[0]
l.remove(len(v))
return len(v) < min(l, 1000000000)
except:
return False
if min_(mstar, ms):
lists += "*"
pre, val, post = mstar.groups()
out += pre + "\n" + lists + " " + val
text = post
elif min_(mhash, ms):
lists += "#"
pre, val, post = mhash.groups()
out += pre + "\n" + lists + " " + val
text = post
elif min_(mitem, ms):
pre, val, post = mitem.groups()
out += "\n" + lists + " " + val
text = post
elif min_(munstar, ms):
lists = lists[:-1]
text = munstar.groups()[1]
elif min_(munhash, ms):
lists.pop()
text = munhash.groups()[1]
else:
out += text
text = ""
#substitutions
for regex, repl in re_subs:
out = regex.sub(repl, out)
return out