[svn] Work on turn html pages into cavedescription models.py.

Moved parser/cavetabs html_to_wiki function to utils.py
Added databaseReset.py desc to refresh the cavedescriptions.
This commit is contained in:
martin speleo 2009-07-04 16:42:17 +01:00
parent fa9a6416d1
commit c377d0376f
4 changed files with 133 additions and 93 deletions

View File

@ -1,5 +1,10 @@
import urllib, urlparse, string, os, datetime, logging import urllib, urlparse, string, os, datetime, logging
import troggle.mptt as mptt try:
import mptt
except ImportError:
#I think we should be having troggle directory as the base import place
#but I am leaving the following line in to make sure I do not break anything
import troggle.mptt as mptt
from django.forms import ModelForm from django.forms import ModelForm
from django.db import models from django.db import models
from django.contrib import admin from django.contrib import admin
@ -539,18 +544,19 @@ try:
mptt.register(Subcave, order_insertion_by=['title']) mptt.register(Subcave, order_insertion_by=['title'])
except mptt.AlreadyRegistered: except mptt.AlreadyRegistered:
print "mptt already registered" print "mptt already registered"
class CaveDescription(TroggleModel): class CaveDescription(TroggleModel):
name = models.CharField(max_length=50) short_name = models.CharField(max_length=50, unique = True)
long_name = models.CharField(max_length=200, blank=True, null=True)
description = models.TextField(blank=True,null=True) description = models.TextField(blank=True,null=True)
linked_subcaves = models.ManyToManyField("Subcave") linked_subcaves = models.ManyToManyField("Subcave")
linked_entrances = models.ManyToManyField("Entrance") linked_entrances = models.ManyToManyField("Entrance")
linked_qms = models.ManyToManyField("QM") linked_qms = models.ManyToManyField("QM")
def __unicode__(self): def __unicode__(self):
return unicode(self.name) return unicode(self.short_name)
class NewSubCave(TroggleModel): class NewSubCave(TroggleModel):
name = models.CharField(max_length=200) name = models.CharField(max_length=200, unique = True)
def __unicode__(self): def __unicode__(self):
return unicode(self.name) return unicode(self.name)

View File

@ -58,6 +58,14 @@ def import_surveys():
import parsers.surveys import parsers.surveys
parsers.surveys.parseSurveys(logfile=settings.LOGFILE) parsers.surveys.parseSurveys(logfile=settings.LOGFILE)
def import_descriptions():
import parsers.descriptions
parsers.descriptions.getDescriptions()
def parse_descriptions():
import parsers.descriptions
parsers.descriptions.parseDescriptions()
def reset(): def reset():
""" Wipe the troggle database and import everything from legacy data """ Wipe the troggle database and import everything from legacy data
""" """
@ -69,16 +77,29 @@ def reset():
import_survex() import_survex()
import_QMs() import_QMs()
import_surveys() import_surveys()
import_descriptions()
parse_descriptions()
def resetdesc():
""" Wipe the troggle database and import descriptions
"""
import core.models
for desc in core.models.CaveDescription.objects.all():
desc.delete()
import_descriptions()
parse_descriptions()
def export_cavetab(): def export_cavetab():
from export import tocavetab from export import tocavetab
outfile=file(os.path.join(settings.EXPOWEB, "noinfo", "CAVETAB2.CSV"),'w') outfile=file(os.path.join(settings.EXPOWEB, "noinfo", "CAVETAB2.CSV"),'w')
tocavetab.writeCaveTab(outfile) tocavetab.writeCaveTab(outfile)
outfile.close() outfile.close()
if __name__ == "__main__": if __name__ == "__main__":
import sys import sys
if "reset" in sys.argv: if "desc" in sys.argv:
resetdesc()
elif "reset" in sys.argv:
reset() reset()
else: else:
print "Do 'python databaseReset.py reset'" print "Do 'python databaseReset.py reset'"

View File

@ -3,6 +3,7 @@ import troggle.core.models as models
from django.conf import settings from django.conf import settings
import csv, time, re, os, logging import csv, time, re, os, logging
from utils import save_carefully from utils import save_carefully
from utils import html_to_wiki
##format of CAVETAB2.CSV is ##format of CAVETAB2.CSV is
KatasterNumber = 0 KatasterNumber = 0
@ -52,85 +53,6 @@ MarkingComment = 43
Findability = 44 Findability = 44
FindabilityComment = 45 FindabilityComment = 45
def html_to_wiki(text):
if type(text) != str:
return text
text = unicode(text, "utf-8")
#Characters
#text = re.sub("ü", u"\xfc", text)
#text = re.sub("ö", u"\xf6", text)
#text = re.sub("ä", u"\xe4", text)
#text = re.sub("°", u"\xb0", text)
#text = re.sub("©", u"\xa9", text)
#text = re.sub("&", u"\x26", text)
#text = re.sub("ß", u"\xdf", text)
#text = re.sub("ß", u"\xdf", text)
#text = re.sub("&lt;", u"<", text)
#text = re.sub("&gt;", u">", text)
#text = re.sub("&egrave;", u"\xe8", text)
#text = re.sub("&eacute;", u"\xe9", text)
#text = re.sub("&quote;", u'"', text)
#text = re.sub("&quot;", u'"', text)
#text = re.sub("&Ouml;", u'\xd6', text)
#text = re.sub("&times;", u'"', text)
#text = re.sub("&(.*);", "/1", text)
#if s:
# print s.groups()
#Lists
text = re.sub("</p>", r"", text)
text = re.sub("<p>$", r"", text)
text = re.sub("<p>", r"\n\n", text)
out = ""
lists = ""
while text:
mstar = re.match("^(.*?)<ul>\s*<li[^>]*>(.*?)</li>(.*)$", text, re.DOTALL)
munstar = re.match("^(\s*)</ul>(.*)$", text, re.DOTALL)
mhash = re.match("^(.*?)<ol>\s*<li[^>]*>(.*?)</li>(.*)$", text, re.DOTALL)
munhash = re.match("^(\s*)</ol>(.*)$", text, re.DOTALL)
mitem = re.match("^(\s*)<li[^>]*>(.*?)</li>(.*)$", text, re.DOTALL)
ms = [len(m.groups()[0]) for m in [mstar, munstar, mhash, munhash, mitem] if m]
def min_(i, l):
try:
v = i.groups()[0]
l.remove(len(v))
return len(v) < min(l, 1000000000)
except:
return False
if min_(mstar, ms):
lists += "*"
pre, val, post = mstar.groups()
out += pre + "\n" + lists + " " + val
text = post
elif min_(mhash, ms):
lists += "#"
pre, val, post = mhash.groups()
out += pre + "\n" + lists + " " + val
text = post
elif min_(mitem, ms):
pre, val, post = mitem.groups()
out += "\n" + lists + " " + val
text = post
elif min_(munstar, ms):
lists = lists[:-1]
text = munstar.groups()[1]
elif min_(munhash, ms):
lists.pop()
text = munhash.groups()[1]
else:
out += text
text = ""
text2 = out
while text2:
mtag = re.match("^(.*?)<(.*?)>(.*)$", text, re.DOTALL)
if mtag:
text2 = mtag.groups()[2]
print mtag.groups()[1]
else:
text2 = ""
return out
def LoadCaveTab(): def LoadCaveTab():
cavetab = open(os.path.join(settings.EXPOWEB, "noinfo", "CAVETAB2.CSV"),'rU') cavetab = open(os.path.join(settings.EXPOWEB, "noinfo", "CAVETAB2.CSV"),'rU')
caveReader = csv.reader(cavetab) caveReader = csv.reader(cavetab)

103
utils.py
View File

@ -1,5 +1,8 @@
from django.conf import settings from django.conf import settings
from troggle.core.models import LogbookEntry try:
from django.db import models
except:#We want to get rid of this try statement if possible
from troggle.core.models import LogbookEntry
import random, re, logging import random, re, logging
def weighted_choice(lst): def weighted_choice(lst):
@ -16,11 +19,11 @@ def randomLogbookSentence():
# needs to handle empty logbooks without crashing # needs to handle empty logbooks without crashing
#Choose a random logbook entry #Choose a random logbook entry
randSent['entry']=LogbookEntry.objects.order_by('?')[0] randSent['entry']=models.LogbookEntry.objects.order_by('?')[0]
#Choose again if there are no sentances (this happens if it is a placeholder entry) #Choose again if there are no sentances (this happens if it is a placeholder entry)
while len(re.findall('[A-Z].*?\.',randSent['entry'].text))==0: while len(re.findall('[A-Z].*?\.',randSent['entry'].text))==0:
randSent['entry']=LogbookEntry.objects.order_by('?')[0] randSent['entry']=models.LogbookEntry.objects.order_by('?')[0]
#Choose a random sentence from that entry. Store the sentence as randSent['sentence'], and the number of that sentence in the entry as randSent['number'] #Choose a random sentence from that entry. Store the sentence as randSent['sentence'], and the number of that sentence in the entry as randSent['number']
sentenceList=re.findall('[A-Z].*?\.',randSent['entry'].text) sentenceList=re.findall('[A-Z].*?\.',randSent['entry'].text)
@ -60,10 +63,98 @@ def save_carefully(objectType, lookupAttribs={}, nonLookupAttribs={}):
def render_with_context(req, *args, **kwargs): def render_with_context(req, *args, **kwargs):
"""this is the snippet from http://www.djangosnippets.org/snippets/3/ """this is the snippet from http://www.djangosnippets.org/snippets/3/
Django uses Context, not RequestContext when you call render_to_response. We always want to use RequestContext, so that django adds the context from settings.TEMPLATE_CONTEXT_PROCESSORS. This way we automatically get necessary settings variables passed to each template. So we use a custom method, render_response instead of render_to_response. Hopefully future Django releases will make this unnecessary.""" Django uses Context, not RequestContext when you call render_to_response.
We always want to use RequestContext, so that django adds the context from
settings.TEMPLATE_CONTEXT_PROCESSORS. This way we automatically get
necessary settings variables passed to each template. So we use a custom
method, render_response instead of render_to_response. Hopefully future
Django releases will make this unnecessary."""
from django.shortcuts import render_to_response from django.shortcuts import render_to_response
from django.template import RequestContext from django.template import RequestContext
kwargs['context_instance'] = RequestContext(req) kwargs['context_instance'] = RequestContext(req)
return render_to_response(*args, **kwargs) return render_to_response(*args, **kwargs)
re_body = re.compile(r"\<body[^>]*\>(.*)\</body\>", re.DOTALL)
re_title = re.compile(r"\<title[^>]*\>(.*)\</title\>", re.DOTALL)
def get_html_body(text):
return get_single_match(re_body, text)
def get_html_title(text):
return get_single_match(re_title, text)
def get_single_match(regex, text):
match = regex.search(text)
if match:
return match.groups()[0]
else:
return None
re_subs = [(re.compile(r"\<b[^>]*\>(.*?)\</b\>", re.DOTALL), r"'''\1'''"),
(re.compile(r"\<i\>(.*?)\</i\>", re.DOTALL), r"''\1''"),
(re.compile(r"\<h1[^>]*\>(.*?)\</h1\>", re.DOTALL), r"=\1="),
(re.compile(r"\<h2[^>]*\>(.*?)\</h2\>", re.DOTALL), r"==\1=="),
(re.compile(r"\<h3[^>]*\>(.*?)\</h3\>", re.DOTALL), r"===\1==="),
(re.compile(r"\<h4[^>]*\>(.*?)\</h4\>", re.DOTALL), r"====\1===="),
(re.compile(r"\<h5[^>]*\>(.*?)\</h5\>", re.DOTALL), r"=====\1====="),
(re.compile(r"\<h6[^>]*\>(.*?)\</h6\>", re.DOTALL), r"======\1======"),
(re.compile(r"\<a\s+id=['\"]([^'\"]*)['\"]\s*\>(.*?)\</a\>", re.DOTALL), r"[subcave:\1|\2]"),
#interpage link needed
(re.compile(r"\<a\s+href=['\"]#([^'\"]*)['\"]\s*\>(.*?)\</a\>", re.DOTALL), r"[cavedescription:\1|\2]"),
(re.compile(r"\[\<a\s+href=['\"][^'\"]*['\"]\s+id=['\"][^'\"]*['\"]\s*\>([^\s]*).*?\</a\>\]", re.DOTALL), r"![qm:\1]"),
]
def html_to_wiki(text, codec = "utf-8"):
if type(text) == str:
text = unicode(text, codec)
text = re.sub("</p>", r"", text)
text = re.sub("<p>$", r"", text)
text = re.sub("<p>", r"\n\n", text)
out = ""
lists = ""
#lists
while text:
mstar = re.match("^(.*?)<ul[^>]*>\s*<li[^>]*>(.*?)</li>(.*)$", text, re.DOTALL)
munstar = re.match("^(\s*)</ul>(.*)$", text, re.DOTALL)
mhash = re.match("^(.*?)<ol[^>]*>\s*<li[^>]*>(.*?)</li>(.*)$", text, re.DOTALL)
munhash = re.match("^(\s*)</ol>(.*)$", text, re.DOTALL)
mitem = re.match("^(\s*)<li[^>]*>(.*?)</li>(.*)$", text, re.DOTALL)
ms = [len(m.groups()[0]) for m in [mstar, munstar, mhash, munhash, mitem] if m]
def min_(i, l):
try:
v = i.groups()[0]
l.remove(len(v))
return len(v) < min(l, 1000000000)
except:
return False
if min_(mstar, ms):
lists += "*"
pre, val, post = mstar.groups()
out += pre + "\n" + lists + " " + val
text = post
elif min_(mhash, ms):
lists += "#"
pre, val, post = mhash.groups()
out += pre + "\n" + lists + " " + val
text = post
elif min_(mitem, ms):
pre, val, post = mitem.groups()
out += "\n" + lists + " " + val
text = post
elif min_(munstar, ms):
lists = lists[:-1]
text = munstar.groups()[1]
elif min_(munhash, ms):
lists.pop()
text = munhash.groups()[1]
else:
out += text
text = ""
#substitutions
for regex, repl in re_subs:
out = regex.sub(repl, out)
return out