[svn] Work on turn html pages into cavedescription models.py.

Moved parser/cavetabs html_to_wiki function to utils.py Added databaseReset.py desc to refresh the cavedescriptions.
2009-07-04 16:42:17 +01:00 · 2009-07-04 16:42:17 +01:00 · c377d0376f
commit c377d0376f
parent fa9a6416d1
4 changed files with 133 additions and 93 deletions
--- a/core/models.py
+++ b/core/models.py
@ -1,5 +1,10 @@
 import urllib, urlparse, string, os, datetime, logging
-import troggle.mptt as mptt
+try:
    import mptt
 except ImportError:
    #I think we should be having troggle directory as the base import place
    #but I  am leaving the following line in to make sure I do not break anything
    import troggle.mptt as mptt
 from django.forms import ModelForm
 from django.db import models
 from django.contrib import admin
@ -541,16 +546,17 @@ except mptt.AlreadyRegistered:
    print "mptt already registered"
 class CaveDescription(TroggleModel):
-    name = models.CharField(max_length=50)
+    short_name = models.CharField(max_length=50, unique = True)
    long_name = models.CharField(max_length=200, blank=True, null=True)
    description = models.TextField(blank=True,null=True)
    linked_subcaves = models.ManyToManyField("Subcave")
    linked_entrances = models.ManyToManyField("Entrance")
    linked_qms = models.ManyToManyField("QM")
    def __unicode__(self):
-        return unicode(self.name)
+        return unicode(self.short_name)
 class NewSubCave(TroggleModel):
-    name = models.CharField(max_length=200)
+    name = models.CharField(max_length=200, unique = True)
    def __unicode__(self):
        return unicode(self.name)
--- a/databaseReset.py
+++ b/databaseReset.py
@ -58,6 +58,14 @@ def import_surveys():
    import parsers.surveys
    parsers.surveys.parseSurveys(logfile=settings.LOGFILE)
 def import_descriptions():
    import parsers.descriptions
    parsers.descriptions.getDescriptions()
 def parse_descriptions():
    import parsers.descriptions
    parsers.descriptions.parseDescriptions()
 def reset():
    """ Wipe the troggle database and import everything from legacy data
    """
@ -69,6 +77,17 @@ def reset():
    import_survex()
    import_QMs()
    import_surveys()
    import_descriptions()
    parse_descriptions()
 def resetdesc():
    """ Wipe the troggle database and import descriptions
    """
    import core.models
    for desc in core.models.CaveDescription.objects.all():
        desc.delete()
    import_descriptions()
    parse_descriptions()
 def export_cavetab():
    from export import tocavetab
@ -78,7 +97,9 @@ def export_cavetab():
 if __name__ == "__main__":
    import sys
-    if "reset" in sys.argv:
+    if "desc" in sys.argv:
        resetdesc()
    elif "reset" in sys.argv:
        reset()
    else:
        print "Do 'python databaseReset.py reset'"
--- a/parsers/cavetab.py
+++ b/parsers/cavetab.py
@ -3,6 +3,7 @@ import troggle.core.models as models
 from django.conf import settings
 import csv, time, re, os, logging
 from utils import save_carefully
 from utils import html_to_wiki
 ##format of CAVETAB2.CSV is
 KatasterNumber = 0
@ -52,85 +53,6 @@ MarkingComment = 43
 Findability = 44
 FindabilityComment = 45
 def html_to_wiki(text):
    if type(text) != str:
        return text
    text = unicode(text, "utf-8")
    #Characters
    #text = re.sub("&uuml;", u"\xfc", text)
    #text = re.sub("&ouml;", u"\xf6", text)
    #text = re.sub("&auml;", u"\xe4", text)
    #text = re.sub("&deg;", u"\xb0", text)
    #text = re.sub("&copy;", u"\xa9", text)
    #text = re.sub("&amp;", u"\x26", text)
    #text = re.sub("&szlig;", u"\xdf", text)
    #text = re.sub("&szlig;", u"\xdf", text)
    #text = re.sub("&lt;", u"<", text)
    #text = re.sub("&gt;", u">", text)
    #text = re.sub("&egrave;", u"\xe8", text)
    #text = re.sub("&eacute;", u"\xe9", text)
    #text = re.sub("&quote;", u'"', text)
    #text = re.sub("&quot;", u'"', text)
    #text = re.sub("&Ouml;", u'\xd6', text)
    #text = re.sub("&times;", u'"', text)
    #text = re.sub("&(.*);", "/1", text)
    #if s:
    #    print s.groups()
    #Lists
    text = re.sub("</p>", r"", text)
    text = re.sub("<p>$", r"", text)
    text = re.sub("<p>", r"\n\n", text)
    out = ""
    lists = ""
    while text:
        mstar = re.match("^(.*?)<ul>\s*<li[^>]*>(.*?)</li>(.*)$", text, re.DOTALL)
        munstar = re.match("^(\s*)</ul>(.*)$", text, re.DOTALL)
        mhash = re.match("^(.*?)<ol>\s*<li[^>]*>(.*?)</li>(.*)$", text, re.DOTALL)
        munhash = re.match("^(\s*)</ol>(.*)$", text, re.DOTALL)
        mitem = re.match("^(\s*)<li[^>]*>(.*?)</li>(.*)$", text, re.DOTALL)
        ms = [len(m.groups()[0]) for m in [mstar, munstar, mhash, munhash, mitem] if m]
        def min_(i, l):
            try:
                v = i.groups()[0]
                l.remove(len(v))
                return len(v) < min(l, 1000000000)
            except:
                return False
        if min_(mstar, ms):
            lists += "*"
            pre, val, post = mstar.groups()
            out += pre + "\n" + lists + " " + val
            text = post
        elif min_(mhash, ms):
            lists += "#"
            pre, val, post = mhash.groups()
            out += pre + "\n" + lists + " " + val
            text = post
        elif min_(mitem, ms):
            pre, val, post = mitem.groups()
            out += "\n" + lists + " " + val
            text = post
        elif min_(munstar, ms):
            lists = lists[:-1]
            text = munstar.groups()[1]
        elif min_(munhash, ms):
            lists.pop()
            text = munhash.groups()[1]
        else:
            out += text
            text = ""
    text2 = out
    while text2:
        mtag = re.match("^(.*?)<(.*?)>(.*)$", text, re.DOTALL)
        if mtag:
            text2 = mtag.groups()[2]
            print mtag.groups()[1]
        else:
            text2 = ""
    return out
 def LoadCaveTab():
    cavetab = open(os.path.join(settings.EXPOWEB, "noinfo", "CAVETAB2.CSV"),'rU')
    caveReader = csv.reader(cavetab)
--- a/utils.py
+++ b/utils.py
@ -1,5 +1,8 @@
 from django.conf import settings
-from troggle.core.models import LogbookEntry
+try:
    from django.db import models
 except:#We want to get rid of this try statement if possible
    from troggle.core.models import LogbookEntry
 import random, re, logging
 def weighted_choice(lst):
@ -16,11 +19,11 @@ def randomLogbookSentence():
    # needs to handle empty logbooks without crashing
    #Choose a random logbook entry
-    randSent['entry']=LogbookEntry.objects.order_by('?')[0]
+    randSent['entry']=models.LogbookEntry.objects.order_by('?')[0]
    #Choose again if there are no sentances (this happens if it is a placeholder entry)
    while len(re.findall('[A-Z].*?\.',randSent['entry'].text))==0:
-        randSent['entry']=LogbookEntry.objects.order_by('?')[0]
+        randSent['entry']=models.LogbookEntry.objects.order_by('?')[0]
    #Choose a random sentence from that entry. Store the sentence as randSent['sentence'], and the number of that sentence in the entry as randSent['number']
    sentenceList=re.findall('[A-Z].*?\.',randSent['entry'].text)
@ -61,9 +64,97 @@ def save_carefully(objectType, lookupAttribs={}, nonLookupAttribs={}):
 def render_with_context(req, *args, **kwargs):
    """this is the snippet from http://www.djangosnippets.org/snippets/3/
-    Django uses Context, not RequestContext when you call render_to_response. We always want to use RequestContext, so that django adds the context from settings.TEMPLATE_CONTEXT_PROCESSORS. This way we automatically get necessary settings variables passed to each template. So we use a custom method, render_response instead of render_to_response. Hopefully future Django releases will make this unnecessary."""
+    Django uses Context, not RequestContext when you call render_to_response.
    We always want to use RequestContext, so that django adds the context from
    settings.TEMPLATE_CONTEXT_PROCESSORS. This way we automatically get
    necessary settings variables passed to each template. So we use a custom
    method, render_response instead of render_to_response. Hopefully future
    Django releases will make this unnecessary."""
    from django.shortcuts import render_to_response
    from django.template import RequestContext
    kwargs['context_instance'] = RequestContext(req)
    return render_to_response(*args, **kwargs)
 re_body = re.compile(r"\<body[^>]*\>(.*)\</body\>", re.DOTALL)
 re_title = re.compile(r"\<title[^>]*\>(.*)\</title\>", re.DOTALL)
 def get_html_body(text):
    return get_single_match(re_body, text)
 def get_html_title(text):
    return get_single_match(re_title, text)
 def get_single_match(regex, text):
    match = regex.search(text)
    if match:
        return match.groups()[0]
    else:
        return None
 re_subs = [(re.compile(r"\<b[^>]*\>(.*?)\</b\>", re.DOTALL), r"'''\1'''"),
           (re.compile(r"\<i\>(.*?)\</i\>", re.DOTALL), r"''\1''"),
           (re.compile(r"\<h1[^>]*\>(.*?)\</h1\>", re.DOTALL), r"=\1="),
           (re.compile(r"\<h2[^>]*\>(.*?)\</h2\>", re.DOTALL), r"==\1=="),
           (re.compile(r"\<h3[^>]*\>(.*?)\</h3\>", re.DOTALL), r"===\1==="),
           (re.compile(r"\<h4[^>]*\>(.*?)\</h4\>", re.DOTALL), r"====\1===="),
           (re.compile(r"\<h5[^>]*\>(.*?)\</h5\>", re.DOTALL), r"=====\1====="),
           (re.compile(r"\<h6[^>]*\>(.*?)\</h6\>", re.DOTALL), r"======\1======"),
           (re.compile(r"\<a\s+id=['\"]([^'\"]*)['\"]\s*\>(.*?)\</a\>", re.DOTALL), r"[subcave:\1|\2]"),
           #interpage link needed
           (re.compile(r"\<a\s+href=['\"]#([^'\"]*)['\"]\s*\>(.*?)\</a\>", re.DOTALL), r"[cavedescription:\1|\2]"),
           (re.compile(r"\[\<a\s+href=['\"][^'\"]*['\"]\s+id=['\"][^'\"]*['\"]\s*\>([^\s]*).*?\</a\>\]", re.DOTALL), r"![qm:\1]"),
           ]
 def html_to_wiki(text, codec = "utf-8"):
    if type(text) == str:
        text = unicode(text, codec)
    text = re.sub("</p>", r"", text)
    text = re.sub("<p>$", r"", text)
    text = re.sub("<p>", r"\n\n", text)
    out = ""
    lists = ""
    #lists
    while text:
        mstar = re.match("^(.*?)<ul[^>]*>\s*<li[^>]*>(.*?)</li>(.*)$", text, re.DOTALL)
        munstar = re.match("^(\s*)</ul>(.*)$", text, re.DOTALL)
        mhash = re.match("^(.*?)<ol[^>]*>\s*<li[^>]*>(.*?)</li>(.*)$", text, re.DOTALL)
        munhash = re.match("^(\s*)</ol>(.*)$", text, re.DOTALL)
        mitem = re.match("^(\s*)<li[^>]*>(.*?)</li>(.*)$", text, re.DOTALL)
        ms = [len(m.groups()[0]) for m in [mstar, munstar, mhash, munhash, mitem] if m]
        def min_(i, l):
            try:
                v = i.groups()[0]
                l.remove(len(v))
                return len(v) < min(l, 1000000000)
            except:
                return False
        if min_(mstar, ms):
            lists += "*"
            pre, val, post = mstar.groups()
            out += pre + "\n" + lists + " " + val
            text = post
        elif min_(mhash, ms):
            lists += "#"
            pre, val, post = mhash.groups()
            out += pre + "\n" + lists + " " + val
            text = post
        elif min_(mitem, ms):
            pre, val, post = mitem.groups()
            out += "\n" + lists + " " + val
            text = post
        elif min_(munstar, ms):
            lists = lists[:-1]
            text = munstar.groups()[1]
        elif min_(munhash, ms):
            lists.pop()
            text = munhash.groups()[1]
        else:
            out += text
            text = ""
    #substitutions
    for regex, repl in re_subs:
        out = regex.sub(repl, out)
    return out