[svn] Work on turn html pages into cavedescription models.py.

Moved parser/cavetabs html_to_wiki function to utils.py Added databaseReset.py desc to refresh the cavedescriptions.
2009-07-04 16:42:17 +01:00
parent fa9a6416d1
commit c377d0376f
4 changed files with 133 additions and 93 deletions
--- a/core/models.py
+++ b/core/models.py
@@ -1,5 +1,10 @@
 import urllib, urlparse, string, os, datetime, logging
-import troggle.mptt as mptt
+try:
+    import mptt
+except ImportError:
+    #I think we should be having troggle directory as the base import place
+    #but I  am leaving the following line in to make sure I do not break anything
+    import troggle.mptt as mptt
 from django.forms import ModelForm
 from django.db import models
 from django.contrib import admin
@@ -539,18 +544,19 @@ try:
    mptt.register(Subcave, order_insertion_by=['title'])
 except mptt.AlreadyRegistered:
    print "mptt already registered"
-    
+
 class CaveDescription(TroggleModel):
-    name = models.CharField(max_length=50)
+    short_name = models.CharField(max_length=50, unique = True)
+    long_name = models.CharField(max_length=200, blank=True, null=True)
    description = models.TextField(blank=True,null=True)
    linked_subcaves = models.ManyToManyField("Subcave")
    linked_entrances = models.ManyToManyField("Entrance")
    linked_qms = models.ManyToManyField("QM")
    def __unicode__(self):
-        return unicode(self.name)
+        return unicode(self.short_name)

 class NewSubCave(TroggleModel):
-    name = models.CharField(max_length=200)
+    name = models.CharField(max_length=200, unique = True)
    def __unicode__(self):
        return unicode(self.name)

--- a/databaseReset.py
+++ b/databaseReset.py
@@ -58,6 +58,14 @@ def import_surveys():
    import parsers.surveys
    parsers.surveys.parseSurveys(logfile=settings.LOGFILE)

+def import_descriptions():
+    import parsers.descriptions
+    parsers.descriptions.getDescriptions()
+
+def parse_descriptions():
+    import parsers.descriptions
+    parsers.descriptions.parseDescriptions()
+
 def reset():
    """ Wipe the troggle database and import everything from legacy data
    """
@@ -69,16 +77,29 @@ def reset():
    import_survex()
    import_QMs()
    import_surveys()
+    import_descriptions()
+    parse_descriptions()

+def resetdesc():
+    """ Wipe the troggle database and import descriptions
+    """
+    import core.models
+    for desc in core.models.CaveDescription.objects.all():
+        desc.delete()
+    import_descriptions()
+    parse_descriptions()
+    
 def export_cavetab():
    from export import tocavetab
    outfile=file(os.path.join(settings.EXPOWEB, "noinfo", "CAVETAB2.CSV"),'w')
    tocavetab.writeCaveTab(outfile)
    outfile.close()
-    
-if __name__ == "__main__": 
+
+if __name__ == "__main__":
    import sys
-    if "reset" in sys.argv:
+    if "desc" in sys.argv:
+        resetdesc()
+    elif "reset" in sys.argv:
        reset()
    else:
        print "Do 'python databaseReset.py reset'"
--- a/parsers/cavetab.py
+++ b/parsers/cavetab.py
@@ -3,6 +3,7 @@ import troggle.core.models as models
 from django.conf import settings
 import csv, time, re, os, logging
 from utils import save_carefully
+from utils import html_to_wiki

 ##format of CAVETAB2.CSV is
 KatasterNumber = 0
@@ -52,85 +53,6 @@ MarkingComment = 43
 Findability = 44
 FindabilityComment = 45

-
-def html_to_wiki(text):
-    if type(text) != str:
-        return text
-    text = unicode(text, "utf-8")
-    #Characters
-    #text = re.sub("&uuml;", u"\xfc", text)
-    #text = re.sub("&ouml;", u"\xf6", text)
-    #text = re.sub("&auml;", u"\xe4", text)
-    #text = re.sub("&deg;", u"\xb0", text)
-    #text = re.sub("&copy;", u"\xa9", text)
-    #text = re.sub("&amp;", u"\x26", text)
-    #text = re.sub("&szlig;", u"\xdf", text)
-    #text = re.sub("&szlig;", u"\xdf", text)
-    #text = re.sub("&lt;", u"<", text)
-    #text = re.sub("&gt;", u">", text)
-    #text = re.sub("&egrave;", u"\xe8", text)
-    #text = re.sub("&eacute;", u"\xe9", text)
-    #text = re.sub("&quote;", u'"', text)
-    #text = re.sub("&quot;", u'"', text)
-    #text = re.sub("&Ouml;", u'\xd6', text)
-    #text = re.sub("&times;", u'"', text)
-
-    #text = re.sub("&(.*);", "/1", text)
-    #if s:
-    #    print s.groups()
-    #Lists
-    text = re.sub("</p>", r"", text)
-    text = re.sub("<p>$", r"", text)
-    text = re.sub("<p>", r"\n\n", text)
-    out = ""
-    lists = ""
-    while text:
-        mstar = re.match("^(.*?)<ul>\s*<li[^>]*>(.*?)</li>(.*)$", text, re.DOTALL)
-        munstar = re.match("^(\s*)</ul>(.*)$", text, re.DOTALL)
-        mhash = re.match("^(.*?)<ol>\s*<li[^>]*>(.*?)</li>(.*)$", text, re.DOTALL)
-        munhash = re.match("^(\s*)</ol>(.*)$", text, re.DOTALL)
-        mitem = re.match("^(\s*)<li[^>]*>(.*?)</li>(.*)$", text, re.DOTALL)
-        ms = [len(m.groups()[0]) for m in [mstar, munstar, mhash, munhash, mitem] if m]
-        def min_(i, l):
-            try:
-                v = i.groups()[0]
-                l.remove(len(v))
-                return len(v) < min(l, 1000000000)
-            except:
-                return False
-        if min_(mstar, ms):
-            lists += "*"
-            pre, val, post = mstar.groups()
-            out += pre + "\n" + lists + " " + val
-            text = post
-        elif min_(mhash, ms):
-            lists += "#"
-            pre, val, post = mhash.groups()
-            out += pre + "\n" + lists + " " + val
-            text = post
-        elif min_(mitem, ms):
-            pre, val, post = mitem.groups()
-            out += "\n" + lists + " " + val
-            text = post
-        elif min_(munstar, ms):
-            lists = lists[:-1]
-            text = munstar.groups()[1]
-        elif min_(munhash, ms):
-            lists.pop()
-            text = munhash.groups()[1]
-        else:
-            out += text
-            text = ""
-    text2 = out
-    while text2:
-        mtag = re.match("^(.*?)<(.*?)>(.*)$", text, re.DOTALL)
-        if mtag:
-            text2 = mtag.groups()[2]
-            print mtag.groups()[1]
-        else:
-            text2 = ""
-    return out
-
 def LoadCaveTab():
    cavetab = open(os.path.join(settings.EXPOWEB, "noinfo", "CAVETAB2.CSV"),'rU')
    caveReader = csv.reader(cavetab)
--- a/utils.py
+++ b/utils.py
@@ -1,5 +1,8 @@
 from django.conf import settings
-from troggle.core.models import LogbookEntry
+try:
+    from django.db import models
+except:#We want to get rid of this try statement if possible
+    from troggle.core.models import LogbookEntry
 import random, re, logging

 def weighted_choice(lst):
@@ -16,11 +19,11 @@ def randomLogbookSentence():
    # needs to handle empty logbooks without crashing

    #Choose a random logbook entry
-    randSent['entry']=LogbookEntry.objects.order_by('?')[0]
+    randSent['entry']=models.LogbookEntry.objects.order_by('?')[0]

    #Choose again if there are no sentances (this happens if it is a placeholder entry)
    while len(re.findall('[A-Z].*?\.',randSent['entry'].text))==0:
-        randSent['entry']=LogbookEntry.objects.order_by('?')[0]
+        randSent['entry']=models.LogbookEntry.objects.order_by('?')[0]
    
    #Choose a random sentence from that entry. Store the sentence as randSent['sentence'], and the number of that sentence in the entry as randSent['number']
    sentenceList=re.findall('[A-Z].*?\.',randSent['entry'].text)
@@ -60,10 +63,98 @@ def save_carefully(objectType, lookupAttribs={}, nonLookupAttribs={}):

 def render_with_context(req, *args, **kwargs):
    """this is the snippet from http://www.djangosnippets.org/snippets/3/
-    
-    Django uses Context, not RequestContext when you call render_to_response. We always want to use RequestContext, so that django adds the context from settings.TEMPLATE_CONTEXT_PROCESSORS. This way we automatically get necessary settings variables passed to each template. So we use a custom method, render_response instead of render_to_response. Hopefully future Django releases will make this unnecessary."""
+
+    Django uses Context, not RequestContext when you call render_to_response.
+    We always want to use RequestContext, so that django adds the context from
+    settings.TEMPLATE_CONTEXT_PROCESSORS. This way we automatically get
+    necessary settings variables passed to each template. So we use a custom
+    method, render_response instead of render_to_response. Hopefully future
+    Django releases will make this unnecessary."""

    from django.shortcuts import render_to_response
    from django.template import RequestContext
    kwargs['context_instance'] = RequestContext(req)
-    return render_to_response(*args, **kwargs)
+    return render_to_response(*args, **kwargs)
+    
+re_body = re.compile(r"\<body[^>]*\>(.*)\</body\>", re.DOTALL)
+re_title = re.compile(r"\<title[^>]*\>(.*)\</title\>", re.DOTALL)
+def get_html_body(text):
+    return get_single_match(re_body, text)
+
+def get_html_title(text):
+    return get_single_match(re_title, text)
+
+def get_single_match(regex, text):
+    match = regex.search(text)
+
+    if match:
+        return match.groups()[0]
+    else:
+        return None
+
+
+re_subs = [(re.compile(r"\<b[^>]*\>(.*?)\</b\>", re.DOTALL), r"'''\1'''"),
+           (re.compile(r"\<i\>(.*?)\</i\>", re.DOTALL), r"''\1''"),
+           (re.compile(r"\<h1[^>]*\>(.*?)\</h1\>", re.DOTALL), r"=\1="),
+           (re.compile(r"\<h2[^>]*\>(.*?)\</h2\>", re.DOTALL), r"==\1=="),
+           (re.compile(r"\<h3[^>]*\>(.*?)\</h3\>", re.DOTALL), r"===\1==="),
+           (re.compile(r"\<h4[^>]*\>(.*?)\</h4\>", re.DOTALL), r"====\1===="),
+           (re.compile(r"\<h5[^>]*\>(.*?)\</h5\>", re.DOTALL), r"=====\1====="),
+           (re.compile(r"\<h6[^>]*\>(.*?)\</h6\>", re.DOTALL), r"======\1======"),
+           (re.compile(r"\<a\s+id=['\"]([^'\"]*)['\"]\s*\>(.*?)\</a\>", re.DOTALL), r"[subcave:\1|\2]"),
+           #interpage link needed
+           (re.compile(r"\<a\s+href=['\"]#([^'\"]*)['\"]\s*\>(.*?)\</a\>", re.DOTALL), r"[cavedescription:\1|\2]"),
+           (re.compile(r"\[\<a\s+href=['\"][^'\"]*['\"]\s+id=['\"][^'\"]*['\"]\s*\>([^\s]*).*?\</a\>\]", re.DOTALL), r"![qm:\1]"),
+
+           ]
+
+def html_to_wiki(text, codec = "utf-8"):
+    if type(text) == str:
+        text = unicode(text, codec)
+    text = re.sub("</p>", r"", text)
+    text = re.sub("<p>$", r"", text)
+    text = re.sub("<p>", r"\n\n", text)
+    out = ""
+    lists = ""
+    #lists
+    while text:
+        mstar = re.match("^(.*?)<ul[^>]*>\s*<li[^>]*>(.*?)</li>(.*)$", text, re.DOTALL)
+        munstar = re.match("^(\s*)</ul>(.*)$", text, re.DOTALL)
+        mhash = re.match("^(.*?)<ol[^>]*>\s*<li[^>]*>(.*?)</li>(.*)$", text, re.DOTALL)
+        munhash = re.match("^(\s*)</ol>(.*)$", text, re.DOTALL)
+        mitem = re.match("^(\s*)<li[^>]*>(.*?)</li>(.*)$", text, re.DOTALL)
+        ms = [len(m.groups()[0]) for m in [mstar, munstar, mhash, munhash, mitem] if m]
+        def min_(i, l):
+            try:
+                v = i.groups()[0]
+                l.remove(len(v))
+                return len(v) < min(l, 1000000000)
+            except:
+                return False
+        if min_(mstar, ms):
+            lists += "*"
+            pre, val, post = mstar.groups()
+            out += pre + "\n" + lists + " " + val
+            text = post
+        elif min_(mhash, ms):
+            lists += "#"
+            pre, val, post = mhash.groups()
+            out += pre + "\n" + lists + " " + val
+            text = post
+        elif min_(mitem, ms):
+            pre, val, post = mitem.groups()
+            out += "\n" + lists + " " + val
+            text = post
+        elif min_(munstar, ms):
+            lists = lists[:-1]
+            text = munstar.groups()[1]
+        elif min_(munhash, ms):
+            lists.pop()
+            text = munhash.groups()[1]
+        else:
+            out += text
+            text = ""
+    #substitutions
+    for regex, repl in re_subs:
+        out = regex.sub(repl, out)
+    return out