troggle-unchained/utils.py

from django.conf import settings
import random, re, logging
from core.models import CaveDescription

def weighted_choice(lst):
	n = random.uniform(0,1)
	for item, weight in lst:
		if n < weight:
			break
		n = n - weight
	return item

def randomLogbookSentence():
    from troggle.core.models import LogbookEntry
    randSent={}

    # needs to handle empty logbooks without crashing

    #Choose a random logbook entry
    randSent['entry']=LogbookEntry.objects.order_by('?')[0]

    #Choose again if there are no sentances (this happens if it is a placeholder entry)
    while len(re.findall('[A-Z].*?\.',randSent['entry'].text))==0:
        randSent['entry']=LogbookEntry.objects.order_by('?')[0]
    
    #Choose a random sentence from that entry. Store the sentence as randSent['sentence'], and the number of that sentence in the entry as randSent['number']
    sentenceList=re.findall('[A-Z].*?\.',randSent['entry'].text)
    randSent['number']=random.randrange(0,len(sentenceList))
    randSent['sentence']=sentenceList[randSent['number']]
           
    return randSent


def save_carefully(objectType, lookupAttribs={}, nonLookupAttribs={}):
    """Looks up instance using lookupAttribs and carries out the following:
            -if instance does not exist in DB: add instance to DB, return (new instance, True)
            -if instance exists in DB and was modified using Troggle: do nothing, return (existing instance, False)
            -if instance exists in DB and was not modified using Troggle: overwrite instance, return (instance, False)
            
        The checking is accomplished using Django's get_or_create and the new_since_parsing boolean field
        defined in core.models.TroggleModel.
    
    """

    instance, created=objectType.objects.get_or_create(defaults=nonLookupAttribs, **lookupAttribs)

    if not created and not instance.new_since_parsing:
        for k, v in nonLookupAttribs.items(): #overwrite the existing attributes from the logbook text (except date and title)
            setattr(instance, k, v)
        instance.save()
    
    if created:
        logging.info(str(instance) + ' was just added to the database for the first time. \n')
    
    if not created and instance.new_since_parsing:
        logging.info(str(instance) + " has been modified using Troggle, so the current script left it as is. \n")

    if not created and not instance.new_since_parsing:
        logging.info(str(instance) + " existed in the database unchanged since last parse. It was overwritten by the current script. \n")
    return (instance, created)

def render_with_context(req, *args, **kwargs):
    """this is the snippet from http://www.djangosnippets.org/snippets/3/

    Django uses Context, not RequestContext when you call render_to_response.
    We always want to use RequestContext, so that django adds the context from
    settings.TEMPLATE_CONTEXT_PROCESSORS. This way we automatically get
    necessary settings variables passed to each template. So we use a custom
    method, render_response instead of render_to_response. Hopefully future
    Django releases will make this unnecessary."""

    from django.shortcuts import render_to_response
    from django.template import RequestContext
    kwargs['context_instance'] = RequestContext(req)
    return render_to_response(*args, **kwargs)
    
re_body = re.compile(r"\<body[^>]*\>(.*)\</body\>", re.DOTALL)
re_title = re.compile(r"\<title[^>]*\>(.*)\</title\>", re.DOTALL)

def get_html_body(text):
    return get_single_match(re_body, text)

def get_html_title(text):
    return get_single_match(re_title, text)

def get_single_match(regex, text):
    match = regex.search(text)

    if match:
        return match.groups()[0]
    else:
        return None

def href_to_wikilinks(matchobj):
    """
    Given an html link, checks for possible valid wikilinks.
    
    Returns the first valid wikilink. Valid means the target
    object actually exists.
    """
    res=CaveDescription.objects.filter(long_name__icontains=matchobj.groupdict()['text'])
    if res:
        return r'[[cavedescription:'+res[0].short_name+'|'+res[0].long_name+']]'
    else:
        return matchobj.group()
    #except:
        #print 'fail'
    

re_subs = [(re.compile(r"\<b[^>]*\>(.*?)\</b\>", re.DOTALL), r"'''\1'''"),
           (re.compile(r"\<i\>(.*?)\</i\>", re.DOTALL), r"''\1''"),
           (re.compile(r"\<h1[^>]*\>(.*?)\</h1\>", re.DOTALL), r"=\1="),
           (re.compile(r"\<h2[^>]*\>(.*?)\</h2\>", re.DOTALL), r"==\1=="),
           (re.compile(r"\<h3[^>]*\>(.*?)\</h3\>", re.DOTALL), r"===\1==="),
           (re.compile(r"\<h4[^>]*\>(.*?)\</h4\>", re.DOTALL), r"====\1===="),
           (re.compile(r"\<h5[^>]*\>(.*?)\</h5\>", re.DOTALL), r"=====\1====="),
           (re.compile(r"\<h6[^>]*\>(.*?)\</h6\>", re.DOTALL), r"======\1======"),
           (re.compile(r'(<a href="?(?P<target>.*)"?>)?<img class="?(?P<class>\w*)"? src="?t/?(?P<source>[\w/\.]*)"?(?P<rest>></img>|\s/>(</a>)?)', re.DOTALL),r'[[display:\g<class> photo:\g<source>]]'), #
           (re.compile(r"\<a\s+id=['\"]([^'\"]*)['\"]\s*\>(.*?)\</a\>", re.DOTALL), r"[[subcave:\1|\2]]"), #assumes that all links with id attributes are subcaves. Not great.
           #interpage link needed
           (re.compile(r"\<a\s+href=['\"]#([^'\"]*)['\"]\s*\>(.*?)\</a\>", re.DOTALL), r"[[cavedescription:\1|\2]]"), #assumes that all links with target ids are cave descriptions. Not great.
           (re.compile(r"\[\<a\s+href=['\"][^'\"]*['\"]\s+id=['\"][^'\"]*['\"]\s*\>([^\s]*).*?\</a\>\]", re.DOTALL), r"[[qm:\1]]"),
           
           #BUGGED!
           (re.compile(r'<a\shref="?(?P<target>.*)"?>(?P<text>.*)</a>'),href_to_wikilinks),
           

           ]

def html_to_wiki(text, codec = "utf-8"):
    if type(text) == str:
        text = unicode(text, codec)
    text = re.sub("</p>", r"", text)
    text = re.sub("<p>$", r"", text)
    text = re.sub("<p>", r"\n\n", text)
    out = ""
    lists = ""
    #lists
    while text:
        mstar = re.match("^(.*?)<ul[^>]*>\s*<li[^>]*>(.*?)</li>(.*)$", text, re.DOTALL)
        munstar = re.match("^(\s*)</ul>(.*)$", text, re.DOTALL)
        mhash = re.match("^(.*?)<ol[^>]*>\s*<li[^>]*>(.*?)</li>(.*)$", text, re.DOTALL)
        munhash = re.match("^(\s*)</ol>(.*)$", text, re.DOTALL)
        mitem = re.match("^(\s*)<li[^>]*>(.*?)</li>(.*)$", text, re.DOTALL)
        ms = [len(m.groups()[0]) for m in [mstar, munstar, mhash, munhash, mitem] if m]
        def min_(i, l):
            try:
                v = i.groups()[0]
                l.remove(len(v))
                return len(v) < min(l, 1000000000)
            except:
                return False
        if min_(mstar, ms):
            lists += "*"
            pre, val, post = mstar.groups()
            out += pre + "\n" + lists + " " + val
            text = post
        elif min_(mhash, ms):
            lists += "#"
            pre, val, post = mhash.groups()
            out += pre + "\n" + lists + " " + val
            text = post
        elif min_(mitem, ms):
            pre, val, post = mitem.groups()
            out += "\n" + lists + " " + val
            text = post
        elif min_(munstar, ms):
            lists = lists[:-1]
            text = munstar.groups()[1]
        elif min_(munhash, ms):
            lists.pop()
            text = munhash.groups()[1]
        else:
            out += text
            text = ""
    
    #substitutions
    for regex, repl in re_subs:
        out = regex.sub(repl, out)
    return out
[svn] Brief code cleanup. 2009-07-03 05:31:49 +01:00			`from django.conf import settings`
			`import random, re, logging`
[svn] * Make descriptions parser also replace links to descriptions from Cave models' underground_descriptions with wikilinks for valid (existing) links * Make entrances searchable in admin by cave kataster number 2009-07-16 05:37:33 +01:00			`from core.models import CaveDescription`
[svn] Brief code cleanup. 2009-07-03 05:31:49 +01:00
			`def weighted_choice(lst):`
			`n = random.uniform(0,1)`
			`for item, weight in lst:`
			`if n < weight:`
			`break`
			`n = n - weight`
			`return item`

			`def randomLogbookSentence():`
[svn] override save for CaveDescriptions to scan qm wikilinks and add into the manytomany field linked_qms 2009-07-11 01:36:00 +01:00			`from troggle.core.models import LogbookEntry`
[svn] Brief code cleanup. 2009-07-03 05:31:49 +01:00			`randSent={}`

			`# needs to handle empty logbooks without crashing`

			`#Choose a random logbook entry`
[svn] override save for CaveDescriptions to scan qm wikilinks and add into the manytomany field linked_qms 2009-07-11 01:36:00 +01:00			`randSent['entry']=LogbookEntry.objects.order_by('?')[0]`
[svn] Brief code cleanup. 2009-07-03 05:31:49 +01:00
			`#Choose again if there are no sentances (this happens if it is a placeholder entry)`
			`while len(re.findall('[A-Z].*?\.',randSent['entry'].text))==0:`
[svn] override save for CaveDescriptions to scan qm wikilinks and add into the manytomany field linked_qms 2009-07-11 01:36:00 +01:00			`randSent['entry']=LogbookEntry.objects.order_by('?')[0]`
[svn] Brief code cleanup. 2009-07-03 05:31:49 +01:00
			`#Choose a random sentence from that entry. Store the sentence as randSent['sentence'], and the number of that sentence in the entry as randSent['number']`
			`sentenceList=re.findall('[A-Z].*?\.',randSent['entry'].text)`
			`randSent['number']=random.randrange(0,len(sentenceList))`
			`randSent['sentence']=sentenceList[randSent['number']]`

			`return randSent`


			`def save_carefully(objectType, lookupAttribs={}, nonLookupAttribs={}):`
			`"""Looks up instance using lookupAttribs and carries out the following:`
			`-if instance does not exist in DB: add instance to DB, return (new instance, True)`
			`-if instance exists in DB and was modified using Troggle: do nothing, return (existing instance, False)`
			`-if instance exists in DB and was not modified using Troggle: overwrite instance, return (instance, False)`

			`The checking is accomplished using Django's get_or_create and the new_since_parsing boolean field`
			`defined in core.models.TroggleModel.`

			`"""`

			`instance, created=objectType.objects.get_or_create(defaults=nonLookupAttribs, **lookupAttribs)`

			`if not created and not instance.new_since_parsing:`
			`for k, v in nonLookupAttribs.items(): #overwrite the existing attributes from the logbook text (except date and title)`
			`setattr(instance, k, v)`
			`instance.save()`

			`if created:`
[svn] bugged 2009-07-21 07:20:34 +01:00			`logging.info(str(instance) + ' was just added to the database for the first time. \n')`
[svn] Brief code cleanup. 2009-07-03 05:31:49 +01:00
			`if not created and instance.new_since_parsing:`
[svn] bugged 2009-07-21 07:20:34 +01:00			`logging.info(str(instance) + " has been modified using Troggle, so the current script left it as is. \n")`
[svn] Brief code cleanup. 2009-07-03 05:31:49 +01:00
			`if not created and not instance.new_since_parsing:`
[svn] bugged 2009-07-21 07:20:34 +01:00			`logging.info(str(instance) + " existed in the database unchanged since last parse. It was overwritten by the current script. \n")`
[svn] Brief code cleanup. 2009-07-03 05:31:49 +01:00			`return (instance, created)`

			`def render_with_context(req, args, *kwargs):`
			`"""this is the snippet from http://www.djangosnippets.org/snippets/3/`
[svn] Work on turn html pages into cavedescription models.py. Moved parser/cavetabs html_to_wiki function to utils.py Added databaseReset.py desc to refresh the cavedescriptions. 2009-07-04 16:42:17 +01:00
			`Django uses Context, not RequestContext when you call render_to_response.`
			`We always want to use RequestContext, so that django adds the context from`
			`settings.TEMPLATE_CONTEXT_PROCESSORS. This way we automatically get`
			`necessary settings variables passed to each template. So we use a custom`
			`method, render_response instead of render_to_response. Hopefully future`
			`Django releases will make this unnecessary."""`
[svn] Brief code cleanup. 2009-07-03 05:31:49 +01:00
			`from django.shortcuts import render_to_response`
			`from django.template import RequestContext`
			`kwargs['context_instance'] = RequestContext(req)`
[svn] Work on turn html pages into cavedescription models.py. Moved parser/cavetabs html_to_wiki function to utils.py Added databaseReset.py desc to refresh the cavedescriptions. 2009-07-04 16:42:17 +01:00			`return render_to_response(args, *kwargs)`

			`re_body = re.compile(r"\<body[^>]\>(.)\</body\>", re.DOTALL)`
			`re_title = re.compile(r"\<title[^>]\>(.)\</title\>", re.DOTALL)`
[svn] * Make descriptions parser also replace links to descriptions from Cave models' underground_descriptions with wikilinks for valid (existing) links * Make entrances searchable in admin by cave kataster number 2009-07-16 05:37:33 +01:00
[svn] Work on turn html pages into cavedescription models.py. Moved parser/cavetabs html_to_wiki function to utils.py Added databaseReset.py desc to refresh the cavedescriptions. 2009-07-04 16:42:17 +01:00			`def get_html_body(text):`
			`return get_single_match(re_body, text)`

			`def get_html_title(text):`
			`return get_single_match(re_title, text)`

			`def get_single_match(regex, text):`
			`match = regex.search(text)`

			`if match:`
			`return match.groups()[0]`
			`else:`
			`return None`

[svn] * Make descriptions parser also replace links to descriptions from Cave models' underground_descriptions with wikilinks for valid (existing) links * Make entrances searchable in admin by cave kataster number 2009-07-16 05:37:33 +01:00			`def href_to_wikilinks(matchobj):`
			`"""`
			`Given an html link, checks for possible valid wikilinks.`

			`Returns the first valid wikilink. Valid means the target`
			`object actually exists.`
			`"""`
			`res=CaveDescription.objects.filter(long_name__icontains=matchobj.groupdict()['text'])`
			`if res:`
			`return r'[[cavedescription:'+res[0].short_name+'\|'+res[0].long_name+']]'`
			`else:`
[svn] Various bug fixes, using more raw_id fields in admin so it loads faster. I had to put onLoad="contentHeight();" back into the base template. This is a bad solution, I would rather use Martin's, but it wasn't working. 2009-07-22 16:18:00 +01:00			`return matchobj.group()`
[svn] * Make descriptions parser also replace links to descriptions from Cave models' underground_descriptions with wikilinks for valid (existing) links * Make entrances searchable in admin by cave kataster number 2009-07-16 05:37:33 +01:00			`#except:`
			`#print 'fail'`

[svn] Work on turn html pages into cavedescription models.py. Moved parser/cavetabs html_to_wiki function to utils.py Added databaseReset.py desc to refresh the cavedescriptions. 2009-07-04 16:42:17 +01:00
			`re_subs = [(re.compile(r"\<b[^>]\>(.?)\</b\>", re.DOTALL), r"'''\1'''"),`
			`(re.compile(r"\<i\>(.*?)\</i\>", re.DOTALL), r"''\1''"),`
			`(re.compile(r"\<h1[^>]\>(.?)\</h1\>", re.DOTALL), r"=\1="),`
			`(re.compile(r"\<h2[^>]\>(.?)\</h2\>", re.DOTALL), r"==\1=="),`
			`(re.compile(r"\<h3[^>]\>(.?)\</h3\>", re.DOTALL), r"===\1==="),`
			`(re.compile(r"\<h4[^>]\>(.?)\</h4\>", re.DOTALL), r"====\1===="),`
			`(re.compile(r"\<h5[^>]\>(.?)\</h5\>", re.DOTALL), r"=====\1====="),`
			`(re.compile(r"\<h6[^>]\>(.?)\</h6\>", re.DOTALL), r"======\1======"),`
[svn] Various bug fixes, using more raw_id fields in admin so it loads faster. I had to put onLoad="contentHeight();" back into the base template. This is a bad solution, I would rather use Martin's, but it wasn't working. 2009-07-22 16:18:00 +01:00			`(re.compile(r'(<a href="?(?P<target>.)"?>)?<img class="?(?P<class>\w)"? src="?t/?(?P<source>[\w/\.]*)"?(?P<rest>></img>\|\s/>(</a>)?)', re.DOTALL),r'[[display:\g<class> photo:\g<source>]]'), #`
[svn] * Make descriptions parser also replace links to descriptions from Cave models' underground_descriptions with wikilinks for valid (existing) links * Make entrances searchable in admin by cave kataster number 2009-07-16 05:37:33 +01:00			`(re.compile(r"\<a\s+id=['\"]([^'\"])['\"]\s\>(.*?)\</a\>", re.DOTALL), r"[[subcave:\1\|\2]]"), #assumes that all links with id attributes are subcaves. Not great.`
[svn] Work on turn html pages into cavedescription models.py. Moved parser/cavetabs html_to_wiki function to utils.py Added databaseReset.py desc to refresh the cavedescriptions. 2009-07-04 16:42:17 +01:00			`#interpage link needed`
[svn] Various bug fixes, using more raw_id fields in admin so it loads faster. I had to put onLoad="contentHeight();" back into the base template. This is a bad solution, I would rather use Martin's, but it wasn't working. 2009-07-22 16:18:00 +01:00			`(re.compile(r"\<a\s+href=['\"]#([^'\"])['\"]\s\>(.*?)\</a\>", re.DOTALL), r"[[cavedescription:\1\|\2]]"), #assumes that all links with target ids are cave descriptions. Not great.`
[svn] Make QM wikilinks work in new format, and fix cave description parser to output working wikilinks. 2009-07-09 05:08:21 +01:00			`(re.compile(r"\[\<a\s+href=['\"][^'\"]['\"]\s+id=['\"][^'\"]['\"]\s\>([^\s]).*?\</a\>\]", re.DOTALL), r"[[qm:\1]]"),`
[svn] Various bug fixes, using more raw_id fields in admin so it loads faster. I had to put onLoad="contentHeight();" back into the base template. This is a bad solution, I would rather use Martin's, but it wasn't working. 2009-07-22 16:18:00 +01:00
			`#BUGGED!`
			`(re.compile(r'<a\shref="?(?P<target>.)"?>(?P<text>.)</a>'),href_to_wikilinks),`

[svn] Work on turn html pages into cavedescription models.py. Moved parser/cavetabs html_to_wiki function to utils.py Added databaseReset.py desc to refresh the cavedescriptions. 2009-07-04 16:42:17 +01:00
			`]`

			`def html_to_wiki(text, codec = "utf-8"):`
			`if type(text) == str:`
			`text = unicode(text, codec)`
			`text = re.sub("</p>", r"", text)`
			`text = re.sub("<p>$", r"", text)`
			`text = re.sub("<p>", r"\n\n", text)`
			`out = ""`
			`lists = ""`
			`#lists`
			`while text:`
			`mstar = re.match("^(.?)<ul[^>]>\s<li[^>]>(.?)</li>(.)$", text, re.DOTALL)`
			`munstar = re.match("^(\s)</ul>(.)$", text, re.DOTALL)`
			`mhash = re.match("^(.?)<ol[^>]>\s<li[^>]>(.?)</li>(.)$", text, re.DOTALL)`
			`munhash = re.match("^(\s)</ol>(.)$", text, re.DOTALL)`
			`mitem = re.match("^(\s)<li[^>]>(.?)</li>(.)$", text, re.DOTALL)`
			`ms = [len(m.groups()[0]) for m in [mstar, munstar, mhash, munhash, mitem] if m]`
			`def min_(i, l):`
			`try:`
			`v = i.groups()[0]`
			`l.remove(len(v))`
			`return len(v) < min(l, 1000000000)`
			`except:`
			`return False`
			`if min_(mstar, ms):`
			`lists += "*"`
			`pre, val, post = mstar.groups()`
			`out += pre + "\n" + lists + " " + val`
			`text = post`
			`elif min_(mhash, ms):`
			`lists += "#"`
			`pre, val, post = mhash.groups()`
			`out += pre + "\n" + lists + " " + val`
			`text = post`
			`elif min_(mitem, ms):`
			`pre, val, post = mitem.groups()`
			`out += "\n" + lists + " " + val`
			`text = post`
			`elif min_(munstar, ms):`
			`lists = lists[:-1]`
			`text = munstar.groups()[1]`
			`elif min_(munhash, ms):`
			`lists.pop()`
			`text = munhash.groups()[1]`
			`else:`
			`out += text`
			`text = ""`
[svn] bugged 2009-07-21 07:20:34 +01:00
[svn] Work on turn html pages into cavedescription models.py. Moved parser/cavetabs html_to_wiki function to utils.py Added databaseReset.py desc to refresh the cavedescriptions. 2009-07-04 16:42:17 +01:00			`#substitutions`
			`for regex, repl in re_subs:`
			`out = regex.sub(repl, out)`
[svn] override save for CaveDescriptions to scan qm wikilinks and add into the manytomany field linked_qms 2009-07-11 01:36:00 +01:00			`return out`