update HTMLPurifier; enable embedded flash video in articles

2011-04-11 16:41:01 +04:00
parent ad92c6ac62
commit f4f0f80d21
341 changed files with 2014 additions and 643 deletions
--- a/lib/htmlpurifier/library/HTMLPurifier/Lexer/DOMLex.php
+++ b/lib/htmlpurifier/library/HTMLPurifier/Lexer/DOMLex.php
@@ -41,7 +41,7 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer

        // attempt to armor stray angled brackets that cannot possibly
        // form tags and thus are probably being used as emoticons
-        if ($config->get('Core', 'AggressivelyFixLt')) {
+        if ($config->get('Core.AggressivelyFixLt')) {
            $char = '[^a-z!\/]';
            $comment = "/<!--(.*?)(-->|\z)/is";
            $html = preg_replace_callback($comment, array($this, 'callbackArmorCommentEntities'), $html);
@@ -72,23 +72,57 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
    }

    /**
-     * Recursive function that tokenizes a node, putting it into an accumulator.
-     *
+     * Iterative function that tokenizes a node, putting it into an accumulator.
+     * To iterate is human, to recurse divine - L. Peter Deutsch
     * @param $node     DOMNode to be tokenized.
     * @param $tokens   Array-list of already tokenized tokens.
-     * @param $collect  Says whether or start and close are collected, set to
-     *                  false at first recursion because it's the implicit DIV
-     *                  tag you're dealing with.
     * @returns Tokens of node appended to previously passed tokens.
     */
-    protected function tokenizeDOM($node, &$tokens, $collect = false) {
+    protected function tokenizeDOM($node, &$tokens) {

+        $level = 0;
+        $nodes = array($level => array($node));
+        $closingNodes = array();
+        do {
+            while (!empty($nodes[$level])) {
+                $node = array_shift($nodes[$level]); // FIFO
+                $collect = $level > 0 ? true : false;
+                $needEndingTag = $this->createStartNode($node, $tokens, $collect);
+                if ($needEndingTag) {
+                    $closingNodes[$level][] = $node;
+                }
+                if ($node->childNodes && $node->childNodes->length) {
+                    $level++;
+                    $nodes[$level] = array();
+                    foreach ($node->childNodes as $childNode) {
+                        array_push($nodes[$level], $childNode);
+                    }
+                }
+            }
+            $level--;
+            if ($level && isset($closingNodes[$level])) {
+                while($node = array_pop($closingNodes[$level])) {
+                    $this->createEndNode($node, $tokens);
+                }
+            }
+        } while ($level > 0);
+    }
+
+    /**
+     * @param $node  DOMNode to be tokenized.
+     * @param $tokens   Array-list of already tokenized tokens.
+     * @param $collect  Says whether or start and close are collected, set to
+     *                    false at first recursion because it's the implicit DIV
+     *                    tag you're dealing with.
+     * @returns bool if the token needs an endtoken
+     */
+    protected function createStartNode($node, &$tokens, $collect) {
        // intercept non element nodes. WE MUST catch all of them,
        // but we're not getting the character reference nodes because
        // those should have been preprocessed
        if ($node->nodeType === XML_TEXT_NODE) {
            $tokens[] = $this->factory->createText($node->data);
-            return;
+            return false;
        } elseif ($node->nodeType === XML_CDATA_SECTION_NODE) {
            // undo libxml's special treatment of <script> and <style> tags
            $last = end($tokens);
@@ -106,48 +140,44 @@ class HTMLPurifier_Lexer_DOMLex extends HTMLPurifier_Lexer
                }
            }
            $tokens[] = $this->factory->createText($this->parseData($data));
-            return;
+            return false;
        } elseif ($node->nodeType === XML_COMMENT_NODE) {
            // this is code is only invoked for comments in script/style in versions
            // of libxml pre-2.6.28 (regular comments, of course, are still
            // handled regularly)
            $tokens[] = $this->factory->createComment($node->data);
-            return;
+            return false;
        } elseif (
            // not-well tested: there may be other nodes we have to grab
            $node->nodeType !== XML_ELEMENT_NODE
        ) {
-            return;
+            return false;
        }

-        $attr = $node->hasAttributes() ?
-            $this->transformAttrToAssoc($node->attributes) :
-            array();
+        $attr = $node->hasAttributes() ? $this->transformAttrToAssoc($node->attributes) : array();

        // We still have to make sure that the element actually IS empty
        if (!$node->childNodes->length) {
            if ($collect) {
                $tokens[] = $this->factory->createEmpty($node->tagName, $attr);
            }
+            return false;
        } else {
-            if ($collect) { // don't wrap on first iteration
+            if ($collect) {
                $tokens[] = $this->factory->createStart(
                    $tag_name = $node->tagName, // somehow, it get's dropped
                    $attr
                );
            }
-            foreach ($node->childNodes as $node) {
-                // remember, it's an accumulator. Otherwise, we'd have
-                // to use array_merge
-                $this->tokenizeDOM($node, $tokens, true);
-            }
-            if ($collect) {
-                $tokens[] = $this->factory->createEnd($tag_name);
-            }
+            return true;
        }
-
    }

+    protected function createEndNode($node, &$tokens) {
+        $tokens[] = $this->factory->createEnd($node->tagName);
+    }
+
+
    /**
     * Converts a DOMNamedNodeMap of DOMAttr objects into an assoc array.
     *
--- a/lib/htmlpurifier/library/HTMLPurifier/Lexer/DirectLex.php
+++ b/lib/htmlpurifier/library/HTMLPurifier/Lexer/DirectLex.php
@@ -33,7 +33,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
        // special normalization for script tags without any armor
        // our "armor" heurstic is a < sign any number of whitespaces after
        // the first script tag
-        if ($config->get('HTML', 'Trusted')) {
+        if ($config->get('HTML.Trusted')) {
            $html = preg_replace_callback('#(<script[^>]*>)(\s*[^<].+?)(</script>)#si',
                array($this, 'scriptCallback'), $html);
        }
@@ -45,12 +45,12 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
        $array = array(); // result array

        // This is also treated to mean maintain *column* numbers too
-        $maintain_line_numbers = $config->get('Core', 'MaintainLineNumbers');
+        $maintain_line_numbers = $config->get('Core.MaintainLineNumbers');

        if ($maintain_line_numbers === null) {
            // automatically determine line numbering by checking
            // if error collection is on
-            $maintain_line_numbers = $config->get('Core', 'CollectErrors');
+            $maintain_line_numbers = $config->get('Core.CollectErrors');
        }

        if ($maintain_line_numbers) {
@@ -67,10 +67,10 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
        $nl = "\n";
        // how often to manually recalculate. This will ALWAYS be right,
        // but it's pretty wasteful. Set to 0 to turn off
-        $synchronize_interval = $config->get('Core', 'DirectLexLineNumberSyncInterval');
+        $synchronize_interval = $config->get('Core.DirectLexLineNumberSyncInterval');

        $e = false;
-        if ($config->get('Core', 'CollectErrors')) {
+        if ($config->get('Core.CollectErrors')) {
            $e =& $context->get('ErrorCollector');
        }

@@ -345,7 +345,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
        if ($string == '') return array(); // no attributes

        $e = false;
-        if ($config->get('Core', 'CollectErrors')) {
+        if ($config->get('Core.CollectErrors')) {
            $e =& $context->get('ErrorCollector');
        }

@@ -384,7 +384,7 @@ class HTMLPurifier_Lexer_DirectLex extends HTMLPurifier_Lexer
                }
            }
            if ($value === false) $value = '';
-            return array($key => $value);
+            return array($key => $this->parseData($value));
        }

        // setup loop environment
--- a/lib/htmlpurifier/library/HTMLPurifier/Lexer/PEARSax3.php
+++ b/lib/htmlpurifier/library/HTMLPurifier/Lexer/PEARSax3.php
@@ -26,13 +26,20 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
     * Internal accumulator array for SAX parsers.
     */
    protected $tokens = array();
+    protected $last_token_was_empty;
+
+    private $parent_handler;
+    private $stack = array();

    public function tokenizeHTML($string, $config, $context) {

        $this->tokens = array();
+        $this->last_token_was_empty = false;

        $string = $this->normalize($string, $config, $context);

+        $this->parent_handler = set_error_handler(array($this, 'muteStrictErrorHandler'));
+
        $parser = new XML_HTMLSax3();
        $parser->set_object($this);
        $parser->set_element_handler('openHandler','closeHandler');
@@ -44,6 +51,8 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer

        $parser->parse($string);

+        restore_error_handler();
+
        return $this->tokens;

    }
@@ -58,9 +67,11 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
        }
        if ($closed) {
            $this->tokens[] = new HTMLPurifier_Token_Empty($name, $attrs);
+            $this->last_token_was_empty = true;
        } else {
            $this->tokens[] = new HTMLPurifier_Token_Start($name, $attrs);
        }
+        $this->stack[] = $name;
        return true;
    }

@@ -71,10 +82,12 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
        // HTMLSax3 seems to always send empty tags an extra close tag
        // check and ignore if you see it:
        // [TESTME] to make sure it doesn't overreach
-        if ($this->tokens[count($this->tokens)-1] instanceof HTMLPurifier_Token_Empty) {
+        if ($this->last_token_was_empty) {
+            $this->last_token_was_empty = false;
            return true;
        }
        $this->tokens[] = new HTMLPurifier_Token_End($name);
+        if (!empty($this->stack)) array_pop($this->stack);
        return true;
    }

@@ -82,6 +95,7 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
     * Data event handler, interface is defined by PEAR package.
     */
    public function dataHandler(&$parser, $data) {
+        $this->last_token_was_empty = false;
        $this->tokens[] = new HTMLPurifier_Token_Text($data);
        return true;
    }
@@ -91,7 +105,18 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
     */
    public function escapeHandler(&$parser, $data) {
        if (strpos($data, '--') === 0) {
-            $this->tokens[] = new HTMLPurifier_Token_Comment($data);
+            // remove trailing and leading double-dashes
+            $data = substr($data, 2);
+            if (strlen($data) >= 2 && substr($data, -2) == "--") {
+                $data = substr($data, 0, -2);
+            }
+            if (isset($this->stack[sizeof($this->stack) - 1]) &&
+                $this->stack[sizeof($this->stack) - 1] == "style") {
+                $this->tokens[] = new HTMLPurifier_Token_Text($data);
+            } else {
+                $this->tokens[] = new HTMLPurifier_Token_Comment($data);
+            }
+            $this->last_token_was_empty = false;
        }
        // CDATA is handled elsewhere, but if it was handled here:
        //if (strpos($data, '[CDATA[') === 0) {
@@ -101,6 +126,14 @@ class HTMLPurifier_Lexer_PEARSax3 extends HTMLPurifier_Lexer
        return true;
    }

+    /**
+     * An error handler that mutes strict errors
+     */
+    public function muteStrictErrorHandler($errno, $errstr, $errfile=null, $errline=null, $errcontext=null) {
+        if ($errno == E_STRICT) return;
+        return call_user_func($this->parent_handler, $errno, $errstr, $errfile, $errline, $errcontext);
+    }
+
 }

 // vim: et sw=4 sts=4
--- a/lib/htmlpurifier/library/HTMLPurifier/Lexer/PH5P.php
+++ b/lib/htmlpurifier/library/HTMLPurifier/Lexer/PH5P.php
@@ -125,8 +125,6 @@ class HTML5 {
    const EOF      = 5;

    public function __construct($data) {
-        $data = str_replace("\r\n", "\n", $data);
-        $data = str_replace("\r", null, $data);

        $this->data = $data;
        $this->char = -1;