Server : nginx/1.24.0 System : Linux ip-172-31-33-48 6.14.0-1011-aws #11~24.04.1-Ubuntu SMP Fri Aug 1 02:07:25 UTC 2025 x86_64 User : www-data ( 33) PHP Version : 8.3.6 Disable Function : NONE Directory : /var/www/html/wp-content/plugins.off/sierotki/vendor/ |
Upload File : |
<?php /** * Website: http://sourceforge.net/projects/simplehtmldom/ * Additional projects: http://sourceforge.net/projects/debugobject/ * Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/) * * Licensed under The MIT License * See the LICENSE file in the project root for more information. * * Authors: * S.C. Chen * John Schlick * Rus Carroll * logmanoriginal * * Contributors: * Yousuke Kumakura * Vadim Voituk * Antcs * * Version Rev. 1.9.1 (291) */ define( 'HDOM_TYPE_ELEMENT', 1 ); define( 'HDOM_TYPE_COMMENT', 2 ); define( 'HDOM_TYPE_TEXT', 3 ); define( 'HDOM_TYPE_ENDTAG', 4 ); define( 'HDOM_TYPE_ROOT', 5 ); define( 'HDOM_TYPE_UNKNOWN', 6 ); define( 'HDOM_QUOTE_DOUBLE', 0 ); define( 'HDOM_QUOTE_SINGLE', 1 ); define( 'HDOM_QUOTE_NO', 3 ); define( 'HDOM_INFO_BEGIN', 0 ); define( 'HDOM_INFO_END', 1 ); define( 'HDOM_INFO_QUOTE', 2 ); define( 'HDOM_INFO_SPACE', 3 ); define( 'HDOM_INFO_TEXT', 4 ); define( 'HDOM_INFO_INNER', 5 ); define( 'HDOM_INFO_OUTER', 6 ); define( 'HDOM_INFO_ENDSPACE', 7 ); defined( 'DEFAULT_TARGET_CHARSET' ) || define( 'DEFAULT_TARGET_CHARSET', 'UTF-8' ); defined( 'DEFAULT_BR_TEXT' ) || define( 'DEFAULT_BR_TEXT', "\r\n" ); defined( 'DEFAULT_SPAN_TEXT' ) || define( 'DEFAULT_SPAN_TEXT', ' ' ); defined( 'MAX_FILE_SIZE' ) || define( 'MAX_FILE_SIZE', 600000 ); define( 'HDOM_SMARTY_AS_TEXT', 1 ); function file_get_html( $url, $use_include_path = false, $context = null, $offset = 0, $maxLen = -1, $lowercase = true, $forceTagsClosed = true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN = true, $defaultBRText = DEFAULT_BR_TEXT, $defaultSpanText = DEFAULT_SPAN_TEXT ) { if ( $maxLen <= 0 ) { $maxLen = MAX_FILE_SIZE; } $dom = new simple_html_dom( null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText ); /** * For sourceforge users: uncomment the next line and comment the * retrieve_url_contents line 2 lines down if it is not already done. */ $contents = file_get_contents( $url, $use_include_path, $context, $offset, $maxLen ); // $contents = retrieve_url_contents($url); if ( empty( $contents ) || strlen( $contents ) > $maxLen ) { $dom->clear(); return false; } return $dom->load( $contents, $lowercase, $stripRN ); } function str_get_html( $str, $lowercase = true, $forceTagsClosed = true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN = true, $defaultBRText = DEFAULT_BR_TEXT, $defaultSpanText = DEFAULT_SPAN_TEXT ) { $dom = new simple_html_dom( null, $lowercase, $forceTagsClosed, $target_charset, $stripRN, $defaultBRText, $defaultSpanText ); if ( empty( $str ) || strlen( $str ) > MAX_FILE_SIZE ) { $dom->clear(); return false; } return $dom->load( $str, $lowercase, $stripRN ); } function dump_html_tree( $node, $show_attr = true, $deep = 0 ) { $node->dump( $node ); } class simple_html_dom_node { public $nodetype = HDOM_TYPE_TEXT; public $tag = 'text'; public $attr = array(); public $children = array(); public $nodes = array(); public $parent = null; public $_ = array(); public $tag_start = 0; private $dom = null; function __construct( $dom ) { $this->dom = $dom; $dom->nodes[] = $this; } function __destruct() { $this->clear(); } function __toString() { return $this->outertext(); } function clear() { $this->dom = null; $this->nodes = null; $this->parent = null; $this->children = null; } function dump( $show_attr = true, $depth = 0 ) { echo str_repeat( "\t", $depth ) . $this->tag; if ( $show_attr && count( $this->attr ) > 0 ) { echo '('; foreach ( $this->attr as $k => $v ) { echo "[$k]=>\"$v\", "; } echo ')'; } echo "\n"; if ( $this->nodes ) { foreach ( $this->nodes as $node ) { $node->dump( $show_attr, $depth + 1 ); } } } function dump_node( $echo = true ) { $string = $this->tag; if ( count( $this->attr ) > 0 ) { $string .= '('; foreach ( $this->attr as $k => $v ) { $string .= "[$k]=>\"$v\", "; } $string .= ')'; } if ( count( $this->_ ) > 0 ) { $string .= ' $_ ('; foreach ( $this->_ as $k => $v ) { if ( is_array( $v ) ) { $string .= "[$k]=>("; foreach ( $v as $k2 => $v2 ) { $string .= "[$k2]=>\"$v2\", "; } $string .= ')'; } else { $string .= "[$k]=>\"$v\", "; } } $string .= ')'; } if ( isset( $this->text ) ) { $string .= " text: ({$this->text})"; } $string .= ' HDOM_INNER_INFO: '; if ( isset( $node->_[ HDOM_INFO_INNER ] ) ) { $string .= "'" . $node->_[ HDOM_INFO_INNER ] . "'"; } else { $string .= ' NULL '; } $string .= ' children: ' . count( $this->children ); $string .= ' nodes: ' . count( $this->nodes ); $string .= ' tag_start: ' . $this->tag_start; $string .= "\n"; if ( $echo ) { echo $string; return; } else { return $string; } } function parent( $parent = null ) { // I am SURE that this doesn't work properly. // It fails to unset the current node from it's current parents nodes or // children list first. if ( $parent !== null ) { $this->parent = $parent; $this->parent->nodes[] = $this; $this->parent->children[] = $this; } return $this->parent; } function has_child() { return ! empty( $this->children ); } function children( $idx = -1 ) { if ( $idx === -1 ) { return $this->children; } if ( isset( $this->children[ $idx ] ) ) { return $this->children[ $idx ]; } return null; } function first_child() { if ( count( $this->children ) > 0 ) { return $this->children[0]; } return null; } function last_child() { if ( count( $this->children ) > 0 ) { return end( $this->children ); } return null; } function next_sibling() { if ( $this->parent === null ) { return null; } $idx = array_search( $this, $this->parent->children, true ); if ( $idx !== false && isset( $this->parent->children[ $idx + 1 ] ) ) { return $this->parent->children[ $idx + 1 ]; } return null; } function prev_sibling() { if ( $this->parent === null ) { return null; } $idx = array_search( $this, $this->parent->children, true ); if ( $idx !== false && $idx > 0 ) { return $this->parent->children[ $idx - 1 ]; } return null; } function find_ancestor_tag( $tag ) { global $debug_object; if ( is_object( $debug_object ) ) { $debug_object->debug_log_entry( 1 ); } if ( $this->parent === null ) { return null; } $ancestor = $this->parent; while ( ! is_null( $ancestor ) ) { if ( is_object( $debug_object ) ) { $debug_object->debug_log( 2, 'Current tag is: ' . $ancestor->tag ); } if ( $ancestor->tag === $tag ) { break; } $ancestor = $ancestor->parent; } return $ancestor; } function innertext() { if ( isset( $this->_[ HDOM_INFO_INNER ] ) ) { return $this->_[ HDOM_INFO_INNER ]; } if ( isset( $this->_[ HDOM_INFO_TEXT ] ) ) { return $this->dom->restore_noise( $this->_[ HDOM_INFO_TEXT ] ); } $ret = ''; foreach ( $this->nodes as $n ) { $ret .= $n->outertext(); } return $ret; } function outertext() { global $debug_object; if ( is_object( $debug_object ) ) { $text = ''; if ( $this->tag === 'text' ) { if ( ! empty( $this->text ) ) { $text = ' with text: ' . $this->text; } } $debug_object->debug_log( 1, 'Innertext of tag: ' . $this->tag . $text ); } if ( $this->tag === 'root' ) { return $this->innertext(); } // todo: What is the use of this callback? Remove? if ( $this->dom && $this->dom->callback !== null ) { call_user_func_array( $this->dom->callback, array( $this ) ); } if ( isset( $this->_[ HDOM_INFO_OUTER ] ) ) { return $this->_[ HDOM_INFO_OUTER ]; } if ( isset( $this->_[ HDOM_INFO_TEXT ] ) ) { return $this->dom->restore_noise( $this->_[ HDOM_INFO_TEXT ] ); } $ret = ''; if ( $this->dom && $this->dom->nodes[ $this->_[ HDOM_INFO_BEGIN ] ] ) { $ret = $this->dom->nodes[ $this->_[ HDOM_INFO_BEGIN ] ]->makeup(); } if ( isset( $this->_[ HDOM_INFO_INNER ] ) ) { // todo: <br> should either never have HDOM_INFO_INNER or always if ( $this->tag !== 'br' ) { $ret .= $this->_[ HDOM_INFO_INNER ]; } } elseif ( $this->nodes ) { foreach ( $this->nodes as $n ) { $ret .= $this->convert_text( $n->outertext() ); } } if ( isset( $this->_[ HDOM_INFO_END ] ) && $this->_[ HDOM_INFO_END ] != 0 ) { $ret .= '</' . $this->tag . '>'; } return $ret; } function text() { if ( isset( $this->_[ HDOM_INFO_INNER ] ) ) { return $this->_[ HDOM_INFO_INNER ]; } switch ( $this->nodetype ) { case HDOM_TYPE_TEXT: return $this->dom->restore_noise( $this->_[ HDOM_INFO_TEXT ] ); case HDOM_TYPE_COMMENT: return ''; case HDOM_TYPE_UNKNOWN: return ''; } if ( strcasecmp( $this->tag, 'script' ) === 0 ) { return ''; } if ( strcasecmp( $this->tag, 'style' ) === 0 ) { return ''; } $ret = ''; // In rare cases, (always node type 1 or HDOM_TYPE_ELEMENT - observed // for some span tags, and some p tags) $this->nodes is set to NULL. // NOTE: This indicates that there is a problem where it's set to NULL // without a clear happening. // WHY is this happening? if ( ! is_null( $this->nodes ) ) { foreach ( $this->nodes as $n ) { // Start paragraph after a blank line if ( $n->tag === 'p' ) { $ret = trim( $ret ) . "\n\n"; } $ret .= $this->convert_text( $n->text() ); // If this node is a span... add a space at the end of it so // multiple spans don't run into each other. This is plaintext // after all. if ( $n->tag === 'span' ) { $ret .= $this->dom->default_span_text; } } } return $ret; } function xmltext() { $ret = $this->innertext(); $ret = str_ireplace( '<![CDATA[', '', $ret ); $ret = str_replace( ']]>', '', $ret ); return $ret; } function makeup() { // text, comment, unknown if ( isset( $this->_[ HDOM_INFO_TEXT ] ) ) { return $this->dom->restore_noise( $this->_[ HDOM_INFO_TEXT ] ); } $ret = '<' . $this->tag; $i = -1; foreach ( $this->attr as $key => $val ) { ++$i; // skip removed attribute if ( $val === null || $val === false ) { continue; } $ret .= $this->_[ HDOM_INFO_SPACE ][ $i ][0]; //no value attr: nowrap, checked selected... if ( $val === true ) { $ret .= $key; } else { switch ( $this->_[ HDOM_INFO_QUOTE ][ $i ] ) { case HDOM_QUOTE_DOUBLE: $quote = '"'; break; case HDOM_QUOTE_SINGLE: $quote = '\''; break; default: $quote = ''; } $ret .= $key . $this->_[ HDOM_INFO_SPACE ][ $i ][1] . '=' . $this->_[ HDOM_INFO_SPACE ][ $i ][2] . $quote . $val . $quote; } } $ret = $this->dom->restore_noise( $ret ); return $ret . $this->_[ HDOM_INFO_ENDSPACE ] . '>'; } function find( $selector, $idx = null, $lowercase = false ) { $selectors = $this->parse_selector( $selector ); if ( ( $count = count( $selectors ) ) === 0 ) { return array(); } $found_keys = array(); // find each selector for ( $c = 0; $c < $count; ++$c ) { // The change on the below line was documented on the sourceforge // code tracker id 2788009 // used to be: if (($levle=count($selectors[0]))===0) return array(); if ( ( $levle = count( $selectors[ $c ] ) ) === 0 ) { return array(); } if ( ! isset( $this->_[ HDOM_INFO_BEGIN ] ) ) { return array(); } $head = array( $this->_[ HDOM_INFO_BEGIN ] => 1 ); $cmd = ' '; // Combinator // handle descendant selectors, no recursive! for ( $l = 0; $l < $levle; ++$l ) { $ret = array(); foreach ( $head as $k => $v ) { $n = ( $k === -1 ) ? $this->dom->root : $this->dom->nodes[ $k ]; //PaperG - Pass this optional parameter on to the seek function. $n->seek( $selectors[ $c ][ $l ], $ret, $cmd, $lowercase ); } $head = $ret; $cmd = $selectors[ $c ][ $l ][4]; // Next Combinator } foreach ( $head as $k => $v ) { if ( ! isset( $found_keys[ $k ] ) ) { $found_keys[ $k ] = 1; } } } // sort keys ksort( $found_keys ); $found = array(); foreach ( $found_keys as $k => $v ) { $found[] = $this->dom->nodes[ $k ]; } // return nth-element or array if ( is_null( $idx ) ) { return $found; } elseif ( $idx < 0 ) { $idx = count( $found ) + $idx; } return ( isset( $found[ $idx ] ) ) ? $found[ $idx ] : null; } protected function seek( $selector, &$ret, $parent_cmd, $lowercase = false ) { global $debug_object; if ( is_object( $debug_object ) ) { $debug_object->debug_log_entry( 1 ); } list($tag, $id, $class, $attributes, $cmb) = $selector; $nodes = array(); if ( $parent_cmd === ' ' ) { // Descendant Combinator // Find parent closing tag if the current element doesn't have a closing // tag (i.e. void element) $end = ( ! empty( $this->_[ HDOM_INFO_END ] ) ) ? $this->_[ HDOM_INFO_END ] : 0; if ( $end == 0 ) { $parent = $this->parent; while ( ! isset( $parent->_[ HDOM_INFO_END ] ) && $parent !== null ) { $end -= 1; $parent = $parent->parent; } $end += $parent->_[ HDOM_INFO_END ]; } // Get list of target nodes $nodes_start = $this->_[ HDOM_INFO_BEGIN ] + 1; $nodes_count = $end - $nodes_start; $nodes = array_slice( $this->dom->nodes, $nodes_start, $nodes_count, true ); } elseif ( $parent_cmd === '>' ) { // Child Combinator $nodes = $this->children; } elseif ( $parent_cmd === '+' && $this->parent && in_array( $this, $this->parent->children ) ) { // Next-Sibling Combinator $index = array_search( $this, $this->parent->children, true ) + 1; if ( $index < count( $this->parent->children ) ) { $nodes[] = $this->parent->children[ $index ]; } } elseif ( $parent_cmd === '~' && $this->parent && in_array( $this, $this->parent->children ) ) { // Subsequent Sibling Combinator $index = array_search( $this, $this->parent->children, true ); $nodes = array_slice( $this->parent->children, $index ); } // Go throgh each element starting at this element until the end tag // Note: If this element is a void tag, any previous void element is // skipped. foreach ( $nodes as $node ) { $pass = true; // Skip root nodes if ( ! $node->parent ) { $pass = false; } // Handle 'text' selector if ( $pass && $tag === 'text' && $node->tag === 'text' ) { $ret[ array_search( $node, $this->dom->nodes, true ) ] = 1; unset( $node ); continue; } // Skip if node isn't a child node (i.e. text nodes) if ( $pass && ! in_array( $node, $node->parent->children, true ) ) { $pass = false; } // Skip if tag doesn't match if ( $pass && $tag !== '' && $tag !== $node->tag && $tag !== '*' ) { $pass = false; } // Skip if ID doesn't exist if ( $pass && $id !== '' && ! isset( $node->attr['id'] ) ) { $pass = false; } // Check if ID matches if ( $pass && $id !== '' && isset( $node->attr['id'] ) ) { // Note: Only consider the first ID (as browsers do) $node_id = explode( ' ', trim( $node->attr['id'] ) )[0]; if ( $id !== $node_id ) { $pass = false; } } // Check if all class(es) exist if ( $pass && $class !== '' && is_array( $class ) && ! empty( $class ) ) { if ( isset( $node->attr['class'] ) ) { $node_classes = explode( ' ', $node->attr['class'] ); if ( $lowercase ) { $node_classes = array_map( 'strtolower', $node_classes ); } foreach ( $class as $c ) { if ( ! in_array( $c, $node_classes ) ) { $pass = false; break; } } } else { $pass = false; } } // Check attributes if ( $pass && $attributes !== '' && is_array( $attributes ) && ! empty( $attributes ) ) { foreach ( $attributes as $a ) { list ( $att_name, $att_expr, $att_val, $att_inv, $att_case_sensitivity ) = $a; // Handle indexing attributes (i.e. "[2]") /** * Note: This is not supported by the CSS Standard but adds * the ability to select items compatible to XPath (i.e. * the 3rd element within it's parent). * * Note: This doesn't conflict with the CSS Standard which * doesn't work on numeric attributes anyway. */ if ( is_numeric( $att_name ) && $att_expr === '' && $att_val === '' ) { $count = 0; // Find index of current element in parent foreach ( $node->parent->children as $c ) { if ( $c->tag === $node->tag ) { ++$count; } if ( $c === $node ) { break; } } // If this is the correct node, continue with next // attribute if ( $count === (int) $att_name ) { continue; } } // Check attribute availability if ( $att_inv ) { // Attribute should NOT be set if ( isset( $node->attr[ $att_name ] ) ) { $pass = false; break; } } else { // Attribute should be set // todo: "plaintext" is not a valid CSS selector! if ( $att_name !== 'plaintext' && ! isset( $node->attr[ $att_name ] ) ) { $pass = false; break; } } // Continue with next attribute if expression isn't defined if ( $att_expr === '' ) { continue; } // If they have told us that this is a "plaintext" // search then we want the plaintext of the node - right? // todo "plaintext" is not a valid CSS selector! if ( $att_name === 'plaintext' ) { $nodeKeyValue = $node->text(); } else { $nodeKeyValue = $node->attr[ $att_name ]; } if ( is_object( $debug_object ) ) { $debug_object->debug_log( 2, 'testing node: ' . $node->tag . ' for attribute: ' . $att_name . $att_expr . $att_val . ' where nodes value is: ' . $nodeKeyValue ); } // If lowercase is set, do a case insensitive test of // the value of the selector. if ( $lowercase ) { $check = $this->match( $att_expr, strtolower( $att_val ), strtolower( $nodeKeyValue ), $att_case_sensitivity ); } else { $check = $this->match( $att_expr, $att_val, $nodeKeyValue, $att_case_sensitivity ); } if ( is_object( $debug_object ) ) { $debug_object->debug_log( 2, 'after match: ' . ( $check ? 'true' : 'false' ) ); } if ( ! $check ) { $pass = false; break; } } } // Found a match. Add to list and clear node if ( $pass ) { $ret[ $node->_[ HDOM_INFO_BEGIN ] ] = 1; } unset( $node ); } // It's passed by reference so this is actually what this function returns. if ( is_object( $debug_object ) ) { $debug_object->debug_log( 1, 'EXIT - ret: ', $ret ); } } protected function match( $exp, $pattern, $value, $case_sensitivity ) { global $debug_object; if ( is_object( $debug_object ) ) { $debug_object->debug_log_entry( 1 );} if ( $case_sensitivity === 'i' ) { $pattern = strtolower( $pattern ); $value = strtolower( $value ); } switch ( $exp ) { case '=': return ( $value === $pattern ); case '!=': return ( $value !== $pattern ); case '^=': return preg_match( '/^' . preg_quote( $pattern, '/' ) . '/', $value ); case '$=': return preg_match( '/' . preg_quote( $pattern, '/' ) . '$/', $value ); case '*=': return preg_match( '/' . preg_quote( $pattern, '/' ) . '/', $value ); case '|=': /** * [att|=val] * * Represents an element with the att attribute, its value * either being exactly "val" or beginning with "val" * immediately followed by "-" (U+002D). */ return strpos( $value, $pattern ) === 0; case '~=': /** * [att~=val] * * Represents an element with the att attribute whose value is a * whitespace-separated list of words, one of which is exactly * "val". If "val" contains whitespace, it will never represent * anything (since the words are separated by spaces). Also if * "val" is the empty string, it will never represent anything. */ return in_array( $pattern, explode( ' ', trim( $value ) ), true ); } return false; } protected function parse_selector( $selector_string ) { global $debug_object; if ( is_object( $debug_object ) ) { $debug_object->debug_log_entry( 1 ); } /** * Pattern of CSS selectors, modified from mootools (https://mootools.net/) * * Paperg: Add the colon to the attribute, so that it properly finds * <tag attr:ibute="something" > like google does. * * Note: if you try to look at this attribute, you MUST use getAttribute * since $dom->x:y will fail the php syntax check. * * Notice the \[ starting the attribute? and the @? following? This * implies that an attribute can begin with an @ sign that is not * captured. This implies that an html attribute specifier may start * with an @ sign that is NOT captured by the expression. Farther study * is required to determine of this should be documented or removed. * * Matches selectors in this order: * * [0] - full match * * [1] - tag name * ([\w:\*-]*) * Matches the tag name consisting of zero or more words, colons, * asterisks and hyphens. * * [2] - id name * (?:\#([\w-]+)) * Optionally matches a id name, consisting of an "#" followed by * the id name (one or more words and hyphens). * * [3] - class names (including dots) * (?:\.([\w\.-]+))? * Optionally matches a list of classs, consisting of an "." * followed by the class name (one or more words and hyphens) * where multiple classes can be chained (i.e. ".foo.bar.baz") * * [4] - attributes * ((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)? * Optionally matches the attributes list * * [5] - separator * ([\/, >+~]+) * Matches the selector list separator */ // phpcs:ignore Generic.Files.LineLength $pattern = "/([\w:\*-]*)(?:\#([\w-]+))?(?:|\.([\w\.-]+))?((?:\[@?(?:!?[\w:-]+)(?:(?:[!*^$|~]?=)[\"']?(?:.*?)[\"']?)?(?:\s*?(?:[iIsS])?)?\])+)?([\/, >+~]+)/is"; preg_match_all( $pattern, trim( $selector_string ) . ' ', // Add final ' ' as pseudo separator $matches, PREG_SET_ORDER ); if ( is_object( $debug_object ) ) { $debug_object->debug_log( 2, 'Matches Array: ', $matches ); } $selectors = array(); $result = array(); foreach ( $matches as $m ) { $m[0] = trim( $m[0] ); // Skip NoOps if ( $m[0] === '' || $m[0] === '/' || $m[0] === '//' ) { continue; } // Convert to lowercase if ( $this->dom->lowercase ) { $m[1] = strtolower( $m[1] ); } // Extract classes if ( $m[3] !== '' ) { $m[3] = explode( '.', $m[3] ); } /* Extract attributes (pattern based on the pattern above!) * [0] - full match * [1] - attribute name * [2] - attribute expression * [3] - attribute value * [4] - case sensitivity * * Note: Attributes can be negated with a "!" prefix to their name */ if ( $m[4] !== '' ) { preg_match_all( "/\[@?(!?[\w:-]+)(?:([!*^$|~]?=)[\"']?(.*?)[\"']?)?(?:\s+?([iIsS])?)?\]/is", trim( $m[4] ), $attributes, PREG_SET_ORDER ); // Replace element by array $m[4] = array(); foreach ( $attributes as $att ) { // Skip empty matches if ( trim( $att[0] ) === '' ) { continue; } $inverted = ( isset( $att[1][0] ) && $att[1][0] === '!' ); $m[4][] = array( $inverted ? substr( $att[1], 1 ) : $att[1], // Name ( isset( $att[2] ) ) ? $att[2] : '', // Expression ( isset( $att[3] ) ) ? $att[3] : '', // Value $inverted, // Inverted Flag ( isset( $att[4] ) ) ? strtolower( $att[4] ) : '', // Case-Sensitivity ); } } // Sanitize Separator if ( $m[5] !== '' && trim( $m[5] ) === '' ) { // Descendant Separator $m[5] = ' '; } else { // Other Separator $m[5] = trim( $m[5] ); } // Clear Separator if it's a Selector List if ( $is_list = ( $m[5] === ',' ) ) { $m[5] = ''; } // Remove full match before adding to results array_shift( $m ); $result[] = $m; if ( $is_list ) { // Selector List $selectors[] = $result; $result = array(); } } if ( count( $result ) > 0 ) { $selectors[] = $result; } return $selectors; } function __get( $name ) { if ( isset( $this->attr[ $name ] ) ) { return $this->convert_text( $this->attr[ $name ] ); } switch ( $name ) { case 'outertext': return $this->outertext(); case 'innertext': return $this->innertext(); case 'plaintext': return $this->text(); case 'xmltext': return $this->xmltext(); default: return array_key_exists( $name, $this->attr ); } } function __set( $name, $value ) { global $debug_object; if ( is_object( $debug_object ) ) { $debug_object->debug_log_entry( 1 ); } switch ( $name ) { case 'outertext': return $this->_[ HDOM_INFO_OUTER ] = $value; case 'innertext': if ( isset( $this->_[ HDOM_INFO_TEXT ] ) ) { return $this->_[ HDOM_INFO_TEXT ] = $value; } return $this->_[ HDOM_INFO_INNER ] = $value; } if ( ! isset( $this->attr[ $name ] ) ) { $this->_[ HDOM_INFO_SPACE ][] = array( ' ', '', '' ); $this->_[ HDOM_INFO_QUOTE ][] = HDOM_QUOTE_DOUBLE; } $this->attr[ $name ] = $value; } function __isset( $name ) { switch ( $name ) { case 'outertext': return true; case 'innertext': return true; case 'plaintext': return true; } //no value attr: nowrap, checked selected... return ( array_key_exists( $name, $this->attr ) ) ? true : isset( $this->attr[ $name ] ); } function __unset( $name ) { if ( isset( $this->attr[ $name ] ) ) { unset( $this->attr[ $name ] ); } } function convert_text( $text ) { global $debug_object; if ( is_object( $debug_object ) ) { $debug_object->debug_log_entry( 1 ); } $converted_text = $text; $sourceCharset = ''; $targetCharset = ''; if ( $this->dom ) { $sourceCharset = strtoupper( $this->dom->_charset ); $targetCharset = strtoupper( $this->dom->_target_charset ); } if ( is_object( $debug_object ) ) { $debug_object->debug_log( 3, 'source charset: ' . $sourceCharset . ' target charaset: ' . $targetCharset ); } if ( ! empty( $sourceCharset ) && ! empty( $targetCharset ) && ( strcasecmp( $sourceCharset, $targetCharset ) != 0 ) ) { // Check if the reported encoding could have been incorrect and the text is actually already UTF-8 if ( ( strcasecmp( $targetCharset, 'UTF-8' ) == 0 ) && ( $this->is_utf8( $text ) ) ) { $converted_text = $text; } else { $converted_text = iconv( $sourceCharset, $targetCharset, $text ); } } // Lets make sure that we don't have that silly BOM issue with any of the utf-8 text we output. if ( $targetCharset === 'UTF-8' ) { if ( substr( $converted_text, 0, 3 ) === "\xef\xbb\xbf" ) { $converted_text = substr( $converted_text, 3 ); } if ( substr( $converted_text, -3 ) === "\xef\xbb\xbf" ) { $converted_text = substr( $converted_text, 0, -3 ); } } return $converted_text; } static function is_utf8( $str ) { $c = 0; $b = 0; $bits = 0; $len = strlen( $str ); for ( $i = 0; $i < $len; $i++ ) { $c = ord( $str[ $i ] ); if ( $c > 128 ) { if ( ( $c >= 254 ) ) { return false; } elseif ( $c >= 252 ) { $bits = 6; } elseif ( $c >= 248 ) { $bits = 5; } elseif ( $c >= 240 ) { $bits = 4; } elseif ( $c >= 224 ) { $bits = 3; } elseif ( $c >= 192 ) { $bits = 2; } else { return false; } if ( ( $i + $bits ) > $len ) { return false; } while ( $bits > 1 ) { $i++; $b = ord( $str[ $i ] ); if ( $b < 128 || $b > 191 ) { return false; } $bits--; } } } return true; } function get_display_size() { global $debug_object; $width = -1; $height = -1; if ( $this->tag !== 'img' ) { return false; } // See if there is aheight or width attribute in the tag itself. if ( isset( $this->attr['width'] ) ) { $width = $this->attr['width']; } if ( isset( $this->attr['height'] ) ) { $height = $this->attr['height']; } // Now look for an inline style. if ( isset( $this->attr['style'] ) ) { // Thanks to user gnarf from stackoverflow for this regular expression. $attributes = array(); preg_match_all( '/([\w-]+)\s*:\s*([^;]+)\s*;?/', $this->attr['style'], $matches, PREG_SET_ORDER ); foreach ( $matches as $match ) { $attributes[ $match[1] ] = $match[2]; } // If there is a width in the style attributes: if ( isset( $attributes['width'] ) && $width == -1 ) { // check that the last two characters are px (pixels) if ( strtolower( substr( $attributes['width'], -2 ) ) === 'px' ) { $proposed_width = substr( $attributes['width'], 0, -2 ); // Now make sure that it's an integer and not something stupid. if ( filter_var( $proposed_width, FILTER_VALIDATE_INT ) ) { $width = $proposed_width; } } } // If there is a width in the style attributes: if ( isset( $attributes['height'] ) && $height == -1 ) { // check that the last two characters are px (pixels) if ( strtolower( substr( $attributes['height'], -2 ) ) == 'px' ) { $proposed_height = substr( $attributes['height'], 0, -2 ); // Now make sure that it's an integer and not something stupid. if ( filter_var( $proposed_height, FILTER_VALIDATE_INT ) ) { $height = $proposed_height; } } } } // Future enhancement: // Look in the tag to see if there is a class or id specified that has // a height or width attribute to it. // Far future enhancement // Look at all the parent tags of this image to see if they specify a // class or id that has an img selector that specifies a height or width // Note that in this case, the class or id will have the img subselector // for it to apply to the image. // ridiculously far future development // If the class or id is specified in a SEPARATE css file thats not on // the page, go get it and do what we were just doing for the ones on // the page. $result = array( 'height' => $height, 'width' => $width, ); return $result; } function save( $filepath = '' ) { $ret = $this->outertext(); if ( $filepath !== '' ) { file_put_contents( $filepath, $ret, LOCK_EX ); } return $ret; } function addClass( $class ) { if ( is_string( $class ) ) { $class = explode( ' ', $class ); } if ( is_array( $class ) ) { foreach ( $class as $c ) { if ( isset( $this->class ) ) { if ( $this->hasClass( $c ) ) { continue; } else { $this->class .= ' ' . $c; } } else { $this->class = $c; } } } else { if ( is_object( $debug_object ) ) { $debug_object->debug_log( 2, 'Invalid type: ', gettype( $class ) ); } } } function hasClass( $class ) { if ( is_string( $class ) ) { if ( isset( $this->class ) ) { return in_array( $class, explode( ' ', $this->class ), true ); } } else { if ( is_object( $debug_object ) ) { $debug_object->debug_log( 2, 'Invalid type: ', gettype( $class ) ); } } return false; } function removeClass( $class = null ) { if ( ! isset( $this->class ) ) { return; } if ( is_null( $class ) ) { $this->removeAttribute( 'class' ); return; } if ( is_string( $class ) ) { $class = explode( ' ', $class ); } if ( is_array( $class ) ) { $class = array_diff( explode( ' ', $this->class ), $class ); if ( empty( $class ) ) { $this->removeAttribute( 'class' ); } else { $this->class = implode( ' ', $class ); } } } function getAllAttributes() { return $this->attr; } function getAttribute( $name ) { return $this->__get( $name ); } function setAttribute( $name, $value ) { $this->__set( $name, $value ); } function hasAttribute( $name ) { return $this->__isset( $name ); } function removeAttribute( $name ) { $this->__set( $name, null ); } function remove() { if ( $this->parent ) { $this->parent->removeChild( $this ); } } function removeChild( $node ) { $nidx = array_search( $node, $this->nodes, true ); $cidx = array_search( $node, $this->children, true ); $didx = array_search( $node, $this->dom->nodes, true ); if ( $nidx !== false && $cidx !== false && $didx !== false ) { foreach ( $node->children as $child ) { $node->removeChild( $child ); } foreach ( $node->nodes as $entity ) { $enidx = array_search( $entity, $node->nodes, true ); $edidx = array_search( $entity, $node->dom->nodes, true ); if ( $enidx !== false && $edidx !== false ) { unset( $node->nodes[ $enidx ] ); unset( $node->dom->nodes[ $edidx ] ); } } unset( $this->nodes[ $nidx ] ); unset( $this->children[ $cidx ] ); unset( $this->dom->nodes[ $didx ] ); $node->clear(); } } function getElementById( $id ) { return $this->find( "#$id", 0 ); } function getElementsById( $id, $idx = null ) { return $this->find( "#$id", $idx ); } function getElementByTagName( $name ) { return $this->find( $name, 0 ); } function getElementsByTagName( $name, $idx = null ) { return $this->find( $name, $idx ); } function parentNode() { return $this->parent(); } function childNodes( $idx = -1 ) { return $this->children( $idx ); } function firstChild() { return $this->first_child(); } function lastChild() { return $this->last_child(); } function nextSibling() { return $this->next_sibling(); } function previousSibling() { return $this->prev_sibling(); } function hasChildNodes() { return $this->has_child(); } function nodeName() { return $this->tag; } function appendChild( $node ) { $node->parent( $this ); return $node; } } class simple_html_dom { public $root = null; public $nodes = array(); public $callback = null; public $lowercase = false; public $original_size; public $size; protected $pos; protected $doc; protected $char; protected $cursor; protected $parent; protected $noise = array(); protected $token_blank = " \t\r\n"; protected $token_equal = ' =/>'; protected $token_slash = " />\r\n\t"; protected $token_attr = ' >'; public $_charset = ''; public $_target_charset = ''; protected $default_br_text = ''; public $default_span_text = ''; private $optional_closing_array; protected $self_closing_tags = array( 'area' => 1, 'base' => 1, 'br' => 1, 'col' => 1, 'embed' => 1, 'hr' => 1, 'img' => 1, 'input' => 1, 'link' => 1, 'meta' => 1, 'param' => 1, 'source' => 1, 'track' => 1, 'wbr' => 1, ); protected $block_tags = array( 'body' => 1, 'div' => 1, 'form' => 1, 'root' => 1, 'span' => 1, 'table' => 1, ); protected $optional_closing_tags = array( // Not optional, see // https://www.w3.org/TR/html/textlevel-semantics.html#the-b-element 'b' => array( 'b' => 1 ), 'dd' => array( 'dd' => 1, 'dt' => 1, ), // Not optional, see // https://www.w3.org/TR/html/grouping-content.html#the-dl-element 'dl' => array( 'dd' => 1, 'dt' => 1, ), 'dt' => array( 'dd' => 1, 'dt' => 1, ), 'li' => array( 'li' => 1 ), 'optgroup' => array( 'optgroup' => 1, 'option' => 1, ), 'option' => array( 'optgroup' => 1, 'option' => 1, ), 'p' => array( 'p' => 1 ), 'rp' => array( 'rp' => 1, 'rt' => 1, ), 'rt' => array( 'rp' => 1, 'rt' => 1, ), 'td' => array( 'td' => 1, 'th' => 1, ), 'th' => array( 'td' => 1, 'th' => 1, ), 'tr' => array( 'td' => 1, 'th' => 1, 'tr' => 1, ), ); function __construct( $str = null, $lowercase = true, $forceTagsClosed = true, $target_charset = DEFAULT_TARGET_CHARSET, $stripRN = true, $defaultBRText = DEFAULT_BR_TEXT, $defaultSpanText = DEFAULT_SPAN_TEXT, $options = 0 ) { if ( $str ) { if ( preg_match( '/^http:\/\//i', $str ) || is_file( $str ) ) { $this->load_file( $str ); } else { $this->load( $str, $lowercase, $stripRN, $defaultBRText, $defaultSpanText, $options ); } } // Forcing tags to be closed implies that we don't trust the html, but // it can lead to parsing errors if we SHOULD trust the html. if ( ! $forceTagsClosed ) { $this->optional_closing_array = array(); } $this->_target_charset = $target_charset; } function __destruct() { $this->clear(); } function load( $str, $lowercase = true, $stripRN = true, $defaultBRText = DEFAULT_BR_TEXT, $defaultSpanText = DEFAULT_SPAN_TEXT, $options = 0 ) { global $debug_object; // prepare $this->prepare( $str, $lowercase, $defaultBRText, $defaultSpanText ); // Per sourceforge http://sourceforge.net/tracker/?func=detail&aid=2949097&group_id=218559&atid=1044037 // Script tags removal now preceeds style tag removal. // strip out <script> tags $this->remove_noise( "'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is" ); $this->remove_noise( "'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is" ); // strip out the \r \n's if we are told to. if ( $stripRN ) { $this->doc = str_replace( "\r", ' ', $this->doc ); $this->doc = str_replace( "\n", ' ', $this->doc ); // set the length of content since we have changed it. $this->size = strlen( $this->doc ); } // strip out cdata $this->remove_noise( "'<!\[CDATA\[(.*?)\]\]>'is", true ); // strip out comments $this->remove_noise( "'<!--(.*?)-->'is" ); // strip out <style> tags $this->remove_noise( "'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is" ); $this->remove_noise( "'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is" ); // strip out preformatted tags $this->remove_noise( "'<\s*(?:code)[^>]*>(.*?)<\s*/\s*(?:code)\s*>'is" ); // strip out server side scripts $this->remove_noise( "'(<\?)(.*?)(\?>)'s", true ); if ( $options & HDOM_SMARTY_AS_TEXT ) { // Strip Smarty scripts $this->remove_noise( "'(\{\w)(.*?)(\})'s", true ); } // parsing $this->parse(); // end $this->root->_[ HDOM_INFO_END ] = $this->cursor; $this->parse_charset(); // make load function chainable return $this; } function load_file() { $args = func_get_args(); if ( ( $doc = call_user_func_array( 'file_get_contents', $args ) ) !== false ) { $this->load( $doc, true ); } else { return false; } } function set_callback( $function_name ) { $this->callback = $function_name; } function remove_callback() { $this->callback = null; } function save( $filepath = '' ) { $ret = $this->root->innertext(); if ( $filepath !== '' ) { file_put_contents( $filepath, $ret, LOCK_EX ); } return $ret; } function find( $selector, $idx = null, $lowercase = false ) { return $this->root->find( $selector, $idx, $lowercase ); } function clear() { if ( isset( $this->nodes ) ) { foreach ( $this->nodes as $n ) { $n->clear(); $n = null; } } // This add next line is documented in the sourceforge repository. // 2977248 as a fix for ongoing memory leaks that occur even with the // use of clear. if ( isset( $this->children ) ) { foreach ( $this->children as $n ) { $n->clear(); $n = null; } } if ( isset( $this->parent ) ) { $this->parent->clear(); unset( $this->parent ); } if ( isset( $this->root ) ) { $this->root->clear(); unset( $this->root ); } unset( $this->doc ); unset( $this->noise ); } function dump( $show_attr = true ) { $this->root->dump( $show_attr ); } protected function prepare( $str, $lowercase = true, $defaultBRText = DEFAULT_BR_TEXT, $defaultSpanText = DEFAULT_SPAN_TEXT ) { $this->clear(); $this->doc = trim( $str ); $this->size = strlen( $this->doc ); $this->original_size = $this->size; // original size of the html $this->pos = 0; $this->cursor = 1; $this->noise = array(); $this->nodes = array(); $this->lowercase = $lowercase; $this->default_br_text = $defaultBRText; $this->default_span_text = $defaultSpanText; $this->root = new simple_html_dom_node( $this ); $this->root->tag = 'root'; $this->root->_[ HDOM_INFO_BEGIN ] = -1; $this->root->nodetype = HDOM_TYPE_ROOT; $this->parent = $this->root; if ( $this->size > 0 ) { $this->char = $this->doc[0]; } } protected function parse() { while ( true ) { // Read next tag if there is no text between current position and the // next opening tag. if ( ( $s = $this->copy_until_char( '<' ) ) === '' ) { if ( $this->read_tag() ) { continue; } else { return true; } } // Add a text node for text between tags $node = new simple_html_dom_node( $this ); ++$this->cursor; $node->_[ HDOM_INFO_TEXT ] = $s; $this->link_nodes( $node, false ); } } protected function parse_charset() { global $debug_object; $charset = null; if ( function_exists( 'get_last_retrieve_url_contents_content_type' ) ) { $contentTypeHeader = get_last_retrieve_url_contents_content_type(); $success = preg_match( '/charset=(.+)/', $contentTypeHeader, $matches ); if ( $success ) { $charset = $matches[1]; if ( is_object( $debug_object ) ) { $debug_object->debug_log( 2, 'header content-type found charset of: ' . $charset ); } } } if ( empty( $charset ) ) { // https://www.w3.org/TR/html/document-metadata.html#statedef-http-equiv-content-type $el = $this->root->find( 'meta[http-equiv=Content-Type]', 0, true ); if ( ! empty( $el ) ) { $fullvalue = $el->content; if ( is_object( $debug_object ) ) { $debug_object->debug_log( 2, 'meta content-type tag found' . $fullvalue ); } if ( ! empty( $fullvalue ) ) { $success = preg_match( '/charset=(.+)/i', $fullvalue, $matches ); if ( $success ) { $charset = $matches[1]; } else { // If there is a meta tag, and they don't specify the // character set, research says that it's typically // ISO-8859-1 if ( is_object( $debug_object ) ) { $debug_object->debug_log( 2, 'meta content-type tag couldn\'t be parsed. using iso-8859 default.' ); } $charset = 'ISO-8859-1'; } } } } if ( empty( $charset ) ) { // https://www.w3.org/TR/html/document-metadata.html#character-encoding-declaration if ( $meta = $this->root->find( 'meta[charset]', 0 ) ) { $charset = $meta->charset; if ( is_object( $debug_object ) ) { $debug_object->debug_log( 2, 'meta charset: ' . $charset ); } } } if ( empty( $charset ) ) { // Try to guess the charset based on the content // Requires Multibyte String (mbstring) support (optional) if ( function_exists( 'mb_detect_encoding' ) ) { /** * mb_detect_encoding() is not intended to distinguish between * charsets, especially single-byte charsets. Its primary * purpose is to detect which multibyte encoding is in use, * i.e. UTF-8, UTF-16, shift-JIS, etc. * * -- https://bugs.php.net/bug.php?id=38138 * * Adding both CP1251/ISO-8859-5 and CP1252/ISO-8859-1 will * always result in CP1251/ISO-8859-5 and vice versa. * * Thus, only detect if it's either UTF-8 or CP1252/ISO-8859-1 * to stay compatible. */ $encoding = mb_detect_encoding( $this->doc, array( 'UTF-8', 'CP1252', 'ISO-8859-1' ) ); if ( $encoding === 'CP1252' || $encoding === 'ISO-8859-1' ) { // Due to a limitation of mb_detect_encoding // 'CP1251'/'ISO-8859-5' will be detected as // 'CP1252'/'ISO-8859-1'. This will cause iconv to fail, in // which case we can simply assume it is the other charset. if ( ! @iconv( 'CP1252', 'UTF-8', $this->doc ) ) { $encoding = 'CP1251'; } } if ( $encoding !== false ) { $charset = $encoding; if ( is_object( $debug_object ) ) { $debug_object->debug_log( 2, 'mb_detect: ' . $charset ); } } } } if ( empty( $charset ) ) { // Assume it's UTF-8 as it is the most likely charset to be used $charset = 'UTF-8'; if ( is_object( $debug_object ) ) { $debug_object->debug_log( 2, 'No match found, assume ' . $charset ); } } // Since CP1252 is a superset, if we get one of it's subsets, we want // it instead. if ( ( strtolower( $charset ) == 'iso-8859-1' ) || ( strtolower( $charset ) == 'latin1' ) || ( strtolower( $charset ) == 'latin-1' ) ) { $charset = 'CP1252'; if ( is_object( $debug_object ) ) { $debug_object->debug_log( 2, 'replacing ' . $charset . ' with CP1252 as its a superset' ); } } if ( is_object( $debug_object ) ) { $debug_object->debug_log( 1, 'EXIT - ' . $charset ); } return $this->_charset = $charset; } protected function read_tag() { // Set end position if no further tags found if ( $this->char !== '<' ) { $this->root->_[ HDOM_INFO_END ] = $this->cursor; return false; } $begin_tag_pos = $this->pos; $this->char = ( ++$this->pos < $this->size ) ? $this->doc[ $this->pos ] : null; // next // end tag if ( $this->char === '/' ) { $this->char = ( ++$this->pos < $this->size ) ? $this->doc[ $this->pos ] : null; // next // Skip whitespace in end tags (i.e. in "</ html>") $this->skip( $this->token_blank ); $tag = $this->copy_until_char( '>' ); // Skip attributes in end tags if ( ( $pos = strpos( $tag, ' ' ) ) !== false ) { $tag = substr( $tag, 0, $pos ); } $parent_lower = strtolower( $this->parent->tag ); $tag_lower = strtolower( $tag ); // The end tag is supposed to close the parent tag. Handle situations // when it doesn't if ( $parent_lower !== $tag_lower ) { // Parent tag does not have to be closed necessarily (optional closing tag) // Current tag is a block tag, so it may close an ancestor if ( isset( $this->optional_closing_tags[ $parent_lower ] ) && isset( $this->block_tags[ $tag_lower ] ) ) { $this->parent->_[ HDOM_INFO_END ] = 0; $org_parent = $this->parent; // Traverse ancestors to find a matching opening tag // Stop at root node while ( ( $this->parent->parent ) && strtolower( $this->parent->tag ) !== $tag_lower ) { $this->parent = $this->parent->parent; } // If we don't have a match add current tag as text node if ( strtolower( $this->parent->tag ) !== $tag_lower ) { $this->parent = $org_parent; // restore origonal parent if ( $this->parent->parent ) { $this->parent = $this->parent->parent; } $this->parent->_[ HDOM_INFO_END ] = $this->cursor; return $this->as_text_node( $tag ); } } elseif ( ( $this->parent->parent ) && isset( $this->block_tags[ $tag_lower ] ) ) { // Grandparent exists and current tag is a block tag, so our // parent doesn't have an end tag $this->parent->_[ HDOM_INFO_END ] = 0; // No end tag $org_parent = $this->parent; // Traverse ancestors to find a matching opening tag // Stop at root node while ( ( $this->parent->parent ) && strtolower( $this->parent->tag ) !== $tag_lower ) { $this->parent = $this->parent->parent; } // If we don't have a match add current tag as text node if ( strtolower( $this->parent->tag ) !== $tag_lower ) { $this->parent = $org_parent; // restore origonal parent $this->parent->_[ HDOM_INFO_END ] = $this->cursor; return $this->as_text_node( $tag ); } } elseif ( ( $this->parent->parent ) && strtolower( $this->parent->parent->tag ) === $tag_lower ) { // Grandparent exists and current tag closes it $this->parent->_[ HDOM_INFO_END ] = 0; $this->parent = $this->parent->parent; } else { // Random tag, add as text node return $this->as_text_node( $tag ); } } // Set end position of parent tag to current cursor position $this->parent->_[ HDOM_INFO_END ] = $this->cursor; if ( $this->parent->parent ) { $this->parent = $this->parent->parent; } $this->char = ( ++$this->pos < $this->size ) ? $this->doc[ $this->pos ] : null; // next return true; } // start tag $node = new simple_html_dom_node( $this ); $node->_[ HDOM_INFO_BEGIN ] = $this->cursor; ++$this->cursor; $tag = $this->copy_until( $this->token_slash ); // Get tag name $node->tag_start = $begin_tag_pos; // doctype, cdata & comments... // <!DOCTYPE html> // <![CDATA[ ... ]]> // <!-- Comment --> if ( isset( $tag[0] ) && $tag[0] === '!' ) { $node->_[ HDOM_INFO_TEXT ] = '<' . $tag . $this->copy_until_char( '>' ); if ( isset( $tag[2] ) && $tag[1] === '-' && $tag[2] === '-' ) { // Comment ("<!--") $node->nodetype = HDOM_TYPE_COMMENT; $node->tag = 'comment'; } else { // Could be doctype or CDATA but we don't care $node->nodetype = HDOM_TYPE_UNKNOWN; $node->tag = 'unknown'; } if ( $this->char === '>' ) { $node->_[ HDOM_INFO_TEXT ] .= '>'; } $this->link_nodes( $node, true ); $this->char = ( ++$this->pos < $this->size ) ? $this->doc[ $this->pos ] : null; // next return true; } // The start tag cannot contain another start tag, if so add as text // i.e. "<<html>" if ( $pos = strpos( $tag, '<' ) !== false ) { $tag = '<' . substr( $tag, 0, -1 ); $node->_[ HDOM_INFO_TEXT ] = $tag; $this->link_nodes( $node, false ); $this->char = $this->doc[ --$this->pos ]; // prev return true; } // Handle invalid tag names (i.e. "<html#doc>") if ( ! preg_match( '/^\w[\w:-]*$/', $tag ) ) { $node->_[ HDOM_INFO_TEXT ] = '<' . $tag . $this->copy_until( '<>' ); // Next char is the beginning of a new tag, don't touch it. if ( $this->char === '<' ) { $this->link_nodes( $node, false ); return true; } // Next char closes current tag, add and be done with it. if ( $this->char === '>' ) { $node->_[ HDOM_INFO_TEXT ] .= '>'; } $this->link_nodes( $node, false ); $this->char = ( ++$this->pos < $this->size ) ? $this->doc[ $this->pos ] : null; // next return true; } // begin tag, add new node $node->nodetype = HDOM_TYPE_ELEMENT; $tag_lower = strtolower( $tag ); $node->tag = ( $this->lowercase ) ? $tag_lower : $tag; // handle optional closing tags if ( isset( $this->optional_closing_tags[ $tag_lower ] ) ) { // Traverse ancestors to close all optional closing tags while ( isset( $this->optional_closing_tags[ $tag_lower ][ strtolower( $this->parent->tag ) ] ) ) { $this->parent->_[ HDOM_INFO_END ] = 0; $this->parent = $this->parent->parent; } $node->parent = $this->parent; } $guard = 0; // prevent infinity loop // [0] Space between tag and first attribute $space = array( $this->copy_skip( $this->token_blank ), '', '' ); // attributes do { // Everything until the first equal sign should be the attribute name $name = $this->copy_until( $this->token_equal ); if ( $name === '' && $this->char !== null && $space[0] === '' ) { break; } if ( $guard === $this->pos ) { // Escape infinite loop $this->char = ( ++$this->pos < $this->size ) ? $this->doc[ $this->pos ] : null; // next continue; } $guard = $this->pos; // handle endless '<' // Out of bounds before the tag ended if ( $this->pos >= $this->size - 1 && $this->char !== '>' ) { $node->nodetype = HDOM_TYPE_TEXT; $node->_[ HDOM_INFO_END ] = 0; $node->_[ HDOM_INFO_TEXT ] = '<' . $tag . $space[0] . $name; $node->tag = 'text'; $this->link_nodes( $node, false ); return true; } // handle mismatch '<' // Attributes cannot start after opening tag if ( $this->doc[ $this->pos - 1 ] == '<' ) { $node->nodetype = HDOM_TYPE_TEXT; $node->tag = 'text'; $node->attr = array(); $node->_[ HDOM_INFO_END ] = 0; $node->_[ HDOM_INFO_TEXT ] = substr( $this->doc, $begin_tag_pos, $this->pos - $begin_tag_pos - 1 ); $this->pos -= 2; $this->char = ( ++$this->pos < $this->size ) ? $this->doc[ $this->pos ] : null; // next $this->link_nodes( $node, false ); return true; } if ( $name !== '/' && $name !== '' ) { // this is a attribute name // [1] Whitespace after attribute name $space[1] = $this->copy_skip( $this->token_blank ); $name = $this->restore_noise( $name ); // might be a noisy name if ( $this->lowercase ) { $name = strtolower( $name ); } if ( $this->char === '=' ) { // attribute with value $this->char = ( ++$this->pos < $this->size ) ? $this->doc[ $this->pos ] : null; // next $this->parse_attr( $node, $name, $space ); // get attribute value } else { //no value attr: nowrap, checked selected... $node->_[ HDOM_INFO_QUOTE ][] = HDOM_QUOTE_NO; $node->attr[ $name ] = true; if ( $this->char != '>' ) { $this->char = $this->doc[ --$this->pos ]; } // prev } $node->_[ HDOM_INFO_SPACE ][] = $space; // prepare for next attribute $space = array( $this->copy_skip( $this->token_blank ), '', '', ); } else { // no more attributes break; } } while ( $this->char !== '>' && $this->char !== '/' ); // go until the tag ended $this->link_nodes( $node, true ); $node->_[ HDOM_INFO_ENDSPACE ] = $space[0]; // handle empty tags (i.e. "<div/>") if ( $this->copy_until_char( '>' ) === '/' ) { $node->_[ HDOM_INFO_ENDSPACE ] .= '/'; $node->_[ HDOM_INFO_END ] = 0; } else { // reset parent if ( ! isset( $this->self_closing_tags[ strtolower( $node->tag ) ] ) ) { $this->parent = $node; } } $this->char = ( ++$this->pos < $this->size ) ? $this->doc[ $this->pos ] : null; // next // If it's a BR tag, we need to set it's text to the default text. // This way when we see it in plaintext, we can generate formatting that the user wants. // since a br tag never has sub nodes, this works well. if ( $node->tag === 'br' ) { $node->_[ HDOM_INFO_INNER ] = $this->default_br_text; } return true; } protected function parse_attr( $node, $name, &$space ) { $is_duplicate = isset( $node->attr[ $name ] ); if ( ! $is_duplicate ) { // Copy whitespace between "=" and value $space[2] = $this->copy_skip( $this->token_blank ); } switch ( $this->char ) { case '"': $quote_type = HDOM_QUOTE_DOUBLE; $this->char = ( ++$this->pos < $this->size ) ? $this->doc[ $this->pos ] : null; // next $value = $this->copy_until_char( '"' ); $this->char = ( ++$this->pos < $this->size ) ? $this->doc[ $this->pos ] : null; // next break; case '\'': $quote_type = HDOM_QUOTE_SINGLE; $this->char = ( ++$this->pos < $this->size ) ? $this->doc[ $this->pos ] : null; // next $value = $this->copy_until_char( '\'' ); $this->char = ( ++$this->pos < $this->size ) ? $this->doc[ $this->pos ] : null; // next break; default: $quote_type = HDOM_QUOTE_NO; $value = $this->copy_until( $this->token_attr ); } $value = $this->restore_noise( $value ); // PaperG: Attributes should not have \r or \n in them, that counts as // html whitespace. $value = str_replace( "\r", '', $value ); $value = str_replace( "\n", '', $value ); // PaperG: If this is a "class" selector, lets get rid of the preceeding // and trailing space since some people leave it in the multi class case. if ( $name === 'class' ) { $value = trim( $value ); } if ( ! $is_duplicate ) { $node->_[ HDOM_INFO_QUOTE ][] = $quote_type; $node->attr[ $name ] = $value; } } protected function link_nodes( &$node, $is_child ) { $node->parent = $this->parent; $this->parent->nodes[] = $node; if ( $is_child ) { $this->parent->children[] = $node; } } protected function as_text_node( $tag ) { $node = new simple_html_dom_node( $this ); ++$this->cursor; $node->_[ HDOM_INFO_TEXT ] = '</' . $tag . '>'; $this->link_nodes( $node, false ); $this->char = ( ++$this->pos < $this->size ) ? $this->doc[ $this->pos ] : null; // next return true; } protected function skip( $chars ) { $this->pos += strspn( $this->doc, $chars, $this->pos ); $this->char = ( $this->pos < $this->size ) ? $this->doc[ $this->pos ] : null; // next } protected function copy_skip( $chars ) { $pos = $this->pos; $len = strspn( $this->doc, $chars, $pos ); $this->pos += $len; $this->char = ( $this->pos < $this->size ) ? $this->doc[ $this->pos ] : null; // next if ( $len === 0 ) { return ''; } return substr( $this->doc, $pos, $len ); } protected function copy_until( $chars ) { $pos = $this->pos; $len = strcspn( $this->doc, $chars, $pos ); $this->pos += $len; $this->char = ( $this->pos < $this->size ) ? $this->doc[ $this->pos ] : null; // next return substr( $this->doc, $pos, $len ); } protected function copy_until_char( $char ) { if ( $this->char === null ) { return ''; } if ( ( $pos = strpos( $this->doc, $char, $this->pos ) ) === false ) { $ret = substr( $this->doc, $this->pos, $this->size - $this->pos ); $this->char = null; $this->pos = $this->size; return $ret; } if ( $pos === $this->pos ) { return ''; } $pos_old = $this->pos; $this->char = $this->doc[ $pos ]; $this->pos = $pos; return substr( $this->doc, $pos_old, $pos - $pos_old ); } protected function remove_noise( $pattern, $remove_tag = false ) { global $debug_object; if ( is_object( $debug_object ) ) { $debug_object->debug_log_entry( 1 ); } $count = preg_match_all( $pattern, $this->doc, $matches, PREG_SET_ORDER | PREG_OFFSET_CAPTURE ); for ( $i = $count - 1; $i > -1; --$i ) { $key = '___noise___' . sprintf( '% 5d', count( $this->noise ) + 1000 ); if ( is_object( $debug_object ) ) { $debug_object->debug_log( 2, 'key is: ' . $key ); } $idx = ( $remove_tag ) ? 0 : 1; // 0 = entire match, 1 = submatch $this->noise[ $key ] = $matches[ $i ][ $idx ][0]; $this->doc = substr_replace( $this->doc, $key, $matches[ $i ][ $idx ][1], strlen( $matches[ $i ][ $idx ][0] ) ); } // reset the length of content $this->size = strlen( $this->doc ); if ( $this->size > 0 ) { $this->char = $this->doc[0]; } } function restore_noise( $text ) { global $debug_object; if ( is_object( $debug_object ) ) { $debug_object->debug_log_entry( 1 ); } while ( ( $pos = strpos( $text, '___noise___' ) ) !== false ) { // Sometimes there is a broken piece of markup, and we don't GET the // pos+11 etc... token which indicates a problem outside of us... // todo: "___noise___1000" (or any number with four or more digits) // in the DOM causes an infinite loop which could be utilized by // malicious software if ( strlen( $text ) > $pos + 15 ) { $key = '___noise___' . $text[ $pos + 11 ] . $text[ $pos + 12 ] . $text[ $pos + 13 ] . $text[ $pos + 14 ] . $text[ $pos + 15 ]; if ( is_object( $debug_object ) ) { $debug_object->debug_log( 2, 'located key of: ' . $key ); } if ( isset( $this->noise[ $key ] ) ) { $text = substr( $text, 0, $pos ) . $this->noise[ $key ] . substr( $text, $pos + 16 ); } else { // do this to prevent an infinite loop. $text = substr( $text, 0, $pos ) . 'UNDEFINED NOISE FOR KEY: ' . $key . substr( $text, $pos + 16 ); } } else { // There is no valid key being given back to us... We must get // rid of the ___noise___ or we will have a problem. $text = substr( $text, 0, $pos ) . 'NO NUMERIC NOISE KEY' . substr( $text, $pos + 11 ); } } return $text; } function search_noise( $text ) { global $debug_object; if ( is_object( $debug_object ) ) { $debug_object->debug_log_entry( 1 ); } foreach ( $this->noise as $noiseElement ) { if ( strpos( $noiseElement, $text ) !== false ) { return $noiseElement; } } } function __toString() { return $this->root->innertext(); } function __get( $name ) { switch ( $name ) { case 'outertext': return $this->root->innertext(); case 'innertext': return $this->root->innertext(); case 'plaintext': return $this->root->text(); case 'charset': return $this->_charset; case 'target_charset': return $this->_target_charset; } } function childNodes( $idx = -1 ) { return $this->root->childNodes( $idx ); } function firstChild() { return $this->root->first_child(); } function lastChild() { return $this->root->last_child(); } function createElement( $name, $value = null ) { return @str_get_html( "<$name>$value</$name>" )->firstChild(); } function createTextNode( $value ) { return @end( str_get_html( $value )->nodes ); } function getElementById( $id ) { return $this->find( "#$id", 0 ); } function getElementsById( $id, $idx = null ) { return $this->find( "#$id", $idx ); } function getElementByTagName( $name ) { return $this->find( $name, 0 ); } function getElementsByTagName( $name, $idx = -1 ) { return $this->find( $name, $idx ); } function loadFile() { $args = func_get_args(); $this->load_file( $args ); } }