Xataface HTML Reports Module 0.2
HTML Reports Module for Xataface
classes/simple_html_dom.php
Go to the documentation of this file.
00001 <?php
00002 /*******************************************************************************
00003 Version: 1.0 ($Rev: 152 $)
00004 Website: http://sourceforge.net/projects/simplehtmldom/
00005 Author: S.C. Chen (me578022@gmail.com)
00006 Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/)
00007 Contributions by: Yousuke Kumakura (Attribute filters)
00008 Licensed under The MIT License
00009 Redistributions of files must retain the above copyright notice.
00010 *******************************************************************************/
00011 
00012 define('HDOM_TYPE_ELEMENT', 1);
00013 define('HDOM_TYPE_COMMENT', 2);
00014 define('HDOM_TYPE_TEXT',    3);
00015 define('HDOM_TYPE_ENDTAG',  4);
00016 define('HDOM_TYPE_ROOT',    5);
00017 define('HDOM_TYPE_UNKNOWN', 6);
00018 define('HDOM_QUOTE_DOUBLE', 0);
00019 define('HDOM_QUOTE_SINGLE', 1);
00020 define('HDOM_QUOTE_NO',     3);
00021 define('HDOM_INFO_BEGIN',   0);
00022 define('HDOM_INFO_END',     1);
00023 define('HDOM_INFO_QUOTE',   2);
00024 define('HDOM_INFO_SPACE',   3);
00025 define('HDOM_INFO_TEXT',    4);
00026 define('HDOM_INFO_INNER',   5);
00027 define('HDOM_INFO_OUTER',   6);
00028 define('HDOM_INFO_ENDSPACE',7);
00029 
00030 // helper functions
00031 // -----------------------------------------------------------------------------
00032 // get html dom form file
00033 function file_get_html() {
00034     $dom = new simple_html_dom;
00035     $args = func_get_args();
00036     $dom->load(call_user_func_array('file_get_contents', $args), true);
00037     return $dom;
00038 }
00039 
00040 // get html dom form string
00041 function str_get_html($str, $lowercase=true) {
00042     $dom = new simple_html_dom;
00043     $dom->load($str, $lowercase);
00044     return $dom;
00045 }
00046 
00047 // get dom form file (deprecation)
00048 function file_get_dom() {
00049     $dom = new simple_html_dom;
00050     $args = func_get_args();
00051     $dom->load(call_user_func_array('file_get_contents', $args), true);
00052     return $dom;
00053 }
00054 
00055 // get dom form string (deprecation)
00056 function str_get_dom($str, $lowercase=true) {
00057     $dom = new simple_html_dom;
00058     $dom->load($str, $lowercase);
00059     return $dom;
00060 }
00061 
00062 // simple html dom node
00063 // -----------------------------------------------------------------------------
00064 class simple_html_dom_node {
00065     public $nodetype = HDOM_TYPE_TEXT;
00066     public $tag = 'text';
00067     public $attr = array();
00068     public $children = array();
00069     public $nodes = array();
00070     public $parent = null;
00071     public $_ = array();
00072     private $dom = null;
00073 
00074     function __construct($dom) {
00075         $this->dom = $dom;
00076         $dom->nodes[] = &$this;
00077     }
00078 
00079     function __destruct() {
00080         $this->clear();
00081     }
00082     
00083     function __toString() {
00084         return $this->outertext();
00085     }
00086 
00087     // clean up memory due to php5 circular references memory leak...
00088     function clear() {
00089         $this->dom = null;
00090         $this->nodes = null;
00091         $this->parent = null;
00092         $this->children = null;
00093     }
00094 
00095     // returns the parent of node
00096     function parent() {
00097         return $this->parent;
00098     }
00099 
00100     // returns children of node
00101     function children($idx=-1) {
00102         if ($idx===-1) return $this->children;
00103         if (isset($this->children[$idx])) return $this->children[$idx];
00104         return null;
00105     }
00106 
00107     // returns the first child of node
00108     function first_child() {
00109         if (count($this->children)>0) return $this->children[0];
00110         return null;
00111     }
00112 
00113     // returns the last child of node
00114     function last_child() {
00115         if (($count=count($this->children))>0) return $this->children[$count-1];
00116         return null;
00117     }
00118 
00119     // returns the next sibling of node    
00120     function next_sibling() {
00121         if ($this->parent===null) return null;
00122         $idx = 0;
00123         $count = count($this->parent->children);
00124         while ($idx<$count && $this!==$this->parent->children[$idx])
00125             ++$idx;
00126         if (++$idx>=$count) return null;
00127         return $this->parent->children[$idx];
00128     }
00129 
00130     // returns the previous sibling of node
00131     function prev_sibling() {
00132         if ($this->parent===null) return null;
00133         $idx = 0;
00134         $count = count($this->parent->children);
00135         while ($idx<$count && $this!==$this->parent->children[$idx])
00136             ++$idx;
00137         if (--$idx<0) return null;
00138         return $this->parent->children[$idx];
00139     }
00140 
00141     // get dom node's inner html
00142     function innertext() {
00143         if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
00144         if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
00145 
00146         $ret = '';
00147         foreach($this->nodes as $n)
00148             $ret .= $n->outertext();
00149         return $ret;
00150     }
00151 
00152     // get dom node's outer text (with tag)
00153     function outertext() {
00154         if ($this->tag==='root') return $this->innertext();
00155 
00156         // trigger callback
00157         if ($this->dom->callback!==null)
00158             call_user_func_array($this->dom->callback, array($this));
00159 
00160         if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER];
00161         if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
00162 
00163         // render begin tag
00164         $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup();
00165 
00166         // render inner text
00167         if (isset($this->_[HDOM_INFO_INNER]))
00168             $ret .= $this->_[HDOM_INFO_INNER];
00169         else {
00170             foreach($this->nodes as $n)
00171                 $ret .= $n->outertext();
00172         }
00173 
00174         // render end tag
00175         if(isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0)
00176             $ret .= '</'.$this->tag.'>';
00177         return $ret;
00178     }
00179 
00180     // get dom node's plain text
00181     function plaintext() {
00182         if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER];
00183         switch ($this->nodetype) {
00184             case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
00185             case HDOM_TYPE_COMMENT: return '';
00186             case HDOM_TYPE_UNKNOWN: return '';
00187         }
00188         if (strcasecmp($this->tag, 'script')===0) return '';
00189         if (strcasecmp($this->tag, 'style')===0) return '';
00190 
00191         $ret = '';
00192         foreach($this->nodes as $n)
00193             $ret .= $n->plaintext();
00194         return $ret;
00195     }
00196 
00197     // build node's text with tag
00198     function makeup() {
00199         // text, comment, unknown
00200         if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]);
00201 
00202         $ret = '<'.$this->tag;
00203         $i = -1;
00204 
00205         foreach($this->attr as $key=>$val) {
00206             ++$i;
00207 
00208             // skip removed attribute
00209             if ($val===null || $val===false)
00210                 continue;
00211 
00212             $ret .= $this->_[HDOM_INFO_SPACE][$i][0];
00213             //no value attr: nowrap, checked selected...
00214             if ($val===true)
00215                 $ret .= $key;
00216             else {
00217                 switch($this->_[HDOM_INFO_QUOTE][$i]) {
00218                     case HDOM_QUOTE_DOUBLE: $quote = '"'; break;
00219                     case HDOM_QUOTE_SINGLE: $quote = '\''; break;
00220                     default: $quote = '';
00221                 }
00222                 $ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote;
00223             }
00224         }
00225         $ret = $this->dom->restore_noise($ret);
00226         return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>';
00227     }
00228 
00229     // find elements by css selector
00230     function find($selector, $idx=-1) {
00231         $selectors = $this->parse_selector($selector);
00232 
00233         if (($count=count($selectors))===0) return array();
00234         $found_keys = array();
00235 
00236         // find each selector
00237         for ($c=0; $c<$count; ++$c) {
00238             if (($levle=count($selectors[0]))===0) return array();
00239             if (!isset($this->_[HDOM_INFO_BEGIN])) return array();
00240 
00241             $head = array($this->_[HDOM_INFO_BEGIN]=>1);
00242 
00243             // handle descendant selectors, no recursive!
00244             for ($l=0; $l<$levle; ++$l) {
00245                 $ret = array();
00246                 foreach($head as $k=>$v) {
00247                     $n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k];
00248                     $n->seek($selectors[$c][$l], $ret);
00249                 }
00250                 $head = $ret;
00251             }
00252 
00253             foreach($head as $k=>$v) {
00254                 if (!isset($found_keys[$k]))
00255                     $found_keys[$k] = 1;
00256             }
00257         }
00258 
00259         // sort keys
00260         ksort($found_keys);
00261 
00262         $found = array();
00263         foreach($found_keys as $k=>$v)
00264             $found[] = $this->dom->nodes[$k];
00265 
00266         // return nth-element or array
00267         if ($idx<0) return $found;
00268         return (isset($found[$idx])) ? $found[$idx] : null;
00269     }
00270 
00271     // seek for given conditions
00272     protected function seek($selector, &$ret) {
00273         list($tag, $key, $val, $exp) = $selector;
00274 
00275         $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0;
00276         if ($end==0) {
00277             $parent = $this->parent;
00278             while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) {
00279                 $end -= 1;
00280                 $parent = $parent->parent;
00281             }
00282             $end += $parent->_[HDOM_INFO_END];
00283         }
00284 
00285         for($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) {
00286             $node = $this->dom->nodes[$i];
00287             $pass = true;
00288 
00289             if ($tag==='*') {
00290                 if (in_array($node, $this->children, true))
00291                     $ret[$i] = 1;
00292                 continue;
00293             }
00294 
00295             // compare tag
00296             if ($tag && $tag!=$node->tag) {$pass=false;}
00297             // compare key
00298             if ($pass && $key && !(isset($node->attr[$key]))) {$pass=false;}
00299             // compare value
00300             if ($pass && $key && $val) {
00301                 $check = $this->match($exp, $val, $node->attr[$key]);
00302                 // handle multiple class
00303                 if (!$check && strcasecmp($key, 'class')===0) {
00304                     foreach(explode(' ',$node->attr[$key]) as $k) {
00305                         $check = $this->match($exp, $val, $k);
00306                         if ($check) break;
00307                     }
00308                 }
00309                 if (!$check) $pass = false;
00310             }
00311             if ($pass) $ret[$i] = 1;
00312             unset($node);
00313         }
00314     }
00315 
00316     protected function match($exp, $pattern, $value) {
00317         $check = true;
00318         switch ($exp) {
00319             case '=':
00320                 $check = ($value===$pattern) ? true : false; break;
00321             case '!=':
00322                 $check = ($value!==$pattern) ? true : false; break;
00323             case '^=':
00324                 $check = (preg_match("/^".preg_quote($pattern,'/')."/", $value)) ? true : false; break;
00325             case '$=':
00326                 $check = (preg_match("/".preg_quote($pattern,'/')."$/", $value)) ? true : false; break;
00327             case '*=':
00328                 $check = (preg_match("/".preg_quote($pattern,'/')."/i", $value)) ? true : false; break;
00329         }
00330         return $check;
00331     }
00332 
00333     protected function parse_selector($selector_string) {
00334         // pattern of CSS selectors, modified from mootools
00335         $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[(\w+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([, ]+)/is";
00336         preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER);
00337         $selectors = array();
00338         $result = array();
00339         
00340         foreach ($matches as $m) {
00341             if (trim($m[0])==='') continue;
00342             
00343             list($tag, $key, $val, $exp) = array($m[1], null, null, '=');
00344             if(!empty($m[2])) {$key='id'; $val=$m[2];}
00345             if(!empty($m[3])) {$key='class'; $val=$m[3];}
00346             if(!empty($m[4])) {$key=$m[4];}
00347             if(!empty($m[5])) {$exp=$m[5];}
00348             if(!empty($m[6])) {$val=$m[6];}
00349 
00350             // convert to lowercase
00351             if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);}
00352 
00353             $result[] = array($tag, $key, $val, $exp);
00354             if (trim($m[7])===',') {
00355                 $selectors[] = $result;
00356                 $result = array();
00357             }
00358         }
00359         if (count($result)>0)
00360             $selectors[] = $result;
00361 
00362         return $selectors;
00363     }
00364 
00365     function __get($name) {
00366         if (isset($this->attr[$name])) return $this->attr[$name];
00367         switch($name) {
00368             case 'outertext': return $this->outertext();
00369             case 'innertext': return $this->innertext();
00370             case 'plaintext': return $this->plaintext();
00371             default: return array_key_exists($name, $this->attr);
00372         }
00373     }
00374 
00375     function __set($name, $value) {
00376         switch($name) {
00377             case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value;
00378             case 'innertext':
00379                 if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value;
00380                 return $this->_[HDOM_INFO_INNER] = $value;
00381         }
00382         if (!isset($this->attr[$name])) {
00383             $this->_[HDOM_INFO_SPACE][] = array(' ', '', ''); 
00384             $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
00385         }
00386         $this->attr[$name] = $value;
00387     }
00388 
00389     function __isset($name) {
00390         switch($name) {
00391             case 'outertext': return true;
00392             case 'innertext': return true;
00393             case 'plaintext': return true;
00394         }
00395         //no value attr: nowrap, checked selected...
00396         return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]);
00397     }
00398     
00399     function __unset($name) {
00400         if (isset($this->attr[$name]))
00401             unset($this->attr[$name]);
00402     }
00403 
00404     // camel naming conventions
00405     function getAllAttributes() {return $this->attr;}
00406     function getAttribute($name) {return $this->__get($name);}
00407     function setAttribute($name, $value) {$this->__set($name, $value);}
00408     function hasAttribute($name) {return $this->__isset($name);}
00409     function removeAttribute($name) {$this->__set($name, null);}
00410     function getElementById($id) {return $this->find("#$id", 0);}
00411     function getElementsById($id, $idx=-1) {return $this->find("#$id", $idx);}
00412     function getElementByTagName($name) {return $this->find($name, 0);}
00413     function getElementsByTagName($name, $idx=-1) {return $this->find($name, $idx);}
00414     function parentNode() {return $this->parent();}
00415     function childNodes($idx=-1) {return $this->children($idx);}
00416     function firstChild() {return $this->first_child();}
00417     function lastChild() {return $this->last_child();}
00418     function nextSibling() {return $this->next_sibling();}
00419     function previousSibling() {return $this->prev_sibling();}
00420 }
00421 
00422 // simple html dom parser
00423 // -----------------------------------------------------------------------------
00424 class simple_html_dom {
00425     public $root = null;
00426     public $nodes = array();
00427     public $callback = null;
00428     public $lowercase = false;
00429     protected $pos;
00430     protected $doc;
00431     protected $char;
00432     protected $size;
00433     protected $cursor;
00434     protected $parent;
00435     protected $noise = array();
00436     protected $token_blank = " \t\r\n";
00437     protected $token_equal = ' =/><';
00438     protected $token_slash = " />\r\n\t";
00439     protected $token_attr = ' >';
00440     // use isset instead of in_array, performance boost about 30%...
00441     protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1, 'nobr'=>1);
00442     protected $block_tags = array('root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1);
00443     protected $optional_closing_tags = array(
00444         'tr'=>array('tr'=>1, 'td'=>1, 'th'=>1),
00445         'th'=>array('th'=>1),
00446         'td'=>array('td'=>1),
00447         'ul'=>array('li'=>1),
00448         'li'=>array('li'=>1),
00449         'dt'=>array('dt'=>1, 'dd'=>1),
00450         'dd'=>array('dd'=>1, 'dt'=>1),
00451         'dl'=>array('dd'=>1, 'dt'=>1),
00452         'p'=>array('p'=>1),
00453     );
00454 
00455     function __destruct() {
00456         $this->clear();
00457     }
00458 
00459     // load html from string
00460     function load($str, $lowercase=true) {
00461         // prepare
00462         $this->prepare($str, $lowercase);
00463         // strip out comments
00464         $this->remove_noise("'<!--(.*?)-->'is");
00465         // strip out <style> tags
00466         $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is");
00467         $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is");
00468         // strip out <script> tags
00469         $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is");
00470         $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is");
00471         // strip out preformatted tags
00472         $this->remove_noise("'<\s*(?:pre|code)[^>]*>(.*?)<\s*/\s*(?:pre|code)\s*>'is");
00473         // strip out server side scripts
00474         $this->remove_noise("'(<\?)(.*?)(\?>)'is", true);
00475         
00476         //echo $this->doc;
00477         //die;
00478 
00479         // parsing
00480         while ($this->parse());
00481         // end
00482         $this->root->_[HDOM_INFO_END] = $this->cursor;
00483     }
00484 
00485     // load html from file
00486     function load_file() {
00487         $args = func_get_args();
00488         $this->load(call_user_func_array('file_get_contents', $args), true);
00489     }
00490 
00491     // set callback function
00492     function set_callback($function_name) {
00493         $this->callback = $function_name;
00494     }
00495 
00496     // remove callback function
00497     function remove_callback() {
00498         $this->callback = null;
00499     }
00500 
00501     // save dom as string
00502     function save($filepath='') {
00503         $ret = $this->root->innertext();
00504         if ($filepath!=='') file_put_contents($filepath, $ret);
00505         return $ret;
00506     }
00507 
00508     // find dom node by css selector
00509     function find($selector, $idx=-1) {
00510         return $this->root->find($selector, $idx);
00511     }
00512 
00513     // clean up memory due to php5 circular references memory leak...
00514     function clear() {
00515         foreach($this->nodes as $n) {$n->clear(); $n = null;}
00516         if (isset($this->parent)) {$this->parent->clear(); unset($this->parent);}
00517         if (isset($this->root)) {$this->root->clear(); unset($this->root);}
00518         unset($this->doc);
00519         unset($this->noise);
00520     }
00521 
00522     // prepare HTML data and init everything
00523     protected function prepare($str, $lowercase=true) {
00524         $this->clear();
00525         $this->doc = $str;
00526         $this->pos = 0;
00527         $this->cursor = 1;
00528         $this->noise = array();
00529         $this->nodes = array();
00530         $this->lowercase = $lowercase;
00531         $this->root = new simple_html_dom_node($this);
00532         $this->root->tag = 'root';
00533         $this->root->_[HDOM_INFO_BEGIN] = -1;
00534         $this->root->nodetype = HDOM_TYPE_ROOT;
00535         $this->parent = $this->root;
00536         // set the length of content
00537         $this->size = strlen($str);
00538         if ($this->size>0) $this->char = $this->doc[0];
00539     }
00540 
00541     // parse html content
00542     protected function parse() {
00543         if (($s = $this->copy_until_char('<'))==='')
00544             return $this->read_tag();
00545 
00546         // text
00547         $node = new simple_html_dom_node($this);
00548         ++$this->cursor;
00549         $node->_[HDOM_INFO_TEXT] = $s;
00550         $this->link_nodes($node, false);
00551         return true;
00552     }
00553 
00554     // read tag info
00555     protected function read_tag() {
00556         if ($this->char!=='<') {
00557             $this->root->_[HDOM_INFO_END] = $this->cursor;
00558             return false;
00559         }
00560         $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
00561 
00562         // end tag
00563         if ($this->char==='/') {
00564             $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
00565             $this->skip($this->token_blank_t);
00566             $tag = $this->copy_until_char('>');
00567 
00568             // skip attributes in end tag
00569             if (($pos = strpos($tag, ' '))!==false)
00570                 $tag = substr($tag, 0, $pos);
00571 
00572             $parent_lower = strtolower($this->parent->tag);
00573             $tag_lower = strtolower($tag);
00574 
00575             if ($parent_lower!==$tag_lower) {
00576                 if (isset($this->optional_closing_tags[$parent_lower]) && isset($this->block_tags[$tag_lower])) {
00577                     $this->parent->_[HDOM_INFO_END] = 0;
00578                     while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower)
00579                         $this->parent = $this->parent->parent;
00580 
00581                     if (strtolower($this->parent->tag)!==$tag_lower) {
00582                         $this->as_text_node($tag);
00583                         $this->char = (--$this->pos>-1) ? $this->doc[$this->pos] : null; // back
00584                     }
00585                 }
00586                 else if (($this->parent->parent) && strtolower($this->parent->parent->tag)===$tag_lower) {
00587                     $this->parent->_[HDOM_INFO_END] = 0;
00588                     $this->parent = $this->parent->parent;
00589                 }
00590                 else
00591                     return $this->as_text_node($tag);
00592             }
00593 
00594             $this->parent->_[HDOM_INFO_END] = $this->cursor;
00595             if ($this->parent->parent) $this->parent = $this->parent->parent;
00596 
00597             $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
00598             return true;
00599         }
00600 
00601         $node = new simple_html_dom_node($this);
00602         $node->_[HDOM_INFO_BEGIN] = $this->cursor;
00603         ++$this->cursor;
00604         $tag = $this->copy_until($this->token_slash);
00605 
00606         // doctype, cdata & comments...
00607         if (isset($tag[0]) && $tag[0]==='!') {
00608             $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');
00609 
00610             if (isset($tag[2]) && $tag[1]==='-' && $tag[2]==='-') {
00611                 $node->nodetype = HDOM_TYPE_COMMENT;
00612                 $node->tag = 'comment';
00613             } else {
00614                 $node->nodetype = HDOM_TYPE_UNKNOWN;
00615                 $node->tag = 'unknown';
00616             }
00617 
00618             if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>';
00619             $this->link_nodes($node, false);
00620             $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
00621             return true;
00622         }
00623 
00624         // text
00625         if (!preg_match("/^[\w-:]+$/", $tag)) {
00626             $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>');
00627             if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>';
00628             $this->link_nodes($node, false);
00629             $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
00630             return true;
00631         }
00632 
00633         // begin tag
00634         $node->nodetype = HDOM_TYPE_ELEMENT;
00635         $tag_lower = strtolower($tag);
00636         $node->tag = ($this->lowercase) ? $tag_lower : $tag;
00637 
00638         // handle optional closing tags
00639         if (isset($this->optional_closing_tags[$tag_lower]) ) {
00640             while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) {
00641                 $this->parent->_[HDOM_INFO_END] = 0;
00642                 $this->parent = $this->parent->parent;
00643             }
00644             $node->parent = $this->parent;
00645         }
00646         $this->link_nodes($node, true);
00647 
00648         $guard = 0; // prevent infinity loop
00649         $space = array($this->copy_skip($this->token_blank), '', '');
00650 
00651         // attributes
00652         do {
00653             if ($this->char!==null && $space[0]==='') break;
00654             $name = $this->copy_until($this->token_equal);
00655 
00656             if($guard===$this->pos) {
00657                 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
00658                 continue;
00659             }
00660             $guard = $this->pos;
00661 
00662             // handle endless '<'
00663             if($this->pos>=$this->size-1 && $this->char!=='>') {
00664                 $node->nodetype = HDOM_TYPE_TEXT;
00665                 $node->_[HDOM_INFO_END] = 0;
00666                 $node->_[HDOM_INFO_TEXT] = '<'.$tag . $space[0] . $name;
00667                 $node->tag = 'text';
00668                 return true;
00669             }
00670 
00671             if ($name!=='/' && $name!=='') {
00672                 $space[1] = $this->copy_skip($this->token_blank);
00673                 if ($this->lowercase) $name = strtolower($name);
00674                 if ($this->char==='=') {
00675                     $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
00676                     $this->parse_attr($node, $name, $space);
00677                 }
00678                 else {
00679                     //no value attr: nowrap, checked selected...
00680                     $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
00681                     $node->attr[$name] = true;
00682                     if ($this->char!='>') $this->char = $this->doc[--$this->pos]; // prev
00683                 }
00684                 $node->_[HDOM_INFO_SPACE][] = $space;
00685                 $space = array($this->copy_skip($this->token_blank), '', '');
00686             }
00687             else
00688                 break;
00689         } while($this->char!=='>' && $this->char!=='/');
00690 
00691         $node->_[HDOM_INFO_ENDSPACE] = $space[0];
00692 
00693         // check self closing
00694         if ($this->copy_until_char_escape('>')==='/') {
00695             $node->_[HDOM_INFO_ENDSPACE] .= '/';
00696             $node->_[HDOM_INFO_END] = 0;
00697         }
00698         else {
00699             // reset parent
00700             if (!isset($this->self_closing_tags[strtolower($node->tag)])) $this->parent = $node;
00701         }
00702         $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
00703         return true;
00704     }
00705 
00706     // parse attributes
00707     protected function parse_attr($node, $name, &$space) {
00708         $space[2] = $this->copy_skip($this->token_blank);
00709         switch($this->char) {
00710             case '"':
00711                 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE;
00712                 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
00713                 $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('"'));
00714                 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
00715                 break;
00716             case '\'':
00717                 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE;
00718                 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
00719                 $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('\''));
00720                 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
00721                 break;
00722             default:
00723                 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO;
00724                 $node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr));
00725         }
00726     }
00727 
00728     // link node's parent
00729     protected function link_nodes(&$node, $is_child) {
00730         $node->parent = $this->parent;
00731         $this->parent->nodes[] = &$node;
00732         if ($is_child)
00733             $this->parent->children[] = &$node;
00734     }
00735 
00736     // as a text node
00737     protected function as_text_node($tag) {
00738         $node = new simple_html_dom_node($this);
00739         ++$this->cursor;
00740         $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>';
00741         $this->link_nodes($node, false);
00742         $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
00743         return true;
00744     }
00745 
00746     protected function skip($chars) {
00747         $this->pos += strspn($this->doc, $chars, $this->pos);
00748         $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
00749     }
00750 
00751     protected function copy_skip($chars) {
00752         $pos = $this->pos;
00753         $len = strspn($this->doc, $chars, $pos);
00754         $this->pos += $len;
00755         $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
00756         if ($len===0) return '';
00757         return substr($this->doc, $pos, $len);
00758     }
00759 
00760     protected function copy_until($chars) {
00761         $pos = $this->pos;
00762         $len = strcspn($this->doc, $chars, $pos);
00763         $this->pos += $len;
00764         $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next
00765         return substr($this->doc, $pos, $len);
00766     }
00767 
00768     protected function copy_until_char($char) {
00769         if ($this->char===null) return '';
00770 
00771         if (($pos = strpos($this->doc, $char, $this->pos))===false) {
00772             $ret = substr($this->doc, $this->pos, $this->size-$this->pos);
00773             $this->char = null;
00774             $this->pos = $this->size;
00775             return $ret;
00776         }
00777 
00778         if ($pos===$this->pos) return '';
00779         $pos_old = $this->pos;
00780         $this->char = $this->doc[$pos];
00781         $this->pos = $pos;
00782         return substr($this->doc, $pos_old, $pos-$pos_old);
00783     }
00784 
00785     protected function copy_until_char_escape($char) {
00786         if ($this->char===null) return '';
00787 
00788         $start = $this->pos;
00789         while(1) {
00790             if (($pos = strpos($this->doc, $char, $start))===false) {
00791                 $ret = substr($this->doc, $this->pos, $this->size-$this->pos);
00792                 $this->char = null;
00793                 $this->pos = $this->size;
00794                 return $ret;
00795             }
00796 
00797             if ($pos===$this->pos) return '';
00798 
00799             if ($this->doc[$pos-1]==='\\') {
00800                 $start = $pos+1;
00801                 continue;
00802             }
00803 
00804             $pos_old = $this->pos;
00805             $this->char = $this->doc[$pos];
00806             $this->pos = $pos;
00807             return substr($this->doc, $pos_old, $pos-$pos_old);
00808         }
00809     }
00810 
00811     // remove noise from html content
00812     protected function remove_noise($pattern, $remove_tag=false) {
00813         $count = preg_match_all($pattern, $this->doc, $matches, PREG_SET_ORDER|PREG_OFFSET_CAPTURE);
00814 
00815         for ($i=$count-1; $i>-1; --$i) {
00816             $key = '___noise___'.sprintf('% 3d', count($this->noise)+100);
00817             $idx = ($remove_tag) ? 0 : 1;
00818             $this->noise[$key] = $matches[$i][$idx][0];
00819             $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0]));
00820         }
00821 
00822         // reset the length of content
00823         $this->size = strlen($this->doc);
00824         if ($this->size>0) $this->char = $this->doc[0];
00825     }
00826 
00827     // restore noise to html content
00828     function restore_noise($text) {
00829         while(($pos=strpos($text, '___noise___'))!==false) {
00830             $key = '___noise___'.$text[$pos+11].$text[$pos+12].$text[$pos+13];
00831             if (isset($this->noise[$key]))
00832                 $text = substr($text, 0, $pos).$this->noise[$key].substr($text, $pos+14);
00833         }
00834         return $text;
00835     }
00836 
00837     function __toString() {
00838         return $this->root->innertext();
00839     }
00840 
00841     function __get($name) {
00842         switch($name) {
00843             case 'outertext': return $this->root->innertext();
00844             case 'innertext': return $this->root->innertext();
00845             case 'plaintext': return $this->root->plaintext();
00846         }
00847     }
00848 
00849     // camel naming conventions
00850     function childNodes($idx=-1) {return $this->root->childNodes($idx);}
00851     function firstChild() {return $this->root->first_child();}
00852     function lastChild() {return $this->root->last_child();}
00853     function getElementById($id) {return $this->find("#$id", 0);}
00854     function getElementsById($id, $idx=-1) {return $this->find("#$id", $idx);}
00855     function getElementByTagName($name) {return $this->find($name, 0);}
00856     function getElementsByTagName($name, $idx=-1) {return $this->find($name, $idx);}
00857     function loadFile() {$args = func_get_args();$this->load(call_user_func_array('file_get_contents', $args), true);}
00858 }
00859 ?>
 All Data Structures Files Functions Variables Enumerations