Xataface HTML Reports Module 0.2
HTML Reports Module for Xataface
|
00001 <?php 00002 /******************************************************************************* 00003 Version: 1.0 ($Rev: 152 $) 00004 Website: http://sourceforge.net/projects/simplehtmldom/ 00005 Author: S.C. Chen (me578022@gmail.com) 00006 Acknowledge: Jose Solorzano (https://sourceforge.net/projects/php-html/) 00007 Contributions by: Yousuke Kumakura (Attribute filters) 00008 Licensed under The MIT License 00009 Redistributions of files must retain the above copyright notice. 00010 *******************************************************************************/ 00011 00012 define('HDOM_TYPE_ELEMENT', 1); 00013 define('HDOM_TYPE_COMMENT', 2); 00014 define('HDOM_TYPE_TEXT', 3); 00015 define('HDOM_TYPE_ENDTAG', 4); 00016 define('HDOM_TYPE_ROOT', 5); 00017 define('HDOM_TYPE_UNKNOWN', 6); 00018 define('HDOM_QUOTE_DOUBLE', 0); 00019 define('HDOM_QUOTE_SINGLE', 1); 00020 define('HDOM_QUOTE_NO', 3); 00021 define('HDOM_INFO_BEGIN', 0); 00022 define('HDOM_INFO_END', 1); 00023 define('HDOM_INFO_QUOTE', 2); 00024 define('HDOM_INFO_SPACE', 3); 00025 define('HDOM_INFO_TEXT', 4); 00026 define('HDOM_INFO_INNER', 5); 00027 define('HDOM_INFO_OUTER', 6); 00028 define('HDOM_INFO_ENDSPACE',7); 00029 00030 // helper functions 00031 // ----------------------------------------------------------------------------- 00032 // get html dom form file 00033 function file_get_html() { 00034 $dom = new simple_html_dom; 00035 $args = func_get_args(); 00036 $dom->load(call_user_func_array('file_get_contents', $args), true); 00037 return $dom; 00038 } 00039 00040 // get html dom form string 00041 function str_get_html($str, $lowercase=true) { 00042 $dom = new simple_html_dom; 00043 $dom->load($str, $lowercase); 00044 return $dom; 00045 } 00046 00047 // get dom form file (deprecation) 00048 function file_get_dom() { 00049 $dom = new simple_html_dom; 00050 $args = func_get_args(); 00051 $dom->load(call_user_func_array('file_get_contents', $args), true); 00052 return $dom; 00053 } 00054 00055 // get dom form string (deprecation) 00056 function str_get_dom($str, $lowercase=true) { 00057 $dom = new simple_html_dom; 00058 $dom->load($str, $lowercase); 00059 return $dom; 00060 } 00061 00062 // simple html dom node 00063 // ----------------------------------------------------------------------------- 00064 class simple_html_dom_node { 00065 public $nodetype = HDOM_TYPE_TEXT; 00066 public $tag = 'text'; 00067 public $attr = array(); 00068 public $children = array(); 00069 public $nodes = array(); 00070 public $parent = null; 00071 public $_ = array(); 00072 private $dom = null; 00073 00074 function __construct($dom) { 00075 $this->dom = $dom; 00076 $dom->nodes[] = &$this; 00077 } 00078 00079 function __destruct() { 00080 $this->clear(); 00081 } 00082 00083 function __toString() { 00084 return $this->outertext(); 00085 } 00086 00087 // clean up memory due to php5 circular references memory leak... 00088 function clear() { 00089 $this->dom = null; 00090 $this->nodes = null; 00091 $this->parent = null; 00092 $this->children = null; 00093 } 00094 00095 // returns the parent of node 00096 function parent() { 00097 return $this->parent; 00098 } 00099 00100 // returns children of node 00101 function children($idx=-1) { 00102 if ($idx===-1) return $this->children; 00103 if (isset($this->children[$idx])) return $this->children[$idx]; 00104 return null; 00105 } 00106 00107 // returns the first child of node 00108 function first_child() { 00109 if (count($this->children)>0) return $this->children[0]; 00110 return null; 00111 } 00112 00113 // returns the last child of node 00114 function last_child() { 00115 if (($count=count($this->children))>0) return $this->children[$count-1]; 00116 return null; 00117 } 00118 00119 // returns the next sibling of node 00120 function next_sibling() { 00121 if ($this->parent===null) return null; 00122 $idx = 0; 00123 $count = count($this->parent->children); 00124 while ($idx<$count && $this!==$this->parent->children[$idx]) 00125 ++$idx; 00126 if (++$idx>=$count) return null; 00127 return $this->parent->children[$idx]; 00128 } 00129 00130 // returns the previous sibling of node 00131 function prev_sibling() { 00132 if ($this->parent===null) return null; 00133 $idx = 0; 00134 $count = count($this->parent->children); 00135 while ($idx<$count && $this!==$this->parent->children[$idx]) 00136 ++$idx; 00137 if (--$idx<0) return null; 00138 return $this->parent->children[$idx]; 00139 } 00140 00141 // get dom node's inner html 00142 function innertext() { 00143 if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER]; 00144 if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 00145 00146 $ret = ''; 00147 foreach($this->nodes as $n) 00148 $ret .= $n->outertext(); 00149 return $ret; 00150 } 00151 00152 // get dom node's outer text (with tag) 00153 function outertext() { 00154 if ($this->tag==='root') return $this->innertext(); 00155 00156 // trigger callback 00157 if ($this->dom->callback!==null) 00158 call_user_func_array($this->dom->callback, array($this)); 00159 00160 if (isset($this->_[HDOM_INFO_OUTER])) return $this->_[HDOM_INFO_OUTER]; 00161 if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 00162 00163 // render begin tag 00164 $ret = $this->dom->nodes[$this->_[HDOM_INFO_BEGIN]]->makeup(); 00165 00166 // render inner text 00167 if (isset($this->_[HDOM_INFO_INNER])) 00168 $ret .= $this->_[HDOM_INFO_INNER]; 00169 else { 00170 foreach($this->nodes as $n) 00171 $ret .= $n->outertext(); 00172 } 00173 00174 // render end tag 00175 if(isset($this->_[HDOM_INFO_END]) && $this->_[HDOM_INFO_END]!=0) 00176 $ret .= '</'.$this->tag.'>'; 00177 return $ret; 00178 } 00179 00180 // get dom node's plain text 00181 function plaintext() { 00182 if (isset($this->_[HDOM_INFO_INNER])) return $this->_[HDOM_INFO_INNER]; 00183 switch ($this->nodetype) { 00184 case HDOM_TYPE_TEXT: return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 00185 case HDOM_TYPE_COMMENT: return ''; 00186 case HDOM_TYPE_UNKNOWN: return ''; 00187 } 00188 if (strcasecmp($this->tag, 'script')===0) return ''; 00189 if (strcasecmp($this->tag, 'style')===0) return ''; 00190 00191 $ret = ''; 00192 foreach($this->nodes as $n) 00193 $ret .= $n->plaintext(); 00194 return $ret; 00195 } 00196 00197 // build node's text with tag 00198 function makeup() { 00199 // text, comment, unknown 00200 if (isset($this->_[HDOM_INFO_TEXT])) return $this->dom->restore_noise($this->_[HDOM_INFO_TEXT]); 00201 00202 $ret = '<'.$this->tag; 00203 $i = -1; 00204 00205 foreach($this->attr as $key=>$val) { 00206 ++$i; 00207 00208 // skip removed attribute 00209 if ($val===null || $val===false) 00210 continue; 00211 00212 $ret .= $this->_[HDOM_INFO_SPACE][$i][0]; 00213 //no value attr: nowrap, checked selected... 00214 if ($val===true) 00215 $ret .= $key; 00216 else { 00217 switch($this->_[HDOM_INFO_QUOTE][$i]) { 00218 case HDOM_QUOTE_DOUBLE: $quote = '"'; break; 00219 case HDOM_QUOTE_SINGLE: $quote = '\''; break; 00220 default: $quote = ''; 00221 } 00222 $ret .= $key.$this->_[HDOM_INFO_SPACE][$i][1].'='.$this->_[HDOM_INFO_SPACE][$i][2].$quote.$val.$quote; 00223 } 00224 } 00225 $ret = $this->dom->restore_noise($ret); 00226 return $ret . $this->_[HDOM_INFO_ENDSPACE] . '>'; 00227 } 00228 00229 // find elements by css selector 00230 function find($selector, $idx=-1) { 00231 $selectors = $this->parse_selector($selector); 00232 00233 if (($count=count($selectors))===0) return array(); 00234 $found_keys = array(); 00235 00236 // find each selector 00237 for ($c=0; $c<$count; ++$c) { 00238 if (($levle=count($selectors[0]))===0) return array(); 00239 if (!isset($this->_[HDOM_INFO_BEGIN])) return array(); 00240 00241 $head = array($this->_[HDOM_INFO_BEGIN]=>1); 00242 00243 // handle descendant selectors, no recursive! 00244 for ($l=0; $l<$levle; ++$l) { 00245 $ret = array(); 00246 foreach($head as $k=>$v) { 00247 $n = ($k===-1) ? $this->dom->root : $this->dom->nodes[$k]; 00248 $n->seek($selectors[$c][$l], $ret); 00249 } 00250 $head = $ret; 00251 } 00252 00253 foreach($head as $k=>$v) { 00254 if (!isset($found_keys[$k])) 00255 $found_keys[$k] = 1; 00256 } 00257 } 00258 00259 // sort keys 00260 ksort($found_keys); 00261 00262 $found = array(); 00263 foreach($found_keys as $k=>$v) 00264 $found[] = $this->dom->nodes[$k]; 00265 00266 // return nth-element or array 00267 if ($idx<0) return $found; 00268 return (isset($found[$idx])) ? $found[$idx] : null; 00269 } 00270 00271 // seek for given conditions 00272 protected function seek($selector, &$ret) { 00273 list($tag, $key, $val, $exp) = $selector; 00274 00275 $end = (!empty($this->_[HDOM_INFO_END])) ? $this->_[HDOM_INFO_END] : 0; 00276 if ($end==0) { 00277 $parent = $this->parent; 00278 while (!isset($parent->_[HDOM_INFO_END]) && $parent!==null) { 00279 $end -= 1; 00280 $parent = $parent->parent; 00281 } 00282 $end += $parent->_[HDOM_INFO_END]; 00283 } 00284 00285 for($i=$this->_[HDOM_INFO_BEGIN]+1; $i<$end; ++$i) { 00286 $node = $this->dom->nodes[$i]; 00287 $pass = true; 00288 00289 if ($tag==='*') { 00290 if (in_array($node, $this->children, true)) 00291 $ret[$i] = 1; 00292 continue; 00293 } 00294 00295 // compare tag 00296 if ($tag && $tag!=$node->tag) {$pass=false;} 00297 // compare key 00298 if ($pass && $key && !(isset($node->attr[$key]))) {$pass=false;} 00299 // compare value 00300 if ($pass && $key && $val) { 00301 $check = $this->match($exp, $val, $node->attr[$key]); 00302 // handle multiple class 00303 if (!$check && strcasecmp($key, 'class')===0) { 00304 foreach(explode(' ',$node->attr[$key]) as $k) { 00305 $check = $this->match($exp, $val, $k); 00306 if ($check) break; 00307 } 00308 } 00309 if (!$check) $pass = false; 00310 } 00311 if ($pass) $ret[$i] = 1; 00312 unset($node); 00313 } 00314 } 00315 00316 protected function match($exp, $pattern, $value) { 00317 $check = true; 00318 switch ($exp) { 00319 case '=': 00320 $check = ($value===$pattern) ? true : false; break; 00321 case '!=': 00322 $check = ($value!==$pattern) ? true : false; break; 00323 case '^=': 00324 $check = (preg_match("/^".preg_quote($pattern,'/')."/", $value)) ? true : false; break; 00325 case '$=': 00326 $check = (preg_match("/".preg_quote($pattern,'/')."$/", $value)) ? true : false; break; 00327 case '*=': 00328 $check = (preg_match("/".preg_quote($pattern,'/')."/i", $value)) ? true : false; break; 00329 } 00330 return $check; 00331 } 00332 00333 protected function parse_selector($selector_string) { 00334 // pattern of CSS selectors, modified from mootools 00335 $pattern = "/([\w-:\*]*)(?:\#([\w-]+)|\.([\w-]+))?(?:\[(\w+)(?:([!*^$]?=)[\"']?(.*?)[\"']?)?\])?([, ]+)/is"; 00336 preg_match_all($pattern, trim($selector_string).' ', $matches, PREG_SET_ORDER); 00337 $selectors = array(); 00338 $result = array(); 00339 00340 foreach ($matches as $m) { 00341 if (trim($m[0])==='') continue; 00342 00343 list($tag, $key, $val, $exp) = array($m[1], null, null, '='); 00344 if(!empty($m[2])) {$key='id'; $val=$m[2];} 00345 if(!empty($m[3])) {$key='class'; $val=$m[3];} 00346 if(!empty($m[4])) {$key=$m[4];} 00347 if(!empty($m[5])) {$exp=$m[5];} 00348 if(!empty($m[6])) {$val=$m[6];} 00349 00350 // convert to lowercase 00351 if ($this->dom->lowercase) {$tag=strtolower($tag); $key=strtolower($key);} 00352 00353 $result[] = array($tag, $key, $val, $exp); 00354 if (trim($m[7])===',') { 00355 $selectors[] = $result; 00356 $result = array(); 00357 } 00358 } 00359 if (count($result)>0) 00360 $selectors[] = $result; 00361 00362 return $selectors; 00363 } 00364 00365 function __get($name) { 00366 if (isset($this->attr[$name])) return $this->attr[$name]; 00367 switch($name) { 00368 case 'outertext': return $this->outertext(); 00369 case 'innertext': return $this->innertext(); 00370 case 'plaintext': return $this->plaintext(); 00371 default: return array_key_exists($name, $this->attr); 00372 } 00373 } 00374 00375 function __set($name, $value) { 00376 switch($name) { 00377 case 'outertext': return $this->_[HDOM_INFO_OUTER] = $value; 00378 case 'innertext': 00379 if (isset($this->_[HDOM_INFO_TEXT])) return $this->_[HDOM_INFO_TEXT] = $value; 00380 return $this->_[HDOM_INFO_INNER] = $value; 00381 } 00382 if (!isset($this->attr[$name])) { 00383 $this->_[HDOM_INFO_SPACE][] = array(' ', '', ''); 00384 $this->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; 00385 } 00386 $this->attr[$name] = $value; 00387 } 00388 00389 function __isset($name) { 00390 switch($name) { 00391 case 'outertext': return true; 00392 case 'innertext': return true; 00393 case 'plaintext': return true; 00394 } 00395 //no value attr: nowrap, checked selected... 00396 return (array_key_exists($name, $this->attr)) ? true : isset($this->attr[$name]); 00397 } 00398 00399 function __unset($name) { 00400 if (isset($this->attr[$name])) 00401 unset($this->attr[$name]); 00402 } 00403 00404 // camel naming conventions 00405 function getAllAttributes() {return $this->attr;} 00406 function getAttribute($name) {return $this->__get($name);} 00407 function setAttribute($name, $value) {$this->__set($name, $value);} 00408 function hasAttribute($name) {return $this->__isset($name);} 00409 function removeAttribute($name) {$this->__set($name, null);} 00410 function getElementById($id) {return $this->find("#$id", 0);} 00411 function getElementsById($id, $idx=-1) {return $this->find("#$id", $idx);} 00412 function getElementByTagName($name) {return $this->find($name, 0);} 00413 function getElementsByTagName($name, $idx=-1) {return $this->find($name, $idx);} 00414 function parentNode() {return $this->parent();} 00415 function childNodes($idx=-1) {return $this->children($idx);} 00416 function firstChild() {return $this->first_child();} 00417 function lastChild() {return $this->last_child();} 00418 function nextSibling() {return $this->next_sibling();} 00419 function previousSibling() {return $this->prev_sibling();} 00420 } 00421 00422 // simple html dom parser 00423 // ----------------------------------------------------------------------------- 00424 class simple_html_dom { 00425 public $root = null; 00426 public $nodes = array(); 00427 public $callback = null; 00428 public $lowercase = false; 00429 protected $pos; 00430 protected $doc; 00431 protected $char; 00432 protected $size; 00433 protected $cursor; 00434 protected $parent; 00435 protected $noise = array(); 00436 protected $token_blank = " \t\r\n"; 00437 protected $token_equal = ' =/><'; 00438 protected $token_slash = " />\r\n\t"; 00439 protected $token_attr = ' >'; 00440 // use isset instead of in_array, performance boost about 30%... 00441 protected $self_closing_tags = array('img'=>1, 'br'=>1, 'input'=>1, 'meta'=>1, 'link'=>1, 'hr'=>1, 'base'=>1, 'embed'=>1, 'spacer'=>1, 'nobr'=>1); 00442 protected $block_tags = array('root'=>1, 'body'=>1, 'form'=>1, 'div'=>1, 'span'=>1, 'table'=>1); 00443 protected $optional_closing_tags = array( 00444 'tr'=>array('tr'=>1, 'td'=>1, 'th'=>1), 00445 'th'=>array('th'=>1), 00446 'td'=>array('td'=>1), 00447 'ul'=>array('li'=>1), 00448 'li'=>array('li'=>1), 00449 'dt'=>array('dt'=>1, 'dd'=>1), 00450 'dd'=>array('dd'=>1, 'dt'=>1), 00451 'dl'=>array('dd'=>1, 'dt'=>1), 00452 'p'=>array('p'=>1), 00453 ); 00454 00455 function __destruct() { 00456 $this->clear(); 00457 } 00458 00459 // load html from string 00460 function load($str, $lowercase=true) { 00461 // prepare 00462 $this->prepare($str, $lowercase); 00463 // strip out comments 00464 $this->remove_noise("'<!--(.*?)-->'is"); 00465 // strip out <style> tags 00466 $this->remove_noise("'<\s*style[^>]*[^/]>(.*?)<\s*/\s*style\s*>'is"); 00467 $this->remove_noise("'<\s*style\s*>(.*?)<\s*/\s*style\s*>'is"); 00468 // strip out <script> tags 00469 $this->remove_noise("'<\s*script[^>]*[^/]>(.*?)<\s*/\s*script\s*>'is"); 00470 $this->remove_noise("'<\s*script\s*>(.*?)<\s*/\s*script\s*>'is"); 00471 // strip out preformatted tags 00472 $this->remove_noise("'<\s*(?:pre|code)[^>]*>(.*?)<\s*/\s*(?:pre|code)\s*>'is"); 00473 // strip out server side scripts 00474 $this->remove_noise("'(<\?)(.*?)(\?>)'is", true); 00475 00476 //echo $this->doc; 00477 //die; 00478 00479 // parsing 00480 while ($this->parse()); 00481 // end 00482 $this->root->_[HDOM_INFO_END] = $this->cursor; 00483 } 00484 00485 // load html from file 00486 function load_file() { 00487 $args = func_get_args(); 00488 $this->load(call_user_func_array('file_get_contents', $args), true); 00489 } 00490 00491 // set callback function 00492 function set_callback($function_name) { 00493 $this->callback = $function_name; 00494 } 00495 00496 // remove callback function 00497 function remove_callback() { 00498 $this->callback = null; 00499 } 00500 00501 // save dom as string 00502 function save($filepath='') { 00503 $ret = $this->root->innertext(); 00504 if ($filepath!=='') file_put_contents($filepath, $ret); 00505 return $ret; 00506 } 00507 00508 // find dom node by css selector 00509 function find($selector, $idx=-1) { 00510 return $this->root->find($selector, $idx); 00511 } 00512 00513 // clean up memory due to php5 circular references memory leak... 00514 function clear() { 00515 foreach($this->nodes as $n) {$n->clear(); $n = null;} 00516 if (isset($this->parent)) {$this->parent->clear(); unset($this->parent);} 00517 if (isset($this->root)) {$this->root->clear(); unset($this->root);} 00518 unset($this->doc); 00519 unset($this->noise); 00520 } 00521 00522 // prepare HTML data and init everything 00523 protected function prepare($str, $lowercase=true) { 00524 $this->clear(); 00525 $this->doc = $str; 00526 $this->pos = 0; 00527 $this->cursor = 1; 00528 $this->noise = array(); 00529 $this->nodes = array(); 00530 $this->lowercase = $lowercase; 00531 $this->root = new simple_html_dom_node($this); 00532 $this->root->tag = 'root'; 00533 $this->root->_[HDOM_INFO_BEGIN] = -1; 00534 $this->root->nodetype = HDOM_TYPE_ROOT; 00535 $this->parent = $this->root; 00536 // set the length of content 00537 $this->size = strlen($str); 00538 if ($this->size>0) $this->char = $this->doc[0]; 00539 } 00540 00541 // parse html content 00542 protected function parse() { 00543 if (($s = $this->copy_until_char('<'))==='') 00544 return $this->read_tag(); 00545 00546 // text 00547 $node = new simple_html_dom_node($this); 00548 ++$this->cursor; 00549 $node->_[HDOM_INFO_TEXT] = $s; 00550 $this->link_nodes($node, false); 00551 return true; 00552 } 00553 00554 // read tag info 00555 protected function read_tag() { 00556 if ($this->char!=='<') { 00557 $this->root->_[HDOM_INFO_END] = $this->cursor; 00558 return false; 00559 } 00560 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 00561 00562 // end tag 00563 if ($this->char==='/') { 00564 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 00565 $this->skip($this->token_blank_t); 00566 $tag = $this->copy_until_char('>'); 00567 00568 // skip attributes in end tag 00569 if (($pos = strpos($tag, ' '))!==false) 00570 $tag = substr($tag, 0, $pos); 00571 00572 $parent_lower = strtolower($this->parent->tag); 00573 $tag_lower = strtolower($tag); 00574 00575 if ($parent_lower!==$tag_lower) { 00576 if (isset($this->optional_closing_tags[$parent_lower]) && isset($this->block_tags[$tag_lower])) { 00577 $this->parent->_[HDOM_INFO_END] = 0; 00578 while (($this->parent->parent) && strtolower($this->parent->tag)!==$tag_lower) 00579 $this->parent = $this->parent->parent; 00580 00581 if (strtolower($this->parent->tag)!==$tag_lower) { 00582 $this->as_text_node($tag); 00583 $this->char = (--$this->pos>-1) ? $this->doc[$this->pos] : null; // back 00584 } 00585 } 00586 else if (($this->parent->parent) && strtolower($this->parent->parent->tag)===$tag_lower) { 00587 $this->parent->_[HDOM_INFO_END] = 0; 00588 $this->parent = $this->parent->parent; 00589 } 00590 else 00591 return $this->as_text_node($tag); 00592 } 00593 00594 $this->parent->_[HDOM_INFO_END] = $this->cursor; 00595 if ($this->parent->parent) $this->parent = $this->parent->parent; 00596 00597 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 00598 return true; 00599 } 00600 00601 $node = new simple_html_dom_node($this); 00602 $node->_[HDOM_INFO_BEGIN] = $this->cursor; 00603 ++$this->cursor; 00604 $tag = $this->copy_until($this->token_slash); 00605 00606 // doctype, cdata & comments... 00607 if (isset($tag[0]) && $tag[0]==='!') { 00608 $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>'); 00609 00610 if (isset($tag[2]) && $tag[1]==='-' && $tag[2]==='-') { 00611 $node->nodetype = HDOM_TYPE_COMMENT; 00612 $node->tag = 'comment'; 00613 } else { 00614 $node->nodetype = HDOM_TYPE_UNKNOWN; 00615 $node->tag = 'unknown'; 00616 } 00617 00618 if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>'; 00619 $this->link_nodes($node, false); 00620 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 00621 return true; 00622 } 00623 00624 // text 00625 if (!preg_match("/^[\w-:]+$/", $tag)) { 00626 $node->_[HDOM_INFO_TEXT] = '<' . $tag . $this->copy_until_char('>'); 00627 if ($this->char==='>') $node->_[HDOM_INFO_TEXT].='>'; 00628 $this->link_nodes($node, false); 00629 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 00630 return true; 00631 } 00632 00633 // begin tag 00634 $node->nodetype = HDOM_TYPE_ELEMENT; 00635 $tag_lower = strtolower($tag); 00636 $node->tag = ($this->lowercase) ? $tag_lower : $tag; 00637 00638 // handle optional closing tags 00639 if (isset($this->optional_closing_tags[$tag_lower]) ) { 00640 while (isset($this->optional_closing_tags[$tag_lower][strtolower($this->parent->tag)])) { 00641 $this->parent->_[HDOM_INFO_END] = 0; 00642 $this->parent = $this->parent->parent; 00643 } 00644 $node->parent = $this->parent; 00645 } 00646 $this->link_nodes($node, true); 00647 00648 $guard = 0; // prevent infinity loop 00649 $space = array($this->copy_skip($this->token_blank), '', ''); 00650 00651 // attributes 00652 do { 00653 if ($this->char!==null && $space[0]==='') break; 00654 $name = $this->copy_until($this->token_equal); 00655 00656 if($guard===$this->pos) { 00657 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 00658 continue; 00659 } 00660 $guard = $this->pos; 00661 00662 // handle endless '<' 00663 if($this->pos>=$this->size-1 && $this->char!=='>') { 00664 $node->nodetype = HDOM_TYPE_TEXT; 00665 $node->_[HDOM_INFO_END] = 0; 00666 $node->_[HDOM_INFO_TEXT] = '<'.$tag . $space[0] . $name; 00667 $node->tag = 'text'; 00668 return true; 00669 } 00670 00671 if ($name!=='/' && $name!=='') { 00672 $space[1] = $this->copy_skip($this->token_blank); 00673 if ($this->lowercase) $name = strtolower($name); 00674 if ($this->char==='=') { 00675 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 00676 $this->parse_attr($node, $name, $space); 00677 } 00678 else { 00679 //no value attr: nowrap, checked selected... 00680 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO; 00681 $node->attr[$name] = true; 00682 if ($this->char!='>') $this->char = $this->doc[--$this->pos]; // prev 00683 } 00684 $node->_[HDOM_INFO_SPACE][] = $space; 00685 $space = array($this->copy_skip($this->token_blank), '', ''); 00686 } 00687 else 00688 break; 00689 } while($this->char!=='>' && $this->char!=='/'); 00690 00691 $node->_[HDOM_INFO_ENDSPACE] = $space[0]; 00692 00693 // check self closing 00694 if ($this->copy_until_char_escape('>')==='/') { 00695 $node->_[HDOM_INFO_ENDSPACE] .= '/'; 00696 $node->_[HDOM_INFO_END] = 0; 00697 } 00698 else { 00699 // reset parent 00700 if (!isset($this->self_closing_tags[strtolower($node->tag)])) $this->parent = $node; 00701 } 00702 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 00703 return true; 00704 } 00705 00706 // parse attributes 00707 protected function parse_attr($node, $name, &$space) { 00708 $space[2] = $this->copy_skip($this->token_blank); 00709 switch($this->char) { 00710 case '"': 00711 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_DOUBLE; 00712 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 00713 $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('"')); 00714 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 00715 break; 00716 case '\'': 00717 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_SINGLE; 00718 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 00719 $node->attr[$name] = $this->restore_noise($this->copy_until_char_escape('\'')); 00720 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 00721 break; 00722 default: 00723 $node->_[HDOM_INFO_QUOTE][] = HDOM_QUOTE_NO; 00724 $node->attr[$name] = $this->restore_noise($this->copy_until($this->token_attr)); 00725 } 00726 } 00727 00728 // link node's parent 00729 protected function link_nodes(&$node, $is_child) { 00730 $node->parent = $this->parent; 00731 $this->parent->nodes[] = &$node; 00732 if ($is_child) 00733 $this->parent->children[] = &$node; 00734 } 00735 00736 // as a text node 00737 protected function as_text_node($tag) { 00738 $node = new simple_html_dom_node($this); 00739 ++$this->cursor; 00740 $node->_[HDOM_INFO_TEXT] = '</' . $tag . '>'; 00741 $this->link_nodes($node, false); 00742 $this->char = (++$this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 00743 return true; 00744 } 00745 00746 protected function skip($chars) { 00747 $this->pos += strspn($this->doc, $chars, $this->pos); 00748 $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 00749 } 00750 00751 protected function copy_skip($chars) { 00752 $pos = $this->pos; 00753 $len = strspn($this->doc, $chars, $pos); 00754 $this->pos += $len; 00755 $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 00756 if ($len===0) return ''; 00757 return substr($this->doc, $pos, $len); 00758 } 00759 00760 protected function copy_until($chars) { 00761 $pos = $this->pos; 00762 $len = strcspn($this->doc, $chars, $pos); 00763 $this->pos += $len; 00764 $this->char = ($this->pos<$this->size) ? $this->doc[$this->pos] : null; // next 00765 return substr($this->doc, $pos, $len); 00766 } 00767 00768 protected function copy_until_char($char) { 00769 if ($this->char===null) return ''; 00770 00771 if (($pos = strpos($this->doc, $char, $this->pos))===false) { 00772 $ret = substr($this->doc, $this->pos, $this->size-$this->pos); 00773 $this->char = null; 00774 $this->pos = $this->size; 00775 return $ret; 00776 } 00777 00778 if ($pos===$this->pos) return ''; 00779 $pos_old = $this->pos; 00780 $this->char = $this->doc[$pos]; 00781 $this->pos = $pos; 00782 return substr($this->doc, $pos_old, $pos-$pos_old); 00783 } 00784 00785 protected function copy_until_char_escape($char) { 00786 if ($this->char===null) return ''; 00787 00788 $start = $this->pos; 00789 while(1) { 00790 if (($pos = strpos($this->doc, $char, $start))===false) { 00791 $ret = substr($this->doc, $this->pos, $this->size-$this->pos); 00792 $this->char = null; 00793 $this->pos = $this->size; 00794 return $ret; 00795 } 00796 00797 if ($pos===$this->pos) return ''; 00798 00799 if ($this->doc[$pos-1]==='\\') { 00800 $start = $pos+1; 00801 continue; 00802 } 00803 00804 $pos_old = $this->pos; 00805 $this->char = $this->doc[$pos]; 00806 $this->pos = $pos; 00807 return substr($this->doc, $pos_old, $pos-$pos_old); 00808 } 00809 } 00810 00811 // remove noise from html content 00812 protected function remove_noise($pattern, $remove_tag=false) { 00813 $count = preg_match_all($pattern, $this->doc, $matches, PREG_SET_ORDER|PREG_OFFSET_CAPTURE); 00814 00815 for ($i=$count-1; $i>-1; --$i) { 00816 $key = '___noise___'.sprintf('% 3d', count($this->noise)+100); 00817 $idx = ($remove_tag) ? 0 : 1; 00818 $this->noise[$key] = $matches[$i][$idx][0]; 00819 $this->doc = substr_replace($this->doc, $key, $matches[$i][$idx][1], strlen($matches[$i][$idx][0])); 00820 } 00821 00822 // reset the length of content 00823 $this->size = strlen($this->doc); 00824 if ($this->size>0) $this->char = $this->doc[0]; 00825 } 00826 00827 // restore noise to html content 00828 function restore_noise($text) { 00829 while(($pos=strpos($text, '___noise___'))!==false) { 00830 $key = '___noise___'.$text[$pos+11].$text[$pos+12].$text[$pos+13]; 00831 if (isset($this->noise[$key])) 00832 $text = substr($text, 0, $pos).$this->noise[$key].substr($text, $pos+14); 00833 } 00834 return $text; 00835 } 00836 00837 function __toString() { 00838 return $this->root->innertext(); 00839 } 00840 00841 function __get($name) { 00842 switch($name) { 00843 case 'outertext': return $this->root->innertext(); 00844 case 'innertext': return $this->root->innertext(); 00845 case 'plaintext': return $this->root->plaintext(); 00846 } 00847 } 00848 00849 // camel naming conventions 00850 function childNodes($idx=-1) {return $this->root->childNodes($idx);} 00851 function firstChild() {return $this->root->first_child();} 00852 function lastChild() {return $this->root->last_child();} 00853 function getElementById($id) {return $this->find("#$id", 0);} 00854 function getElementsById($id, $idx=-1) {return $this->find("#$id", $idx);} 00855 function getElementByTagName($name) {return $this->find($name, 0);} 00856 function getElementsByTagName($name, $idx=-1) {return $this->find($name, $idx);} 00857 function loadFile() {$args = func_get_args();$this->load(call_user_func_array('file_get_contents', $args), true);} 00858 } 00859 ?>