[ Index ] |
PHP Cross Reference of Unnamed Project |
[Summary view] [Print] [Text view]
1 <?php 2 /** 3 * HTML2PDF Librairy - parsingHtml class 4 * 5 * HTML => PDF convertor 6 * distributed under the LGPL License 7 * 8 * @author Laurent MINGUET <webmaster@html2pdf.fr> 9 * @version 4.03 10 */ 11 12 class HTML2PDF_parsingHtml 13 { 14 protected $_html = ''; // HTML code to parse 15 protected $_num = 0; // table number 16 protected $_level = 0; // table level 17 protected $_encoding = ''; // encoding 18 public $code = array(); // parsed HTML codfe 19 20 const HTML_TAB = ' '; 21 22 /** 23 * main constructor 24 * 25 * @param string encoding 26 * @access public 27 */ 28 public function __construct($encoding = 'UTF-8') 29 { 30 $this->_num = 0; 31 $this->_level = array($this->_num); 32 $this->_html = ''; 33 $this->code = array(); 34 $this->setEncoding($encoding); 35 } 36 37 /** 38 * change the encoding 39 * 40 * @param string encoding 41 * @access public 42 */ 43 public function setEncoding($encoding) 44 { 45 $this->_encoding = $encoding; 46 } 47 48 /** 49 * Define the HTML code to parse 50 * 51 * @param string HTML code 52 * @access public 53 */ 54 public function setHTML($html) 55 { 56 // remove the HTML in comment 57 $html = preg_replace('/<!--(.*)-->/isU', '', $html); 58 59 // save the HTML code 60 $this->_html = $html; 61 } 62 63 /** 64 * parse the HTML code 65 * 66 * @access public 67 */ 68 public function parse() 69 { 70 $parents = array(); 71 72 // flag : are we in a <pre> Tag ? 73 $tagPreIn = false; 74 75 // action to use for each line of the content of a <pre> Tag 76 $tagPreBr = array( 77 'name' => 'br', 78 'close' => false, 79 'param' => array( 80 'style' => array(), 81 'num' => 0 82 ) 83 ); 84 85 // tag that can be not closed 86 $tagsNotClosed = array( 87 'br', 'hr', 'img', 'col', 88 'input', 'link', 'option', 89 'circle', 'ellipse', 'path', 'rect', 'line', 'polygon', 'polyline' 90 ); 91 92 // search the HTML tags 93 $tmp = array(); 94 $this->_searchCode($tmp); 95 96 // all the actions to do 97 $actions = array(); 98 99 // foreach part of the HTML code 100 foreach ($tmp as $part) { 101 // if it is a tag code 102 if ($part[0]=='code') { 103 // analise the HTML code 104 $res = $this->_analiseCode($part[1]); 105 106 // if it is a real HTML tag 107 if ($res) { 108 // save the current posistion in the HTML code 109 $res['html_pos'] = $part[2]; 110 111 // if the tag must be closed 112 if (!in_array($res['name'], $tagsNotClosed)) { 113 // if it is a closure tag 114 if ($res['close']) { 115 // HTML validation 116 if (count($parents)<1) 117 throw new HTML2PDF_exception(3, $res['name'], $this->getHtmlErrorCode($res['html_pos'])); 118 else if ($parents[count($parents)-1]!=$res['name']) 119 throw new HTML2PDF_exception(4, $parents, $this->getHtmlErrorCode($res['html_pos'])); 120 else 121 unset($parents[count($parents)-1]); 122 } else { 123 // if it is a autoclosed tag 124 if ($res['autoclose']) { 125 // save the opened tag 126 $actions[] = $res; 127 128 // prepare the closed tag 129 $res['params'] = array(); 130 $res['close'] = true; 131 } 132 // else :add a child for validation 133 else 134 $parents[count($parents)] = $res['name']; 135 } 136 137 // if it is a <pre> tag (or <code> tag) not auclosed => update the flag 138 if (($res['name']=='pre' || $res['name']=='code') && !$res['autoclose']) { 139 $tagPreIn = !$res['close']; 140 } 141 } 142 143 // save the actions to convert 144 $actions[] = $res; 145 } else { // else (it is not a real HTML tag => we transform it in Texte 146 $part[0]='txt'; 147 } 148 } 149 // if it is text 150 if ($part[0]=='txt') { 151 // if we are not in a <pre> tag 152 if (!$tagPreIn) { 153 // save the action 154 $actions[] = array( 155 'name' => 'write', 156 'close' => false, 157 'param' => array('txt' => $this->_prepareTxt($part[1])), 158 ); 159 } else { // else (if we are in a <pre> tag) 160 // prepare the text 161 $part[1] = str_replace("\r", '', $part[1]); 162 $part[1] = explode("\n", $part[1]); 163 164 // foreach line of the text 165 foreach ($part[1] as $k => $txt) { 166 // transform the line 167 $txt = str_replace("\t", self::HTML_TAB, $txt); 168 $txt = str_replace(' ', ' ', $txt); 169 170 // add a break line 171 if ($k>0) $actions[] = $tagPreBr; 172 173 // save the action 174 $actions[] = array( 175 'name' => 'write', 176 'close' => false, 177 'param' => array('txt' => $this->_prepareTxt($txt, false)), 178 ); 179 } 180 } 181 } 182 } 183 184 // for each indentified action, we have to clean up the begin and the end of the texte 185 // based on tags that surround it 186 187 // list of the tags to clean 188 $tagsToClean = array( 189 'page', 'page_header', 'page_footer', 'form', 190 'table', 'thead', 'tfoot', 'tr', 'td', 'th', 'br', 191 'div', 'hr', 'p', 'ul', 'ol', 'li', 192 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 193 'bookmark', 'fieldset', 'legend', 194 'draw', 'circle', 'ellipse', 'path', 'rect', 'line', 'g', 'polygon', 'polyline', 195 'option' 196 ); 197 198 // foreach action 199 $nb = count($actions); 200 for ($k=0; $k<$nb; $k++) { 201 // if it is a Text 202 if ($actions[$k]['name']=='write') { 203 // if the tag before the text is a tag to clean => ltrim on the text 204 if ($k>0 && in_array($actions[$k-1]['name'], $tagsToClean)) 205 $actions[$k]['param']['txt'] = ltrim($actions[$k]['param']['txt']); 206 207 // if the tag after the text is a tag to clean => rtrim on the text 208 if ($k<$nb-1 && in_array($actions[$k+1]['name'], $tagsToClean)) 209 $actions[$k]['param']['txt'] = rtrim($actions[$k]['param']['txt']); 210 211 // if the text is empty => remove the action 212 if (!strlen($actions[$k]['param']['txt'])) 213 unset($actions[$k]); 214 } 215 } 216 217 // if we are not on the level 0 => HTML validator ERROR 218 if (count($parents)) throw new HTML2PDF_exception(5, $parents); 219 220 // save the actions to do 221 $this->code = array_values($actions); 222 } 223 224 /** 225 * prepare the text 226 * 227 * @param string texte 228 * @param boolean true => replace multiple space+\t+\r+\n by a single space 229 * @return string texte 230 * @access protected 231 */ 232 protected function _prepareTxt($txt, $spaces = true) 233 { 234 if ($spaces) $txt = preg_replace('/\s+/is', ' ', $txt); 235 $txt = str_replace('€', '€', $txt); 236 $txt = html_entity_decode($txt, ENT_QUOTES, $this->_encoding); 237 return $txt; 238 } 239 240 /** 241 * parse the HTML code 242 * 243 * @param &array array's result 244 * @return null 245 */ 246 protected function _searchCode(&$tmp) 247 { 248 // initialise the array 249 $tmp = array(); 250 251 // regexp to separate the tags from the texts 252 $reg = '/(<[^>]+>)|([^<]+)+/isU'; 253 254 // last match found 255 $str = ''; 256 $offset = 0; 257 258 // As it finds a match 259 while (preg_match($reg, $this->_html, $parse, PREG_OFFSET_CAPTURE, $offset)) { 260 // if it is a tag 261 if ($parse[1][0]) { 262 // save the previous text if it exists 263 if ($str!=='') $tmp[] = array('txt', $str); 264 265 // save the tag, with the offset 266 $tmp[] = array('code', trim($parse[1][0]), $offset); 267 268 // init the current text 269 $str = ''; 270 } else { // else (if it is a text) 271 // add the new text to the current text 272 $str.= $parse[2][0]; 273 } 274 275 // Update offset to the end of the match 276 $offset = $parse[0][1] + strlen($parse[0][0]); 277 unset($parse); 278 } 279 // if a text is present in the end, we save it 280 if ($str!='') $tmp[] = array('txt', $str); 281 unset($str); 282 } 283 284 /** 285 * analise a HTML tag 286 * 287 * @param string HTML code to analise 288 * @return array corresponding action 289 */ 290 protected function _analiseCode($code) 291 { 292 // name of the tag, opening, closure, autoclosure 293 $tag = '<([\/]{0,1})([_a-z0-9]+)([\/>\s]+)'; 294 if (!preg_match('/'.$tag.'/isU', $code, $match)) return null; 295 $close = ($match[1]=='/' ? true : false); 296 $autoclose = preg_match('/\/>$/isU', $code); 297 $name = strtolower($match[2]); 298 299 // required parameters (depends on the tag name) 300 $param = array(); 301 $param['style'] = ''; 302 if ($name=='img') { 303 $param['alt'] = ''; 304 $param['src'] = ''; 305 } 306 if ($name=='a') { 307 $param['href'] = ''; 308 } 309 310 // read the parameters : nom=valeur 311 $prop = '([a-zA-Z0-9_]+)=([^"\'\s>]+)'; 312 preg_match_all('/'.$prop.'/is', $code, $match); 313 for($k=0; $k<count($match[0]); $k++) 314 $param[trim(strtolower($match[1][$k]))] = trim($match[2][$k]); 315 316 // read the parameters : nom="valeur" 317 $prop = '([a-zA-Z0-9_]+)=["]([^"]*)["]'; 318 preg_match_all('/'.$prop.'/is', $code, $match); 319 for($k=0; $k<count($match[0]); $k++) 320 $param[trim(strtolower($match[1][$k]))] = trim($match[2][$k]); 321 322 // read the parameters : nom='valeur' 323 $prop = "([a-zA-Z0-9_]+)=[']([^']*)[']"; 324 preg_match_all('/'.$prop.'/is', $code, $match); 325 for($k=0; $k<count($match[0]); $k++) 326 $param[trim(strtolower($match[1][$k]))] = trim($match[2][$k]); 327 328 // compliance of each parameter 329 $color = "#000000"; 330 $border = null; 331 foreach ($param as $key => $val) { 332 $key = strtolower($key); 333 switch($key) 334 { 335 case 'width': 336 unset($param[$key]); 337 $param['style'] .= 'width: '.$val.'px; '; 338 break; 339 340 case 'align': 341 if ($name==='img') { 342 unset($param[$key]); 343 $param['style'] .= 'float: '.$val.'; '; 344 } elseif ($name!=='table') { 345 unset($param[$key]); 346 $param['style'] .= 'text-align: '.$val.'; '; 347 } 348 break; 349 350 case 'valign': 351 unset($param[$key]); 352 $param['style'] .= 'vertical-align: '.$val.'; '; 353 break; 354 355 case 'height': 356 unset($param[$key]); 357 $param['style'] .= 'height: '.$val.'px; '; 358 break; 359 360 case 'bgcolor': 361 unset($param[$key]); 362 $param['style'] .= 'background: '.$val.'; '; 363 break; 364 365 case 'bordercolor': 366 unset($param[$key]); 367 $color = $val; 368 break; 369 370 case 'border': 371 unset($param[$key]); 372 if (preg_match('/^[0-9]+$/isU', $val)) $val = $val.'px'; 373 $border = $val; 374 break; 375 376 case 'cellpadding': 377 case 'cellspacing': 378 if (preg_match('/^([0-9]+)$/isU', $val)) $param[$key] = $val.'px'; 379 break; 380 381 case 'colspan': 382 case 'rowspan': 383 $val = preg_replace('/[^0-9]/isU', '', $val); 384 if (!$val) $val = 1; 385 $param[$key] = $val; 386 break; 387 } 388 } 389 390 // compliance of the border 391 if ($border!==null) { 392 if ($border) $border = 'border: solid '.$border.' '.$color; 393 else $border = 'border: none'; 394 395 $param['style'] .= $border.'; '; 396 $param['border'] = $border; 397 } 398 399 // reading styles: decomposition and standardization 400 $styles = explode(';', $param['style']); 401 $param['style'] = array(); 402 foreach ($styles as $style) { 403 $tmp = explode(':', $style); 404 if (count($tmp)>1) { 405 $cod = $tmp[0]; 406 unset($tmp[0]); 407 $tmp = implode(':', $tmp); 408 $param['style'][trim(strtolower($cod))] = preg_replace('/[\s]+/isU', ' ', trim($tmp)); 409 } 410 } 411 412 // determining the level of table opening, with an added level 413 if (in_array($name, array('ul', 'ol', 'table')) && !$close) { 414 $this->_num++; 415 $this->_level[count($this->_level)] = $this->_num; 416 } 417 418 // get the level of the table containing the element 419 if (!isset($param['num'])) { 420 $param['num'] = $this->_level[count($this->_level)-1]; 421 } 422 423 // for closures table: remove a level 424 if (in_array($name, array('ul', 'ol', 'table')) && $close) { 425 unset($this->_level[count($this->_level)-1]); 426 } 427 428 // prepare the parameters 429 if (isset($param['value'])) $param['value'] = $this->_prepareTxt($param['value']); 430 if (isset($param['alt'])) $param['alt'] = $this->_prepareTxt($param['alt']); 431 if (isset($param['title'])) $param['title'] = $this->_prepareTxt($param['title']); 432 if (isset($param['class'])) $param['class'] = $this->_prepareTxt($param['class']); 433 434 // return the new action to do 435 return array('name' => $name, 'close' => $close ? 1 : 0, 'autoclose' => $autoclose, 'param' => $param); 436 } 437 438 /** 439 * get a full level of HTML, between an opening and closing corresponding 440 * 441 * @param integer key 442 * @return array actions 443 */ 444 public function getLevel($k) 445 { 446 // if the code does not exist => return empty 447 if (!isset($this->code[$k])) return array(); 448 449 // the tag to detect 450 $detect = $this->code[$k]['name']; 451 452 // if it is a text => return 453 if ($detect=='write') { 454 return array($this->code[$k]); 455 } 456 457 // 458 $level = 0; // depth level 459 $end = false; // end of the search 460 $code = array(); // extract code 461 462 // while it's not ended 463 while (!$end) { 464 // current action 465 $row = $this->code[$k]; 466 467 // if 'write' => we add the text 468 if ($row['name']=='write') { 469 $code[] = $row; 470 } else { // else, it is a html tag 471 $not = false; // flag for not taking into account the current tag 472 473 // if it is the searched tag 474 if ($row['name']==$detect) { 475 // if we are just at the root level => dont take it 476 if ($level==0) { 477 $not = true; 478 } 479 480 // update the level 481 $level+= ($row['close'] ? -1 : 1); 482 483 // if we are now at the root level => it is the end, and dont take it 484 if ($level==0) { 485 $not = true; 486 $end = true; 487 } 488 } 489 490 // if we can takin into account the current tag => save it 491 if (!$not) { 492 if (isset($row['style']['text-align'])) unset($row['style']['text-align']); 493 $code[] = $row; 494 } 495 } 496 497 // it continues as long as there has code to analise 498 if (isset($this->code[$k+1])) 499 $k++; 500 else 501 $end = true; 502 } 503 504 // return the extract 505 return $code; 506 } 507 508 /** 509 * return a part of the HTML code, for error message 510 * 511 * @param integer position 512 * @param integer take before 513 * @param integer take after 514 * @return string part of the html code 515 */ 516 public function getHtmlErrorCode($pos, $before=30, $after=40) 517 { 518 return substr($this->_html, $pos-$before, $before+$after); 519 } 520 }
title
Description
Body
title
Description
Body
title
Description
Body
title
Body
Generated: Tue Mar 17 22:47:18 2015 | Cross-referenced by PHPXref 0.7.1 |