[ Index ]

PHP Cross Reference of Unnamed Project

title

Body

[close]

/se3master/var/www/se3/html2pdf/_class/ -> parsingHtml.class.php (source)

   1  <?php
   2  /**
   3   * HTML2PDF Librairy - parsingHtml class
   4   *
   5   * HTML => PDF convertor
   6   * distributed under the LGPL License
   7   *
   8   * @author      Laurent MINGUET <webmaster@html2pdf.fr>
   9   * @version     4.03
  10   */
  11  
  12  class HTML2PDF_parsingHtml
  13  {
  14      protected    $_html     = '';        // HTML code to parse
  15      protected    $_num      = 0;         // table number
  16      protected    $_level    = 0;         // table level
  17      protected    $_encoding = '';        // encoding
  18      public       $code      = array();   // parsed HTML codfe
  19  
  20      const HTML_TAB = '        ';
  21  
  22      /**
  23       * main constructor
  24       *
  25       * @param   string encoding
  26       * @access  public
  27       */
  28      public function __construct($encoding = 'UTF-8')
  29      {
  30          $this->_num   = 0;
  31          $this->_level = array($this->_num);
  32          $this->_html  = '';
  33          $this->code  = array();
  34          $this->setEncoding($encoding);
  35      }
  36  
  37      /**
  38       * change the encoding
  39       *
  40       * @param   string encoding
  41       * @access  public
  42       */
  43      public function setEncoding($encoding)
  44      {
  45          $this->_encoding = $encoding;
  46      }
  47  
  48      /**
  49       * Define the HTML code to parse
  50       *
  51       * @param   string HTML code
  52       * @access  public
  53       */
  54      public function setHTML($html)
  55      {
  56          // remove the HTML in comment
  57          $html = preg_replace('/<!--(.*)-->/isU', '', $html);
  58  
  59          // save the HTML code
  60          $this->_html = $html;
  61      }
  62  
  63      /**
  64       * parse the HTML code
  65       *
  66       * @access public
  67       */
  68      public function parse()
  69      {
  70          $parents = array();
  71  
  72          // flag : are we in a <pre> Tag ?
  73          $tagPreIn = false;
  74  
  75          // action to use for each line of the content of a <pre> Tag
  76          $tagPreBr = array(
  77                      'name' => 'br',
  78                      'close' => false,
  79                      'param' => array(
  80                          'style' => array(),
  81                          'num'    => 0
  82                      )
  83                  );
  84  
  85          // tag that can be not closed
  86          $tagsNotClosed = array(
  87              'br', 'hr', 'img', 'col',
  88              'input', 'link', 'option',
  89              'circle', 'ellipse', 'path', 'rect', 'line', 'polygon', 'polyline'
  90          );
  91  
  92          // search the HTML tags
  93          $tmp = array();
  94          $this->_searchCode($tmp);
  95  
  96          // all the actions to do
  97          $actions = array();
  98  
  99          // foreach part of the HTML code
 100          foreach ($tmp as $part) {
 101              // if it is a tag code
 102              if ($part[0]=='code') {
 103                  // analise the HTML code
 104                  $res = $this->_analiseCode($part[1]);
 105  
 106                  // if it is a real HTML tag
 107                  if ($res) {
 108                      // save the current posistion in the HTML code
 109                      $res['html_pos'] = $part[2];
 110  
 111                      // if the tag must be closed
 112                      if (!in_array($res['name'], $tagsNotClosed)) {
 113                          // if it is a closure tag
 114                          if ($res['close']) {
 115                              // HTML validation
 116                              if (count($parents)<1)
 117                                  throw new HTML2PDF_exception(3, $res['name'], $this->getHtmlErrorCode($res['html_pos']));
 118                              else if ($parents[count($parents)-1]!=$res['name'])
 119                                  throw new HTML2PDF_exception(4, $parents, $this->getHtmlErrorCode($res['html_pos']));
 120                              else
 121                                  unset($parents[count($parents)-1]);
 122                          } else {
 123                              // if it is a autoclosed tag
 124                              if ($res['autoclose']) {
 125                                  // save the opened tag
 126                                  $actions[] = $res;
 127  
 128                                  // prepare the closed tag
 129                                  $res['params'] = array();
 130                                  $res['close'] = true;
 131                              }
 132                              // else :add a child for validation
 133                              else
 134                                  $parents[count($parents)] = $res['name'];
 135                          }
 136  
 137                          // if it is a <pre> tag (or <code> tag) not auclosed => update the flag
 138                          if (($res['name']=='pre' || $res['name']=='code') && !$res['autoclose']) {
 139                              $tagPreIn = !$res['close'];
 140                          }
 141                      }
 142  
 143                      // save the actions to convert
 144                      $actions[] = $res;
 145                  } else { // else (it is not a real HTML tag => we transform it in Texte
 146                      $part[0]='txt';
 147                  }
 148              }
 149              // if it is text
 150              if ($part[0]=='txt') {
 151                  // if we are not in a <pre> tag
 152                  if (!$tagPreIn) {
 153                      // save the action
 154                      $actions[] = array(
 155                          'name'    => 'write',
 156                          'close'    => false,
 157                          'param' => array('txt' => $this->_prepareTxt($part[1])),
 158                      );
 159                  } else { // else (if we are in a <pre> tag)
 160                      // prepare the text
 161                      $part[1] = str_replace("\r", '', $part[1]);
 162                      $part[1] = explode("\n", $part[1]);
 163  
 164                      // foreach line of the text
 165                      foreach ($part[1] as $k => $txt) {
 166                          // transform the line
 167                          $txt = str_replace("\t", self::HTML_TAB, $txt);
 168                          $txt = str_replace(' ', '&nbsp;', $txt);
 169  
 170                          // add a break line
 171                          if ($k>0) $actions[] = $tagPreBr;
 172  
 173                          // save the action
 174                          $actions[] = array(
 175                              'name'    => 'write',
 176                              'close'    => false,
 177                              'param' => array('txt' => $this->_prepareTxt($txt, false)),
 178                          );
 179                      }
 180                  }
 181              }
 182          }
 183  
 184          // for each indentified action, we have to clean up the begin and the end of the texte
 185          // based on tags that surround it
 186  
 187          // list of the tags to clean
 188          $tagsToClean = array(
 189              'page', 'page_header', 'page_footer', 'form',
 190              'table', 'thead', 'tfoot', 'tr', 'td', 'th', 'br',
 191              'div', 'hr', 'p', 'ul', 'ol', 'li',
 192              'h1', 'h2', 'h3', 'h4', 'h5', 'h6',
 193              'bookmark', 'fieldset', 'legend',
 194              'draw', 'circle', 'ellipse', 'path', 'rect', 'line', 'g', 'polygon', 'polyline',
 195              'option'
 196          );
 197  
 198          // foreach action
 199          $nb = count($actions);
 200          for ($k=0; $k<$nb; $k++) {
 201              // if it is a Text
 202              if ($actions[$k]['name']=='write') {
 203                  // if the tag before the text is a tag to clean => ltrim on the text
 204                  if ($k>0 && in_array($actions[$k-1]['name'], $tagsToClean))
 205                      $actions[$k]['param']['txt'] = ltrim($actions[$k]['param']['txt']);
 206  
 207                  // if the tag after the text is a tag to clean => rtrim on the text
 208                  if ($k<$nb-1 && in_array($actions[$k+1]['name'], $tagsToClean))
 209                      $actions[$k]['param']['txt'] = rtrim($actions[$k]['param']['txt']);
 210  
 211                  // if the text is empty => remove the action
 212                  if (!strlen($actions[$k]['param']['txt']))
 213                      unset($actions[$k]);
 214              }
 215          }
 216  
 217          // if we are not on the level 0 => HTML validator ERROR
 218          if (count($parents)) throw new HTML2PDF_exception(5, $parents);
 219  
 220          // save the actions to do
 221          $this->code = array_values($actions);
 222      }
 223  
 224      /**
 225       * prepare the text
 226       *
 227       * @param   string texte
 228       * @param   boolean true => replace multiple space+\t+\r+\n by a single space
 229       * @return  string texte
 230       * @access  protected
 231       */
 232      protected function _prepareTxt($txt, $spaces = true)
 233      {
 234          if ($spaces) $txt = preg_replace('/\s+/is', ' ', $txt);
 235          $txt = str_replace('&euro;', '€', $txt);
 236          $txt = html_entity_decode($txt, ENT_QUOTES, $this->_encoding);
 237          return $txt;
 238      }
 239  
 240      /**
 241       * parse the HTML code
 242       *
 243       * @param    &array    array's result
 244       * @return   null
 245       */
 246      protected function _searchCode(&$tmp)
 247      {
 248          // initialise the array
 249          $tmp = array();
 250  
 251          // regexp to separate the tags from the texts
 252          $reg = '/(<[^>]+>)|([^<]+)+/isU';
 253  
 254          // last match found
 255          $str = '';
 256          $offset = 0;
 257  
 258          // As it finds a match
 259          while (preg_match($reg, $this->_html, $parse, PREG_OFFSET_CAPTURE, $offset)) {
 260              // if it is a tag
 261              if ($parse[1][0]) {
 262                  // save the previous text if it exists
 263                  if ($str!=='')    $tmp[] = array('txt', $str);
 264  
 265                  // save the tag, with the offset
 266                  $tmp[] = array('code', trim($parse[1][0]), $offset);
 267  
 268                  // init the current text
 269                  $str = '';
 270              } else { // else (if it is a text)
 271                  // add the new text to the current text
 272                  $str.= $parse[2][0];
 273              }
 274  
 275              // Update offset to the end of the match
 276              $offset = $parse[0][1] + strlen($parse[0][0]);
 277              unset($parse);
 278          }
 279          // if a text is present in the end, we save it
 280          if ($str!='') $tmp[] = array('txt', $str);
 281          unset($str);
 282      }
 283  
 284      /**
 285       * analise a HTML tag
 286       *
 287       * @param   string   HTML code to analise
 288       * @return  array    corresponding action
 289       */
 290      protected function _analiseCode($code)
 291      {
 292          // name of the tag, opening, closure, autoclosure
 293          $tag = '<([\/]{0,1})([_a-z0-9]+)([\/>\s]+)';
 294          if (!preg_match('/'.$tag.'/isU', $code, $match)) return null;
 295          $close     = ($match[1]=='/' ? true : false);
 296          $autoclose = preg_match('/\/>$/isU', $code);
 297          $name      = strtolower($match[2]);
 298  
 299          // required parameters (depends on the tag name)
 300          $param    = array();
 301          $param['style'] = '';
 302          if ($name=='img') {
 303              $param['alt'] = '';
 304              $param['src'] = '';
 305          }
 306          if ($name=='a') {
 307              $param['href'] = '';
 308          }
 309  
 310          // read the parameters : nom=valeur
 311          $prop = '([a-zA-Z0-9_]+)=([^"\'\s>]+)';
 312          preg_match_all('/'.$prop.'/is', $code, $match);
 313          for($k=0; $k<count($match[0]); $k++)
 314              $param[trim(strtolower($match[1][$k]))] = trim($match[2][$k]);
 315  
 316          // read the parameters : nom="valeur"
 317          $prop = '([a-zA-Z0-9_]+)=["]([^"]*)["]';
 318          preg_match_all('/'.$prop.'/is', $code, $match);
 319          for($k=0; $k<count($match[0]); $k++)
 320              $param[trim(strtolower($match[1][$k]))] = trim($match[2][$k]);
 321  
 322          // read the parameters : nom='valeur'
 323          $prop = "([a-zA-Z0-9_]+)=[']([^']*)[']";
 324          preg_match_all('/'.$prop.'/is', $code, $match);
 325          for($k=0; $k<count($match[0]); $k++)
 326              $param[trim(strtolower($match[1][$k]))] = trim($match[2][$k]);
 327  
 328          // compliance of each parameter
 329          $color  = "#000000";
 330          $border = null;
 331          foreach ($param as $key => $val) {
 332              $key = strtolower($key);
 333              switch($key)
 334              {
 335                  case 'width':
 336                      unset($param[$key]);
 337                      $param['style'] .= 'width: '.$val.'px; ';
 338                      break;
 339  
 340                  case 'align':
 341                      if ($name==='img') {
 342                          unset($param[$key]);
 343                          $param['style'] .= 'float: '.$val.'; ';
 344                      } elseif ($name!=='table') {
 345                          unset($param[$key]);
 346                          $param['style'] .= 'text-align: '.$val.'; ';
 347                      }
 348                      break;
 349  
 350                  case 'valign':
 351                      unset($param[$key]);
 352                      $param['style'] .= 'vertical-align: '.$val.'; ';
 353                      break;
 354  
 355                  case 'height':
 356                      unset($param[$key]);
 357                      $param['style'] .= 'height: '.$val.'px; ';
 358                      break;
 359  
 360                  case 'bgcolor':
 361                      unset($param[$key]);
 362                      $param['style'] .= 'background: '.$val.'; ';
 363                      break;
 364  
 365                  case 'bordercolor':
 366                      unset($param[$key]);
 367                      $color = $val;
 368                      break;
 369  
 370                  case 'border':
 371                      unset($param[$key]);
 372                      if (preg_match('/^[0-9]+$/isU', $val)) $val = $val.'px';
 373                      $border = $val;
 374                      break;
 375  
 376                  case 'cellpadding':
 377                  case 'cellspacing':
 378                      if (preg_match('/^([0-9]+)$/isU', $val)) $param[$key] = $val.'px';
 379                      break;
 380  
 381                  case 'colspan':
 382                  case 'rowspan':
 383                      $val = preg_replace('/[^0-9]/isU', '', $val);
 384                      if (!$val) $val = 1;
 385                      $param[$key] = $val;
 386                      break;
 387              }
 388          }
 389  
 390          // compliance of the border
 391          if ($border!==null) {
 392              if ($border)    $border = 'border: solid '.$border.' '.$color;
 393              else            $border = 'border: none';
 394  
 395              $param['style'] .= $border.'; ';
 396              $param['border'] = $border;
 397          }
 398  
 399          // reading styles: decomposition and standardization
 400          $styles = explode(';', $param['style']);
 401          $param['style'] = array();
 402          foreach ($styles as $style) {
 403              $tmp = explode(':', $style);
 404              if (count($tmp)>1) {
 405                  $cod = $tmp[0];
 406                  unset($tmp[0]);
 407                  $tmp = implode(':', $tmp);
 408                  $param['style'][trim(strtolower($cod))] = preg_replace('/[\s]+/isU', ' ', trim($tmp));
 409              }
 410          }
 411  
 412          // determining the level of table opening, with an added level
 413          if (in_array($name, array('ul', 'ol', 'table')) && !$close) {
 414              $this->_num++;
 415              $this->_level[count($this->_level)] = $this->_num;
 416          }
 417  
 418          // get the level of the table containing the element
 419          if (!isset($param['num'])) {
 420              $param['num'] = $this->_level[count($this->_level)-1];
 421          }
 422  
 423          // for closures table: remove a level
 424          if (in_array($name, array('ul', 'ol', 'table')) && $close) {
 425              unset($this->_level[count($this->_level)-1]);
 426          }
 427  
 428          // prepare the parameters
 429          if (isset($param['value']))  $param['value']  = $this->_prepareTxt($param['value']);
 430          if (isset($param['alt']))    $param['alt']    = $this->_prepareTxt($param['alt']);
 431          if (isset($param['title']))  $param['title']  = $this->_prepareTxt($param['title']);
 432          if (isset($param['class']))  $param['class']  = $this->_prepareTxt($param['class']);
 433  
 434          // return the new action to do
 435          return array('name' => $name, 'close' => $close ? 1 : 0, 'autoclose' => $autoclose, 'param' => $param);
 436      }
 437  
 438      /**
 439       * get a full level of HTML, between an opening and closing corresponding
 440       *
 441       * @param   integer key
 442       * @return  array   actions
 443       */
 444      public function getLevel($k)
 445      {
 446          // if the code does not exist => return empty
 447          if (!isset($this->code[$k])) return array();
 448  
 449          // the tag to detect
 450          $detect = $this->code[$k]['name'];
 451  
 452          // if it is a text => return
 453          if ($detect=='write') {
 454              return array($this->code[$k]);
 455          }
 456  
 457          //
 458          $level = 0;      // depth level
 459          $end = false;    // end of the search
 460          $code = array(); // extract code
 461  
 462          // while it's not ended
 463          while (!$end) {
 464              // current action
 465              $row = $this->code[$k];
 466  
 467              // if 'write' => we add the text
 468              if ($row['name']=='write') {
 469                  $code[] = $row;
 470              } else { // else, it is a html tag
 471                  $not = false; // flag for not taking into account the current tag
 472  
 473                  // if it is the searched tag
 474                  if ($row['name']==$detect) {
 475                      // if we are just at the root level => dont take it
 476                      if ($level==0) {
 477                          $not = true;
 478                      }
 479  
 480                      // update the level
 481                      $level+= ($row['close'] ? -1 : 1);
 482  
 483                      // if we are now at the root level => it is the end, and dont take it
 484                      if ($level==0) {
 485                          $not = true;
 486                          $end = true;
 487                      }
 488                  }
 489  
 490                  // if we can takin into account the current tag => save it
 491                  if (!$not) {
 492                      if (isset($row['style']['text-align'])) unset($row['style']['text-align']);
 493                      $code[] = $row;
 494                  }
 495              }
 496  
 497              // it continues as long as there has code to analise
 498              if (isset($this->code[$k+1]))
 499                  $k++;
 500              else
 501                  $end = true;
 502          }
 503  
 504          // return the extract
 505          return $code;
 506      }
 507  
 508      /**
 509       * return a part of the HTML code, for error message
 510       *
 511       * @param   integer position
 512       * @param   integer take before
 513       * @param   integer take after
 514       * @return  string  part of the html code
 515       */
 516      public function getHtmlErrorCode($pos, $before=30, $after=40)
 517      {
 518          return substr($this->_html, $pos-$before, $before+$after);
 519      }
 520  }


Generated: Tue Mar 17 22:47:18 2015 Cross-referenced by PHPXref 0.7.1