PHP 縮短HTML內文的文字 html_substr

 

發表原文:http://tw2.php.net/manual/en/function.substr.php#59719

Here's a little addon to the html_substr function posted by fox.

Now it counts only chars outside of tags, and doesn't cut words.

Note: this will only work in xhtml strict/transitional due to the checking of "/>" tags and the requirement of quotations in every value of a tag. It's also only been tested with the presence of br, img, and a tags, but it should work with the presence of any tag.

 

<?php
function html_substr($posttext, $minimum_length = 200, $length_offset = 20, $cut_words = FALSE, $dots = TRUE) {
 
    // $minimum_length:
    // The approximate length you want the concatenated text to be  
 
 
    // $length_offset:
    // The variation in how long the text can be in this example text
    // length will be between 200 and 200-20=180 characters and the
    // character where the last tag ends
 
    // Reset tag counter & quote checker
    $tag_counter = 0;
    $quotes_on = FALSE;
    // Check if the text is too long
    if (strlen($posttext) > $minimum_length) {
        // Reset the tag_counter and pass through (part of) the entire text
        $c = 0;
        for ($i = 0; $i < strlen($posttext); $i++) {
            // Load the current character and the next one
            // if the string has not arrived at the last character
            $current_char = substr($posttext,$i,1);
            if ($i < strlen($posttext) - 1) {
                $next_char = substr($posttext,$i + 1,1);
            }
            else {
                $next_char = "";
            }
            // First check if quotes are on
            if (!$quotes_on) {
                // Check if it's a tag
                // On a "<" add 3 if it's an opening tag (like <a href...)
                // or add only 1 if it's an ending tag (like </a>)
                if ($current_char == '<') {
                    if ($next_char == '/') {
                        $tag_counter += 1;
                    }
                    else {
                        $tag_counter += 3;
                    }
                }
                // Slash signifies an ending (like </a> or ... />)
                // substract 2
                if ($current_char == '/' && $tag_counter <> 0) $tag_counter -= 2;
                // On a ">" substract 1
                if ($current_char == '>') $tag_counter -= 1;
                // If quotes are encountered, start ignoring the tags
                // (for directory slashes)
                if ($current_char == '"') $quotes_on = TRUE;
            }
            else {
                // IF quotes are encountered again, turn it back off
                if ($current_char == '"') $quotes_on = FALSE;
            }
 
            // Count only the chars outside html tags
            if($tag_counter == 2 || $tag_counter == 0){
                $c++;
            }         
 
            // Check if the counter has reached the minimum length yet,
            // then wait for the tag_counter to become 0, and chop the string there
            if ($c > $minimum_length - $length_offset && $tag_counter == 0 && ($next_char == ' ' || $cut_words == TRUE)) {
                $posttext = substr($posttext,0,$i + 1);             
                if($dots){
                   $posttext .= '...';
                }
                return $posttext;
            }
        }
    } 
    return $posttext;
}
 
?>

<?php
 
function html_strlen($str) {
  $chars = preg_split('/(&[^;s]+;)|/', $str, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
  return count($chars);
}
 
function html_substr($str, $start, $length = NULL) {
  if ($length === 0) return ""; //stop wasting our time ;)
 
  //check if we can simply use the built-in functions
  if (strpos($str, '&') === false) { //No entities. Use built-in functions
    if ($length === NULL)
      return substr($str, $start);
    else
      return substr($str, $start, $length);
  }
 
  // create our array of characters and html entities
  $chars = preg_split('/(&[^;s]+;)|/', $str, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_OFFSET_CAPTURE);
  $html_length = count($chars);
 
  // check if we can predict the return value and save some processing time
  if (
       ($html_length === 0) /* input string was empty */ or
       ($start >= $html_length) /* $start is longer than the input string */ or
       (isset($length) and ($length <= -$html_length)) /* all characters would be omitted */
     )
    return "";
 
  //calculate start position
  if ($start >= 0) {
    $real_start = $chars[$start][1];
  } else { //start'th character from the end of string
    $start = max($start,-$html_length);
    $real_start = $chars[$html_length+$start][1];
  }
 
  if (!isset($length)) // no $length argument passed, return all remaining characters
    return substr($str, $real_start);
  else if ($length > 0) { // copy $length chars
    if ($start+$length >= $html_length) { // return all remaining characters
      return substr($str, $real_start);
    } else { //return $length characters
      return substr($str, $real_start, $chars[max($start,0)+$length][1] - $real_start);
    }
  } else { //negative $length. Omit $length characters from end
      return substr($str, $real_start, $chars[$html_length+$length][1] - $real_start);
  }
 
}
 
?>

之前看了許多資料,後來發現還是會在某些情況下出問題,一是有BIG5碼中文字切割會產生最後一碼為亂碼,二是可能會切到HTML的Tag,索性先自己寫一個,但這還是有修正的必要。

PHP Function Code

    function substr_big5($str,$start,$len)
    {
        return end_big5(substr($str,$start,$len));
    }
 
    function end_big5($src){
        $str = preg_replace("/[xa1-xf9][x40-x7exa1-xfe]/","",$src);
        return (preg_match("/[xa1-xf9]$/",$str)) ? substr($src,0,-1) : $src;
    }
 
    function html_substr($str,$start,$len){
        //必須是 > 結尾
        for($newlen=$start+$len;$newlen<strlen($str);$newlen++){
            if(substr($str,$newlen-1,1)==">"){
                break;
            }
        }
        return end_big5(substr($str,$start,$newlen));
    }

 

PHP CALL Code

<?php echo html_substr($row["ArticleContent"],0,_Web_ShortText_Length); ?>

當然這個模組還是有點小問題,就是在巢狀的HTML碼(如<ul><li></li></ul>),或是Tag還是會有沒收合的問題(如<a href=...>),這可能要參考先前找到的兩篇資料來看看,希望能解決。

翻了一下 Smarty plugin裡面的 Html_Substr , 發現寫的實在是太漂亮了.

原文網址:http://www.phpinsider.com/smarty-forum/viewtopic.php?t=533

所以當然就要拿出來用看看.

 function html_substr($string, $length)
    {
        if( !empty( $string ) && $length>0 ) {
            $isText = true;
            $ret = "";
            $i = 0;
 
            $currentChar = "";
            $lastSpacePosition = -1;
            $lastChar = "";
 
            $tagsArray = array();
            $currentTag = "";
            $tagLevel = 0;
 
            $noTagLength = strlen( strip_tags( $string ) );
 
            // Parser loop
            for( $j=0; $j<strlen( $string ); $j++ ) {
 
                $currentChar = substr( $string, $j, 1 );
                $ret .= $currentChar;
 
                // Lesser than event
                if( $currentChar == "<") $isText = false;
 
                // Character handler
                if( $isText ) {
 
                    // Memorize last space position
                    if( $currentChar == " " ) { $lastSpacePosition = $j; }
                    else { $lastChar = $currentChar; }
 
                    $i++;
                } else {
                    $currentTag .= $currentChar;
                }
 
                // Greater than event
                if( $currentChar == ">" ) {
                    $isText = true;
 
                    // Opening tag handler
                    if( ( strpos( $currentTag, "<" ) !== FALSE ) &&
                        ( strpos( $currentTag, "/>" ) === FALSE ) &&
                        ( strpos( $currentTag, "</") === FALSE ) ) {
 
                        // Tag has attribute(s)
                        if( strpos( $currentTag, " " ) !== FALSE ) {
                            $currentTag = substr( $currentTag, 1, strpos( $currentTag, " " ) - 1 );
                        } else {
                            // Tag doesn't have attribute(s)
                            $currentTag = substr( $currentTag, 1, -1 );
                        }
 
                        array_push( $tagsArray, $currentTag );
 
                    } else if( strpos( $currentTag, "</" ) !== FALSE ) {
 
                        array_pop( $tagsArray );
                    }
 
                    $currentTag = "";
                }
 
                if( $i >= $length) {
                    break;
                }
            }
 
            // Cut HTML string at last space position
            if( $length < $noTagLength ) {
                if( $lastSpacePosition != -1 ) {
                    $ret = substr( $string, 0, $lastSpacePosition );
                } else {
                    $ret = substr( $string, $j );
                }
            }
 
            // Close broken XHTML elements
            while( sizeof( $tagsArray ) != 0 ) {
                $aTag = array_pop( $tagsArray );
                $ret .= "</" . $aTag . ">n";
            }
 
        } else {
            $ret = "";
        }
 
        return( $ret );
    }


 

之前看了 Smart Plugin 中的 Html_Substr 函數 , 發現在中文的處理上還是有點不夠完美 , 因此就打算自己寫一個來專門處理中文 , 並且一樣要保持 Html 碼在截短後的完整性 , 看是否能夠更完整的處理本文縮短的問題.

構想 :HTML標籤使用堆疊方法來記錄 , 並在截短後輸入補上結尾 . 中文字部份使用 php mbstring 系列函數來處理 , 包含長度及取字 . 取字長度使用內文長度 , 而非包含HTML的原始碼長度.

function html_substr($string, $length)
 {
  if( !empty( $string ) && $length>0 ) {
   $isText = true;   //是否為內文的判斷器
   $ret = "";    //最後輸出的字串
   $i = 0;     //內文字記數器 (判斷長度用)
 
   $currentChar = "";  //目前處理的字元
   $lastSpacePosition = -1;//最後設定輸出的位置
 
   $tagsArray = array(); //標籤陣列 , 堆疊設計想法
   $currentTag = "";  //目前處理中的標籤
 
   $noTagLength = mb_strlen( strip_tags( $string ),'BIG-5' ); //沒有HTML標籤的字串長度
 
   // 判斷所有字的迴圈
   for( $j=0; $j<mb_strlen($string,'BIG-5'); $j++ ) {
 
    $currentChar = mb_substr( $string, $j, 1 ,'BIG-5');
    $ret .= $currentChar;
 
    // 如果是HTML標籤開頭
    if( $currentChar == "<") $isText = false;
 
    // 如果是內文
    if( $isText ) {
 
     // 如果遇到空白則表示暫定輸出到這
     if( $currentChar == " " ) { $lastSpacePosition = $j; }
 
     //內文長度記錄
     $i++;
    } else {
     $currentTag .= $currentChar;
    }
 
    // 如果是HTML標籤結尾
    if( $currentChar == ">" ) {
     $isText = true;
 
     // 判斷標籤是否要處理 , 是否有結尾
     if( ( mb_strpos( $currentTag, "<" ,0,'BIG-5') !== FALSE ) &&
      ( mb_strpos( $currentTag, "/>",0,'BIG-5' ) === FALSE ) &&
      ( mb_strpos( $currentTag, "</",0,'BIG-5') === FALSE ) ) {
 
      // 取出標籤名稱 (有無屬性的情況皆處理)
      if( mb_strpos( $currentTag, " ",0,'BIG-5' ) !== FALSE ) {
       // 有屬性
       $currentTag = mb_substr( $currentTag, 1, mb_strpos( $currentTag, " " ,0,'BIG-5') - 1 ,'BIG-5');
      } else {
       // 沒屬性
       $currentTag = mb_substr( $currentTag, 1, -1 ,'BIG-5');
      }
 
      // 加入標籤陣列
      array_push( $tagsArray, $currentTag );
 
     } else if( mb_strpos( $currentTag, "</" ,0,'BIG-5') !== FALSE ) {
      // 取出最後一個標籤(表示已結尾)
      array_pop( $tagsArray );
     }
 
     //清除現在的標籤
     $currentTag = "";
    }
 
    // 判斷是否還要繼續抓字 (用內文長度判斷)
    if( $i >= $length) {
     break;
    }
   }
 
   // 取出要截短的HTML字串
   if( $length < $noTagLength ) {
    if( $lastSpacePosition != -1 ) {
     // 指定的結尾
     $ret = mb_substr( $string, 0, $lastSpacePosition ,'BIG-5' );
    } else {
     // 預設的內文長度位置
     $ret = mb_substr( $string, 0 , $j ,'BIG-5' );
    }
   }
 
   // 補上未結尾的標籤
   while( sizeof( $tagsArray ) != 0 ) {
    $aTag = array_pop( $tagsArray );
    $ret .= "</" . $aTag . ">n";
   }
 
  } else {
   $ret = "";
  }
 
  return( $ret );
 }


 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章