發表原文:http://tw2.php.net/manual/en/function.substr.php#59719
Here's a little addon to the html_substr function posted by fox.
Now it counts only chars outside of tags, and doesn't cut words.
Note: this will only work in xhtml strict/transitional due to the checking of "/>" tags and the requirement of quotations in every value of a tag. It's also only been tested with the presence of br, img, and a tags, but it should work with the presence of any tag.
<?php
function html_substr($posttext, $minimum_length = 200, $length_offset = 20, $cut_words = FALSE, $dots = TRUE) {
// $minimum_length:
// The approximate length you want the concatenated text to be
// $length_offset:
// The variation in how long the text can be in this example text
// length will be between 200 and 200-20=180 characters and the
// character where the last tag ends
// Reset tag counter & quote checker
$tag_counter = 0;
$quotes_on = FALSE;
// Check if the text is too long
if (strlen($posttext) > $minimum_length) {
// Reset the tag_counter and pass through (part of) the entire text
$c = 0;
for ($i = 0; $i < strlen($posttext); $i++) {
// Load the current character and the next one
// if the string has not arrived at the last character
$current_char = substr($posttext,$i,1);
if ($i < strlen($posttext) - 1) {
$next_char = substr($posttext,$i + 1,1);
}
else {
$next_char = "";
}
// First check if quotes are on
if (!$quotes_on) {
// Check if it's a tag
// On a "<" add 3 if it's an opening tag (like <a href...)
// or add only 1 if it's an ending tag (like </a>)
if ($current_char == '<') {
if ($next_char == '/') {
$tag_counter += 1;
}
else {
$tag_counter += 3;
}
}
// Slash signifies an ending (like </a> or ... />)
// substract 2
if ($current_char == '/' && $tag_counter <> 0) $tag_counter -= 2;
// On a ">" substract 1
if ($current_char == '>') $tag_counter -= 1;
// If quotes are encountered, start ignoring the tags
// (for directory slashes)
if ($current_char == '"') $quotes_on = TRUE;
}
else {
// IF quotes are encountered again, turn it back off
if ($current_char == '"') $quotes_on = FALSE;
}
// Count only the chars outside html tags
if($tag_counter == 2 || $tag_counter == 0){
$c++;
}
// Check if the counter has reached the minimum length yet,
// then wait for the tag_counter to become 0, and chop the string there
if ($c > $minimum_length - $length_offset && $tag_counter == 0 && ($next_char == ' ' || $cut_words == TRUE)) {
$posttext = substr($posttext,0,$i + 1);
if($dots){
$posttext .= '...';
}
return $posttext;
}
}
}
return $posttext;
}
?>
<?php
function html_strlen($str) {
$chars = preg_split('/(&[^;s]+;)|/', $str, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE);
return count($chars);
}
function html_substr($str, $start, $length = NULL) {
if ($length === 0) return ""; //stop wasting our time ;)
//check if we can simply use the built-in functions
if (strpos($str, '&') === false) { //No entities. Use built-in functions
if ($length === NULL)
return substr($str, $start);
else
return substr($str, $start, $length);
}
// create our array of characters and html entities
$chars = preg_split('/(&[^;s]+;)|/', $str, -1, PREG_SPLIT_NO_EMPTY | PREG_SPLIT_DELIM_CAPTURE | PREG_SPLIT_OFFSET_CAPTURE);
$html_length = count($chars);
// check if we can predict the return value and save some processing time
if (
($html_length === 0) /* input string was empty */ or
($start >= $html_length) /* $start is longer than the input string */ or
(isset($length) and ($length <= -$html_length)) /* all characters would be omitted */
)
return "";
//calculate start position
if ($start >= 0) {
$real_start = $chars[$start][1];
} else { //start'th character from the end of string
$start = max($start,-$html_length);
$real_start = $chars[$html_length+$start][1];
}
if (!isset($length)) // no $length argument passed, return all remaining characters
return substr($str, $real_start);
else if ($length > 0) { // copy $length chars
if ($start+$length >= $html_length) { // return all remaining characters
return substr($str, $real_start);
} else { //return $length characters
return substr($str, $real_start, $chars[max($start,0)+$length][1] - $real_start);
}
} else { //negative $length. Omit $length characters from end
return substr($str, $real_start, $chars[$html_length+$length][1] - $real_start);
}
}
?>
之前看了許多資料,後來發現還是會在某些情況下出問題,一是有BIG5碼中文字切割會產生最後一碼為亂碼,二是可能會切到HTML的Tag,索性先自己寫一個,但這還是有修正的必要。
PHP Function Code
function substr_big5($str,$start,$len)
{
return end_big5(substr($str,$start,$len));
}
function end_big5($src){
$str = preg_replace("/[xa1-xf9][x40-x7exa1-xfe]/","",$src);
return (preg_match("/[xa1-xf9]$/",$str)) ? substr($src,0,-1) : $src;
}
function html_substr($str,$start,$len){
//必須是 > 結尾
for($newlen=$start+$len;$newlen<strlen($str);$newlen++){
if(substr($str,$newlen-1,1)==">"){
break;
}
}
return end_big5(substr($str,$start,$newlen));
}
PHP CALL Code
<?php echo html_substr($row["ArticleContent"],0,_Web_ShortText_Length); ?>
當然這個模組還是有點小問題,就是在巢狀的HTML碼(如<ul><li></li></ul>),或是Tag還是會有沒收合的問題(如<a href=...>),這可能要參考先前找到的兩篇資料來看看,希望能解決。
翻了一下 Smarty plugin裡面的 Html_Substr , 發現寫的實在是太漂亮了.
原文網址:http://www.phpinsider.com/smarty-forum/viewtopic.php?t=533
所以當然就要拿出來用看看.
function html_substr($string, $length)
{
if( !empty( $string ) && $length>0 ) {
$isText = true;
$ret = "";
$i = 0;
$currentChar = "";
$lastSpacePosition = -1;
$lastChar = "";
$tagsArray = array();
$currentTag = "";
$tagLevel = 0;
$noTagLength = strlen( strip_tags( $string ) );
// Parser loop
for( $j=0; $j<strlen( $string ); $j++ ) {
$currentChar = substr( $string, $j, 1 );
$ret .= $currentChar;
// Lesser than event
if( $currentChar == "<") $isText = false;
// Character handler
if( $isText ) {
// Memorize last space position
if( $currentChar == " " ) { $lastSpacePosition = $j; }
else { $lastChar = $currentChar; }
$i++;
} else {
$currentTag .= $currentChar;
}
// Greater than event
if( $currentChar == ">" ) {
$isText = true;
// Opening tag handler
if( ( strpos( $currentTag, "<" ) !== FALSE ) &&
( strpos( $currentTag, "/>" ) === FALSE ) &&
( strpos( $currentTag, "</") === FALSE ) ) {
// Tag has attribute(s)
if( strpos( $currentTag, " " ) !== FALSE ) {
$currentTag = substr( $currentTag, 1, strpos( $currentTag, " " ) - 1 );
} else {
// Tag doesn't have attribute(s)
$currentTag = substr( $currentTag, 1, -1 );
}
array_push( $tagsArray, $currentTag );
} else if( strpos( $currentTag, "</" ) !== FALSE ) {
array_pop( $tagsArray );
}
$currentTag = "";
}
if( $i >= $length) {
break;
}
}
// Cut HTML string at last space position
if( $length < $noTagLength ) {
if( $lastSpacePosition != -1 ) {
$ret = substr( $string, 0, $lastSpacePosition );
} else {
$ret = substr( $string, $j );
}
}
// Close broken XHTML elements
while( sizeof( $tagsArray ) != 0 ) {
$aTag = array_pop( $tagsArray );
$ret .= "</" . $aTag . ">n";
}
} else {
$ret = "";
}
return( $ret );
}
之前看了 Smart Plugin 中的 Html_Substr 函數 , 發現在中文的處理上還是有點不夠完美 , 因此就打算自己寫一個來專門處理中文 , 並且一樣要保持 Html 碼在截短後的完整性 , 看是否能夠更完整的處理本文縮短的問題.
構想 :HTML標籤使用堆疊方法來記錄 , 並在截短後輸入補上結尾 . 中文字部份使用 php mbstring 系列函數來處理 , 包含長度及取字 . 取字長度使用內文長度 , 而非包含HTML的原始碼長度.
function html_substr($string, $length)
{
if( !empty( $string ) && $length>0 ) {
$isText = true; //是否為內文的判斷器
$ret = ""; //最後輸出的字串
$i = 0; //內文字記數器 (判斷長度用)
$currentChar = ""; //目前處理的字元
$lastSpacePosition = -1;//最後設定輸出的位置
$tagsArray = array(); //標籤陣列 , 堆疊設計想法
$currentTag = ""; //目前處理中的標籤
$noTagLength = mb_strlen( strip_tags( $string ),'BIG-5' ); //沒有HTML標籤的字串長度
// 判斷所有字的迴圈
for( $j=0; $j<mb_strlen($string,'BIG-5'); $j++ ) {
$currentChar = mb_substr( $string, $j, 1 ,'BIG-5');
$ret .= $currentChar;
// 如果是HTML標籤開頭
if( $currentChar == "<") $isText = false;
// 如果是內文
if( $isText ) {
// 如果遇到空白則表示暫定輸出到這
if( $currentChar == " " ) { $lastSpacePosition = $j; }
//內文長度記錄
$i++;
} else {
$currentTag .= $currentChar;
}
// 如果是HTML標籤結尾
if( $currentChar == ">" ) {
$isText = true;
// 判斷標籤是否要處理 , 是否有結尾
if( ( mb_strpos( $currentTag, "<" ,0,'BIG-5') !== FALSE ) &&
( mb_strpos( $currentTag, "/>",0,'BIG-5' ) === FALSE ) &&
( mb_strpos( $currentTag, "</",0,'BIG-5') === FALSE ) ) {
// 取出標籤名稱 (有無屬性的情況皆處理)
if( mb_strpos( $currentTag, " ",0,'BIG-5' ) !== FALSE ) {
// 有屬性
$currentTag = mb_substr( $currentTag, 1, mb_strpos( $currentTag, " " ,0,'BIG-5') - 1 ,'BIG-5');
} else {
// 沒屬性
$currentTag = mb_substr( $currentTag, 1, -1 ,'BIG-5');
}
// 加入標籤陣列
array_push( $tagsArray, $currentTag );
} else if( mb_strpos( $currentTag, "</" ,0,'BIG-5') !== FALSE ) {
// 取出最後一個標籤(表示已結尾)
array_pop( $tagsArray );
}
//清除現在的標籤
$currentTag = "";
}
// 判斷是否還要繼續抓字 (用內文長度判斷)
if( $i >= $length) {
break;
}
}
// 取出要截短的HTML字串
if( $length < $noTagLength ) {
if( $lastSpacePosition != -1 ) {
// 指定的結尾
$ret = mb_substr( $string, 0, $lastSpacePosition ,'BIG-5' );
} else {
// 預設的內文長度位置
$ret = mb_substr( $string, 0 , $j ,'BIG-5' );
}
}
// 補上未結尾的標籤
while( sizeof( $tagsArray ) != 0 ) {
$aTag = array_pop( $tagsArray );
$ret .= "</" . $aTag . ">n";
}
} else {
$ret = "";
}
return( $ret );
}