java正則表達式處理HTML標籤

package com.jrj.stock.common.util;

import java.util.ArrayList;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * html標籤處理類
 * @author TonsonMiao
 */

public class HtmlTagUtil {

	/**
	 * 獲取指定標籤的第一次出現的內容
	 * @param target
	 * @param tag
	 * @return
	 */
	public String getTagContentFirst(String target, String tag) {
		String str = "<[" + tag.toUpperCase() + "," + tag.toLowerCase()
				+ "].*?>(.*?)</[" + tag.toUpperCase() + "," + tag.toLowerCase()
				+ "]>";
		Pattern p = Pattern.compile(str);
		Matcher m = p.matcher(target);
		if (m.find()) {
			return m.group();
		} else {
			return null;
		}
	}

	/**
	 * 獲取所有指定標籤的內容
	 * @param target
	 * @param tag
	 * @return
	 */
	public List<String> getTagContentAll(String target, String tag) {
		String str = "<[" + tag.toUpperCase() + "," + tag.toLowerCase()
				+ "].*?>(.*?)</[" + tag.toUpperCase() + "," + tag.toLowerCase()
				+ "]>";
		Pattern p = Pattern.compile(str);
		Matcher m = p.matcher(target);
		List<String> list = new ArrayList<String>();
		while (m.find()) {
			list.add(m.group(1));
		}
		return list;
	}

	/**
	 * 清除指定標籤
	 * @param target
	 * @param tag
	 * @return
	 */
	public String clearTag(String target, String tag) {
		String str = "<[" + tag.toUpperCase() + "," + tag.toLowerCase()
				+ "].*?>(.*?)</[" + tag.toUpperCase() + "," + tag.toLowerCase()
				+ "]>";
		Pattern p = Pattern.compile(str);
		Matcher m = p.matcher(target);
		StringBuffer sb = new StringBuffer();
		while (m.find()) {
			m.appendReplacement(sb, "");
		}
		m.appendTail(sb);
		return sb.toString();
	}

	/**
	 * 清除指定標籤的標籤名
	 * @param target
	 * @param tag
	 * @return
	 */
	public String clearTagName(String target, String tag) {
		String str = "<" + tag.toUpperCase() + ".*?>(.*?)</"
				+ tag.toUpperCase() + ">";
		boolean doubleFlag = true;
		if (tag.equalsIgnoreCase("input") || tag.equalsIgnoreCase("img")) {
			str = "<" + tag.toUpperCase() + "(.*?)>";
			doubleFlag = false;
		}
		Pattern p = Pattern.compile(str, Pattern.CASE_INSENSITIVE);
		Matcher m = p.matcher(target);
		StringBuffer sb = new StringBuffer();
		boolean flag = false;
		while (sb.length() == 0 || flag) {
			sb = new StringBuffer();
			while (flag || m.find()) {
				m.appendReplacement(sb, "");
				if (doubleFlag)
					sb.append(m.group(1));
				flag = false;
			}
			m.appendTail(sb);
			flag = (m = p.matcher(sb.toString())).find();
		}

		return sb.toString();
	}

	/**
	 * 清除所有標籤名
	 * @param target
	 * @return
	 */
	public String clearTagNameAll(String target) {
		String str = "<([^>]*)>";
		Pattern p = Pattern.compile(str, Pattern.CASE_INSENSITIVE);
		Matcher m = p.matcher(target);
		StringBuffer sb = new StringBuffer();
		while (m.find()) {
			m.appendReplacement(sb, "");
		}
		m.appendTail(sb);

		return sb.toString();
	}

	/**
	 * 列出出現過的標籤名
	 * @param target
	 * @return
	 */
	public Set<String> listTagNameAll(String target) {
		String str = "<\\s*[a-zA-Z]+";
		String name = "[a-zA-Z]+";
		Pattern p = Pattern.compile(str, Pattern.CASE_INSENSITIVE);
		Pattern pname = Pattern.compile(name, Pattern.CASE_INSENSITIVE);
		Matcher m = p.matcher(target);
		Set<String> list = new HashSet<String>();
		while (m.find()) {
			Matcher mn = pname.matcher(m.group());
			mn.find();
			list.add(mn.group().toUpperCase());
		}

		return list;
	}

}

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章