上一篇文章介紹了,通過rss抓取新聞正文。這裏介紹同時抓取多個RSS站點的正文,以及抓取正文中的圖片。
我的RSS不是抓取 站點 <body></body>中的內容,而是需要的正文,廣告評論等都排除在外。
第一部分:同時抓取多個站點,看看我的站點配置
<?xml version="1.0" encoding="GB2312"?>
<websites>
<site>
<name>IT之家</name>
<url>http://www.ithome.com/rss/</url>
<startTag><![CDATA[<div class="post_content" id="paragraph">]]></startTag>
<endTag><![CDATA[<div class="share">]]></endTag>
<encoding>GB2312</encoding>
<open>true</open>
</site>
<site>
<name>虎嗅網</name>
<url>http://www.huxiu.com/rss/0.xml</url>
<startTag><![CDATA[<table cellpadding="0" cellspacing="0" class="neirong-box" >]]></startTag>
<endTag><![CDATA[</table>]]></endTag>
<encoding>UTF-8</encoding>
<open>true</open>
</site>
</website>
這兩個站點就是我需要抓取的,url是rss地址 startTag,endTag 是正文開始和結束的位置,encoding是站點的編碼格式,open 表示是否抓取該站點,如果不清晰 請看http://blog.csdn.net/kissliux/article/details/14227057
需要抓取的站點準備好了,開始解析吧。使用dom4j,請引入相關jar 我習慣使用maven管理這些jar
<dependency>
<groupId>dom4j</groupId>
<artifactId>dom4j</artifactId>
<version> 1.6.1</version>
</dependency>
public class Website {
private String name;
private String url;
private String startTag;
private String endTag;
private String encoding;
private String open;//省略 get set
}
/**
* @author hongliang.dinghl
* Dom4j 生成XML文檔與解析XML文檔
*/
public class Dom4jUtil {
public List<Website> parserXml(String fileName) {
SAXReader saxReader = new SAXReader();
List<Website> list = new ArrayList<Website>();
try {
URL url = getClass().getResource("/");
//System.out.println(url.getPath());
String path = url.getFile().replace("%20", " ") + fileName;
Document document = saxReader.read(new File(path));
Element websites = document.getRootElement();
for (Iterator i = websites.elementIterator(); i.hasNext(); ) {
Element employee = (Element) i.next();
Website website = new Website();
for (Iterator j = employee.elementIterator(); j.hasNext(); ) {
Element node = (Element) j.next();
String name = node.getName();
// System.out.println(name + ":" + node.getText());
String methodName = "set" + name.substring(0, 1).toUpperCase() + name.substring(1);
Method method = website.getClass().getMethod(methodName, String.class);
method.invoke(website, node.getText());
}
list.add(website);
}
} catch (DocumentException e) {
e.printStackTrace();
} catch (NoSuchMethodException e) {
e.printStackTrace();
} catch (InvocationTargetException e) {
e.printStackTrace();
} catch (IllegalAccessException e) {
e.printStackTrace();
}
return list;
}
}
多個站點解析結束。然後遍歷站點,訪問url,抓取正文, 請看我上一篇文字。
第二部分:RSS圖片抓取,a鏈接的去除 直接看代碼吧,都有註釋的 。文字的最底部,
public class FeedReader {
private String CLASS_PAHT;
private String relative_path;
public FeedReader() {
Properties proerties = PropertiesUtil.getInstance().getProerties();
CLASS_PAHT= proerties.getProperty("image_path");
relative_path = proerties.getProperty("relative_path");
}
/**
* @param url rss 網站地址 比如:http://www.ithome.com/rss/
* @return 所有文章對象
* @throws Exception
*/
public List<RSSItemBean> getRss(String url) throws Exception {
URL feedUrl = new URL(url);//SyndFeedInput:從遠程讀到xml結構的內容轉成SyndFeedImpl實例
URLConnection conn = feedUrl.openConnection();
conn.setRequestProperty("User-Agent", "Mozilla/4.0 (compatible; MSIE 5.0; Windows NT; DigExt)"); //欺騙服務器
SyndFeedInput input = new SyndFeedInput();//rome按SyndFeed類型生成rss和atom的實例,
SyndFeed feed = input.build(new XmlReader(conn)); //SyndFeed是rss和atom實現類SyndFeedImpl的接口
List<SyndEntryImpl> entries = feed.getEntries();
RSSItemBean item = null;
List<RSSItemBean> rssItemBeans = new ArrayList<RSSItemBean>();
for (SyndEntryImpl entry : entries) {
item = new RSSItemBean();
item.setTitle(entry.getTitle().trim());
item.setType(feed.getTitleEx().getValue().trim());
item.setUri(entry.getUri());
item.setPubDate(entry.getPublishedDate());
item.setAuthor(entry.getAuthor());
rssItemBeans.add(item);
}
return rssItemBeans;
}
/**
* 從html 中獲取 新聞正文
*
* @param website 網站對象,我自己定義的
* @return 加入了新聞正文的 RSS對象 對象鏈表
* @throws Exception
*/
public List<RSSItemBean> getContent(Website website) throws Exception {
String content;
List<RSSItemBean> rssList = getRss(website.getUrl());
FindHtml findHtml = new FindHtml(website.getStartTag(), website.getEndTag(), website.getEncoding());
for (RSSItemBean rsItem : rssList) {
String link = rsItem.getUri();
content = findHtml.getContent(link); //關鍵方法,獲取新聞徵文
content = processImages(content); //轉換圖片
rsItem.setContent(content);
//break;
rsItem.setFid(Integer.parseInt(website.getFid()));
}
return rssList;
}
/**
* 去掉文章中的<a>
*
* @param input
* @return
*/
private String removeLinks(String input) {
String output = input;
// 開頭的<a>的正則表達式
String regEx = "<a [^>]*>";
Pattern p = Pattern.compile(regEx, Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(input);
output = m.replaceAll("");
// 結尾的</a>的正則表達式
regEx = "</a>";
p = Pattern.compile(regEx, Pattern.CASE_INSENSITIVE);
m = p.matcher(output);
output = m.replaceAll("");
return output;
}
public static void main(String[] args){
UUID uuid = UUID.randomUUID();
System.out.println(uuid.toString());
System.out.println(uuid.toString());
}
/**
* 處理文章中的圖片
*
* @param input
* @return
*/
private String processImages(String input) {
String output = input;
String regEx = "<img [^>]*>";
Pattern p = Pattern.compile(regEx, Pattern.CASE_INSENSITIVE);
Matcher m = p.matcher(input);
List<String> imgs = new ArrayList<String>();
// 讀取所有<img>標籤
while (m.find()) {
imgs.add(m.group());
}
// 把圖存到本地,並替換<img>標籤的src值
for (String img : imgs) {
int begin = -1;
int end = -1;
String path = "";
if (img.indexOf("src=\"") != -1) {
begin = img.indexOf("src=\"");
path = img.substring(begin + 5);
end = path.indexOf("\"");
if (end != -1) {
path = path.substring(0, end);
} else {
path = "";
}
}
if (img.indexOf("src='") != -1) {
begin = img.indexOf("src='");
path = img.substring(begin + 5);
end = path.indexOf("'");
if (end != -1) {
path = path.substring(0, end);
} else {
path = "";
}
}
if (!path.equals("")) {
// String filepath = this.writeImageToServer(path);
String filepath = writeToFile(path);
while (filepath.indexOf('\\') != -1) {
filepath = filepath.replace('\\', '/');
}
output = output.replaceAll(path, filepath);
}
}
// System.out.println(output);
return output;
}
/**
* 把圖片寫到數據庫
*
* @param path 原圖片路徑
* @return 本地圖片路徑
*/
public String writeToFile(String path) {
String dirName = "";
String fileName = "";
OutputStreamWriter osw = null;
File directory = null;
File file = null;
try {
// 取圖像的格式
int begin = path.lastIndexOf(".");
String suffix = path.substring(begin + 1);
if(suffix.contains("!")){ //有些網站圖片 jyijaktkyzkk.jpg!292x420
int index = suffix.indexOf("!");
suffix = suffix.substring(0,index);
}
// 讀取圖像
URL url = new URL(path);
BufferedImage image = ImageIO.read(url);
dirName = CLASS_PAHT; //文件目錄
directory = new File(dirName);
if (!directory.exists()) {
directory.mkdirs();
}
if (directory.exists()) {
String name= UUID.randomUUID() + "." + suffix;
fileName = dirName + name;
file = new File(fileName); //真正文件名
FileOutputStream fos = new FileOutputStream(file);
ImageIO.write(image, suffix, fos);
fos.close();
return relative_path+name;
}
} catch (Exception e) {
e.printStackTrace();
}
return "";
}
}
還有第三篇文字,rss抓取正文保存到discuz