這系列文章主要分析分析webmagic框架,沒有實戰內容,如有實戰問題可以討論,也可以提供技術支持。
歡迎加羣313557283(剛創建),小白互相學習~
Pipeline
我們先來看看接口,就一個process 方法
package us.codecraft.webmagic.pipeline;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
/**
* Pipeline is the persistent and offline process part of crawler.<br>
* The interface Pipeline can be implemented to customize ways of persistent.
*
* @author [email protected] <br>
* @since 0.1.0
* @see ConsolePipeline
* @see FilePipeline
*/
public interface Pipeline {
/**
* Process extracted results.
*
* @param resultItems resultItems
* @param task task
*/
public void process(ResultItems resultItems, Task task);
}
我們再來看看默認調用實現pipeline的那個類ConsolePipeline
很簡單把存儲在resultItem 的結果打印出來
package us.codecraft.webmagic.pipeline;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import java.util.Map;
/**
* Write results in console.<br>
* Usually used in test.
*
* @author [email protected] <br>
* @since 0.1.0
*/
public class ConsolePipeline implements Pipeline {
@Override
public void process(ResultItems resultItems, Task task) {
System.out.println("get page: " + resultItems.getRequest().getUrl());
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
System.out.println(entry.getKey() + ":\t" + entry.getValue());
}
}
}
其他的
FilePipeline 以文件形式保存
package us.codecraft.webmagic.pipeline;
import org.apache.commons.codec.digest.DigestUtils;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import us.codecraft.webmagic.utils.FilePersistentBase;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.OutputStreamWriter;
import java.io.PrintWriter;
import java.util.Map;
/**
* Store results in files.<br>
*
* @author [email protected] <br>
* @since 0.1.0
*/
public class FilePipeline extends FilePersistentBase implements Pipeline {
private Logger logger = LoggerFactory.getLogger(getClass());
/**
* create a FilePipeline with default path"/data/webmagic/"
*/
public FilePipeline() {
setPath("/data/webmagic/");
}
public FilePipeline(String path) {
setPath(path);
}
@Override
public void process(ResultItems resultItems, Task task) {
String path = this.path + PATH_SEPERATOR + task.getUUID() + PATH_SEPERATOR;
try {
PrintWriter printWriter = new PrintWriter(new OutputStreamWriter(new FileOutputStream(getFile(path + DigestUtils.md5Hex(resultItems.getRequest().getUrl()) + ".html")),"UTF-8"));
printWriter.println("url:\t" + resultItems.getRequest().getUrl());
for (Map.Entry<String, Object> entry : resultItems.getAll().entrySet()) {
if (entry.getValue() instanceof Iterable) {
Iterable value = (Iterable) entry.getValue();
printWriter.println(entry.getKey() + ":");
for (Object o : value) {
printWriter.println(o);
}
} else {
printWriter.println(entry.getKey() + ":\t" + entry.getValue());
}
}
printWriter.close();
} catch (IOException e) {
logger.warn("write file error", e);
}
}
}
結果集
ResultItemsCollectorPipeline 我猜主要是爲了批量處理這樣效率高
package us.codecraft.webmagic.pipeline;
import us.codecraft.webmagic.ResultItems;
import us.codecraft.webmagic.Task;
import java.util.ArrayList;
import java.util.List;
/**
* @author [email protected]
* @since 0.4.0
*/
public class ResultItemsCollectorPipeline implements CollectorPipeline<ResultItems> {
private List<ResultItems> collector = new ArrayList<ResultItems>();
@Override
public synchronized void process(ResultItems resultItems, Task task) {
collector.add(resultItems);
}
@Override
public List<ResultItems> getCollected() {
return collector;
}
}
擴展
代碼就不貼了
大概介紹下
FilePageModelPipeline 保存成.html
JsonFilePageModelPipeline 保存成.json
JsonFilePipeline 將內容轉換成json再保存成.json
MultiPagePipeline 用於需要拼接的地方
官網還有個集成mysql 點擊打開鏈接
總結
上面介紹了很多保存方式,個人習慣於在process就進行數據持久化,不知道有什麼不同,歡迎探討。