WebCollector多代理切換機制

使用同一IP長期爬取網站容易被網站的反爬蟲機制封殺IP。爬蟲往往使用多代理的方法來應對反爬蟲機制。

本教程利用WebCollector爬取大衆點評，展示WebCollector的多代理切換機制，相關內容都在代碼註釋中。

教程中僅僅將網頁保存在download文件夾中，如果需要抽取，請參考WebCollector其他教程。

import cn.edu.hfut.dmic.webcollector.crawler.BreadthCrawler;
import cn.edu.hfut.dmic.webcollector.model.Links;
import cn.edu.hfut.dmic.webcollector.model.Page;
import cn.edu.hfut.dmic.webcollector.net.HttpRequest;
import cn.edu.hfut.dmic.webcollector.net.HttpRequesterImpl;
import cn.edu.hfut.dmic.webcollector.net.RandomProxyGenerator;
import cn.edu.hfut.dmic.webcollector.util.Config;
import cn.edu.hfut.dmic.webcollector.util.FileUtils;
import java.io.IOException;
import java.net.InetSocketAddress;
import java.net.Proxy;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.logging.Level;
import java.util.logging.Logger;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

/**
 * 利用多代理爬取大衆點評，適用於webcollector 2.07版
 *
 * @author hu
 */
public class DazhongCrawler extends BreadthCrawler {

    AtomicInteger id = new AtomicInteger(0);

    /**
     * @param crawlPath crawlPath is the path of the directory which maintains
     * information of this crawler
     * @param autoParse if autoParse is true,BreadthCrawler will auto extract
     * links which match regex rules from pag
     */
    public DazhongCrawler(String crawlPath, boolean autoParse) {
        super(crawlPath, autoParse);
        /*start page*/
        this.addSeed("http://www.dianping.com/");

        /*fetch url like http://www.dianping.com/xxxxxxx*/
        this.addRegex("http://www.dianping.com/.*");

        /*do not fetch jpg|png|gif*/
        this.addRegex("-.*\\.(jpg|png|gif).*");
        /*do not fetch url contains #*/
        this.addRegex("-.*#.*");
    }

    @Override
    public void visit(Page page, Links nextLinks) {
        try {
            /*保存網頁到download文件夾中*/
            FileUtils.writeFileWithParent("download/" + id.incrementAndGet() + ".html", page.getContent());
        } catch (IOException ex) {
            Logger.getLogger(DazhongCrawler.class.getName()).log(Level.SEVERE, null, ex);
        }
    }

    /**
     * 爬取頁面中的代理信息，將代理信息放入隨機代理生成器中
     * 爲了演示2.06版本引入的HttpRequest,這裏抓取代理的過程用HttpRequest完成
     * 實際應用中，我們建議使用BreadthCrawler來完成對代理的抓取
     *
     * @param url 包含代理信息的頁面
     * @param proxyGenerator 隨機代理生成器
     * @throws Exception
     */
    public static void addProxy(String url, RandomProxyGenerator proxyGenerator) throws Exception {
        /*HttpRequest是2.07版的新特性*/
        HttpRequest request = new HttpRequest(url);
        /*重試3次*/
        for (int i = 0; i <= 3; i++) {
            try {
                String html = request.getResponse().getHtmlByCharsetDetect();
                String regex = "([0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}\\.[0-9]{1,3}).+?([0-9]{1,4})";
                Pattern pattern = Pattern.compile(regex);
                Matcher matcher = pattern.matcher(html);
                while (matcher.find()) {
                    System.out.println("add proxy:" + matcher.group(1) + ":" + matcher.group(2));
                    String ip = matcher.group(1);
                    int port = Integer.valueOf(matcher.group(2));
                    proxyGenerator.addProxy(ip, port);
                }
                break;
            } catch (Exception ex) {
                ex.printStackTrace();
            }
        }
    }

    public static void main(String[] args) throws Exception {

        DazhongCrawler crawler = new DazhongCrawler("crawl_dazhong", true);

        crawler.setThreads(50);
        crawler.setTopN(100);

        /*使用代理時，爬蟲各種等待時間需要加長，否則容易出現超時*/
        /*連接超時*/
        Config.TIMEOUT_CONNECT = 5000;
        /*讀取超時*/
        Config.TIMEOUT_READ = 20000;
        /*在整個爬取過程中，包括斷點續爬，同一個URL如果爬取超過MAX_RETRY次爬取失敗，則放棄這個URL
         使用多代理爬取時，失敗概率增加，所以需要將MAX_RETRY設置爲一個較大的值*/
        /*注意，如果某個URL在某層中爬取失敗，不要擔心，這個URL會在後面的層中繼續被爬取，
           直到爬取失敗次數達到MAX_RETRY*/
        Config.MAX_RETRY = 30;
        /*爬取線程池如果超過requestMaxInterval的時間沒有發送http請求，則強制停止線程池*/
        Config.requestMaxInterval = 1000 * 60 * 2;

        /*隨機代理生成器,RandomProxyGenerator是WebCollector代理切換的一個插件*/
        /*用戶可以根據自己的業務需求，定製代理切換的插件，代理切換插件需要實現ProxyGenerator*/
        RandomProxyGenerator proxyGenerator = new RandomProxyGenerator() {

            /*每當用一個代理爬取成功，會觸發markGood方法*/
            @Override
            public void markGood(Proxy proxy, String url) {
                InetSocketAddress address = (InetSocketAddress) proxy.address();
                System.out.println("Good Proxy:" + address.toString() + "   " + url);
            }

            /*每當用一個代理爬取失敗，會觸發markBad方法*/
            @Override
            public void markBad(Proxy proxy, String url) {
                InetSocketAddress address = (InetSocketAddress) proxy.address();
                System.out.println("Bad Proxy:" + address.toString() + "   " + url);

                /*可以利用markGood或者markBad給出的反饋，來調整隨機代理生成器中的代理*/
                /*可以動態添加或刪除代理，這些操作都是線程安全的*/
                //removeProxy(proxy);
                
                /*隨機代理RandomProxyGenerator是一種比較差的策略，
                  我們建議用戶自己編寫符合自己業務的ProxyGenerator。
                  編寫ProxyGenerator主要實現ProxyGenerator中的next方法。*/
            }

        };

        for (int i = 1; i <= 5; i++) {
            /*從這些頁面中爬取代理信息，加入proxyGenerator*/
            addProxy("http://proxy.com.ru/list_" + i + ".html", proxyGenerator);
        }

        /*獲取爬蟲的http請求器*/
        HttpRequesterImpl requester = (HttpRequesterImpl) crawler.getHttpRequester();
        /*設置http請求器的隨機代理請求器*/
        requester.setProxyGenerator(proxyGenerator);
 

        //crawler.setResumable(true);
         /*start crawl with depth of 4*/
        crawler.start(30);

    }

}

AJAXHu

發佈了90 篇原創文章 · 獲贊 67 · 訪問量 52萬+

他的留言板關注

WebCollector多代理切換機制

ziw2pdf

apisix~helm方式的部署到k8s

firmeye - IoT固件漏洞挖掘工具

基於WebCollector 2.x的增量更新機制，製作新聞採集APP

WebCollector 2.09 發佈

WebCollector多代理切換機制

我和權威的故事——王垠

Nutch教程中文翻譯1（官方教程，中英對照）——Nutch的編譯、安裝和簡單運行

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結