httpClient4.x 爬蟲教學

import java.io.File;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URLEncoder;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.Timer;
 import java.util.TimerTask;
 
 import org.apache.http.HttpEntity;
 import org.apache.http.HttpResponse;
 import org.apache.http.client.ClientProtocolException;
 import org.apache.http.client.HttpClient;
 import org.apache.http.client.methods.HttpGet;
 import org.apache.http.conn.ClientConnectionManager;
 import org.apache.http.conn.params.ConnManagerParams;
 import org.apache.http.conn.params.ConnPerRouteBean;
 import org.apache.http.conn.scheme.PlainSocketFactory;
 import org.apache.http.conn.scheme.Scheme;
 import org.apache.http.conn.scheme.SchemeRegistry;
 import org.apache.http.conn.ssl.SSLSocketFactory;
 import org.apache.http.impl.client.DefaultHttpClient;
 import org.apache.http.impl.conn.tsccm.ThreadSafeClientConnManager;
 import org.apache.http.params.BasicHttpParams;
 import org.apache.http.params.HttpConnectionParams;
 import org.apache.http.params.HttpParams;
 import org.apache.http.protocol.BasicHttpContext;
 import org.apache.http.protocol.HttpContext;
 
 import org.htmlparser.Node;
 import org.htmlparser.NodeFilter;
 import org.htmlparser.Parser;
 import org.htmlparser.filters.AndFilter;
 import org.htmlparser.filters.NodeClassFilter;
 import org.htmlparser.tags.LinkTag;
 import org.htmlparser.util.NodeList;
 import org.htmlparser.util.ParserException;
 
 public class Crawler implements Runnable{
     public static String SAVE="C:/Users/Administrator/Downloads";//下載保存路徑
     private String url="";//要抓取的網頁地址
     public Crawler(String url){
         this.url=url;
     }
     public Crawler(){}
     /**
      * 
      * @param url 要抓取的網頁的地址
      * @return 這個對應的內容
      * @throws ClientProtocolException
      * @throws IOException
 */
     private String crawl(String url) throws ClientProtocolException, IOException{
         System.out.println("[INFO] Crawl From : "+url);
         HttpClient httpClient = new DefaultHttpClient();
         HttpGet httpGet=new HttpGet(url);
         HttpResponse httpResponse = httpClient.execute(httpGet);
         HttpEntity httpEntity=httpResponse.getEntity();
         InputStream inStream=httpEntity.getContent();
         String content="";
         while(true){
             byte[] bytes=new byte[1024*1000];
             int k=inStream.read(bytes);
             if(k>=0)content=content+new String(bytes,0,k);
             else break;
             System.out.println(content);
             System.out.println("=========================================================================================");
         }
         return content;
     }
     
     public void run(){
         try {
             String prefix=this.url.substring(0,this.url.lastIndexOf("/"));
             String content=this.crawl(this.url);//抓取網頁內容
             Parser parser=new Parser(content); //使用HTMLParser對網頁內容進行解析
             NodeFilter filter;
             NodeList list;
             filter=new NodeClassFilter(LinkTag.class);
             filter=new AndFilter(filter,new NodeFilter(){
                 public boolean accept(Node node) {
                     return ((LinkTag)node).isHTTPLink();
                 }});
             list=parser.extractAllNodesThatMatch(filter);
             List<String> urlsList =new ArrayList<String>();
             for(int i=0;i<list.size();i++){
                 String[] array=list.elementAt(i).getText().split("\"");
                 if(array[1].endsWith(".pdf")||array[1].endsWith(".PDF")){//只下載pdf
                     String downloadUrl=new String(prefix+"/"+array[1]);
                     urlsList.add(downloadUrl);//生成需要下載的地址
                 }
             }
             //從這裏開始是進行下載,使用了多線程執行請求
             HttpParams params=new BasicHttpParams();
             //ConnManagerParams.setTimeout(params, 60000*3); //設置連接最大等待時間
             ConnManagerParams.setMaxConnectionsPerRoute(params, new ConnPerRouteBean(50));//設置併發數
 //HttpConnectionParams.setConnectionTimeout(params, 60000*2);  //設置連接超時時間
             HttpConnectionParams.setSoTimeout(params, 60000*10);//設置讀取超時時間
             
             SchemeRegistry schemeRegistry=new SchemeRegistry();
             schemeRegistry.register(new Scheme("http",PlainSocketFactory.getSocketFactory(),80));
             schemeRegistry.register(new Scheme("https", SSLSocketFactory.getSocketFactory(), 443)); 
             ThreadSafeClientConnManager cm=new ThreadSafeClientConnManager(params,schemeRegistry);
             
             HttpClient httpClient=new DefaultHttpClient(cm,params);
             Thread[] threads=new Thread[urlsList.size()];
             int n=0;
             for(String url:urlsList){
                 String path=Crawler.SAVE+url.substring(url.lastIndexOf("/"), url.length());
                 url=url.substring(0, url.lastIndexOf("/"))+"/"+URLEncoder.encode(url.substring(url.lastIndexOf("/")+1,url.length()),"UTF-8");
                 HttpGet httpGet=new HttpGet(url);
                 threads[n]=new Thread(new Downloader(httpClient,httpGet,url,path));
                 n++;
             }
             for(Thread thread:threads)thread.start();
             for(Thread thread:threads)if(thread.isAlive())thread.join();
             }catch (InterruptedException e) {
                 System.out.println("[ERROR] Download InterruptedException : "+e.toString());
                 //e.printStackTrace();
             } catch (ParserException e) {
             System.out.println("[ERROR] Parse ParserException : "+e.toString());
             //e.printStackTrace();
         }catch (ClientProtocolException e) {
             System.out.println("[ERROR] Crawl ClientProtocolException : "+e.toString());
             //e.printStackTrace();
         } catch (IOException e) {
             System.out.println("[ERROR] Crawl IOException : "+e.toString());
             //e.printStackTrace();
         }
     }
     public static void main(String[] args) {
         //入口程序
         Crawler crawler=new Crawler("http://www3.tjcu.edu.cn/wangshangketang/yuanneike/guanlixue/sjxz.htm");//這裏設定網頁地址
         Thread thread=new Thread(crawler);
         thread.start();
         
     }
 
 }
 
 //類Downloader真正的執行了寫入網絡數據到文件的步驟
 class Downloader implements Runnable{
     private String url="";
     private String path="";
     private final HttpClient httpClient;
     private final HttpContext httpContext;
     private final HttpGet httpGet;
     /**
      * 
      * @param httpClient 多個線程共享的HtppClient
      * @param httpGet 要下載的HttpGet
      * @param url 資源網絡地址
      * @param path 資源下載之後本地的保存路徑
 */
     public Downloader(HttpClient httpClient,HttpGet httpGet,String url,String path){
         this.httpClient=httpClient;
         this.httpGet=httpGet;
         this.httpContext=new BasicHttpContext();
         this.path=path;
         this.url=url;
         
     }
     
     public void run() {
         System.out.println("[INFO] Download From : "+this.url);
         File file=new File(this.path);
         if(file.exists())file.delete();
         try {
             //使用file來寫入本地數據
             file.createNewFile();
             FileOutputStream outStream = new FileOutputStream(this.path);
             
             //執行請求,獲得響應
             HttpResponse httpResponse = this.httpClient.execute(this.httpGet,this.httpContext);
             
             System.out.println("[STATUS] Download : "+httpResponse.getStatusLine()+" [FROM] "+this.path);
             
             HttpEntity httpEntity=httpResponse.getEntity();
             InputStream inStream=httpEntity.getContent();
             while(true){//這個循環讀取網絡數據,寫入本地文件
                 byte[] bytes=new byte[1024*1000];
                 int k=inStream.read(bytes);
                 if(k>=0){
                     outStream.write(bytes,0,k);
                     outStream.flush();
                 }
                 else break;
             }
             inStream.close();
             outStream.close();
         } catch (IOException e){
             this.httpGet.abort();
             System.out.println("[ERROR] Download IOException : "+e.toString()+" [FROM] : "+this.path);
             //e.printStackTrace();
         }
     }
     
 }


發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章