HttpURLConnection的使用

           引言:最近一直在做爬蟲,近期遇到個問題,發現有些網站使用爬蟲的框架去爬取是無法得到網頁的內容的,所以就換了種方式,使用HttpURLConnection去爬取,由於這次的目的只是想得到網頁部分的字符串,就簡單的使用了下HttpURLConnection,同時順便自己也學習了下HttpURLConnection。

     下面代碼的具體功能是這樣的,先用ScheduledExecutorService 讓HttpURLConnection去按固定間隔時間去獲得網頁內容,然後對得到的內容進行截取,並寫入文件,最後對文件的內容進行去重操作。哈哈。。。聽起來好像很麻煩,我想對於大家來說ScheduledExecutorService 、HttpURLConnection或許用用,至於後面文件的操作就沒啥作用了。我之所以在這都詳細的寫出來,是因爲我對於文件的操作一直都不是很熟悉,所以就想藉此機會讓自己再次學習下IO流的操作,加深自己的印象!

ok!話有點多了,下面來看代碼:

import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;

/**
 * Created by IntelliJ IDEA. User: user Date: 2009-5-11 Time: 13:34:10
 * 模擬Http訪問的工具類
 */
public class HttpConnectionUtil {
	private static SimpleDateFormat sdf = new SimpleDateFormat(
			"yyyy-MM-dd HH:mm:ss");

	public static String getHttpContent(String url) {
		return getHttpContent(url, "GB2312");
	}

	public static String getHttpContent(String url, String charSet) {
		HttpURLConnection connection = null;
		String content = "";
		try {
			URL address_url = new URL(url);
			connection = (HttpURLConnection) address_url.openConnection();
			// connection.setRequestMethod("GET");
			// 設置訪問超時時間及讀取網頁流的超市時間,毫秒值
			System.setProperty("sun.net.client.defaultConnectTimeout", "30000");
			System.setProperty("sun.net.client.defaultReadTimeout", "30000");

			// after JDK 1.5
			// connection.setConnectTimeout(10000);
			// connection.setReadTimeout(10000);
			// 得到訪問頁面的返回值
			int response_code = connection.getResponseCode();
			if (response_code == HttpURLConnection.HTTP_OK) {
				InputStream in = connection.getInputStream();
				// InputStreamReader reader = new InputStreamReader(in,charSet);
				BufferedReader reader = new BufferedReader(
						new InputStreamReader(in, charSet));
				String line = null;
				while ((line = reader.readLine()) != null) {
					content += line;
				}
				return content;
			}
		} catch (MalformedURLException e) {
			e.printStackTrace();
		} catch (IOException e) {
			e.printStackTrace();
		} finally {
			if (connection != null) {
				connection.disconnect();
			}
		}
		return "";
	}
	//對文件去重(獲得的網址中有重複的網址)
	public static void readFile(String insertFile, String outfile) {
		Set<String> set = new HashSet<String>();
		try {
			String line = "";
			FileInputStream fis = new FileInputStream(insertFile);
			InputStreamReader isr = new InputStreamReader(fis, "UTF8");
			BufferedReader inbr = new BufferedReader(isr);
			String temp = "";
			while ((temp = inbr.readLine()) != null) {
				if(temp.indexOf("'https:")!=-1){
					temp = temp.substring(temp.indexOf("'https:"));
					String[] strs = new String[2];
					strs = temp.split(",");
					for (int i = 0; i < strs.length; i++) {
							set.add(strs[i]);//set不允許有重複的記錄
					}
				}
			}
			inbr.close();
			
			createFile(outfile);//創建文件
			FileOutputStream fos = new FileOutputStream(outfile);
			BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fos,
					"UTF8"));
			Iterator it = set.iterator();
			while (it.hasNext()) {
				String element = (String) it.next();
				out.write(element);
				out.newLine();
			}
			out.close();
		} catch (Exception e) {
			System.out.println("readFile errors:" + e.getMessage());
		}
	} 
	
	//創建文件
	public static void  createFile(String insert) throws IOException{
		File file = new File(insert);
		if(!file.exists()){
			file.createNewFile();
			System.out.println("文件創建成功!");
		}
	}
	//爬取目標網頁的內容
	  public static void getFile(final String insert) {
		  try {
	    	//String content = "";
			int i = 0;
			String catalog=insert.substring(0, insert.lastIndexOf("\\"));
			File dir=new File(catalog);
			if(!dir.exists()){
				dir.mkdir();
				System.out.println("文件夾創建成功!");
			}
			createFile(insert);//創建文件
			ScheduledExecutorService execService =   Executors.newSingleThreadScheduledExecutor();  
			execService.scheduleWithFixedDelay(new Runnable() {  
		             public void run() {  
		             String	 content = HttpConnectionUtil
							.getHttpContent("https://s3.amazonaws.com/cdtimes/index.html");
					String str = "";
					if (content.contains("var b_urls")) {
						str = content.substring(content.indexOf("var b_urls"), content
								.indexOf("b_urls = shuffle(b_urls)"));
					}
					str = str.substring(str.indexOf("['") + 1, str.indexOf("']") + 1);
					System.out.println(str);
					AppendFile(insert, sdf.format(new Date()) + "   " + str);
		            }  
		         }, 1, 1, TimeUnit.MINUTES); //間隔一分鐘爬取 
			    Thread.sleep(24*3600*1000L);  //控制程序爬取的時間
		        execService.shutdown();  
		  } catch (Exception e) {
				System.out.println("readFile errors:" + e.getMessage());
			}
	    }
	   //以追加的方式將內容寫入文件 
		public static void AppendFile(String fileName, String content) {
			BufferedWriter writer = null;
			try {
				// 打開一個寫文件器,構造函數中的第二個參數true表示以追加形式寫文件
				writer = new BufferedWriter(new FileWriter(fileName, true));
				writer.write(content);
				writer.newLine();
			} catch (IOException e) {
				e.printStackTrace();
			} finally {
				try {
					if (writer != null) {
						writer.close();
					}
				} catch (IOException e) {
					e.printStackTrace();
				}
			}
		}
	
	public static void main(String[] args) throws IOException {
		String insert="";
		String outfile="";
		if(args!=null&&args.length!=0){//這個地方做了兩個參數,是爲了方便將該文件打包成jar文件去執行,傳入兩個文件名即可
			 insert = args[0];//全部結果
			 outfile = args[1];//去重後的結果
		}else{//默認文件
			insert="d:\\HttpConnection\\allResultFile.txt";
			outfile="d:\\HttpConnection\\resultFile.txt";
		}
		System.out.println("開始獲取全部信息...");
		getFile(insert);//獲取全部信息
		System.out.println("全部信息獲取成功!");
		System.out.println("全部結果內容存放文件:"+insert);
		System.out.println("開始對全部結果去重...");
		readFile(insert, outfile);//對信息進行去重處理
		System.out.println("去重完成!");
		System.out.println("去重後結果內容存放文件:"+outfile);
	}
}

發佈了64 篇原創文章 · 獲贊 341 · 訪問量 20萬+
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章