下面代碼的具體功能是這樣的,先用ScheduledExecutorService 讓HttpURLConnection去按固定間隔時間去獲得網頁內容,然後對得到的內容進行截取,並寫入文件,最後對文件的內容進行去重操作。哈哈。。。聽起來好像很麻煩,我想對於大家來說ScheduledExecutorService 、HttpURLConnection或許用用,至於後面文件的操作就沒啥作用了。我之所以在這都詳細的寫出來,是因爲我對於文件的操作一直都不是很熟悉,所以就想藉此機會讓自己再次學習下IO流的操作,加深自己的印象!
ok!話有點多了,下面來看代碼:
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStreamWriter;
import java.net.HttpURLConnection;
import java.net.MalformedURLException;
import java.net.URL;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import java.util.concurrent.Executors;
import java.util.concurrent.ScheduledExecutorService;
import java.util.concurrent.TimeUnit;
/**
* Created by IntelliJ IDEA. User: user Date: 2009-5-11 Time: 13:34:10
* 模擬Http訪問的工具類
*/
public class HttpConnectionUtil {
private static SimpleDateFormat sdf = new SimpleDateFormat(
"yyyy-MM-dd HH:mm:ss");
public static String getHttpContent(String url) {
return getHttpContent(url, "GB2312");
}
public static String getHttpContent(String url, String charSet) {
HttpURLConnection connection = null;
String content = "";
try {
URL address_url = new URL(url);
connection = (HttpURLConnection) address_url.openConnection();
// connection.setRequestMethod("GET");
// 設置訪問超時時間及讀取網頁流的超市時間,毫秒值
System.setProperty("sun.net.client.defaultConnectTimeout", "30000");
System.setProperty("sun.net.client.defaultReadTimeout", "30000");
// after JDK 1.5
// connection.setConnectTimeout(10000);
// connection.setReadTimeout(10000);
// 得到訪問頁面的返回值
int response_code = connection.getResponseCode();
if (response_code == HttpURLConnection.HTTP_OK) {
InputStream in = connection.getInputStream();
// InputStreamReader reader = new InputStreamReader(in,charSet);
BufferedReader reader = new BufferedReader(
new InputStreamReader(in, charSet));
String line = null;
while ((line = reader.readLine()) != null) {
content += line;
}
return content;
}
} catch (MalformedURLException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
} finally {
if (connection != null) {
connection.disconnect();
}
}
return "";
}
//對文件去重(獲得的網址中有重複的網址)
public static void readFile(String insertFile, String outfile) {
Set<String> set = new HashSet<String>();
try {
String line = "";
FileInputStream fis = new FileInputStream(insertFile);
InputStreamReader isr = new InputStreamReader(fis, "UTF8");
BufferedReader inbr = new BufferedReader(isr);
String temp = "";
while ((temp = inbr.readLine()) != null) {
if(temp.indexOf("'https:")!=-1){
temp = temp.substring(temp.indexOf("'https:"));
String[] strs = new String[2];
strs = temp.split(",");
for (int i = 0; i < strs.length; i++) {
set.add(strs[i]);//set不允許有重複的記錄
}
}
}
inbr.close();
createFile(outfile);//創建文件
FileOutputStream fos = new FileOutputStream(outfile);
BufferedWriter out = new BufferedWriter(new OutputStreamWriter(fos,
"UTF8"));
Iterator it = set.iterator();
while (it.hasNext()) {
String element = (String) it.next();
out.write(element);
out.newLine();
}
out.close();
} catch (Exception e) {
System.out.println("readFile errors:" + e.getMessage());
}
}
//創建文件
public static void createFile(String insert) throws IOException{
File file = new File(insert);
if(!file.exists()){
file.createNewFile();
System.out.println("文件創建成功!");
}
}
//爬取目標網頁的內容
public static void getFile(final String insert) {
try {
//String content = "";
int i = 0;
String catalog=insert.substring(0, insert.lastIndexOf("\\"));
File dir=new File(catalog);
if(!dir.exists()){
dir.mkdir();
System.out.println("文件夾創建成功!");
}
createFile(insert);//創建文件
ScheduledExecutorService execService = Executors.newSingleThreadScheduledExecutor();
execService.scheduleWithFixedDelay(new Runnable() {
public void run() {
String content = HttpConnectionUtil
.getHttpContent("https://s3.amazonaws.com/cdtimes/index.html");
String str = "";
if (content.contains("var b_urls")) {
str = content.substring(content.indexOf("var b_urls"), content
.indexOf("b_urls = shuffle(b_urls)"));
}
str = str.substring(str.indexOf("['") + 1, str.indexOf("']") + 1);
System.out.println(str);
AppendFile(insert, sdf.format(new Date()) + " " + str);
}
}, 1, 1, TimeUnit.MINUTES); //間隔一分鐘爬取
Thread.sleep(24*3600*1000L); //控制程序爬取的時間
execService.shutdown();
} catch (Exception e) {
System.out.println("readFile errors:" + e.getMessage());
}
}
//以追加的方式將內容寫入文件
public static void AppendFile(String fileName, String content) {
BufferedWriter writer = null;
try {
// 打開一個寫文件器,構造函數中的第二個參數true表示以追加形式寫文件
writer = new BufferedWriter(new FileWriter(fileName, true));
writer.write(content);
writer.newLine();
} catch (IOException e) {
e.printStackTrace();
} finally {
try {
if (writer != null) {
writer.close();
}
} catch (IOException e) {
e.printStackTrace();
}
}
}
public static void main(String[] args) throws IOException {
String insert="";
String outfile="";
if(args!=null&&args.length!=0){//這個地方做了兩個參數,是爲了方便將該文件打包成jar文件去執行,傳入兩個文件名即可
insert = args[0];//全部結果
outfile = args[1];//去重後的結果
}else{//默認文件
insert="d:\\HttpConnection\\allResultFile.txt";
outfile="d:\\HttpConnection\\resultFile.txt";
}
System.out.println("開始獲取全部信息...");
getFile(insert);//獲取全部信息
System.out.println("全部信息獲取成功!");
System.out.println("全部結果內容存放文件:"+insert);
System.out.println("開始對全部結果去重...");
readFile(insert, outfile);//對信息進行去重處理
System.out.println("去重完成!");
System.out.println("去重後結果內容存放文件:"+outfile);
}
}