在以前看電影《社交網絡》的時候,mark黑了7棟公寓,獲取圖片信息,當時覺得十分cool!所以嘗試做了一個網頁爬蟲,爬取一個網頁的圖片!
來看代碼:
package com.MySpider;
import java.io.BufferedOutputStream;
import java.io.BufferedReader;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
public class GetImage {
/**
* @param args
*/
private List<String> imageUrl = new ArrayList<String>();//用於存儲圖片的url
private int count = 0;//圖片計數器
public static void main(String[] args) {
String netUrl = "http://.com/";//要爬的網頁
new GetImage().init(netUrl);
}
public void init(String netUrl)
{
getPage(netUrl);
while(imageUrl.size()!=0)
{
getImage(imageUrl.remove(0));
}
}
//獲取網頁信息line中的圖片url並加入到集合中
public void getImageUrl(String line)
{
String regex = "http://.{1,}.jpg";
Pattern pat = Pattern.compile(regex);
Matcher matcher=pat.matcher(line);
String str =null;
while(matcher.find())
{
str = matcher.group();
imageUrl.add(str);
}
}
//爬取網頁中的信息。
public void getPage(String netUrl)
{
BufferedReader mybr = null;
try {
URL myurl = new URL(netUrl);
URLConnection myconn = myurl.openConnection();
InputStream myin = myconn.getInputStream();
mybr = new BufferedReader(new InputStreamReader(myin));
String line;
while((line = mybr.readLine())!= null)
{
getImageUrl(line);//判斷網頁中的jpg圖片
}
} catch (MalformedURLException e) {
System.out.println("url異常");
} catch (IOException e) {
System.out.println("url連接異常");
}finally
{
if( mybr != null)
{
try {
mybr.close();
} catch (IOException e) {
System.out.println("讀入流關閉異常");
}
}
}
}
//下載該圖片!
public void getImage(String imageUrl)
{
InputStream myin = null;
BufferedOutputStream myos = null;
try {
URL myurl = new URL(imageUrl);
URLConnection myconn = myurl.openConnection();
myin = myconn.getInputStream();
myos = new BufferedOutputStream(new FileOutputStream("e:\\spiderImage\\"+count+".jpg"));
byte[] buff = new byte[1024];
int num = 0;
while((num = myin.read(buff))!= -1)
{
myos.write(buff, 0, num);
myos.flush();
}
count++;
} catch (MalformedURLException e) {
System.out.println("url異常");
} catch (IOException e) {
System.out.println("url連接異常");
}
finally
{
if( myin != null)
{
try {
myos.close();
} catch (IOException e) {
System.out.println("讀入流關閉異常");
}
}
if( myos != null)
{
try {
myos.close();
} catch (IOException e) {
System.out.println("輸出流關閉異常");
}
}
}
}
}
運行就ok了!
當然有些網站的圖片雖然爬到了,但是下不了!還是要改善程序!
下面是我爬到的成果!