import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.URL;
import java.net.URLConnection;
import java.util.ArrayList;
import java.util.Calendar;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.sun.xml.internal.fastinfoset.stax.events.Util;
public class CatchPicture {
/**
* @param args
*/
public static void main(String[] args) {
// TODO Auto-generated method stub
//定義抓取圖片的 正則表達式
String regular="[*]<b>.*?</b><br/><img src=\"(.*?)\" border=0 alt=\'(.*?)\' style=\".*?\" class=\".*?\">
";
List<Picture> list=new CatchPicture().lookWeiboPic("http://gaoxiao.jokeji.cn/GrapHtml/dongtai/20120921221658.htm","GBK",regular,"2,1");
System.out.println(list.size());
}
//根據URL查看網站上的圖片
public List<Picture> lookWeiboPic(String url,String charset,String regular,String attIndex){
List<Picture> list=new ArrayList<Picture>();
try {
//獲取填寫的url
//判斷所屬網站 獲取 正則表達式
//獲取圖片存放到 list集合
if(!Util.isEmptyString(url)){
String htmls = getPageSource(url.trim(),charset);
Pattern pattern =null;
pattern = Pattern.compile(regular.trim());
if(!Util.isEmptyString(htmls)){
Matcher matcher = pattern.matcher(htmls);
//得到參數屬性順序
String[] sort = regular.trim().split(","); //下標:0 表示 標題title , 1 表示 圖片路徑
//判斷後綴後 得到網站的請求頭部 http://www.moonbasa.com/p-032111106.html-->得到 http://www.moonbasa.com
String[] suffix;
suffix =url.trim().split("cn");
String httphread = "";
if (suffix.length > 1) {
httphread = suffix[0] + "cn";
} else {
suffix = url.trim().split("com");
httphread = suffix[0] + "com";
}
//循環匹配找到的
while(matcher.find()){
Picture picture=new Picture();
//匹配出title
if (-1 == Integer.parseInt(sort[0])) {
// 頁面上抓不到標題
picture.setTitle("");
} else {
// 去標題的#
String title=matcher.group(Integer.parseInt(sort[0])).replace("#", " ");
picture.setTitle(title);
}
//匹配出source
if (-1 == Integer.parseInt(sort[1])) {
// 頁面上抓不到圖片路徑
picture.setSource("");
}else{
String webImgUrl=matcher.group(Integer.parseInt(sort[1]));
//判斷是絕對路徑還是相對路徑
String[] pathType=webImgUrl.split(":");
if(pathType.length>1){
//絕對路徑
picture.setSource(webImgUrl);
}else{
//判斷相對路徑是否含有..
pathType=webImgUrl.split("\\.\\.");
if(pathType.length>1){
picture.setSource(httphread+pathType[1]);
}else{
if(webImgUrl.startsWith("/")){
picture.setSource(httphread+pathType[0]);
}else{
picture.setSource(httphread+"/"+pathType[0]);
}
}
}
}
String upPath=upload(picture.getSource(),"d:\\image\\");
picture.setUpPath(upPath);
list.add(picture);
}//--end while
}
}
}catch (Exception e) {
e.printStackTrace();
}
return list;
}
/**
* 根據網路路徑獲取 頁面源碼
* @param pageUrl
* @param encoding
* @return
*/
public String getPageSource(String pageUrl,String encoding) {
StringBuffer sb = new StringBuffer();
try {
//構建一URL對象
URL url = new URL(pageUrl);
//使用openStream得到一輸入流並由此構造一個BufferedReader對象
BufferedReader in = new BufferedReader(new InputStreamReader(url
.openStream(), encoding));
String line;
//讀取www資源
while ((line = in.readLine()) != null) {
sb.append(line);
sb.append("\n");
}
in.close();
} catch (Exception ex) {
System.err.println(ex);
}
return sb.toString();
}
/**
* 上傳 圖片
* @param urlStr
* @param path
* @return
* @throws Exception
*/
public String upload(String urlStr,String path) throws Exception{
Calendar calendar = Calendar.getInstance();
String month = calendar.get(Calendar.YEAR) + "/"
+ (calendar.get(Calendar.MONTH) + 1);
String filename = java.util.UUID.randomUUID().toString()
+ getExtension(urlStr);
path =path + month + "/";
download(urlStr,path,filename);
return path+month + "/" + filename;
}
/**
* 根據路徑 下載圖片 然後 保存到對應的目錄下
* @param urlString
* @param filename
* @param savePath
* @return
* @throws Exception
*/
public void download(String urlString, String filename,String savePath) throws Exception {
// 構造URL
URL url = new URL(urlString);
// 打開連接
URLConnection con = url.openConnection();
//設置請求的路徑
con.setConnectTimeout(5*1000);
// 輸入流
InputStream is = con.getInputStream();
// 1K的數據緩衝
byte[] bs = new byte[1024];
// 讀取到的數據長度
int len;
// 輸出的文件流
File sf=new File(savePath);
if(!sf.exists()){
sf.mkdirs();
}
OutputStream os = new FileOutputStream(sf.getPath()+"\\"+filename);
// 開始讀取
while ((len = is.read(bs)) != -1) {
os.write(bs, 0, len);
}
// 完畢,關閉所有鏈接
os.close();
is.close();
}
/**
* 根據文件名 獲取文件的後綴名
* @param fileUrl
* @return
*/
public String getExtension(String fileUrl){
return fileUrl.substring(fileUrl.lastIndexOf("."), fileUrl.length());
}
}
Java 抓取網頁上的圖片
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.