Java中,操作文件(讀寫內容)工具、讀寫properties文件工具、序列化工具、正則提取工具、網頁抓取和解析工具。
package com.shao.utils;
import java.io.BufferedReader;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileNotFoundException;
import java.io.FileReader;
import java.io.FileWriter;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.List;
publicclass FileUtil {
/**
* 從文件中讀取數據,將內容放置到List容器中
* @param fileStr
* @return
*/
publicstatic List<String> getDataFromFile(String fileStr){
List<String> data = null;
if(fileStr!=null && fileStr.length()>0){
try {
File file=new File(fileStr);
if(!file.exists()||file.isDirectory()){
thrownew FileNotFoundException();
}else{
BufferedReader br = new BufferedReader(new FileReader(file));
data = new ArrayList<String>();
String temp = br.readLine();
while(temp!=null){
data.add(temp);
temp = br.readLine();
}
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
return data;
}
/**
* 將List容器中的數據寫入到文件中
* @param fileStr
*/
publicstaticvoid writeDataToFile(String fileStr,List<String> data){
if(data==null || data.size()==0){
return;
}
try {
FileWriter writer = new FileWriter(fileStr);
BufferedWriter bw = new BufferedWriter(writer);
for (Iterator<String> iterator = data.iterator(); iterator.hasNext();) {
String temp = (String) iterator.next();
bw.write(temp+"\n");
}
bw.close();
writer.close();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 從文件中讀取數據, 返回文件內容
* @param fileStr
* @return
*/
publicstatic String getContentFromFile(String fileStr){
StringBuffer sb = new StringBuffer();
if(fileStr!=null && fileStr.length()>0){
try {
File file=new File(fileStr);
if(!file.exists()||file.isDirectory()){
thrownew FileNotFoundException();
}else{
BufferedReader br = new BufferedReader(new FileReader(file));
String temp = br.readLine();
while(temp!=null){
sb.append(temp+"\n");
temp = br.readLine();
}
}
} catch (FileNotFoundException e) {
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
}
return sb.toString();
}
/**
* 將數據內容寫入到文件中
* @param fileStr
*/
publicstaticvoid writeContentToFile(String fileStr,String content){
try {
FileWriter writer = new FileWriter(fileStr);
BufferedWriter bw = new BufferedWriter(writer);
bw.write(content);
bw.close();
writer.close();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 從給定的目錄中讀取所有的文件內容
* @param dir
* @return
*/
publicstatic List<String> getContentsFromDir(File file){
List<String> cons = null;
if(file.isDirectory()){
cons = new ArrayList<String>();
File[] files = file.listFiles();
for (int i = 0; i < files.length; i++) {
File f = files[i];
if(f.isDirectory()){
List<String> temp = getContentsFromDir(f);
for (Iterator iterator = temp.iterator(); iterator
.hasNext();) {
String c = (String) iterator.next();
cons.add(c);
}
}else{
String con = getContentFromFile(f.getAbsolutePath());
cons.add(con);
}
}
}
return cons;
}
}
package com.shao.utils;
import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Properties;
/**
* properties文件處理工具
* @author heshaopeng
* @date 2012-11-23
*/
publicclass PropertiesUtil {
/**
* 讀取配置文件
* @param file
* @return
*/
publicstatic Properties readPropertiesFile(String file){
Properties prop=null;
try {
BufferedInputStream is=new BufferedInputStream(new FileInputStream(file));
prop=new Properties();
prop.load(is);
} catch (FileNotFoundException e) {
System.out.println("File Not Found!");
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return prop;
}
/*
* 寫資源文件
* @param filename
* @param p
*/
publicstaticvoid writePropertiesFile(String filename,Properties p)
{
try
{
OutputStream outputStream = new FileOutputStream(filename);
p.store(outputStream,"");
outputStream.close();
}
catch (IOException e)
{
e.printStackTrace();
}
}
}
package com.shao.utils;
import java.io.BufferedInputStream;
import java.io.FileInputStream;
import java.io.FileNotFoundException;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.util.Properties;
/**
* properties文件處理工具
* @author heshaopeng
* @date 2012-11-23
*/
publicclass PropertiesUtil {
/**
* 讀取配置文件
* @param file
* @return
*/
publicstatic Properties readPropertiesFile(String file){
Properties prop=null;
try {
BufferedInputStream is=new BufferedInputStream(new FileInputStream(file));
prop=new Properties();
prop.load(is);
} catch (FileNotFoundException e) {
System.out.println("File Not Found!");
e.printStackTrace();
} catch (IOException e) {
e.printStackTrace();
}
return prop;
}
/*
* 寫資源文件
* @param filename
* @param p
*/
publicstaticvoid writePropertiesFile(String filename,Properties p)
{
try
{
OutputStream outputStream = new FileOutputStream(filename);
p.store(outputStream,"");
outputStream.close();
}
catch (IOException e)
{
e.printStackTrace();
}
}
}
package com.shao.utils;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* 正則表達式使用
* @author Administrator
*
*/
publicclass RegxUtil {
/**
* 利用正則表達式解析網頁內容
* @param s 全部網頁內容
* @param regex 正則表達式
* @return
*/
public List<String> parseContent(String content,String regex) {
//regex = "(((?<=(<a))[\\s\\S]*?(?=(</a>))))";
List<String> list = new ArrayList<String>();
Pattern pa = Pattern.compile(regex);
Matcher ma = pa.matcher(content);
while (ma.find()) {
String ss = ma.group();
list.add(ss);
}
return list;
}
}
package com.shao.utils;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.InputStream;
import java.io.ObjectInputStream;
import java.io.ObjectOutputStream;
/**
* 序列化工具
* @author heshaopeng
* @date 2012-11-23
*/
publicclass SerializationUtil {
privatefinalstatic String SERIALIZATION_PATH = "file2uuid.map";
/**
* 將對象序列化到磁盤文件中
* @param o
* @throwsException
*/
publicstaticvoid writeObject(Object o,String path) throws Exception{
File f=new File(path);
if(f.exists()){
f.delete();
}
FileOutputStream os=new FileOutputStream(f);
ObjectOutputStream oos=new ObjectOutputStream(os);
oos.writeObject(o);
oos.close();
os.close();
}
/**
*反序列化,將磁盤文件轉化爲對象
*@return Object
*@throwsException
*/
publicstatic Object readObject(String path) throws Exception{
File f=new File(path);
if(!f.exists()){
returnnull;
}else{
InputStream is=new FileInputStream(f);
ObjectInputStream ois=new ObjectInputStream(is);
return ois.readObject();
}
}
}
package com.shao.spider;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.InputStreamReader;
import java.net.MalformedURLException;
import java.net.URL;
import java.util.ArrayList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.shao.utils.FileUtil;
/**
* 抓取網頁
* @author Administrator
*
*/
publicclass WebSpider {
/**
* 讀取一個網頁全部內容
*/
public String getOneHtml(String htmlurl) throws IOException {
URL url;
String temp;
StringBuffer sb = new StringBuffer();
try {
//1.根據網址,創建一個URL對象
url = new URL(htmlurl);
//2.讀取網頁全部內容,採用utf-8編碼方式來讀取
//url.openStream()獲取輸入流,並用BufferedReader進行封裝
BufferedReader in = new BufferedReader(new InputStreamReader(url.openStream(), "utf-8"));
while ((temp = in.readLine()) != null) {
if(temp.trim().length()>0){
sb.append(temp+"\n");
}
}
in.close();
} catch (MalformedURLException me) {
System.out.println("你輸入的URL格式有問題!請仔細輸入");
me.getMessage();
throw me;
} catch (IOException e) {
e.printStackTrace();
throw e;
}
return sb.toString();
}
/**
* 抓取網頁,並保存爲html文件
* @param htmlURL
*/
publicvoid getHtmlFileFromURL(String filename,String htmlURL){
try {
String content = getOneHtml(htmlURL);
List<String> data = new ArrayList<String>();
data.add(content);
FileUtil.writeDataToFile(filename, data);
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 將網頁內容寫入到文件,按照網頁標題保存文件
* @param htmlURL
*/
publicvoid writeHtmlToFile(String path,String content){
String filename = path+getTitle(content)+".html";
FileUtil.writeContentToFile(filename, content);
}
/**
*
* @param s
* @return 獲得網頁標題
*/
public String getTitle(String s) {
String regex;
String title = "";
List<String> list = new ArrayList<String>();
regex = "<title>.*?</title>";
list = parseContent(s,regex);
for (int i = 0; i < list.size(); i++) {
title = title + list.get(i);
}
return outTag(title);
}
/**
*
* @param s
* @return 去掉標記
*/
public String outTag(final String s) {
return s.replaceAll("<.*?>", "");
}
/**
* 利用正則表達式解析網頁內容
* @param s 全部網頁內容
* @param regex 正則表達式
* @return
*/
public List<String> parseContent(String content,String regex) {
//regex = "(((?<=(<a))[\\s\\S]*?(?=(</a>))))";
List<String> list = new ArrayList<String>();
Pattern pa = Pattern.compile(regex);
Matcher ma = pa.matcher(content);
while (ma.find()) {
String ss = ma.group();
list.add(ss);
}
return list;
}
publicstaticvoid main(String[] args) throws IOException {
WebSpider wc = new WebSpider();
String urlhtml = "http://bus.mapbar.com/shenzhen/station_list/A.shtml";
/*//抓取這個網頁 的內容
String con = wc.getOneHtml(urlhtml);
System.out.println(con);*/
/*//將抓取的頁面保存爲html頁面
wc.getHtmlFileFromURL("A.html", urlhtml);*/
/*//按照抓物的網頁的標題保存網頁文件
String con = wc.getOneHtml(urlhtml);
wc.writeHtmlToFile("",con);*/
//============測試批量抓取網頁,並保存爲文件========================
/*String urlhtml2 = "http://bus.mapbar.com/shenzhen/station_list/%%.shtml";
String[] ss = {"A","B","C","D","E","F","G",
"H","I","J","K","L","M","N",
"O","P","Q","R","S","T",
"U","V","W","X","Y","Z",
"1","2","3","4","5","6","7","8","9"};
List<String> htmlContent = new ArrayList<String>();
for (int i = 0; i < ss.length; i++) {
String temp = urlhtml2.replace("%%", ss[i]);
String con = wc.getOneHtml(temp);
wc.writeHtmlToFile("", con);
}*/
/*//從抓取後的網頁文件中讀取網頁內容
String path = "深圳公交站點查詢列表-- A字頭-- Mapbar深圳公交.html";
String content = FileUtil.getContentFromFile(path);
System.out.println(content);*/
// File file = new File("test.txt");
// String path = file.getAbsolutePath();
// System.out.println(path);
//============測試批量抓取網頁,並保存爲文件,到特定目錄下========================
/*String urlhtml2 = "http://bus.mapbar.com/shenzhen/station_list/%%.shtml";
String[] ss = {"A","B","C","D","E","F","G",};
List<String> htmlContent = new ArrayList<String>();
for (int i = 0; i < ss.length; i++) {
String temp = urlhtml2.replace("%%", ss[i]);
String con = wc.getOneHtml(temp);
wc.writeHtmlToFile("G:/test/", con);
}*/
//============從特定目錄下,讀取所有抓取下來的文件內容========================
/*File file = new File("G:/test/");
List<String> cons = FileUtil.getContentsFromDir(file);
System.out.println(cons.size());*/
}
}
package com.shao.spider;
import java.awt.image.BufferedImage;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.io.OutputStream;
import java.net.MalformedURLException;
import java.net.URL;
import javax.imageio.ImageIO;
/**
* 抓取所有文本、媒體等資源
* @author Administrator
*
*/
publicclass AllSpider {
/**
* 抓取圖片,採用輸入流的方式
*/
publicvoid getImageFromWeb(String htmlurl) throws IOException {
URL url;
try {
// 1.根據網址,創建一個URL對象
url = new URL(htmlurl);
String imageName = htmlurl.substring(htmlurl.lastIndexOf("/") + 1);
String path = "G:/test/" + imageName;
File file = new File(path);
// 2.讀取網頁全部內容,採用utf-8編碼方式來讀取
// url.openStream()獲取輸入流,並用BufferedReader進行封裝
InputStream is = url.openStream();
OutputStream os = null;
os = new FileOutputStream(file);
int b = is.read();
while (b != -1) {
os.write(b);
b = is.read();
}
is.close();
os.close();
} catch (MalformedURLException me) {
System.out.println("你輸入的URL格式有問題!請仔細輸入");
me.getMessage();
throw me;
} catch (IOException e) {
e.printStackTrace();
throw e;
}
}
publicstaticvoid main(String[] args) throws IOException {
AllSpider as = new AllSpider();
//最普通的方式
String imaurl = "http://a.hiphotos.baidu.com/album/s%3D680%3Bq%3D90/sign=22cf18d6f3d3572c62e29fd4ba28121a/48540923dd54564eba46ab06b2de9c82d1584fe9.jpg";
as.getImageFromWeb(imaurl);
//採用第二種方式,抓取音頻文件
String mp3url = "http://mc.djkk.com/mix/2012/2012-09/2012-09-20/2012092020483953.wma";
as.getImageFromWeb(mp3url);
}
}