因爲需要詳細的地址數據信息所以需要爬取國家統計局的地址數據:
1,抓取url地址:
http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2015/index.html
2,java代碼:(servlet)
package zzz;
import java.io.IOException;
import java.io.PrintWriter;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import javax.servlet.ServletException;
import javax.servlet.http.HttpServlet;
import javax.servlet.http.HttpServletRequest;
import javax.servlet.http.HttpServletResponse;
import config.Common;
@SuppressWarnings("serial")
public class GetAddress extends HttpServlet {
/**
* 遞歸便利獲取地區的信息
* @author yuyu
*/
public String basicUrl="http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2015/";
public String json=null;
public String status="";
public void doGet(HttpServletRequest request, HttpServletResponse response)
throws ServletException, IOException {
try{
response.setHeader("Content-Type", "application/xml; charset=UTF-8");//編碼
response.setHeader("Access-Control-Allow-Origin", "*");//跨域問題
String url="http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2015/index.html";
String data=Common.sendGet(url);
json="";
Pattern pattern = Pattern.compile("\\d+\\.html'>(\\D+)</a>");
Matcher matcher =pattern.matcher(data);
//對應的省市地區的id
int Id=Integer.parseInt(request.getParameter("id"))-1;
if(Id>30||Id<0){
throw new Exception("id錯誤,應取1-31");
}
//便利省市地區
int i=0;
while (matcher.find()) {
if(i==Id){
String info=matcher.group();
String aUrl=basicUrl+info.replaceAll(".>.*", "");
String aData=info.replaceAll("\\w|\\.|<|>|/|'", "");
System.out.println(aUrl);
System.out.println(aData);
json+="{\""+aData+"\":["+getInfo(aUrl)+"]}";
}
i++;
}
//將跑完的數據保存到txt文件
Common.contentToTxt("C:/Users/yuyu/Desktop/"+Id+".txt",json.replaceAll(",]", "]"));
response.setContentType("text/html");
request.setCharacterEncoding("UTF-8");//亂碼問題
PrintWriter out = response.getWriter();
out.println("已寫入C:/Users/yuyu/Desktop/"+Id+".txt");
out.flush();
out.close();
}catch(Exception e){
//發生錯誤的時候輸出錯誤
e.printStackTrace();
response.setContentType("text/html");
request.setCharacterEncoding("UTF-8");//亂碼問題
PrintWriter out = response.getWriter();
out.println("錯誤:"+e.getMessage());
out.flush();
out.close();
}
}
/**
* 根據url獲取對應的頁面信息
* @param url
* @return
* @throws Exception
*/
public String getInfo(String url) throws Exception{
String json="";
String data=Common.sendGet(url);
//請求出錯的我時候
int y=0;
while("".equals(data)||null==data){
if(y==10){
break;
}
data=Common.sendGet(url);
y++;
}
if("".equals(data)||null==data){
throw new Exception("未請求到數據");
}
//取得對應區域的數據
Pattern pattern = Pattern.compile("<tr class='[a-z]*'>.+?</tr>");
Matcher matcher =pattern.matcher(data);
int x=0;
while (matcher.find()) {
if(x==0){
x++;
continue;
}
String info=matcher.group();
//獲得正確的url
String status=url.replaceAll("\\d+\\.html", "");
String shh=getDataByRegex(info,"\\d+/\\d+.html");
//匹配到url
String aUrl=status+shh;
//匹配Id
String aId=getDataByRegex(info,"\\d{12}");
//匹配中文
String aData=getDataByRegex(info,"[\u4e00-\u9fa5]+");
//打印匹配信息
// System.out.println(aUrl);
// System.out.println(aId);
// System.out.println(aData);
//添加匹配帶的信息
if("".equals(shh)){
json+="{\"id\":\""+aId+"\",\"name\":\""+aData+"\"},";
}else{
json+="{\"id\":\""+aId+"\",\"name\":\""+aData+"\",\"children\":["+getInfo(aUrl)+"]},";
}
}
return json;
}
/**
* 執行正則
* @param data
* @param regex
* @return
*/
public String getDataByRegex(String data,String regex){
try{
Pattern pattern = Pattern.compile(regex);
Matcher matcher =pattern.matcher(data);
matcher.find();
return matcher.group();
}catch(Exception e){
return "";
}
}
}
3、需要使用的(Common)工具類
/**
* 向指定URL發送GET方法的請求
*
* @param url
* 發送請求的URL
* @param param
* 請求參數,請求參數應該是 name1=value1&name2=value2 的形式。
* @return URL 所代表遠程資源的響應結果
*/
public static String sendGet(String url) {
String result = "";
BufferedReader in = null;
try {
String urlNameString = url;
URL realUrl = new URL(urlNameString);
// 打開和URL之間的連接
URLConnection connection = realUrl.openConnection();
// 設置通用的請求屬性
// connection.setRequestProperty("accept", "*/*");
// connection.setRequestProperty("connection", "Keep-Alive");
// connection.setRequestProperty("user-agent",
// "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1;SV1)");
connection.setRequestProperty( "Content-Type","application/json; charset=utf-8");
// 建立實際的連接
connection.connect();
// 獲取所有響應頭字段
// Map<String, List<String>> map = connection.getHeaderFields();
// 遍歷所有的響應頭字段
// for (String key : map.keySet()) {
// System.out.println(key + "--->" + map.get(key));
// }
// 定義 BufferedReader輸入流來讀取URL的響應
in = new BufferedReader(
new InputStreamReader(connection.getInputStream()));
String line;
while ((line = in.readLine()) != null) {
result += line;
}
} catch (Exception e) {
System.out.println("發送GET請求出現異常!" + e);
e.printStackTrace();
}
// 使用finally塊來關閉輸入流
finally {
try {
if (in != null) {
in.close();
}
} catch (Exception e2) {
e2.printStackTrace();
}
}
return result;
}
/**
* 把字符寫入txt文件
* @param filePath
* @param content
*/
public static void contentToTxt(String filePath, String content) {
String str = new String(); //原有txt內容
String s1 = new String();//內容更新
try {
File f = new File(filePath);
if (f.exists()) {
System.out.print("文件存在");
} else {
System.out.print("文件不存在");
f.createNewFile();// 不存在則創建
}
BufferedReader input = new BufferedReader(new FileReader(f));
while ((str = input.readLine()) != null) {
s1 += str + "\n";
}
System.out.println(s1);
input.close();
s1 += content;
BufferedWriter output = new BufferedWriter(new FileWriter(f));
output.write(s1);
output.close();
} catch (Exception e) {
e.printStackTrace();
}
}
4、抓取的數據結構
5、文件下載
http://download.csdn.net/download/weixin_36751895/9820068