引言
因爲涉及到一個省市縣三級聯動的模塊,但是由於在2016年7月國家統計局重新發布了統計用區劃代碼。對一些縣區進行了調整,但是這次又添加了關於鎮和村一級。想了一下就把到他們的數據全部爬出來,但是水平有限。爬到村的總是有數據丟失。我使用的jsoup連接超時導致。也做了調整獲取速率的方法。廢話不多說,直接上代碼
還有一點需要注意的就是:中國的這5個地級市,既不設市轄區,又不管轄縣、自治縣、旗、自治旗,亦不代管縣級市,而是直接轄鄉級行政區,俗稱“直筒子市”。 分別是:1、東莞市(廣東省)2、中山市(廣東省)3、三沙市(海南省)4、儋州市(海南省)5、嘉峪關市(甘肅省)特別注意!!!
正文
代碼寫的有點臃腫,但是還在重複率不多,還是一次性使用的東西就沒有進行優化
package com.jsoup;
import java.io.BufferedInputStream;
import java.io.BufferedOutputStream;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.net.URL;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import org.apache.ibatis.session.SqlSession;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;
import com.region.dao.Region;
import com.region.dao.RegionMapper;
import com.region.factory.MybatisFactory;
public class Html {
/**
* 根據url 返回Document結點
*
* @param url
* @return
* @throws InterruptedException
*/
private Document htmlTextByUrl(String url) {
Document doc = null;
try {
int i = (int) (Math.random() * 100); // 隨機延遲,防止網站屏蔽
try {
Thread.sleep(i);
} catch (InterruptedException e) {
e.printStackTrace();
}
doc = Jsoup.connect(url).data("query", "Java").userAgent("Mozilla").cookie("auth", "token").timeout(3000000)
.post();
// System.out.println("獲取成功已返回!");
} catch (IOException e) {
e.printStackTrace();
}
try {
doc = Jsoup.connect(url).timeout(50000000).get();
} catch (IOException e1) {
e1.printStackTrace();
}
return doc;
}
/**
* 從本地獲取Document
*
* @param path
* 文檔路徑
* @return
*/
@SuppressWarnings("unused")
private Document htmlTextByPath(String path) {
Document doc = null;
File input = new File(path);
try {
doc = Jsoup.parse(input, "GBK");
System.out.println("本地網頁已獲取成功,正在返回");
} catch (IOException e) {
e.printStackTrace();
}
return doc;
}
/**
* 從目標url獲取網頁並下載到本地
*
* @param url
* 目標網站url
* @param path
* 保持路徑
*/
@SuppressWarnings("unused")
private void Save_Html(String url, String path) {
try {
File dest = new File(path);
InputStream is;
FileOutputStream fos = new FileOutputStream(dest);
URL temp = new URL(url);
is = temp.openStream();
BufferedInputStream bis = new BufferedInputStream(is);
BufferedOutputStream bos = new BufferedOutputStream(fos);
int length;
byte[] bytes = new byte[1024 * 20];
while ((length = bis.read(bytes, 0, bytes.length)) != -1) {
fos.write(bytes, 0, length);
}
bos.close();
fos.close();
bis.close();
is.close();
} catch (IOException e) {
e.printStackTrace();
}
}
/**
* 根據文檔標籤的類型解析文檔(支持市、縣、區鎮)
*
* @param doc
* 文檔
* @param typeName
* 標籤類型
* @return
*/
private List<Map<String, String>> analysis(Document doc, String typeName) {
List<Map<String, String>> resultList = new ArrayList<>();
Map<String, String> resultMap = null;
Elements selects = doc.select(typeName + ">*>*");
for (int i = 0, length = selects.size(); i < length; i = i + 2) {
resultMap = new HashMap<>();
resultMap.put("code", selects.get(i).text());
resultMap.put("name", selects.get(i + 1).text());
resultMap.put("url", selects.get(i).absUrl("href"));
resultList.add(resultMap);
}
return resultList;
}
/**
* 根據文檔標籤的類型解析文檔(村委會 特殊處理)
*
* @param doc
* 文檔
* @param typeName
* 標籤類型
* @return
*/
@SuppressWarnings("unused")
private List<Map<String, String>> analysisVillagetr(Document doc, String typeName) {
List<Map<String, String>> resultList = new ArrayList<>();
Map<String, String> resultMap = null;
Elements selects = doc.select(typeName + ">*");
System.out.println(selects.size());
for (int i = 0, length = selects.size(); i < length; i = i + 3) {
resultMap = new HashMap<>();
resultMap.put("code", selects.get(i).text());
resultMap.put("name", selects.get(i + 2).text());
resultList.add(resultMap);
System.out.println("名稱:" + selects.get(i + 2).text() + " 編號:" + selects.get(i).text());
}
return resultList;
}
/**
* 獲取某縣下屬的所有村委會
*/
public void getTowntrData(String url, SqlSession session) {
Html html = new Html();
Document document = html.htmlTextByUrl(url);
List<Map<String, String>> analysisList = html.analysis(document, "tr.towntr");
List<Region> resultlist = new ArrayList<>();
RegionMapper mapper = session.getMapper(RegionMapper.class);
Region region = null;
for (Map<String, String> map : analysisList) {
String towntrUrl = map.get("url");
region = new Region();
region.setName(map.get("name"));
region.setUrl(towntrUrl);
region.setCode(Long.parseLong(map.get("code")));
resultlist.add(region);
}
if (resultlist.size() > 0) {
mapper.insertBatch(resultlist);
session.commit();
}
}
/*
* public void getTowntrData(String url, RegionMapper mapper) { StringBuffer
* stringBuffer = new StringBuffer(); Html html = new Html(); Document
* document = html.htmlTextByUrl(url); List<Map<String, String>>
* analysisList = html.analysis(document, "tr.towntr");
*
* for (Map<String, String> map : analysisList) {
* stringBuffer.append("this.regionMap.put(" + map.get("code") + ",\"" +
* map.get("name") + "\");\r\n"); String mapUrl = map.get("url"); Document
* htmlTextByUrl = html.htmlTextByUrl(mapUrl); List<Map<String, String>>
* analysisVillagetr = html.analysisVillagetr(htmlTextByUrl,
* "tr.villagetr");
*
* for (Map<String, String> villMap : analysisVillagetr) {
* stringBuffer.append("this.regionMap.put(" + villMap.get("code") + ",\"" +
* villMap.get("name") + "\");\r\n"); } }
* SaveUtil.save(stringBuffer.toString()); }
*/
/**
* 獲取某市下屬的所有的村委會
*
* @param url
* @return
*/
public void getCountytrData(String url, SqlSession session) {
Html html = new Html();
Document document = html.htmlTextByUrl(url);
List<Map<String, String>> countyList = html.analysis(document, "tr.countytr");
List<Region> resultlist = new ArrayList<>();
RegionMapper mapper = session.getMapper(RegionMapper.class);
Region region = null;
System.out.println(" 共有 縣區:" + countyList.size());
for (Map<String, String> map : countyList) {
String countyUrl = map.get("url");
region = new Region();
region.setName(map.get("name"));
region.setUrl(countyUrl);
region.setCode(Long.parseLong(map.get("code")));
resultlist.add(region);
// 獲取某縣下屬所有村委會
getTowntrData(countyUrl, session);
}
if (resultlist.size() > 0) {
mapper.insertBatch(resultlist);
session.commit();
}
}
/**
* 獲得某省下屬所有的村委會
*
* @param url
* @return
*/
public void getCityData(String url, SqlSession session) {
Html html = new Html();
Document document = html.htmlTextByUrl(url);
List<Map<String, String>> cityList = html.analysis(document, "tr.citytr");
List<Region> resultlist = new ArrayList<>();
RegionMapper mapper = session.getMapper(RegionMapper.class);
Region region = null;
for (Map<String, String> map : cityList) {
String cityUrl = map.get("url");
region = new Region();
region.setName(map.get("name"));
region.setUrl(cityUrl);
region.setCode(Long.parseLong(map.get("code")));
resultlist.add(region);
System.out.println("目前所掃描的市:"+ map.get("name"));
// 獲取某市下屬所有村委會
getCountytrData(cityUrl, session);
}
if (resultlist.size() > 0) {
session.commit();
mapper.insertBatch(resultlist);
}
}
/**
* 獲得國內下屬所有的村委會
*
* @param url
* @return
*/
public void getProvincetr(String url) {
Html html = new Html();
Document document = html.htmlTextByUrl(url);
Elements selects = document.select("tr.provincetr>*>*");
SqlSession session = MybatisFactory.getSession();
RegionMapper mapper = session.getMapper(RegionMapper.class);
Region region = null;
List<Region> resultlist = new ArrayList<>();
for (int i = 0, length = selects.size(); i < length; i++) {
String cityUrl = selects.get(i).absUrl("href");
region = new Region();
region.setName(selects.get(i).text());
region.setUrl(cityUrl);
resultlist.add(region);
// 獲取某省下屬所有村委會
getCityData(cityUrl, session);
}
if (resultlist.size() > 0) {
mapper.insertBatch(resultlist);
session.commit();
session.close();
}
}
}
具體調用的函數是這樣package com;
import java.io.IOException;
import org.apache.ibatis.session.SqlSession;
import com.jsoup.Html;
import com.region.factory.MybatisFactory;
public class Main
{
public static void main(String[] args) throws IOException, InterruptedException
{
/**
* 省直轄市 provincetr
* 市 citytr
* 縣區 countytr
* 鎮街道 towntr
* 村社區 villagetr
*/
Html html = new Html();
long currentTimeMillis = System.currentTimeMillis();
// 截止2016年
String url = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/index.html";
SqlSession session = MybatisFactory.getSession();
// html.getCityData("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/46.html", session);
// html.getTowntrData("http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/46/4604.html", session);
session.close();
// String url1 = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2016/46.html";
// html.getCityData(url1);
System.out.println("共耗時:" + (System.currentTimeMillis() - currentTimeMillis) + "ms");
}
}
這裏對代碼進行過多的解釋了,整理出來的數據我自己寫了一個小demo,一個jar包,獲取速度在5ms左右,大約0.98M。
統計用區劃代碼和城鄉劃分代碼所涉及的數據的sql文件:https://download.csdn.net/download/weixin_39923425/10297338
統計用區劃代碼和城鄉劃分代碼整理 region-1.0.0鏈接:https://github.com/shouyeHua/region