Java使用Jsoup爬取省市區鄉鎮的數據源碼

標題Java使用Jsoup爬取省市區鄉鎮的數據源碼

由於業務需求,需要讓用戶選擇地址信息。所以在想在網上找一份最新的省市區及鄉鎮的數據。可是竟然都要積分C幣一類,所以自己就利用apache的Jsoup寫了一個爬取國家統計局的鄉鎮劃分數據
1.介紹org.jsoup
jsoup是一個Java的html解析器
2.Maven依賴

<!-- https://mvnrepository.com/artifact/org.jsoup/jsoup -->
      <dependency>
          <groupId>org.jsoup</groupId>
          <artifactId>jsoup</artifactId>
          <version>1.9.2</version>
      </dependency>

3.編寫實體類,用於儲存數據
RegionEntry.java

package cn.jiangdoc.utils;

import java.util.ArrayList;
import java.util.List;

public class RegionEntry {
    private String code;
    private String name;
    private List<RegionEntry> sub = new ArrayList<>();

    public String getCode() {
        return code;
    }

    public void setCode(String code) {
        this.code = code;
    }

    public String getName() {
        return name;
    }

    public void setName(String name) {
        this.name = name;
    }

    public List<RegionEntry> getSub() {
        return sub;
    }

    public void setSub(List<RegionEntry> sub) {
        this.sub = sub;
    }

    public RegionEntry(String code, String name, List<RegionEntry> sub) {
        this.code = code;
        this.name = name;
        this.sub = sub;
    }

    public RegionEntry() {
    }
}

4.正式開始我們的爬蟲數據
AddressData .java

package cn.jiangdoc.utils;

import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Date;
import java.util.List;

import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * 抓取
 *
 * @author jiangdoc
 * @date 2019-3-16
 */
public class AddressData {
    public static String SITE_URL = "http://www.stats.gov.cn/tjsj/tjbz/tjyqhdmhcxhfdm/2018/";
    private static List<RegionEntry> regions = new ArrayList<RegionEntry>();
    public static void main(String[] args) {
        System.out.println("抓取開始:" + new Date());
        getProvince();
        StringBuffer content = new StringBuffer();
        for (RegionEntry one : regions) {
            content.append("insert into sys_province values(null,'").append(one.getCode()).append("', '").append(one.getName()).append("', 1 );\r\n");
            for (RegionEntry two : one.getSub()) {
                content.append("insert into sys_city values(null,'").append(one.getCode()).append("', '").append(two.getCode()+"','").append(two.getName()).append("', 2);\r\n");
                for (RegionEntry three : two.getSub()) {
                    content.append("insert into sys_county values(null,'").append(one.getCode()).append("', '").append(two.getCode()).append("', '").append(three.getCode()).append("', '").append(three.getName()).append("', 3 );\r\n");
                    for(RegionEntry four:three.getSub()){
                        content.append("insert into sys_town values(null,'").append(one.getCode()).append("', '").append(two.getCode()).append("', '").append(three.getCode()).append("', '").append(four.getCode()).append("','").append(four.getName()).append("', 4 );\r\n");
                    }
                }
            }
        }
        FileOutputStream out = null;
//        Region.writeFile(content.toString());
        try{
           out = new FileOutputStream(new File("G:\\log\\city.txt"));
            byte[] bytes = content.toString().getBytes();
            out.write(bytes);
            out.flush();
        }catch(Exception e){
            e.printStackTrace();
        }finally{
            if(out!=null)
                try{
                out.close();
            }catch (Exception e){
                e.printStackTrace();
                }
        }
        System.out.println("抓取完畢:" + new Date());
    }
    private static void getProvince() {
        Document doc;
        try {
            doc = Jsoup.connect(SITE_URL).get(); //Jsoup.connect(SITE_URL).get();
            Elements links = doc.select("tr.provincetr").select("a");
            RegionEntry region = null;
            for (Element e : links) {
                region = new RegionEntry();
                String href = e.attr("href");
                String[] arr = href.split("\\.");
                String code = arr[0];
                if (arr[0].length() < 6) {
                    for (int i = 0; i < 6 - arr[0].length(); i++) {
                        code += "0";
                    }
                }
                region.setCode(code);
                region.setName(e.text());
//                href的絕地路徑
                String absHref = e.attr("abs:href");
                System.out.println(absHref);
                getCity(absHref, region);
                regions.add(region);
                try {
                    Thread.sleep(1000);
                } catch (InterruptedException e1) {
                    // TODO Auto-generated catch block
                    e1.printStackTrace();
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    /**
     * 獲取市地址
     * @param url
     * @param region
     */
    private static void getCity(String url, RegionEntry region) {
        Document doc;
        try {
            doc = Jsoup.connect(url).get(); //Jsoup.connect(url).get().charset(charset);
//            <tr class='citytr'><td><a href='65/6501.html'>650100000000</a></td><td><a href='65/6501.html'>烏魯木齊市</a></td></tr>
            Elements links = doc.select("tr.citytr");
            RegionEntry city;
            for (Element e : links) {
                city = new RegionEntry();
                Elements alist = e.select("a");
                Element codeE = alist.get(0);
                Element codeN = alist.get(1);
                String name = codeN.text();
                String code = codeE.text();
                if ("市轄區".equals(name)) {
                    name = region.getName();
                    //code = region.getCode();
                }
                city.setCode(code);
                city.setName(name);
                String absHref = codeE.attr("abs:href");
                getArea(absHref, city);
                region.getSub().add(city);
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    /**
     * 獲取區縣地址
     * @param url
     * @param region
     */
    private static void getArea(String url, RegionEntry region) {
        Document doc;
        try {
            doc = Jsoup.connect(url).get(); // Jsoup.connect(url).get();
            //<tr class='countytr'><td><a href='01/130102.html'>130102000000</a></td><td><a href='01/130102.html'>長安區</a></td></tr>
            Elements links = doc.select("tr.countytr");
            RegionEntry area;
            for (Element e : links) {
                area = new RegionEntry();
                Elements alist = e.select("a");
                if (alist.size() > 0) {
                    Element codeE = alist.get(0);
                    String code = codeE.text();
                    area.setCode(code);
                    Element codeN = alist.get(1);
                    String name = codeN.text();
                    area.setName(name);
                    String absHref = codeE.attr("abs:href");
                    getTown(absHref, area);
                    region.getSub().add(area);
                } else {
                    alist = e.select("td");
                    area.setCode(alist.get(0).text());
                    area.setName(alist.get(1).text());
                    region.getSub().add(area);
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
    //鄉鎮
    private static void getTown(String url, RegionEntry region) {
        Document doc;
        try {
            doc = Jsoup.connect(url).get(); // Jsoup.connect(url).get();
            //<tr class='towntr'><td><a href='07/110107001.html'>110107001000</a></td><td><a href='07/110107001.html'>八寶山街道辦事處</a></td></tr>
            Elements links = doc.select("tr.towntr");
            RegionEntry town;
            for (Element e : links) {
                town = new RegionEntry();
                Elements alist = e.select("a");
                if (alist.size() > 0) {
                    Element codeE = alist.get(0);
                    String code = codeE.text();
                    town.setCode(code);
                    Element codeN = alist.get(1);
                    String name = codeN.text();
                    town.setName(name);
                    region.getSub().add(town);
                } else {
                    alist = e.select("td");
                    town.setCode(alist.get(0).text());
                    town.setName(alist.get(1).text());
                    region.getSub().add(town);
                }
            }
        } catch (IOException e) {
            e.printStackTrace();
        }
    }
}

注意:運行中可能會出現鏈接超時,很正常,在訪問比較少的時間段,成功率會大很多。還有就是廣東省的東菀市和中山市比較特殊沒有區級的劃分;

自己矯正後的數據的下載地址:點擊下載

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章