java+jsoup 爬取智聯招聘 簡化版

1、獲取列表

需要分析正確的接口url

請求參數:

start=180
&pageSize=90
&cityId=763
&salary=0,0
&workExperience=-1
&education=-1
&companyType=-1
&employmentType=-1
&jobWelfareTag=-1
&kw=Java%E5%BC%80%E5%8F%91  這裏漢字需要轉碼
&kt=3   沒有這個參數,請求失敗,不知道有什麼用,望告知!!

 

public static void main(String[] args) {
        String url = "https://fe-api.zhaopin.com/c/i/sou?start=180&pageSize=90&cityId=763&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=Java%E5%BC%80%E5%8F%91&kt=3";
        Connection connect = Jsoup.connect(url).timeout(30000);
        connect.header("authority", "fe-api.zhaopin.com");
        connect.header("accept",
                "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
        connect.header("path",
                "/c/i/sou?start=180&pageSize=90&cityId=763&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=Java%E5%BC%80%E5%8F%91&kt=3&=0");
        connect.header("accept-encoding", "gzip, deflate, br");
        connect.header("accept-language", "zh-CN,zh;q=0.9");
        connect.header("cache-control", "no-cache");
        connect.header("upgrade-insecure-requests", "1");
        connect.header("user-agent",
                "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36")
                .ignoreContentType(true);
        connect.method(Method.GET);
        try {
            Response response = connect.execute();
            System.out.println(response.body());
        } catch (IOException e1) {
            e1.printStackTrace();
        }

    }

 

2、獲得單個職位詳情,在爬取中遇到html亂碼問題

第一版:HTML亂碼

public static void main(String[] args) {
        String url = "https://jobs.zhaopin.com/CC322742114J00246383604.htm";
        Connection connect = Jsoup.connect(url).timeout(30000);
        connect.header("Host", "jobs.zhaopin.com");
        connect.header("Accept",
                "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
        connect.header("path",
                "/c/i/sou?start=180&pageSize=90&cityId=763&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=Java%E5%BC%80%E5%8F%91&kt=3&=0");
        connect.header("Accept-Encoding", "gzip, deflate, br");
        connect.header("Accept-Language", "zh-CN,zh;q=0.9");
        connect.header("Cache-Control", "no-cache");
        connect.header("Connection", "keep-alive");
        connect.header("upgrade-insecure-requests", "1");
        connect.header("user-agent",
                "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36")
                .ignoreContentType(true);
        connect.method(Method.GET);
        try {
            Response response = connect.execute();
            Document parse = response.parse();
            System.out.println(parse.toString());
            String html = parse.html();
            int index = html.indexOf("__INITIAL_STATE__=");不會使用正則
            String msglist = html.substring(index, html.indexOf("</script>", index)).replace("__INITIAL_STATE__=", "");
            System.out.println(msglist);
        } catch (IOException e1) {
            e1.printStackTrace();
        }

    }

第二版:網上找的解決亂碼方法 ,但是無效

public static void main(String[] args) throws IOException {
        String urlstr = "http://jobs.zhaopin.com/CC322742114J00246383604.htm";

        URL url = new URL(urlstr);
        HttpURLConnection connection = (HttpURLConnection) url.openConnection();
        connection.setRequestMethod("GET");
        connection.addRequestProperty("Host", "jobs.zhaopin.com");
        connection.addRequestProperty("Accept",
                "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
        connection.addRequestProperty("Accept-Language", "zh-CN,zh;q=0.9");
        connection.addRequestProperty("Cache-Control", "no-cache");
        connection.addRequestProperty("Connection", "keep-alive");
        connection.addRequestProperty("upgrade-insecure-requests", "1");
        connection.addRequestProperty("user-agent",
                "Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36");
        try {
            /*第一種 字符亂碼  無效*/
            Document parse = Jsoup.parse(connection.getInputStream(), "UTF-8", urlstr);

            String docStr = parse.toString();
            /*第二種 字符亂碼  無效*/
            String str = new String(docStr.getBytes("ISO8859-1"), "UTF-8");
            parse = Jsoup.parse(str);
        } catch (IOException e1) {
            e1.printStackTrace();
        }
    }

最終解決方法 :在把第一個版本中的connect.header("Accept-Encoding", "gzip, deflate, br"); 註釋了,就不會出現亂碼。

爲什麼會出現第二版:Connection connect = Jsoup.connect(url) 不能請求設置編碼

 

爬蟲是有時效性的,以上有錯誤,望指出!!!

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章