1、獲取列表
需要分析正確的接口url
請求參數:
start=180
&pageSize=90
&cityId=763
&salary=0,0
&workExperience=-1
&education=-1
&companyType=-1
&employmentType=-1
&jobWelfareTag=-1
&kw=Java%E5%BC%80%E5%8F%91 這裏漢字需要轉碼
&kt=3 沒有這個參數,請求失敗,不知道有什麼用,望告知!!
public static void main(String[] args) {
String url = "https://fe-api.zhaopin.com/c/i/sou?start=180&pageSize=90&cityId=763&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=Java%E5%BC%80%E5%8F%91&kt=3";
Connection connect = Jsoup.connect(url).timeout(30000);
connect.header("authority", "fe-api.zhaopin.com");
connect.header("accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
connect.header("path",
"/c/i/sou?start=180&pageSize=90&cityId=763&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=Java%E5%BC%80%E5%8F%91&kt=3&=0");
connect.header("accept-encoding", "gzip, deflate, br");
connect.header("accept-language", "zh-CN,zh;q=0.9");
connect.header("cache-control", "no-cache");
connect.header("upgrade-insecure-requests", "1");
connect.header("user-agent",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36")
.ignoreContentType(true);
connect.method(Method.GET);
try {
Response response = connect.execute();
System.out.println(response.body());
} catch (IOException e1) {
e1.printStackTrace();
}
}
2、獲得單個職位詳情,在爬取中遇到html亂碼問題
第一版:HTML亂碼
public static void main(String[] args) {
String url = "https://jobs.zhaopin.com/CC322742114J00246383604.htm";
Connection connect = Jsoup.connect(url).timeout(30000);
connect.header("Host", "jobs.zhaopin.com");
connect.header("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
connect.header("path",
"/c/i/sou?start=180&pageSize=90&cityId=763&salary=0,0&workExperience=-1&education=-1&companyType=-1&employmentType=-1&jobWelfareTag=-1&kw=Java%E5%BC%80%E5%8F%91&kt=3&=0");
connect.header("Accept-Encoding", "gzip, deflate, br");
connect.header("Accept-Language", "zh-CN,zh;q=0.9");
connect.header("Cache-Control", "no-cache");
connect.header("Connection", "keep-alive");
connect.header("upgrade-insecure-requests", "1");
connect.header("user-agent",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36")
.ignoreContentType(true);
connect.method(Method.GET);
try {
Response response = connect.execute();
Document parse = response.parse();
System.out.println(parse.toString());
String html = parse.html();
int index = html.indexOf("__INITIAL_STATE__=");不會使用正則
String msglist = html.substring(index, html.indexOf("</script>", index)).replace("__INITIAL_STATE__=", "");
System.out.println(msglist);
} catch (IOException e1) {
e1.printStackTrace();
}
}
第二版:網上找的解決亂碼方法 ,但是無效
public static void main(String[] args) throws IOException {
String urlstr = "http://jobs.zhaopin.com/CC322742114J00246383604.htm";
URL url = new URL(urlstr);
HttpURLConnection connection = (HttpURLConnection) url.openConnection();
connection.setRequestMethod("GET");
connection.addRequestProperty("Host", "jobs.zhaopin.com");
connection.addRequestProperty("Accept",
"text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8");
connection.addRequestProperty("Accept-Language", "zh-CN,zh;q=0.9");
connection.addRequestProperty("Cache-Control", "no-cache");
connection.addRequestProperty("Connection", "keep-alive");
connection.addRequestProperty("upgrade-insecure-requests", "1");
connection.addRequestProperty("user-agent",
"Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/72.0.3626.96 Safari/537.36");
try {
/*第一種 字符亂碼 無效*/
Document parse = Jsoup.parse(connection.getInputStream(), "UTF-8", urlstr);
String docStr = parse.toString();
/*第二種 字符亂碼 無效*/
String str = new String(docStr.getBytes("ISO8859-1"), "UTF-8");
parse = Jsoup.parse(str);
} catch (IOException e1) {
e1.printStackTrace();
}
}
最終解決方法 :在把第一個版本中的connect.header("Accept-Encoding", "gzip, deflate, br"); 註釋了,就不會出現亂碼。
爲什麼會出現第二版:Connection connect = Jsoup.connect(url) 不能請求設置編碼
爬蟲是有時效性的,以上有錯誤,望指出!!!