前段時間,按照上面的要求,需要做一個職場黑名單的項目,負責的部分是數據採集,也就是通過對各大招聘網站,按照地區或者其它劃分,採集HR的郵箱信息入庫,由於採集的網站較多,所以把部分公用的方法放在一個類中,方便調用,下面是對51job的採集,代碼如下:
package org.hr.integrity.crawl;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Set;
import org.apache.commons.httpclient.NameValuePair;
import org.hr.util.ConnectionUtil;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;
/**
* 爬取51job
* @author 72414
*
*/
public class JobsHref {
NameValuePair[] data = null;
static List<String> col = new ArrayList<String>();// 公司主頁col
static Example ex = new Example();
//放入到set集合中
static Set<String> list = new HashSet<String>();
public boolean getEmail(String body){//判斷email地址
boolean flag=false;
try{
Pattern p = Pattern.compile("[a-zA-Z0-9\\.\\-\\_]+?@[a-zA-Z0-9\\.\\-\\_]+\\.[a-zA-Z]{2,3}");
Matcher m =p.matcher(body);
if(m.find()){
String email=m.group();
if(!email.equals("[email protected]")){
flag=true;
System.out.println("email:"+email);
list.add(email);
}
}
}
catch(Exception e){
e.printStackTrace();
}
return flag;
}
@SuppressWarnings({ "static-access", "unused" })
public List<String> getHref(String body, NameValuePair[] data1) throws Exception// 得到招聘網站公司發佈的第一頁的網址
{
JobsHref jh = new JobsHref();
NameValuePair data[] = {
new NameValuePair("loginname", "[email protected]"),
new NameValuePair("password", "dir13652") };
if (body != null && !"".equals(body)) {
Document doc = Jsoup.parse(body);//Document doc = jh.requestDocumnet(body);
Elements linksElements = doc.select("[class=el]");
for (Element element : linksElements) {
Elements jobs = element.getElementsByClass("t1");//崗位名稱
for (Element ele : jobs) {
Element links = ele.getElementsByTag("a").first();
String href = links.attr("href");
if (href.indexOf("https://") >= 0) {
if (href != null && !"null".equals(href)&& !"".equals(href) && !col.contains(href)) {
col.add(href);
String context = ex.getPostResponseWithHttpClient(href, "GBK");// 得到招聘時應聘的html,這裏成爲廣度優先的第2層
if(!getEmail(context)){
}
}
}
}
}
}
return col;
}
public static String getURLValidate2(String url) {//檢測URL
String URL=null;
if(url.length()<=29){
return "";
}
else{
Matcher m =null;
try {
Pattern p = Pattern.compile("https://search.51job.com/list/");
/*
* 51job中和工作有關的網頁以http://search\\.51job\\.com/list/開頭,
* 截取從工作高級搜索找工作網頁得到的超鏈與上面匹配,返回需要的超鏈
*/
String suburl=url.trim().substring(0, 30);//
m = p.matcher(suburl);
if (m.find()) {
URL = url;
}
else{
return "";
}
}
catch (Exception e) {
e.printStackTrace();
}
}
return URL;
}
@SuppressWarnings("unused")
public List<String> getHref1(String body, NameValuePair[] data1) throws Exception// 得到招聘分頁的網址
{
LinkedList<String> nowpageHref = new LinkedList<String>();
NameValuePair data[] = {
new NameValuePair("loginname", "[email protected]"),
new NameValuePair("password", "dir13652") };
if (body != null && !"".equals(body)) {
Document doc = Jsoup.parse(body);
Elements linksElements = doc.select("div.p_in>ul>li>a");//得到分頁鏈接
for (Element ele : linksElements) {
String href = getURLValidate2(ele.attr("href")) ;
if (href != null && href.indexOf("https://") >= 0&&!"".equals(href)) {
if (!nowpageHref.contains(href))
nowpageHref.add(href);
}
}
}
return nowpageHref;
}
@SuppressWarnings("static-access")
public static void main(String[] args) throws Exception {
JobsHref js= new JobsHref();
NameValuePair data1[] = {
new NameValuePair("loginname", "[email protected]"),
new NameValuePair("password", "dir13652") };
String body= ex.getGetResponseWithHttpClient(
"http://search.51job.com/jobsearch/search_result.php?fromJs=1&jobarea=010000%2C00&district=000000&funtype=0000&industrytype=00&issuedate=3&providesalary=99&keywordtype=2&curr_page=1&lang=c&stype=2&postchannel=0000&workyear=99&cotype=99°reefrom=99&jobterm=01&lonlat=0%2C0&radius=-1&ord_field=0&list_type=0&fromType=14"
,"GBK");// 得到各地區發佈的工作的html頁面
js.getHref(body, data1);//返回當頁工作的網站
List<String> page = js.getHref1(body, data1);//得到招聘分頁的網站
Iterator<String> It = page.iterator();
while (It.hasNext()) {
String result = ex.getGetResponseWithHttpClient(It.next(),"GBK");
js.getHref(result, data1);
}
ex.printEmialList();
ConnectionUtil cu = new ConnectionUtil();
for (String str : list) {
cu.addEmail(str.trim());//去掉空格
}
System.out.println("運行完成!");
}
}
下面是公用的代碼部分,
package org.hr.integrity.crawl;
import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;
public class Example {
// 獲得ConnectionManager,設置相關參數
private static MultiThreadedHttpConnectionManager manager = new MultiThreadedHttpConnectionManager();
private static int connectionTimeOut = 20000;
private static int socketTimeOut = 10000;
private static int maxConnectionPerHost = 5;
private static int maxTotalConnections = 40;
// 標誌初始化是否完成的flag
private static boolean initialed = true;//設置值爲true,2018年6月7日 10:28:09
static List<String> list=new LinkedList<String>();//總郵箱list
// 初始化ConnectionManger的方法
public static void SetPara() {
manager.getParams().setConnectionTimeout(connectionTimeOut);
manager.getParams().setSoTimeout(socketTimeOut);
manager.getParams().setDefaultMaxConnectionsPerHost(
maxConnectionPerHost);
manager.getParams().setMaxTotalConnections(maxTotalConnections);
initialed = true;
}
// 通過get方法獲取網頁內容
public static String getGetResponseWithHttpClient(String url, String encode) {
HttpClient client = new HttpClient(manager);
if (initialed) {
Example.SetPara();
}
GetMethod get = new GetMethod(url);
get.getParams().setParameter("http.protocol.cookie-policy",CookiePolicy.BROWSER_COMPATIBILITY);//去除警告
get.setFollowRedirects(true);
String result = null;
StringBuffer resultBuffer = new StringBuffer();
try {
client.executeMethod(get);
// 在目標頁面情況未知的條件下,不推薦使用getResponseBodyAsString()方法
//String strGetResponseBody = post.getResponseBodyAsString();
BufferedReader in = new BufferedReader(new InputStreamReader(get
.getResponseBodyAsStream(), get.getResponseCharSet()));
String inputLine = null;
while ((inputLine = in.readLine()) != null) {
resultBuffer.append(inputLine);
resultBuffer.append("\n");
}
in.close();
result = resultBuffer.toString();
// iso-8859-1 is the default reading encode
result = Example.ConverterStringCode(resultBuffer
.toString(), get.getResponseCharSet(), encode);
} catch (Exception e) {
e.printStackTrace();
result = "";
} finally {
get.releaseConnection();
}
return result;
}
@SuppressWarnings("resource")
public static void addEmail(String email) throws Exception{
FileOutputStream fos = new FileOutputStream(new File("1_1email.txt"),true);
fos.write(email.getBytes());
}
void printEmialList()throws IOException{
FileOutputStream fos = new FileOutputStream(new File("email.txt"),true);
Iterator<String> it=list.iterator();
System.out.println("生成email");
while(it.hasNext()){
String ema=it.next()+",";
fos.write(ema.getBytes());
}
fos.close();
}
public static String getPostResponseWithHttpClient(String url, String encode) {
HttpClient client = new HttpClient(manager);
if (initialed) {
HttpClientExample.SetPara();
}
PostMethod post = new PostMethod(url);
post.getParams().setParameter("http.protocol.cookie-policy",CookiePolicy.BROWSER_COMPATIBILITY);//去除警告
post.setFollowRedirects(false);
StringBuffer resultBuffer = new StringBuffer();
String result = null;
try {
client.executeMethod(post);
BufferedReader in = new BufferedReader(new InputStreamReader(post
.getResponseBodyAsStream(), post.getResponseCharSet()));
String inputLine = null;
while ((inputLine = in.readLine()) != null) {
resultBuffer.append(inputLine);
resultBuffer.append("\n");
}
in.close();
// iso-8859-1 is the default reading encode
result = Example.ConverterStringCode(resultBuffer
.toString(), post.getResponseCharSet(), encode);
} catch (Exception e) {
e.printStackTrace();
result = "";
} finally {
post.releaseConnection();
}
return result;
}
public static boolean getEmail(String body){
boolean flag=false;
try{
Pattern p = Pattern.compile("[a-zA-Z0-9\\.\\-\\_]+?@[a-zA-Z0-9\\.\\-\\_]+\\.[a-zA-Z]{2,3}");
Matcher m =p.matcher(body);
if(m.find()){
flag=true;
String email=m.group();
//System.out.println("SSSS:"+email);
if(!list.contains(email)){
list.add(email);
addEmail(email);//將得到的Email加入數據庫,這裏先加入文本里面
}
}
}
catch(Exception e){
e.printStackTrace();
}
return flag;
}
public static String getPostResponseWithHttpClient (String url,
String encode, NameValuePair[] nameValuePair) throws Exception {
HttpClient client = new HttpClient(manager);
if (initialed) {//
HttpClientExample.SetPara();//初始化ConnectionManger的方法
}
PostMethod post = new PostMethod(url);
post.setRequestBody(nameValuePair);//將表單所有的值設置到PostMethod中
post.getParams().setParameter(//去除警告
"http.protocol.cookie-policy",CookiePolicy.BROWSER_COMPATIBILITY);
post.setFollowRedirects(false);//設置此類是否應該自動執行http重定向
String result = null;
StringBuffer resultBuffer = new StringBuffer();
try {
client.executeMethod(post);
BufferedReader in = new BufferedReader(new InputStreamReader(post
.getResponseBodyAsStream(), post.getResponseCharSet()));
String inputLine = null;
while ((inputLine = in.readLine()) != null) {
resultBuffer.append(inputLine);
resultBuffer.append("\n");
}
in.close();
// iso-8859-1 is the default reading encode
result = Example.ConverterStringCode(resultBuffer.toString(), post.getResponseCharSet(), encode);
//System.out.println("result:"+result.length());
if(getEmail(result)){//驗證網址
System.out.println("hasemailurl:"+url);
}
} catch (Exception e) {
e.printStackTrace();
result = "";
} finally {
post.releaseConnection();
}
return result;
}
private static String ConverterStringCode(String source, String srcEncode,
String destEncode) {
if (source != null) {
try {
return new String(source.getBytes(srcEncode), destEncode);
} catch (UnsupportedEncodingException e) {
// TODO Auto-generated catch block
e.printStackTrace();
return "";
}
} else {
return "";
}
}
}
上面的代碼是先爬取能獲取到的頁面,爬到的郵箱先放入一個list裏面,爬完之後再放入到數據庫中,下面是ConnectionUtil.java中插入到數據庫的片段代碼:
/**
* 向數據庫表添加數據
* @auther yuyu
*/
public boolean addEmail(String em){
boolean result = false;
try {
conn = DriverManager.getConnection(connStr);
String sqlInset = "insert into hrintegrity.email(email) values(?)";
PreparedStatement stmts = conn.prepareStatement(sqlInset);
stmts.setString(1, em);
//這裏需要添加判斷,email在表中是否存在
int i = stmts.executeUpdate();//執行插入數據操作,返回影響的行數
if(i == 1){
result = true;
}
} catch (Exception e) {
e.printStackTrace();
}finally{
try {
conn.close();
} catch (Exception e) {
e.printStackTrace();
}
}
return result;
}
上面就是一個獲取51job的郵箱的完整代碼,除了51job外,其它招聘網站的獲取方式大同小異,如智聯,不同點就是在Example.java中調用的方法不同,而且在採集數據的時候select的標籤不一樣,需要自己一個一個去嘗試。
有問題可以在留言中一起交流。