獲取招聘網站下的HR-Email信息

        前段時間,按照上面的要求,需要做一個職場黑名單的項目,負責的部分是數據採集,也就是通過對各大招聘網站,按照地區或者其它劃分,採集HR的郵箱信息入庫,由於採集的網站較多,所以把部分公用的方法放在一個類中,方便調用,下面是對51job的採集,代碼如下:

package org.hr.integrity.crawl;

import java.util.regex.Matcher;
import java.util.regex.Pattern;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.ArrayList;
import java.util.Iterator;
import java.util.Set;

import org.apache.commons.httpclient.NameValuePair;
import org.hr.util.ConnectionUtil;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.nodes.Element;
import org.jsoup.select.Elements;

/**
 * 爬取51job
 * @author 72414
 *
 */
public class JobsHref {
	
	NameValuePair[] data = null;

	static List<String> col = new ArrayList<String>();// 公司主頁col 
	
	static Example ex = new Example();
	
	//放入到set集合中
	static Set<String> list = new HashSet<String>();
	
	public boolean getEmail(String body){//判斷email地址
		
		boolean flag=false;
		
		 try{
			 Pattern p = Pattern.compile("[a-zA-Z0-9\\.\\-\\_]+?@[a-zA-Z0-9\\.\\-\\_]+\\.[a-zA-Z]{2,3}"); 
			 Matcher m =p.matcher(body);
			 	if(m.find()){

			 		String email=m.group();
			 		
			 			if(!email.equals("[email protected]")){

			 					flag=true;
			 					
			 					System.out.println("email:"+email);
			 					
			 					list.add(email);
			 				}
			 		}
		 	}
		 	catch(Exception e){
			 
		 		e.printStackTrace();
		 }
    	return flag;
	}
	
	
	@SuppressWarnings({ "static-access", "unused" })
	public List<String> getHref(String body, NameValuePair[] data1) throws Exception// 得到招聘網站公司發佈的第一頁的網址
	{
		
		JobsHref jh = new JobsHref();
		
		NameValuePair data[] = {
				
				new NameValuePair("loginname", "[email protected]"),
				new NameValuePair("password", "dir13652") };
		
		if (body != null && !"".equals(body)) {
			
			Document doc = Jsoup.parse(body);//Document doc = jh.requestDocumnet(body);
			
			Elements linksElements = doc.select("[class=el]");
		
			for (Element element : linksElements) {
				
				Elements jobs = element.getElementsByClass("t1");//崗位名稱
				
				for (Element ele : jobs) {
				
					Element links = ele.getElementsByTag("a").first();
				
					String href = links.attr("href");
				
					if (href.indexOf("https://") >= 0) {
					
						if (href != null && !"null".equals(href)&& !"".equals(href) && !col.contains(href)) {
						
							col.add(href);
						
							String context = ex.getPostResponseWithHttpClient(href, "GBK");// 得到招聘時應聘的html,這裏成爲廣度優先的第2層
							
							if(!getEmail(context)){
								
							}
						}
					}
				}
			}
		}
		return col;
	}
	
	
	public static String getURLValidate2(String url) {//檢測URL
		
		String URL=null;
		 if(url.length()<=29){
			 return "";
         		 }
		 else{
		     Matcher m =null;
		     	try {
					Pattern p = Pattern.compile("https://search.51job.com/list/");
				/*
				 * 51job中和工作有關的網頁以http://search\\.51job\\.com/list/開頭,
				 * 截取從工作高級搜索找工作網頁得到的超鏈與上面匹配,返回需要的超鏈
				 */
				
					String suburl=url.trim().substring(0, 30);//
					
					m = p.matcher(suburl);
					
					if (m.find()) {
						
						URL = url;
						
					}
					else{
						
						return "";
						
					}

		     	} 
				catch (Exception e) {

					e.printStackTrace();
				}
		 	}
		 return URL;
		
		}

	@SuppressWarnings("unused")
	public List<String> getHref1(String body, NameValuePair[] data1) throws Exception// 得到招聘分頁的網址
	{

		LinkedList<String> nowpageHref = new LinkedList<String>();
		
		NameValuePair data[] = {
				new NameValuePair("loginname", "[email protected]"),
				new NameValuePair("password", "dir13652") };
		
		if (body != null && !"".equals(body)) {
			
			Document doc = Jsoup.parse(body);
			
			Elements linksElements = doc.select("div.p_in>ul>li>a");//得到分頁鏈接
			
			for (Element ele : linksElements) {	
				
				String href = getURLValidate2(ele.attr("href")) ;
				
				if (href != null && href.indexOf("https://") >= 0&&!"".equals(href)) {	
					
					if (!nowpageHref.contains(href))
						
						nowpageHref.add(href);

				}
				
			}

		}
		
		return nowpageHref;
	}
	

	@SuppressWarnings("static-access")
	public static void main(String[] args) throws Exception {
		
		JobsHref js= new JobsHref();

		NameValuePair data1[] = {
				
				new NameValuePair("loginname", "[email protected]"),
				new NameValuePair("password", "dir13652") };
		
			String body= ex.getGetResponseWithHttpClient(
					"http://search.51job.com/jobsearch/search_result.php?fromJs=1&jobarea=010000%2C00&district=000000&funtype=0000&industrytype=00&issuedate=3&providesalary=99&keywordtype=2&curr_page=1&lang=c&stype=2&postchannel=0000&workyear=99&cotype=99&degreefrom=99&jobterm=01&lonlat=0%2C0&radius=-1&ord_field=0&list_type=0&fromType=14"
					,"GBK");// 得到各地區發佈的工作的html頁面  
      		js.getHref(body, data1);//返回當頁工作的網站
      		
        	List<String> page = js.getHref1(body, data1);//得到招聘分頁的網站
        	
    		Iterator<String> It = page.iterator();
    		
    		while (It.hasNext()) {

    			String result = ex.getGetResponseWithHttpClient(It.next(),"GBK");
 
    			js.getHref(result, data1);
    		}
		ex.printEmialList();
		
		ConnectionUtil cu = new ConnectionUtil();
		
		for (String str : list) {

			cu.addEmail(str.trim());//去掉空格
		}
		
		System.out.println("運行完成!");
	}
}

          下面是公用的代碼部分,

package org.hr.integrity.crawl;

import java.io.BufferedReader;
import java.io.File;
import java.io.FileOutputStream;
import java.io.IOException;
import java.io.InputStreamReader;
import java.io.UnsupportedEncodingException;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.regex.Matcher;
import java.util.regex.Pattern;

import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.MultiThreadedHttpConnectionManager;
import org.apache.commons.httpclient.NameValuePair;
import org.apache.commons.httpclient.cookie.CookiePolicy;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;

public class Example {
	

	// 獲得ConnectionManager,設置相關參數
	private static MultiThreadedHttpConnectionManager manager = new MultiThreadedHttpConnectionManager();
	private static int connectionTimeOut = 20000;
	private static int socketTimeOut = 10000;
	private static int maxConnectionPerHost = 5;
	private static int maxTotalConnections = 40;
	// 標誌初始化是否完成的flag
	private static boolean initialed = true;//設置值爲true,2018年6月7日 10:28:09
	static List<String> list=new LinkedList<String>();//總郵箱list
	
	// 初始化ConnectionManger的方法
	public static void SetPara() {
		manager.getParams().setConnectionTimeout(connectionTimeOut);
		manager.getParams().setSoTimeout(socketTimeOut);
		manager.getParams().setDefaultMaxConnectionsPerHost(
				maxConnectionPerHost);
		manager.getParams().setMaxTotalConnections(maxTotalConnections);
		initialed = true;
	}

	// 通過get方法獲取網頁內容
	public static String getGetResponseWithHttpClient(String url, String encode) {
		HttpClient client = new HttpClient(manager);
		if (initialed) {
			Example.SetPara();
		}
		GetMethod get = new GetMethod(url);
		get.getParams().setParameter("http.protocol.cookie-policy",CookiePolicy.BROWSER_COMPATIBILITY);//去除警告
		get.setFollowRedirects(true);
		String result = null;
		StringBuffer resultBuffer = new StringBuffer();
		try {
			client.executeMethod(get);
			// 在目標頁面情況未知的條件下,不推薦使用getResponseBodyAsString()方法
			//String strGetResponseBody = post.getResponseBodyAsString();
			BufferedReader in = new BufferedReader(new InputStreamReader(get
					.getResponseBodyAsStream(), get.getResponseCharSet()));
			String inputLine = null;
			while ((inputLine = in.readLine()) != null) {
				resultBuffer.append(inputLine);
				resultBuffer.append("\n");
			}
			in.close();
			result = resultBuffer.toString();
			// iso-8859-1 is the default reading encode
			result = Example.ConverterStringCode(resultBuffer
					.toString(), get.getResponseCharSet(), encode);
		
		} catch (Exception e) {
			e.printStackTrace();
			result = "";
		} finally {
			get.releaseConnection();
			
		}
		return result;
	}
	
	
	@SuppressWarnings("resource")
	public static void addEmail(String email) throws Exception{
		  FileOutputStream fos = new FileOutputStream(new File("1_1email.txt"),true);  
		  fos.write(email.getBytes()); 
	}
	
	
	void printEmialList()throws IOException{
		
		FileOutputStream fos = new FileOutputStream(new File("email.txt"),true);  
		Iterator<String> it=list.iterator();
		System.out.println("生成email");
		while(it.hasNext()){
			String ema=it.next()+",";
			fos.write(ema.getBytes()); 
		}
		 fos.close();
	}
	
	
	public static String getPostResponseWithHttpClient(String url, String encode) {
		HttpClient client = new HttpClient(manager);
		if (initialed) {
			HttpClientExample.SetPara();
		}
		PostMethod post = new PostMethod(url);
		post.getParams().setParameter("http.protocol.cookie-policy",CookiePolicy.BROWSER_COMPATIBILITY);//去除警告
		post.setFollowRedirects(false);
		StringBuffer resultBuffer = new StringBuffer();
		String result = null;
		try {
			client.executeMethod(post);
			BufferedReader in = new BufferedReader(new InputStreamReader(post
					.getResponseBodyAsStream(), post.getResponseCharSet()));
			String inputLine = null;
			while ((inputLine = in.readLine()) != null) {
				resultBuffer.append(inputLine);
				resultBuffer.append("\n");
			}
			in.close();
			// iso-8859-1 is the default reading encode
			result = Example.ConverterStringCode(resultBuffer
					.toString(), post.getResponseCharSet(), encode);
			
		} catch (Exception e) {
			e.printStackTrace();
			result = "";
		} finally {
			post.releaseConnection();
		
		}
		return result;
	}
	
	public static  boolean getEmail(String body){
		boolean flag=false;
		try{
			Pattern p = Pattern.compile("[a-zA-Z0-9\\.\\-\\_]+?@[a-zA-Z0-9\\.\\-\\_]+\\.[a-zA-Z]{2,3}"); 
		   	Matcher m  =p.matcher(body);
	   	if(m.find()){
	   		flag=true;
	   		String email=m.group();
   		//System.out.println("SSSS:"+email);
   		if(!list.contains(email)){
   	    	list.add(email);
   	    	addEmail(email);//將得到的Email加入數據庫,這裏先加入文本里面
   				}
	   		}
		 }
		 catch(Exception e){
			 
			 e.printStackTrace();
		 }
    	return flag;
	}
	
	
	public static String getPostResponseWithHttpClient (String url,
			String encode, NameValuePair[] nameValuePair) throws Exception {
			HttpClient client = new HttpClient(manager);
			if (initialed) {//
				HttpClientExample.SetPara();//初始化ConnectionManger的方法
				}
			PostMethod post = new PostMethod(url);
			post.setRequestBody(nameValuePair);//將表單所有的值設置到PostMethod中
			post.getParams().setParameter(//去除警告
				"http.protocol.cookie-policy",CookiePolicy.BROWSER_COMPATIBILITY);
			post.setFollowRedirects(false);//設置此類是否應該自動執行http重定向
			String result = null;
			StringBuffer resultBuffer = new StringBuffer();
			try {
				client.executeMethod(post);    
			BufferedReader in = new BufferedReader(new InputStreamReader(post
					.getResponseBodyAsStream(), post.getResponseCharSet()));
			String inputLine = null;
			while ((inputLine = in.readLine()) != null) {
				resultBuffer.append(inputLine);
				resultBuffer.append("\n");
			}
			in.close();
			// iso-8859-1 is the default reading encode
			result = Example.ConverterStringCode(resultBuffer.toString(), post.getResponseCharSet(), encode);
			//System.out.println("result:"+result.length());
			if(getEmail(result)){//驗證網址
				System.out.println("hasemailurl:"+url);
			}
		} catch (Exception e) {
			e.printStackTrace();
			result = "";
		} finally {
			post.releaseConnection();
			
		}
		return result;
	}

	private static String ConverterStringCode(String source, String srcEncode,
			String destEncode) {
		if (source != null) {
			try {
				return new String(source.getBytes(srcEncode), destEncode);
			} catch (UnsupportedEncodingException e) {
				// TODO Auto-generated catch block
				e.printStackTrace();
				return "";
			}
		} else {
			return "";
		}
	}
	
}

              上面的代碼是先爬取能獲取到的頁面,爬到的郵箱先放入一個list裏面,爬完之後再放入到數據庫中,下面是ConnectionUtil.java中插入到數據庫的片段代碼:

/**
	 * 向數據庫表添加數據
	 * @auther yuyu
	 */
	public boolean addEmail(String em){
		
		boolean result = false;

		try {
			
			conn = DriverManager.getConnection(connStr);
			
			String sqlInset = "insert into hrintegrity.email(email) values(?)";
			
			PreparedStatement stmts = conn.prepareStatement(sqlInset);
			
			stmts.setString(1, em);
			
			//這裏需要添加判斷,email在表中是否存在
			
			int i = stmts.executeUpdate();//執行插入數據操作,返回影響的行數
			
			if(i == 1){
				
				result = true;
			}
			
		} catch (Exception e) {
			
			e.printStackTrace();
			
		}finally{
			
			try {
				
				conn.close();
				
			} catch (Exception e) {
				
				e.printStackTrace();
			}
			
		}
		
		
		
		return result;
	}

        上面就是一個獲取51job的郵箱的完整代碼,除了51job外,其它招聘網站的獲取方式大同小異,如智聯,不同點就是在Example.java中調用的方法不同,而且在採集數據的時候select的標籤不一樣,需要自己一個一個去嘗試。

        有問題可以在留言中一起交流。

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章