某票務平臺的信息採集

package com.crawler.maoyan.age.sex.index;

import java.io.IOException;
import java.io.UnsupportedEncodingException;
import java.text.SimpleDateFormat;
import java.util.Date;
import java.util.Iterator;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.json.simple.JSONArray;
import org.json.simple.JSONObject;
import org.json.simple.parser.JSONParser;
import org.json.simple.parser.ParseException;
import org.jsoup.Jsoup;
import org.jsoup.nodes.Document;
import org.jsoup.select.Elements;

import com.maoyan.movie.contents.MContents;
import com.maoyan.movie.html.MovieHtml;
import com.maoyan.movie.html.PrecessHtml;
import com.maoyan.movie.ttf.encode.DownParseTTF;
import com.maoyan.mysql.configure.DBConfig;
import com.maoyan.mysql.configure.DbAttribute;
import com.maoyan.mysql.manage.ContentToMySQL;
import com.maoyan.mysql.manage.IFRepetition;
import com.maoyan.mysql.manage.UpdateData;

/** 
* @author 作者 E-mail: [email protected]
* @date 創建時間:2017年1月17日 上午11:46:10 
* @jdk 版本:jdk1.7.0_79
*
* @類說明:受衆性別佔比和受衆年齡佔比
*/
public class AgeSexIndex {
	public static void main(String[] args) {
		AgeSexIndex ageSexIndex = new AgeSexIndex();
		DbAttribute dbAttribute = new DbAttribute();
		String branchURL = "http://piaofang.maoyan.com/movie/";
		String ageSexURL = "";
		String movieIdKey;
		try {
			movieIdKey = new String(dbAttribute.maoyanMovieID.getBytes("ISO-8859-1"), "utf-8");
			String[] splitMovieID = movieIdKey.split("#");
			for (int i = 0; i < splitMovieID.length; i++) {
				ageSexURL = branchURL+splitMovieID[i]+"/wantindex";
				System.out.println(ageSexURL);
				String movieNameId = ageSexIndex.getMovieNameId(ageSexURL);
				Document document = ageSexIndex.getDocument(ageSexURL);
				
				ageSexIndex.exeSexIndex(document, movieNameId);
				ageSexIndex.exeAgeIndex(document,movieNameId);
			}
		} catch (UnsupportedEncodingException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		System.out.println("over");
	}
	
	//獲取電影的性別佔比
	public void exeSexIndex(Document document,String movieNameId){
		MContents mContents = new MContents();
		ContentToMySQL contentToMySQL = new ContentToMySQL();
		IFRepetition ifRepetition = new IFRepetition();
		UpdateData updateData = new UpdateData();
		
		SimpleDateFormat sDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss SSS");
		mContents.setTaskTime(sDateFormat.format(new Date()));
		
		String[] split = movieNameId.split(",");
		//電影名稱
		mContents.setMovieName(split[0]);
		//貓眼電影的id號
		mContents.setPlatformID(split[1]);
		//男性佔比
		Elements meles = document.select("section div.stackcolumn div.stackcolumn-desc i.cs");
		String male = meles.eq(0).text();
		mContents.setMaleRate(male+"%");
		//女性佔比
		Elements feeles = document.select("div.stackcolumn div.stackcolumn-desc p.stackcolumn-desc-right i.cs");
		String female = feeles.eq(0).text();
		mContents.setFemaRate(female+"%");
		
		if (!ifRepetition.sexRepetition(mContents)) {
			contentToMySQL.saveGenderRate(mContents);
		}else {
			updateData.updateSex(mContents);
		}
	}
	
	//獲取電影的年齡佔比
	public void exeAgeIndex(Document document,String movieNameId){
		
		MContents mContents = new MContents();
		ContentToMySQL contentToMySQL = new ContentToMySQL();
		IFRepetition ifRepetition = new IFRepetition();
		UpdateData updateData = new UpdateData();
		
		SimpleDateFormat sDateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss SSS");
		mContents.setTaskTime(sDateFormat.format(new Date()));
		
		JSONParser jsonParser = new JSONParser();
		
		Elements eles = document.select("body script#pageData");
		int beginIndex = eles.toString().indexOf("{");
		int endIndex = eles.toString().lastIndexOf("}");
		String ageJson = eles.toString().substring(beginIndex, endIndex+1);
		
		String[] split = movieNameId.split(",");
		//電影名稱
		mContents.setMovieName(split[0]);
		//貓眼電影的id號
		mContents.setPlatformID(split[1]);;
		
		String ageRate = "";//年齡佔比
		String age = "";//年齡段
		String ageAgeRate = "";
		try {
			JSONObject jsObjectRoot = (JSONObject)jsonParser.parse(ageJson);
			//獲取年齡佔比的相關數據
			JSONObject ageJsonObject = (JSONObject)jsObjectRoot.get("ageRatesChart");
			JSONArray jsonArray = (JSONArray) ageJsonObject.get("series");
			
			Iterator iterator = jsonArray.iterator();
			while(iterator.hasNext()){
				JSONObject seriesJsonObject = (JSONObject)iterator.next();
				//得到年齡佔比精確數據
				JSONArray pointsJsonArray = (JSONArray) seriesJsonObject.get("points");
				//此處的for循環和while循環是一樣的
				for(int i = 0 ,length = pointsJsonArray.size();i < length;i++){
					JSONObject xyValue = (JSONObject)pointsJsonArray.get(i);
					ageRate = xyValue.get("yValue").toString();
					age = xyValue.get("xValue").toString();
					ageAgeRate = age+ageRate;
					if (ageAgeRate.contains("20歲以下")) {
						mContents.setF16to20(ageAgeRate.replace("20歲以下", ""));
					}else if (ageAgeRate.contains("20~24")) {
						mContents.setF21to25(ageAgeRate.replace("20~24", ""));
					}else if (ageAgeRate.contains("25~29")) {
						mContents.setF26to30(ageAgeRate.replace("25~29", ""));
					}else if (ageAgeRate.contains("30~34")) {
						mContents.setF31to35(ageAgeRate.replace("30~34", ""));
					}else if (ageAgeRate.contains("35~39")) {
						mContents.setF36to40(ageAgeRate.replace("35~39", ""));
					}else {
						mContents.setF41to45(ageAgeRate.replace("40歲以上", ""));
					}
					
				}
			}
			if (!ifRepetition.ageRepetition(mContents)) {
				contentToMySQL.saveAgeRate(mContents);
			}else {
				updateData.updateAge(mContents);
			}
			
		} catch (ParseException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
	}
	
	//獲取電影的名稱和所對應的貓眼電影id號
	public String getMovieNameId(String ageSexIndexURL){
		//獲取電影的名稱,電影名稱的獲取和性別佔比不是同一個鏈接
		String movieURL = ageSexIndexURL.substring(0, ageSexIndexURL.indexOf("/wantindex"));
		String movieNameId = "";
		String platformId = "";
	
		String regex = "[^0-9]";
		Pattern pattern = Pattern.compile(regex);
		Matcher matcher = pattern.matcher(ageSexIndexURL);
		//貓眼電影的id號
		platformId = matcher.replaceAll("");
		try {
			Document movieNameDocu = Jsoup.connect(movieURL).get();
			movieNameId = movieNameDocu.title()+","+platformId;
		} catch (IOException e) {
			// TODO Auto-generated catch block
			e.printStackTrace();
		}
		return movieNameId;
	}
	
	//獲取電影性別和年齡的佔比的document數據
	public Document getDocument(String ageSexIndexURL){
		MovieHtml movieHtml = new MovieHtml();
		DownParseTTF downParseTTF = new DownParseTTF();
		PrecessHtml precessHtml = new PrecessHtml();
		String sourceHtml = movieHtml.getHtml(ageSexIndexURL).toString();
		String ttfCode = downParseTTF.parseTTF(sourceHtml);//下載ttf文件並解析
		String precSourceHtml = precessHtml.precSourceHtml(sourceHtml, ttfCode);
		Document document = Jsoup.parse(precSourceHtml);
		return document;
	}
	
	static{
		
		DBConfig.initPropertis("./config/config.properties");
	}

}






 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章