使用httpclient獲取其他網站數據(含解析驗證碼)

    **使用httpclient獲取其他網站數據**

使用httpclient模擬瀏覽器請求網站加載個人訴訟記錄信息接口;
總結:1.系統如果上線,linux系統中使用了python命令來識別驗證碼,先將驗證碼保存在本地,識別完成後刪除;需要一個python腳本,代碼粘下面:
2.如果是在windows系統上運行該系統,提供了一個OCR的封裝類,直接調用即可識別驗證碼;這裏需要使用一個工具類,地址:http://download.csdn.net/download/qq_23339149/9617921

接口類:    
import com.alibaba.fastjson.JSONObject;
import com.aweb.platform.util.StringUtils;
import com.dbn.sysmodule.util.IdcardUtils;
import org.apache.commons.httpclient.HttpClient;
import org.apache.commons.httpclient.methods.GetMethod;
import org.apache.commons.httpclient.methods.PostMethod;
import org.apache.log4j.Logger;
import org.jsoup.Jsoup;
import org.jsoup.select.Elements;
import org.springframework.beans.factory.annotation.Value;

import java.io.*;
import java.net.URLEncoder;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.List;
import java.util.Map;

/**
 * Created by warming on 2016/8/31 with IntelliJ IDEA.
 */
public class OuterInformationServiceImpl implements com.dbn.remote.service.OuterInformationService {
    private Logger log = Logger.getLogger(OuterInformationServiceImpl.class);

    @Value("${zhiXingIndexUrl}")
    private String zhiXingIndexUrl;
    @Value("${verificationCodeUrl}")
    private String verificationCodeUrl;
    @Value("${zhiXingSearchUrl}")
    private String zhiXingSearchUrl;
    @Value("${zhiXingSearchUserAgent}")
    private String zhiXingSearchUserAgent;

    public String getPersonLitigationRecords(String pName, String cardNum) throws Exception {
        String jsonStr = null;
        try {
            if (StringUtils.checkStr(pName) && IdcardUtils.validateCard(cardNum)) {
                HttpClient client = new HttpClient();
                GetMethod method = null;
                loadIndex(method, client);//模擬加載首頁
                String htmlResponse = getRecords(client, method, pName, cardNum);
                int count = 0;
                Boolean success = false;
                while (count < 5) { //請求次數
                    if (htmlResponse.contains("驗證碼錯誤")) {
                        htmlResponse = getRecords(client, method, pName, cardNum);
                        count++;
                    } else {
                        success = true;
                        break;
                    }
                }
                log.info("驗證碼解析錯誤次數:" + count);
                if (!success) {
                    return "查詢失敗";
                }
                jsonStr = getJsonStrByHtml(htmlResponse);
                log.info("查詢結果:" + jsonStr);
            }
        } catch (Exception e) {
            e.printStackTrace();
            throw new Exception("查詢個人訴訟記錄錯誤!", e);
        }
        return jsonStr;
    }

    private String getRecords(HttpClient client, GetMethod method, String pName, String cardNum) throws Exception {
        //加載驗證碼
        method = new GetMethod(verificationCodeUrl);
        method.addRequestHeader("User-Agent", zhiXingSearchUserAgent);
        client.executeMethod(method);
        //通過linux調用python命令執行;
        String fileName = "/tmp/" + Long.toString(System.currentTimeMillis()) + String.valueOf(getRandom()) + ".jpeg";
        FileOutputStream fout = null;
        try {
            fout = new FileOutputStream(fileName);
            fout.write(method.getResponseBody());
        }catch(Exception e){
            log.info("將驗證碼寫入本地失敗!");
        }finally{
            if(fout != null){
                fout.flush();
                fout.close();
            }
        }
        String code = exec(fileName);
        log.info("解析驗證碼爲::" + code);

        //適用於windows操作系統
//        InputStream bis = new ByteArrayInputStream(get.getResponseBody());
//        String code = ParseJPEG_withOCR.getRecogniseStr(bis);
//        log.info("驗證碼解析結果:" + code);
//        bis.close();

        PostMethod post = new PostMethod(zhiXingSearchUrl);
        post.setRequestHeader("Content-Type", "application/x-www-form-urlencoded");
        post.setRequestHeader("Referer", zhiXingIndexUrl);
        post.addRequestHeader("User-Agent", zhiXingSearchUserAgent);
        post.setRequestBody("searchCourtName=" + URLEncoder.encode("全國法院(包含地方各級法院)") + "&selectCourtId=1&selectCourtArrange=1&pname=" +
                URLEncoder.encode(pName) + "&cardNum=" + cardNum + "&j_captcha=" + code);
        client.executeMethod(post);
        InputStream is = null;
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        try{
            is = post.getResponseBodyAsStream();
            int i = -1;
            while ((i = is.read()) != -1) {
                baos.write(i);
            }
        }catch (Exception e){
            log.info("獲取查詢返回頁面失敗!");
        }finally {
            if(is != null){
                is.close();
            }
        }
        return baos.toString();
    }

    //獲取HTML頁面中table標籤對應的json值
    private String getJsonStrByHtml(String htmlResponse) throws Exception {
        if (StringUtils.checkStr(htmlResponse)) {
            JSONObject data = new JSONObject();
            org.jsoup.nodes.Document doc = Jsoup.parse(htmlResponse);
            Elements trs = doc.getElementsByTag("tr");
            Elements ths = doc.getElementsByTag("th");
            Elements tds = doc.getElementsByTag("td");
            int trsSize = trs.size();//行數
            int thsSize = ths.size();//表頭列數
            int tdsSize = tds.size();//td數
            if (trsSize > 0 && thsSize > 0 && tdsSize > 0) {
                List<Object> list = new ArrayList<>();
                for (int j = 0; j < trs.size() - 1; j++) {
                    Map<String, Object> map = new HashMap<>();
                    for (int i = 0; i < thsSize - 1; i++) {
                        map.put(ths.get(i).text(), tds.get(thsSize * j + i).text());
                    }
                    list.add(map);
                }
                data.put("data", list);
                return data.toJSONString();
            } else {
                return null;
            }
        } else {
            return null;
        }
    }

    //模擬請求首頁
    private void loadIndex(GetMethod get, HttpClient client) throws IOException {
        get = new GetMethod(zhiXingIndexUrl);
        get.addRequestHeader("User-Agent", zhiXingSearchUserAgent);
        client.executeMethod(get);
        log.info("首頁加載完成");
    }

    //調用linux命令
    public String exec(String fileName) {
        log.info("驗證文件名稱:" + fileName);
        try {
            String cmd ="python /tmp/captcha.py " + fileName;
            Process process = Runtime.getRuntime().exec(cmd);
            LineNumberReader br = new LineNumberReader(new InputStreamReader(
                    process.getInputStream()));
            StringBuffer sb = new StringBuffer();
            String line;
            while ((line = br.readLine()) != null) {
                System.out.println(line);
                sb.append(line).append("\n");
            }
            //刪除生成的驗證碼圖片
            Runtime.getRuntime().exec("rm -f " + fileName);
            return sb.toString();
        } catch (Exception e) {
            e.printStackTrace();
        }
        return null;
    }

    //隨機三位數
    public int getRandom() {
        int number = 0;
        while (true) {
            number = (int) (Math.random() * 1000);
            if (number >= 100 && number < 1000) {
                break;
            }
        }
        return number;
    }
}

工具類:

import com.asprise.util.ocr.OCR;

import javax.imageio.ImageIO;
import java.awt.image.BufferedImage;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;


public class ParseJPEG_withOCR {
    public static String getRecogniseStr(InputStream imageFile) {
        String s = "";
        try {
            BufferedImage image = ImageIO.read(imageFile);
            int width = image.getTileWidth();
            int height = image.getTileHeight();
            image = image.getSubimage(0, 0, width, height);
            s = new OCR().recognizeEverything(image);
        } catch (IOException e) {
            e.printStackTrace();
            System.out.println(" 圖片識別失敗! ");
        }
        return s;
    }
    public static String getRecogniseStrByFile(File imageFile) {
        String s = "";
        try {
            BufferedImage image = ImageIO.read(imageFile);
            int width = image.getTileWidth();
            int height = image.getTileHeight();
            image = image.getSubimage(0, 0, width, height);
            s = new OCR().recognizeEverything(image);
        } catch (IOException e) {
            e.printStackTrace();
            System.out.println(" 圖片識別失敗! ");
        }
        return s;
    }

    public static void main(String[] args) {
//        for (int i = 0; i < 100; i++) {
//            String code = getRecogniseStrByFile(new File("D:\\pic\\download/" + i + ".jpeg"));
//            System.out.println(code);
//        }
    }

}

python腳本(文件名命名爲:captcha.py),在linux同目錄下保存驗證碼,執行命令:python /tmp/captcha.py ” + fileName 即可返回數據,pytesseract類庫可
百度下載:

from PIL import Image
import sys
import pytesseract

def output(imgfile):
    img = Image.open(imgfile)
    gray = img.convert('L')
    print(pytesseract.image_to_string(gray, config='-psm 7'))


if __name__ == "__main__":
    file_name = sys.argv[1]
    output(file_name)
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章