前言:
在很多爬蟲的自動化項目中,我們難免會遇到一些需要處理驗證碼的問題,其中一種解決方案就是通過手動輸入,但這樣效率卻很慢,有沒有什麼方法能幫助我們自動識別呢?當然又,那就是打碼平臺,這裏我推薦打碼狗平臺,識別率很高,並且便宜,但是還是希望大家也包括我後來能通過深度學習的方式,建立起自己的一套識別體系,廢話不多說,那麼就開始吧!!!
不想看分部分介紹直接拉到最下面修改配置即可食用
簡單介紹打碼狗平臺(沒有廣告錢)
可以看一下下面這個介紹,然後大家註冊一下,充值積分,特別便宜我記得我那個1元錢都用了好幾個月都還沒用完
Python代碼實現
這裏我創建了一個captchaRecognize
類,我將分別對類中每個部分進行講解,
初始化
因爲,打碼平臺沒有反爬機制,所以簡單下了一個User-Agent
上去,加了一個判斷條件,判斷是否有有效的session
傳入
def __init__(self, s):
if s is None:
self.s = requests.session()
else:
self.s = s
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3754.400 QQBrowser/10.5.4034.400',
}
獲取打碼平臺UserKey
這個UserKey
唯一標誌了你的身份
# 用於獲取打碼平臺UserKey
def get_userKey(self, ):
get_url = f"http://www.damagou.top/apiv1/login.html?username={username}&password={password}"
try:
r = requests.get(get_url, headers=self.headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
print("Dmagou Being Processing")
return r.text
except:
print("Can't Get Userkey ", r.status_code)
獲取驗證碼
def get_captcha_pic(self):
get_url = url_captcha
headers_for_captcha = headers
try:
r = self.s.get(get_url, headers=headers_for_captcha)
return r.content
except:
pass
識別驗證碼
下面的type參數對應於打碼狗可以自己選擇不同驗證碼類型
def get_english_captcha(self, captcha, userkey):
base64_data = base64.b64encode(captcha)
postUrl = 'http://www.damagou.top/apiv1/recognize.html'
postData = {
"image": base64_data,
"userkey": userkey,
"type": "1001",
}
try:
r = requests.post(postUrl, data=postData, headers=self.headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
# print("破解驗證碼成功")
return r.text
except:
pass
# print("破解驗證碼失敗")
全部代碼
給出全部的代碼
import random
import base64
import requests
url_captcha = "這裏是獲取驗證碼的地址"
username = '打碼平臺賬戶'
password = '打碼平臺密碼'
# 下面這個是獲取驗證碼的地址的headers大家都懂爬蟲應該都知道
headers = {
'Host': 'xxxx自己填寫',
'Connection': 'keep-alive',
'Cache-Control': 'max-age=0',
'Upgrade-Insecure-Requests': '1',
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3754.400 QQBrowser/10.5.4034.400'
}
class captchaRecognize:
def __init__(self, s):
if s is None:
self.s = requests.session()
else:
self.s = s
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.25 Safari/537.36 Core/1.70.3754.400 QQBrowser/10.5.4034.400',
}
def get_captcha_pic(self):
get_url = url_captcha
headers_for_captcha = headers
try:
r = self.s.get(get_url, headers=headers_for_captcha)
return r.content
except:
pass
# 用於獲取打碼平臺UserKey
def get_userKey(self, ):
get_url = f"http://www.damagou.top/apiv1/login.html?username={username}&password={password}"
try:
r = requests.get(get_url, headers=self.headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
print("Dmagou Being Processing")
return r.text
except:
print("Can't Get Userkey ", r.status_code)
def get_english_captcha(self, captcha, userkey):
base64_data = base64.b64encode(captcha)
postUrl = 'http://www.damagou.top/apiv1/recognize.html'
postData = {
"image": base64_data,
"userkey": userkey,
"type": "1001",
}
try:
r = requests.post(postUrl, data=postData, headers=self.headers)
r.raise_for_status()
r.encoding = r.apparent_encoding
print("破解驗證碼成功")
return r.text
except:
print("破解驗證碼失敗")
def __call__(self):
captcha = self.get_captcha_pic()
userKey = self.get_userKey()
return self.get_english_captcha(captcha, userKey)
if __name__ == '__main__':
session = requests.session()
captchaRecognize = captchaRecognize(session)
print(captchaRecognize())