import requests
from bs4 import BeautifulSoup
import random
class Proxyhandler(object):
def __init__(self):
#僞裝爲瀏覽器訪問的header列表
self.user_agent_list = [
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"]
self.proxy_url = 'https://www.xicidaili.com/nt/'
self.proxy_list = [] # 目標網址
self.target_url = 'https://www.baidu.com/' # 測試地址
self.time_out = 3
#爬取代理IP存入列表,並刪除無效IP,設定爬取10頁的IP
def get_proxy_list(self,page_num = 10):
#從user_agent_list列表中隨機選取一個header
ua = random.choice(self.user_agent_list)
header = {'User-Agent': ua}
print('隨機產生的UA:%s' % ua)
try:
for i in range(page_num):
url = self.proxy_url + str(i + 1)
r = requests.get(url, headers=header, timeout=self.time_out)
r.raise_for_status()
r.encoding = r.apparent_encoding
html = r.text
soup = BeautifulSoup(html,'html.parser')
tr = soup.find_all('tr')
for i in range(1,len(tr)):
t = tr[i]
td = t.find_all('td')
self.proxy_list.append(td[1].text + ':' + td[2].text)
for ip in self.proxy_list:
try:
proy_host = 'https://' + ip
proxy = {'https:':proy_host}
response = requests(self.target_url,headers=header,proxies = proxy)
#刪除列表中出現異常的無效IP
except Exception as e:
self.proxy_list.remove(ip)
continue
#返回有效IP列表
return self.proxy_list
except:
print('get error!')
if __name__ == '__main__':
#實例化Proxyhandler類
proxy_hander = Proxyhandler()
list_ip = proxy_hander.get_proxy_list()
with open('ip.txt','w') as f:
f.write(str(list_ip))
f.close