代理IP地址代碼

import requests
from bs4 import BeautifulSoup
import random


class Proxyhandler(object):
    def __init__(self):
    	#僞裝爲瀏覽器訪問的header列表
        self.user_agent_list = [
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
            "Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
            "Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
            "Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
            "Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"]
        self.proxy_url = 'https://www.xicidaili.com/nt/'
        self.proxy_list = [] # 目標網址
        self.target_url = 'https://www.baidu.com/' # 測試地址
        self.time_out = 3
        #爬取代理IP存入列表,並刪除無效IP,設定爬取10頁的IP
    def get_proxy_list(self,page_num = 10):
		#從user_agent_list列表中隨機選取一個header
        ua = random.choice(self.user_agent_list)
        header = {'User-Agent': ua}
        print('隨機產生的UA:%s' % ua)
        try:
            for i in range(page_num):
                url = self.proxy_url + str(i + 1)
                r = requests.get(url, headers=header, timeout=self.time_out)
                r.raise_for_status()
                r.encoding = r.apparent_encoding
                html = r.text
                soup = BeautifulSoup(html,'html.parser')
                tr = soup.find_all('tr')
                for i in range(1,len(tr)):
                    t = tr[i]
                    td = t.find_all('td')
                    self.proxy_list.append(td[1].text + ':' + td[2].text)
            for ip in self.proxy_list:
                try:
                    proy_host = 'https://' + ip
                    proxy = {'https:':proy_host}
                    response = requests(self.target_url,headers=header,proxies = proxy)
                #刪除列表中出現異常的無效IP
                except Exception as e:
                    self.proxy_list.remove(ip)
                    continue
            #返回有效IP列表
            return self.proxy_list
        except:
            print('get error!')

if __name__ == '__main__':
	#實例化Proxyhandler類
    proxy_hander = Proxyhandler()
    list_ip = proxy_hander.get_proxy_list()
    with open('ip.txt','w') as f:
        f.write(str(list_ip))
    f.close

 

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章