Python3爬蟲學習筆記一 (get,post,cookie,proxy,agent)

原創

2018-12-26 19:49

No.1 第一個python爬蟲練習

from urllib import request,parse
import chardet

if __name__ == '__main__':
    url = 'https://blog.csdn.net/m0_37355951/article/details/80457159'
    rsp = request.urlopen(url)
    html = rsp.read()
    ##獲取網頁的頭信息(編碼)
    cs = chardet.detect(html)   
    print(cs)   ##{'encoding': 'utf-8', 'confidence': 0.99, 'language': ''}
    ##按照獲取的頁面編碼進行解碼 默認utf-8
    html = html.decode(cs.get("encoding",'utf-8'))
    #輸出返回的信息
    print(rsp)
    print(rsp.geturl())  
    print(rsp.info())   
    print(rsp.getcode())  ## 正常200
    ##網頁信息
    print(html)

No.2 模擬Get請求

from urllib import request,parse

if __name__ == '__main__':
    url =  'http://www.baidu.com/s?'
    wd = input('Input your keyword')
    
    ## 拼接的數據
    qs = {
        "wd":wd
    }
    
    ## 對數據進行編譯
    qs = parse.urlencode(qs)
    rsp = request.urlopen(url+qs)
    html = rsp.read().decode()
    print(html)

No.3 模擬post請求

'''
利用parse 模塊模擬post請求
    1.打開F12
    2.輸入一個g
    3.利用NetWork-All-Headers 查看 發現 FormData 的值是kw:g
'''

from urllib import request,parse
import json

'''
  利用data構造內容 然後urlopen打開
    返回一個json 格式的結果
    結果應該是girl的翻譯
'''
baseurl = 'https://fanyi.baidu.com/sug'
#存放dict格式的數據
data = {
    'kw':'girl'
}
#需要使用parse來變異
data = parse.urlencode(data).encode()

rsp = request.urlopen(baseurl,data= data)
## 讀取信息解碼 默認utf-8
json_data = rsp.read().decode()
print(json_data)

#把json字符串轉化成字典
json_data = json.loads(json_data)
print(json_data)

for item in json_data['data']:
    print(item['k'],'---',item['v'])

No.4 UrlError的使用

'''
UrlEror的使用
查看 訪問錯誤
'''

from urllib import request,error

if __name__ == '__main__':
    url = 'http://www.baidu.com'
    try:
        req = request.Request(url)
        rsp = request.urlopen(req)
        html = rsp.read().decode()
        print(html)
    except error.HTTPError as e:
        print(e)
    except error.URLError as e:
        print(e)
    except Exception as e:
        print(e)

No.5 更改自己的agent

常用的agent:
https://blog.csdn.net/rookie_is_me/article/details/81634048

兩種方式：

  1.headers['User-Agent'] = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36"
  req = request.Request(url= url,headers=headers)
   2.req = request.Request(url)
     req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36')

'''
訪問一個網址 更改自己的agent

'''
from urllib import request,error
if __name__ == '__main__':
    url = 'http://www.baidu.com'

    try:
        headers = {}
        headers['User-Agent'] = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36"
       # req = request.Request(url= url,headers=headers)
        req = request.Request(url)
        req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36')
        rsp = request.urlopen(req)
        html = rsp.read().decode()
        print(html)
    except error.URLError as e:
        print(e)
    except Exception as e:
        print(e)

No.6 代理服務器

'''
代理服務器
www.xicidaili.com
www.goubanjia.com
使用步驟：
    1.設置代理地址
    2.創建ProxyHandle
    3.創建Opener
    4.安裝 Opener
'''

from urllib import request,error,parse

if __name__ == '__main__':
    url = 'http://www.baidu.com'
    #設置代理地址
    proxy = {'http':'117.169.104.102:80'}
    #創建ProxyHandler
    proxy_handler = request.ProxyHandler(proxy)
    #創建Opener
    opener = request.build_opener(proxy_handler)
    #安裝Opener
    request.install_opener(opener)
    try:
        rsp = request.urlopen(url)
        html = rsp.read().decode()
        print(html)
    except error.URLError as e:
        print(e)

No.7 使用cookie登錄網站

'''
使用cookie 登錄人人網
複製登錄後的cookie
'''

from urllib import request

if __name__ == '__main__':
    url = 'http://www.renren.com/894245278/profile'
    headers = {'Cookie':' 自己的cookie '}
    req = request.Request(url=url,headers=headers)
    rsp = request.urlopen(req)
    html = rsp.read().decode()
    print(html)

No.8 自動配置cookie （自動登錄）訪問數據

'''
自動配置cookie爬取數據
    CookieJar 管理存儲cookie 向傳出的http請求添加cookie
              cookie存儲在內存中 CookieJar實例回收後，cookie消失
        FileCookieJar 使用文件保存cookie
            MozillaCookieJar 創建與mocilla瀏覽器cookie.txt兼容的FileCookie
            LwpCookieJar
'''
#利用cookieJar訪問人人網
    #打開登錄界面 自動通過用戶名密碼登錄
    #利用提取的cookie登錄隱私頁面

from urllib import request,error,parse
from http import cookiejar

#創建 cookiejar 的實例
cookie = cookiejar.CookieJar()

#生成cookie的管理器
cookie_handler = request.HTTPCookieProcessor(cookie)

#創建http請求管理器
http_handler = request.HTTPHandler()

#生成https管理器
https_handler = request.HTTPSHandler()

#創建請求管理器
opener = request.build_opener(http_handler,https_handler,cookie_handler)


def login():
    url = 'http://www.renren.com/PLogin.do'
    #設置登錄數據
    data = {
        'email':'賬號',
        'password':'密碼'
    }
    #數據編碼
    data = parse.urlencode(data).encode()
    req = request.Request(url,data= data)

    rsp = opener.open(req)

def getHomePage():
    url = 'http://www.renren.com/894245278/profile'
    #如果已經執行了login函數 則opener自動包含相應的cookie值
    rsp = opener.open(url)
    html = rsp.read().decode()
    print(html)

if __name__ == '__main__':
    login()
    getHomePage()

發表評論

所有評論

還沒有人評論，想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.

Python3爬蟲學習筆記一 (get,post,cookie,proxy,agent)

No.1 第一個python爬蟲練習

No.2 模擬Get請求

No.3 模擬post請求

No.4 UrlError的使用

No.5 更改自己的agent

No.6 代理服務器

No.7 使用cookie登錄網站

No.8 自動配置cookie （自動登錄）訪問數據

杭州的 IT 崩盤了麼？

開源高性能結構化日誌模塊NanoLog

Python 潮流週刊#55：分享 9 個高質量的技術類信息源！

Azure Virtual Network (22) 多訂閱使用Azure DNS解析問題 Windows Azure Platform 系列文章目錄

【簡寫Mybatis-02】註冊機的實現以及SqlSession處理

手繪二維碼

.NET藉助虛擬網卡實現一個簡單異地組網工具

Mysql編碼設置

Redis面試問題

RE(正則)和Xpath

大數據面試總結

Python3爬蟲學習筆記一 (get,post,cookie,proxy,agent)

Mac下配置sublime實現LaTeX

https://yachay.unat.edu.pe/blog/index.php?comment_area=format_blog&comment_component=blog&comment_co

linux以太網驅動總結