用request抓取網頁內容:常規、代理與cookie

傳送數據

urlopen(url, data=None, [timeout,]*, cafile=None, capath=None, cadefault=False, context=None)

from urllib.request import *
import urllib.parse
data = bytes(urllib.parse.urlencode({'word':'hello'}), encoding = 'utf8')
data
b'word=hello'
response = urlopen('http://httpbin.org/post', data = data)
print(response.read().decode("utf-8"))
{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {
    "word": "hello"
  }, 
  "headers": {
    "Accept-Encoding": "identity", 
    "Content-Length": "10", 
    "Content-Type": "application/x-www-form-urlencoded", 
    "Host": "httpbin.org", 
    "User-Agent": "Python-urllib/3.7"
  }, 
  "json": null, 
  "origin": "120.236.174.136, 120.236.174.136", 
  "url": "https://httpbin.org/post"
}

超時情況

import socket
try:
    response = urlopen('http://httpbin.org/get', timeout=0.1)
except urllib.error.URLError as e:
    if isinstance(e.reason, socket.timeout):
        print('Time Out')
Time Out

將urlopen中的參數改成Request類

Request(url, data=None, headers={}, origin_req_host=None, unverifiable=False, method =None)

from urllib.parse import *
headers = {
    'User-Agent':'Mozilla/4.0(compatible;MSIE 5.5;Windows NT)',
    'Host': 'httpbin.org'
}
dict = {
    'name': 'happy'
}
data = bytes(urlencode(dict), encoding = 'utf8')
request = Request('http://httpbin.org/post', data=data, headers = headers, method='post')
response = urlopen(request)
print(response.read().decode("utf-8"))
{
  "args": {}, 
  "data": "", 
  "files": {}, 
  "form": {
    "name": "happy"
  }, 
  "headers": {
    "Accept-Encoding": "identity", 
    "Content-Length": "10", 
    "Content-Type": "application/x-www-form-urlencoded", 
    "Host": "httpbin.org", 
    "User-Agent": "Mozilla/4.0(compatible;MSIE 5.5;Windows NT)"
  }, 
  "json": null, 
  "origin": "120.236.174.151, 120.236.174.151", 
  "url": "https://httpbin.org/post"
}

使用Handler添加代理

ProxyHandler的構造參數是一個字典,鍵名是協議類型,鍵值是代理鏈接。

常規代理

from urllib.error import URLError
from urllib.request import ProxyHandler, build_opener
proxy_handler = ProxyHandler({
    "https": "https://223.243.254.191:65309",
    "http": "http://113.121.93.90:9999",
    "http": "http://123.163.27.237:9999"
})
opener = build_opener(proxy_handler)

遇到 由於連接方在一段時間後沒有正確答覆或連接的主機沒有反應,連接嘗試失敗 這種問題就把timeout調大

try:
    response = opener.open('https://httpbin.org/get', timeout = 500)
    print(response.read().decode("utf-8"))
except URLError as e:
    print(e.reason)
{
  "args": {}, 
  "headers": {
    "Accept-Encoding": "identity", 
    "Host": "httpbin.org", 
    "User-Agent": "Python-urllib/3.7"
  }, 
  "origin": "223.245.39.218, 223.245.39.218", 
  "url": "https://httpbin.org/get"
}

可以看到origin和代理ip並不是一模一樣的,不曉得是不是用了ip轉發

try:
    response = opener.open('http://httpbin.org/get')
    print(response.read().decode("utf-8"))
except URLError as e:
    print(e.reason)
{
  "args": {}, 
  "headers": {
    "Accept-Encoding": "identity", 
    "Cache-Control": "max-age=259200", 
    "Host": "httpbin.org", 
    "User-Agent": "Python-urllib/3.7"
  }, 
  "origin": "123.163.27.237, 123.163.27.237", 
  "url": "https://httpbin.org/get"
}

針對http的兩個ip代理,第一個不能用第二個能用,發現有多種可能的情況下可以自動找到正確的那一個。

需要用戶名密碼代理

格式爲’username:password@ipaddress’

from urllib.error import URLError
from urllib.request import ProxyHandler, build_opener
proxy_handler = ProxyHandler({
    # 隨便編的
    "http": "http://123:[email protected]:9999"
})
opener = build_opener(proxy_handler)

使用socks5協議看世界

import socks
import socket
# 下面的服務器地址改過了,不能用
socks.set_default_proxy(socks.SOCKS5, '22.20.18.92', 1080)
socket.socket = socks.socksocket
try:
    response = urlopen("http://httpbin.org/get")
    print(response.read().decode("utf-8"))
except URLError as e:
    print(e.reason)
{
  "args": {}, 
  "headers": {
    "Accept-Encoding": "identity", 
    "Host": "httpbin.org", 
    "User-Agent": "Python-urllib/3.7"
  }, 
  "origin": "19.18.13.12, 19.16.13.12", 
  "url": "https://httpbin.org/get"
}

怒爬一波推特

from urllib.parse import *
socks.set_default_proxy(socks.SOCKS5, '22.20.18.92', 1080)
socket.socket = socks.socksocket
headers = {
    'Origin': 'https://www.google.com',
    'Referer': 'https://www.google.com/',
    'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
try:
    request = Request("https://twitter.com/home", headers = headers)
    response = urlopen(request)
    print(response.read().decode("utf-8"))
except URLError as e:
    print(e.reason)
<!DOCTYPE html>
<html lang="en" data-scribe-reduced-action-queue="true">
  <head>
             <meta charset="utf-8">
          <script  nonce="cpkeY2499BhzH1QhKNytTw==">
            !function(){window.initErrorstack||(window.initErrorstack=[]),window.onerror=function(r,i,n,o,t){r.indexOf("Script error.")>-1||window.initErrorstack.push({errorMsg:r,url:i,lineNumber:n,column:o,errorObj:t})}}();
          </script>

        
      
      <script id="bouncer_terminate_iframe" nonce="cpkeY2499BhzH1QhKNytTw==">
        if (window.top != window) {
      window.top.postMessage({'bouncer': true, 'event': 'complete'}, '');
    }
      </script>
      <script id="ttft_boot_data" nonce="cpkeY2499BhzH1QhKNytTw==">
        window.ttftData={"transaction_id":"006fff6d002c1aa6.c6a409836c08a162\u003c:0088620e009a344f","server_request_start_time":1576248983417,"user_id":null,"is_ssl":true,"rendered_on_server":true,"is_tfe":true,"client":"macaw-swift","tfe_version":"tsa_a\/1.0.1\/20191126.1647.c3ada84","ttft_browser":"chrome"};!function(){function t(t,n){window.ttftData&&!window.ttftData[t]&&(window.ttftData[t]=n)}function n(){return o?Math.round(w.now()+w.timing.navigationStart):(new Date).getTime()}var w=window.performance,o=w&&w.now;window.ttft||(window.ttft={}),window.ttft.recordMilestone||(window.ttft.recordMilestone=t),window.ttft.now||(window.ttft.now=n)}();
      </script>
      <script id="swift_action_queue" nonce="cpkeY2499BhzH1QhKNytTw==">
        !function(){function e(e){if(e||(e=window.event),!e)return!1;if(e.timestamp=(new Date).getTime(),!e.target&&e.srcElement&&(e.target=e.srcElement),document.documentElement.getAttribute("data-scribe-reduced-action-queue"))for(var t=e.target;t&&t!=document.body;){if("A"t.tagName)return;t=t.parentNode}return i("all",o(e)),a(e)?(document.addEventListener||(e=o(e)),e.preventDefault=e.stopPropagation=e.stopImmediatePropagation=function(){},y?(v.push(e),i("captured",e)):i("ignored",e),!1):(i("direct",e),!0)}function t(e){n();for(var t,r=0;t=v[r];r++){var a=e(t.target),i=a.closest("a")[0];if("click"t.type&&i){var o=e.data(i,"events"),u=o&&o.click,c=!i.hostname.match(g)||!i.href.match(/#/);if(!u&&c){window.location=i.href;continue}}a.trigger(e.event.fix(t))}window.swiftActionQueue.wasFlushed=!0}function r(){for(var e in b)if("all"!=e)for(var t=b[e],r=0;r<t.length;r++)console.log("actionQueue",c(t[r]))}function n(){clearTimeout(w);for(var e,t=0;e=h[t];t++)document["on"+e]=null}function a(e){if(!e.target)return!1;var t=e.target,r=(t.tagName||"").toLowerCase();if(e.metaKey)return!1;if(e.shiftKey&&"a"==r)return!1;if(t.hostname&&!t.hostname.match(g))return!1;if(e.type.match(p)&&s(t))return!1;if("label"==r){var n=t.getAttribute("for");if(n){var a=document.getElementById(n);if(a&&f(a))return!1}else for(var i,o=0;i=t.childNodes[o];o++)if(f(i))return!1}return!0}function i(e,t){t.bucket=e,b[e].push(t)}function o(e){var t={};for(var r in e)t[r]=e[r];return t}function u(e){for(;e&&e!=document.body;){if("A"==e.tagName)return e;e=e.parentNode}}function c(e){var t=[];e.bucket&&t.push("["+e.bucket+"]"),t.push(e.type);var r,n,a=e.target,i=u(a),o="",c=e.timestamp&&e.timestamp-d;return"click"===e.type&&i?(r=i.className.trim().replace(/\s+/g,"."),n=i.id.trim(),o=/[^#]/.test(i.href)?" ("+i.href+")":"",a='"'+i.innerText.replace(/\n+/g," ").trim()+'"'):(r=a.className.trim().replace(/\s+/g,"."),n=a.id.trim(),a=a.tagName.toLowerCase(),e.keyCode&&(a=String.fromCharCode(e.keyCode)+" : "+a)),t.push(a+o+(n&&"#"+n)+(!n&&r?"."+r:"")),c&&t.push(c),t.join(" ")}function f(e){var t=(e.tagName||"").toLowerCase();return"input"t&&"checkbox"e.getAttribute("type")}function s(e){var t=(e.tagName||"").toLowerCase();return"textarea"t||"input"t&&"text"e.getAttribute("type")||"true"e.getAttribute("contenteditable")}for(var m,d=(new Date).getTime(),l=1e4,g=/ (([)\.]+.)twitter.com$/,p=/^key/,h=["click","keydown","keypress","keyup"],v=[],w=null,y=!0,b={captured:[],ignored:[],direct:[],all:[]},k=0;m=h[k];k++)document["on"+m]=e;w=setTimeout(function(){y=!1},l),window.swiftActionQueue={buckets:b,flush:t,logActions:r,wasFlushed:!1}}();
      </script>
      <script id="composition_state" nonce="cpkeY2499BhzH1QhKNytTw==">
        !function(){function t(t){t.target.setAttribute("data-in-composition","true")}function n(t){t.target.removeAttribute("data-in-composition")}document.addEventListener&&(document.addEventListener("compositionstart",t,!1),document.addEventListener("compositionend",n,!1))}();
      </script>

使用Handler添加cookie

獲取網站cookie

  • cookiejar模塊的主要作用是提供可存儲cookie的對象,以便於與urllib模塊配合使用來訪問Internet資源。Cookiejar模塊非常強大,我們可以利用本模塊的CookieJar類的對象來捕獲cookie並在後續連接請求時重新發送,比如可以實現模擬登錄功能。
  • 該模塊主要的對象有CookieJar、FileCookieJar、MozillaCookieJar、LWPCookieJar。
  • 它們的關係:CookieJar —-派生—->FileCookieJar —-派生—–>MozillaCookieJar和LWPCookieJar
from http.cookiejar import *
from urllib.request import *
cookie = CookieJar()
handler = HTTPCookieProcessor(cookie)
opener = build_opener(handler)
response = opener.open('http://www.baidu.com')
for item in cookie:
    print(item.name + "=" + item.value)
BAIDUID=90FC91BD0AED82481D19AE5BA741B2C7:FG=1
BIDUPSID=90FC91BD0AED8248A84C636B264E077B
H_PS_PSSID=1448_21100_30211_30327_30284_26350_22160
PSTM=1576730391
delPer=0
BDSVRTM=0
BD_HOME=0

保存爲文件

  • ignore_discard的意思是即使cookies將被丟棄也將它保存下來
  • ignore_expires的意思是如果在該文件中cookies已經存在,則覆蓋原文件寫入
cookie = MozillaCookieJar('cookies.txt')
handler = HTTPCookieProcessor(cookie)
opener = build_opener(handler)
response = opener.open('http://www.baidu.com')
cookie.save(ignore_discard = True, ignore_expires = True)
cookie = MozillaCookieJar('cookies.txt')
cookie.load(ignore_discard = True, ignore_expires = True)
for item in cookie:
    print(item.name + "=" + item.value)
H_PS_PSSID=121849
PSTM=1576740535
delPer=0
BDSVRTM=0
BD_HOME=0
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章