No.1 第一個python爬蟲練習
from urllib import request,parse
import chardet
if __name__ == '__main__':
url = 'https://blog.csdn.net/m0_37355951/article/details/80457159'
rsp = request.urlopen(url)
html = rsp.read()
##獲取網頁的頭信息(編碼)
cs = chardet.detect(html)
print(cs) ##{'encoding': 'utf-8', 'confidence': 0.99, 'language': ''}
##按照獲取的頁面編碼進行解碼 默認utf-8
html = html.decode(cs.get("encoding",'utf-8'))
#輸出返回的信息
print(rsp)
print(rsp.geturl())
print(rsp.info())
print(rsp.getcode()) ## 正常200
##網頁信息
print(html)
No.2 模擬Get請求
from urllib import request,parse
if __name__ == '__main__':
url = 'http://www.baidu.com/s?'
wd = input('Input your keyword')
## 拼接的數據
qs = {
"wd":wd
}
## 對數據進行編譯
qs = parse.urlencode(qs)
rsp = request.urlopen(url+qs)
html = rsp.read().decode()
print(html)
No.3 模擬post請求
'''
利用parse 模塊模擬post請求
1.打開F12
2.輸入一個g
3.利用NetWork-All-Headers 查看 發現 FormData 的值是kw:g
'''
from urllib import request,parse
import json
'''
利用data構造內容 然後urlopen打開
返回一個json 格式的結果
結果應該是girl的翻譯
'''
baseurl = 'https://fanyi.baidu.com/sug'
#存放dict格式的數據
data = {
'kw':'girl'
}
#需要使用parse來變異
data = parse.urlencode(data).encode()
rsp = request.urlopen(baseurl,data= data)
## 讀取信息解碼 默認utf-8
json_data = rsp.read().decode()
print(json_data)
#把json字符串轉化成字典
json_data = json.loads(json_data)
print(json_data)
for item in json_data['data']:
print(item['k'],'---',item['v'])
No.4 UrlError的使用
'''
UrlEror的使用
查看 訪問錯誤
'''
from urllib import request,error
if __name__ == '__main__':
url = 'http://www.baidu.com'
try:
req = request.Request(url)
rsp = request.urlopen(req)
html = rsp.read().decode()
print(html)
except error.HTTPError as e:
print(e)
except error.URLError as e:
print(e)
except Exception as e:
print(e)
No.5 更改自己的agent
常用的agent:
https://blog.csdn.net/rookie_is_me/article/details/81634048
兩種方式:
1.headers['User-Agent'] = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36"
req = request.Request(url= url,headers=headers)
2.req = request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36')
'''
訪問一個網址 更改自己的agent
'''
from urllib import request,error
if __name__ == '__main__':
url = 'http://www.baidu.com'
try:
headers = {}
headers['User-Agent'] = "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36"
# req = request.Request(url= url,headers=headers)
req = request.Request(url)
req.add_header('User-Agent','Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36')
rsp = request.urlopen(req)
html = rsp.read().decode()
print(html)
except error.URLError as e:
print(e)
except Exception as e:
print(e)
No.6 代理服務器
'''
代理服務器
www.xicidaili.com
www.goubanjia.com
使用步驟:
1.設置代理地址
2.創建ProxyHandle
3.創建Opener
4.安裝 Opener
'''
from urllib import request,error,parse
if __name__ == '__main__':
url = 'http://www.baidu.com'
#設置代理地址
proxy = {'http':'117.169.104.102:80'}
#創建ProxyHandler
proxy_handler = request.ProxyHandler(proxy)
#創建Opener
opener = request.build_opener(proxy_handler)
#安裝Opener
request.install_opener(opener)
try:
rsp = request.urlopen(url)
html = rsp.read().decode()
print(html)
except error.URLError as e:
print(e)
No.7 使用cookie登錄網站
'''
使用cookie 登錄人人網
複製登錄後的cookie
'''
from urllib import request
if __name__ == '__main__':
url = 'http://www.renren.com/894245278/profile'
headers = {'Cookie':' 自己的cookie '}
req = request.Request(url=url,headers=headers)
rsp = request.urlopen(req)
html = rsp.read().decode()
print(html)
No.8 自動配置cookie (自動登錄)訪問數據
'''
自動配置cookie爬取數據
CookieJar 管理存儲cookie 向傳出的http請求添加cookie
cookie存儲在內存中 CookieJar實例回收後,cookie消失
FileCookieJar 使用文件保存cookie
MozillaCookieJar 創建與mocilla瀏覽器cookie.txt兼容的FileCookie
LwpCookieJar
'''
#利用cookieJar訪問人人網
#打開登錄界面 自動通過用戶名密碼登錄
#利用提取的cookie登錄隱私頁面
from urllib import request,error,parse
from http import cookiejar
#創建 cookiejar 的實例
cookie = cookiejar.CookieJar()
#生成cookie的管理器
cookie_handler = request.HTTPCookieProcessor(cookie)
#創建http請求管理器
http_handler = request.HTTPHandler()
#生成https管理器
https_handler = request.HTTPSHandler()
#創建請求管理器
opener = request.build_opener(http_handler,https_handler,cookie_handler)
def login():
url = 'http://www.renren.com/PLogin.do'
#設置登錄數據
data = {
'email':'賬號',
'password':'密碼'
}
#數據編碼
data = parse.urlencode(data).encode()
req = request.Request(url,data= data)
rsp = opener.open(req)
def getHomePage():
url = 'http://www.renren.com/894245278/profile'
#如果已經執行了login函數 則opener自動包含相應的cookie值
rsp = opener.open(url)
html = rsp.read().decode()
print(html)
if __name__ == '__main__':
login()
getHomePage()