一、前期準備
- 爬取目標:【春晚鬼畜】趙本山:我就是念詩之王!【改革春風吹滿地】評論區
- 所需要的模塊
from fake_useragent import UserAgent #隨機請求頭
import requests #請求網頁
from requests.exceptions import RequestException #請求異常處理
import csv #將數據存儲到Excel
from datetime import datetime #時間轉換
from multiprocessing.dummy import Pool as pl #多線程
二、網頁分析
在獲取評論之前,首先要分析網頁結構,尋找所需要的數據,網頁中顯示評論共爲77674條評論,2385頁。
之後通過Chrome開發者工具尋找加載評論來源
最終發現加載評論鏈接爲:
https://api.bilibili.com/x/v2/reply?pn=1&type=1&oid=19390801
訪問鏈接後可發現所有評論是由json數據存儲在網頁中,鏈接中pn參數爲頁碼,共有77706條評論(因爲評論在一直增加),2385頁,每頁有20條評論(不包括樓層中的評論),本次主要爬取樓層評論,所以共有約2385*20=47700條評論。
三、數據獲取
本代碼爬取評論區8個數據,如下:
樓層 | 時間 | 用戶暱稱 | ID | 性別 | 等級 | 評論內容 | 點贊數 |
---|
- 對第一頁的數據進行獲取,得到json格式數據
ua = UserAgent(verify_ssl=False)#忽略ssl驗證
headers = {'User-Agent': ua.random}#構造隨機請求頭
def get_page(url,headers):
try:
response = requests.get(url,headers = headers)#請求網頁
if response.status_code == 200:
return response.json()#如果狀態碼爲200則返回json格式數據
return None
except RequestException:#否則返回none
return None
def main(url):
html_json = get_page(url,headers)#請求網頁,獲取數據
- 對獲取到的數據進行分析篩選
def parse_page(html_json):
n = html_json['data']['page']['size']#獲取每頁中的評論數,一般爲20條
for i in range(0,n):
info = {
'floor': html_json['data']['replies'][i]['floor'],#獲取樓層信息
'time': datetime.fromtimestamp(html_json['data']['replies'][i]['ctime']),#獲取日期時間,因爲B站的時間是用秒爲單位的時間戳計時,需要轉換
'nickname': html_json['data']['replies'][i]['member']['uname'],#獲取用戶暱稱
'id': html_json['data']['replies'][i]['mid'],#獲取用戶ID
'sex': html_json['data']['replies'][i]['member']['sex'],#獲取用戶性別
'level': html_json['data']['replies'][i]['member']['level_info']['current_level'],#獲取用戶等級
'content': html_json['data']['replies'][i]['content']['message'],#獲取評論內容
'like': html_json['data']['replies'][i]['like']#獲取評論點贊數
}
def main(url):
html_json = get_page(url,headers)#請求網頁,獲取數據
parse_page(html_json)#分析數據
- 將數據保存到csv文件中
def parse_page(html_json):
n = html_json['data']['page']['size']#獲取每頁中的評論數,一般爲20條
for i in range(0,n):
info = {
'floor': html_json['data']['replies'][i]['floor'],#獲取樓層信息
'time': datetime.fromtimestamp(html_json['data']['replies'][i]['ctime']),#獲取日期時間,因爲B站的時間是用秒爲單位的時間戳計時,需要轉換
'nickname': html_json['data']['replies'][i]['member']['uname'],#獲取用戶暱稱
'id': html_json['data']['replies'][i]['mid'],#獲取用戶ID
'sex': html_json['data']['replies'][i]['member']['sex'],#獲取用戶性別
'level': html_json['data']['replies'][i]['member']['level_info']['current_level'],#獲取用戶等級
'content': html_json['data']['replies'][i]['content']['message'],#獲取評論內容
'like': html_json['data']['replies'][i]['like']#獲取評論點贊數
}
write_file(info)#將信息保存
def write_file(info):
with open('改革春風吹滿地.csv','a',encoding='gb18030',newline='') as f:
writer = csv.writer(f,dialect='excel')
writer.writerow(info.values())#將獲取到的信息保存到csv文件中
f.close()
- 使用多線程爬取所有頁面
if __name__ == '__main__':
pool = pl(10)#開啓4線程
url = ['https://api.bilibili.com/x/v2/reply?&jsonp=jsonp&pn={}&type=1&oid=19390801&sort=0&_=1549711529949'.format(str(m)) for m in range(1,2408)]#所有鏈接
pool.map(main,url)
pool.close()
pool.join()
四、完整代碼
from fake_useragent import UserAgent
import requests
from requests.exceptions import RequestException
import csv
from datetime import datetime
from multiprocessing.dummy import Pool as pl
ua = UserAgent(verify_ssl=False)#忽略ssl驗證
headers = {'User-Agent': ua.random}#構造隨機請求頭
def get_page(url,headers):
try:
response = requests.get(url,headers = headers)#請求網頁
if response.status_code == 200:
return response.json()#如果狀態碼爲200則返回json格式數據
return None
except RequestException:#否則返回none
return None
def parse_page(html_json):
n = html_json['data']['page']['size']#獲取每頁中的評論數,一般爲20條
for i in range(0,n):
info = {
'floor': html_json['data']['replies'][i]['floor'],#獲取樓層信息
'time': datetime.fromtimestamp(html_json['data']['replies'][i]['ctime']),#獲取日期時間,因爲B站的時間是用秒爲單位的時間戳計時,需要轉換
'nickname': html_json['data']['replies'][i]['member']['uname'],#獲取用戶暱稱
'id': html_json['data']['replies'][i]['mid'],#獲取用戶ID
'sex': html_json['data']['replies'][i]['member']['sex'],#獲取用戶性別
'level': html_json['data']['replies'][i]['member']['level_info']['current_level'],#獲取用戶等級
'content': html_json['data']['replies'][i]['content']['message'],#獲取評論內容
'like': html_json['data']['replies'][i]['like']#獲取評論點贊數
}
write_file(info)#將信息保存
def write_file(info):
with open('改革春風吹滿地.csv','a',encoding='gb18030',newline='') as f:
writer = csv.writer(f,dialect='excel')
writer.writerow(info.values())#將獲取到的信息保存到csv文件中
f.close()
def main(url):
html_json = get_page(url,headers)#請求網頁,獲取數據
parse_page(html_json)#分析數據
if __name__ == '__main__':
pool = pl(10)#開啓4線程
url = ['https://api.bilibili.com/x/v2/reply?&jsonp=jsonp&pn={}&type=1&oid=19390801&sort=0&_=1549711529949'.format(str(m)) for m in range(1,2408)]#所有鏈接
pool.map(main,url)
pool.close()
pool.join()
ps:其實應該設置一些延時或使用多個IP,否則會被B站封IP,我第一次爬沒問題,第二次爬了一點鏈接狀態碼就是403了,不過還好只被封了十分鐘