import requests
import re
import json
class BookSpider(object):
def __init__(self):
kw="博士"
self.base_url = "https://search.bilibili.com/all?keyword=" + str(kw) + "&from_source=nav_search_new&page="
self.headers = {"User-Agent":"Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/76.0.3809.132 Safari/537.36"}
self.data_list =[]
#構建所有的url
def get_url_list(self):
url_list = []
for i in range(50):
url = self.base_url+str(i)
url_list.append(url)
return url_list
#2、發送請求
def send_request(self,url):
data = requests.get(url,headers=self.headers).content.decode()
return data
#3、解析數據並保存
def parse_xpath_data(self,data):
try:
up_links=re.findall(r'<a href=\"//space.bilibili.com/.*?\" target=\"_blank\" class=\"up-name\">',data)
up_names=re.findall(r'<a href=\"//space.bilibili.com/.*?\" target=\"_blank\" class=\"up-name\">.*?</a>',data)
video_titles=re.findall(r'<a title=\".*?\"',data)
video_links=re.findall(r'href=\"//www.bilibili.com/video/.*?\"',data)
up_dates=re.findall(r'<i class=\"icon-date\"></i>.*?</span>',data, re.S)
watch_nums=re.findall(r'<i class=\"icon-playtime\"></i>.*?</span>',data,re.S)
subtitle_nums=re.findall(r'<i class=\"icon-subtitle\"></i>.*?</span>',data,re.S)
for i in range(len(video_titles)):
up_id = re.split(r'com/|\?from=search',up_links[i])[1]
up_name=re.split(r'class=\"up-name\">|</a>',up_names[i])[1]
video_title=video_titles.split('"')[1]
video_link=re.split(r'video/|\?from=search',video_links[i])[1]
up_data=re.split(r'\n',up_datas[i])[1]
watch_nums=re.split(r'\n',watch_nums[i])[1]
subtitle_num=re.split(r'\n',subtitle_nums[i])[1]
with open("06.csv", "a", encoding='utf-8-sig') as f:
f.write(str(up_id) + "," + up_name + "," + video_title + "," + str(video_av) + "," + up_date + "," + str(watch_num)\
+ "," + str(subtitle_num) + "," + comment_num + "," + up_type + "\n")
except:
print("產生異常")
def save_data(self):
json.dump(self.data_list,open("04book.json","w"))
json_fd = open("04book.json", "r")
csv_fd = open("04.csv", "w",encoding='utf-8')
# 2、提出表頭,表內容
# 將字符串轉化成列表
data_list = json.load(json_fd)
sheet_title = data_list[0].keys()
sheet_data = []
for data in data_list:
sheet_data.append(data.values())
# 3、csv寫入器
writer = csv.writer(csv_fd)
# 4、寫入表頭
writer.writerow(sheet_title)
# 5、寫入內容
writer.writerows(sheet_data)
# 6、關閉兩個文件
json_fd.close()
csv_fd.close()
#統籌調用
def start(self):
url_list =self.get_url_list()
for url in url_list:
data =self.send_request(url)
self.parse_xpath_data(data)
self.save_data()
BookSpider().start()
python爬蟲——up主信息——正則
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.