豆瓣電影信息採集
1.思路
先能爬一篇電影的信息,之後能爬排行榜url列表,之後在列表裏循環一篇信息的爬取程序就大功告成
2.爬一篇信息的過程
技術python3.7.2+requests+BeautifulSoup4
def OneMovieInfo(url):
headers = {
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8,application/signed-exchange;v=b3'
, 'Accept-Encoding': 'gzip, deflate, br'
, 'Accept-Language': 'zh,zh-TW;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6'
, 'Cache-Control': 'max-age=0'
, 'Connection': 'keep-alive'
,
'Cookie': 'll="118099"; bid=oLMMQ8yWn4I; ap_v=0,6.0; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1553687581%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DZo7zPOnH0SlfVKADj2cjWHUbGmPZadbaxVA16Uo7gohz5M6uUf2px5J6LIBc_CWc%26wd%3D%26eqid%3Df9f78bc3000b1696000000035c9b641c%22%5D; _pk_ses.100001.4cf6=*; __utma=30149280.1773706481.1553687581.1553687581.1553687581.1; __utmb=30149280.0.10.1553687581; __utmc=30149280; __utmz=30149280.1553687581.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utma=223695111.603638443.1553687581.1553687581.1553687581.1; __utmb=223695111.0.10.1553687581; __utmc=223695111; __utmz=223695111.1553687581.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __yadk_uid=rVLAl8dxM3Zxd3wGfI2BGiQvOthM6xuP; _vwo_uuid_v2=DF4DCE411E806F129A7A313A65BC63B65|071d6cd2c6aa78a4896e8f2d9377ee57; _pk_id.100001.4cf6=ab773b0cb7ee49c3.1553687581.1.1553687590.1553687581.'
, 'Host': 'movie.douban.com'
, 'Referer': 'https://movie.douban.com/'
, 'Upgrade-Insecure-Requests': '1'
,
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
}
html = requests.get(url, headers=headers)
soup = BeautifulSoup(html.text, 'lxml')
title = soup.select('#content h1')[0].text.replace('\n', '')
info = soup.select('#info')[0].text.lstrip('\n').rstrip('\n')
Temp_pinglun = soup.select('.short')
pinglun = [i.text for i in Temp_pinglun]
score = soup.select('.rating_num')[0].text
print(title)
print(score)
print(info)
print(pinglun)
3.爬取排行榜
這時候要注意排行榜數據不再是單頁面簡單的HTML了,而是通過XHR異步手段傳進來的,通過谷歌開發者工具分析確定了真正的json路徑,重新構造代碼
def All_Url_list(url):
headers={
'Accept': '*/*'
,'Accept-Encoding': 'gzip, deflate, br'
,'Accept-Language': 'zh,zh-TW;q=0.9,en-US;q=0.8,en;q=0.7,zh-CN;q=0.6'
,'Connection': 'keep-alive'
,'Cookie': 'll="118099"; bid=oLMMQ8yWn4I; ap_v=0,6.0; _pk_ref.100001.4cf6=%5B%22%22%2C%22%22%2C1553687581%2C%22https%3A%2F%2Fwww.baidu.com%2Flink%3Furl%3DZo7zPOnH0SlfVKADj2cjWHUbGmPZadbaxVA16Uo7gohz5M6uUf2px5J6LIBc_CWc%26wd%3D%26eqid%3Df9f78bc3000b1696000000035c9b641c%22%5D; _pk_ses.100001.4cf6=*; __utma=30149280.1773706481.1553687581.1553687581.1553687581.1; __utmb=30149280.0.10.1553687581; __utmc=30149280; __utmz=30149280.1553687581.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __utma=223695111.603638443.1553687581.1553687581.1553687581.1; __utmb=223695111.0.10.1553687581; __utmc=223695111; __utmz=223695111.1553687581.1.1.utmcsr=baidu|utmccn=(organic)|utmcmd=organic; __yadk_uid=rVLAl8dxM3Zxd3wGfI2BGiQvOthM6xuP; _vwo_uuid_v2=DF4DCE411E806F129A7A313A65BC63B65|071d6cd2c6aa78a4896e8f2d9377ee57; _pk_id.100001.4cf6=ab773b0cb7ee49c3.1553687581.1.1553689627.1553687581.'
,'Host': 'movie.douban.com'
,'Referer': 'https://movie.douban.com/typerank?type_name=%E5%96%9C%E5%89%A7&type=24&interval_id=100:90&action='
,'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/73.0.3683.86 Safari/537.36'
,'X-Requested-With': 'XMLHttpRequest'
}
data=requests.get(url,headers=headers)
result=data.json()
url_list=[i['url'] for i in result]
return url_list
4最後在main方法裏套個循環就完事了
if __name__ == '__main__':
url='https://movie.douban.com/j/chart/top_list?type=24&interval_id=100%3A90&action=&start=0&limit=20'
url_list=All_Url_list(url)
for i in url_list:
OneMovieInfo(i)
5.程序效果