一:爬取豆瓣電影top250地址
from bs4 import BeautifulSoup
from urllib.request import urlopen
import pandas as pd
import numpy as np
from pandas import DataFrame,Series
import re
def split(str,regular): #正則表達式過濾字符串
return re.split(regular,str)
def trans_list(main_list,sub_list):
index=main_list.index(sub_list)
sub_list.reverse() #反轉list的排列
for ele in sub_list:
main_list.insert(index,ele) #後一以元素插入在前一元素之前
main_list.pop(main_list.index(sub_list))
return main_list
def extract_info(li_tag):
#使用.stripped_strings更方便
info=[]
for string in li_tag.stripped_strings:
info.append(string)
#info=['1', '肖申克的救贖', '/\xa0The Shawshank Redemption', '/\xa0月黑高飛(港) / 刺激1995(臺)',
#'[可播放]', '導演: 弗蘭克·德拉邦特 Frank Darabont\xa0\xa0\xa0主演: 蒂姆·羅賓斯 Tim Robbins /...',
#'1994\xa0/\xa0美國\xa0/\xa0犯罪 劇情', '9.6', '693081人評價', '希望讓人自由。']
if '[可播放]' in info:
index=info.index('[可播放]')
info.pop(index) #delete unused info,the index-1
class_hd=li_tag.find('div',{'class':'hd'})
if len(class_hd.a.find_all('span'))==2:
if ' / ' in info[2]:
info.insert(2,np.NaN) #缺失則插入NaN,注意index
info[3]=info[3][2:]
else:
info[2]=info[2][2:]
info.insert(3,np.NaN)
else:
info[2]=info[2][2:] #MovieName,\xa0表示16進制下A0的一個數,爲一個字符
info[3]=info[3][2:] #EnglishName
Dir_and_Act=split(info[4],r':|\xa0\xa0\xa0') #正則表達式分割字符串
if len(Dir_and_Act)<4:
Dir_and_Act.append('NaN')
Yea_Cou_Gen=split(info[5],r'\xa0/\xa0')
info[4]=Dir_and_Act
info[5]=Yea_Cou_Gen
info=trans_list(info,Dir_and_Act)
info=trans_list(info,Yea_Cou_Gen)
info.pop(4) #去除‘導演’
info.pop(5) #起初’演員‘
return info #返回一行movie的數據,list的形式
def collecting_data(url,database):
soup=BeautifulSoup(urlopen(url),'lxml')
movie_grid=soup.find_all('ol',{'class':'grid_view'}) #找到電影表單
movie=movie_grid[0].find_all('li')
for li in movie:
database.append(extract_info(li)) #data爲list前提下,DataFrame([data])爲行排列,DataFrame(data)爲列排列
return database #database=[[],[],[],....]
def collect_all(url):
database=[]
collecting_data(url,database)
data=pd.DataFrame(database)
return data #返回一行daframe格式
#mian
#url=r'https://movie.douban.com/top250?start=0&filter='#豆瓣電影top250地址
page=[]
for sequence in list(range(0,250,25)):
url=r'https://movie.douban.com/top250?start=%d&filter=' %sequence #所有top250的網頁地址
page.append(collect_all(url)) #添加數據
GeneralData=pd.DataFrame()
for i in range(len(page)):
GeneralData=pd.concat([GeneralData,page[i]],ignore_index=True) #pd.concat:[]內要爲DataFrame形式,
#保存數據,待整理分析
GeneralData=GeneralData.drop(0,axis=1) #去除編號的一列
column=['MovieName','EnglishName','OtherName','Director',\
'Actors','Year','Country','Grenre','Rating10','RatingNum',\
'Description']
GeneralData.columns=column
GeneralData.to_csv('MovieTop250.csv',encoding='utf-8') #此函數默認解碼方式爲utf-8,但是在保存時不加encoding的話,讀取會產生錯誤
GeneralData.to_csv('Movie.csv')
print("成功保存數據")
二:正則抓取
import requests
import re
from requests.exceptions import RequestException
def gethtml(url):
try:
# 獲取網頁html內容
response = requests.get(url)
print(response)
# 通過狀態碼判斷是否獲取成功
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
def parse_html(html):
pattern=re.compile('<a.*?>.*?title">(.*?)</span>.*?other">(.*?)</span>.*?</div>',re.S)
items=re.findall(pattern,html)
print(items)
print("-------------")
#變成字典
for item in items:
#yield把方法變成生成器
yield{
"name":item[0],
"other":item[1].strip(" ' / ")
}
def main():
url = "https://movie.douban.com/top250"
html=gethtml(url)
for item in parse_html(html):
print(item)
if __name__ == '__main__':
main()
三:requests+xpath
通過檢查元素,copy XPath獲得xpath
# -*-coding:utf-8 -*-
import requests
from lxml import etree
url = 'https://movie.douban.com/subject/1292052/'
data = requests.get(url).text
s=etree.HTML(data)
film=s.xpath('//*[@id="content"]/h1/span[1]/text()')
director=s.xpath('//*[@id="info"]/span[1]/span[2]/a/text()')
time=s.xpath('//*[@id="info"]/span[10]/text()')
runtime=s.xpath('//*[@id="info"]/span[13]/text()')
print("電影名稱:",film)
print("導演:",director)
print("上映時間:",time)
print("片長::",runtime)
四:爬取豆瓣TOP250的圖書信息
# -*-coding:utf-8 -*-
from lxml import etree
import requests
import time
for a in range(10):
url = 'https://book.douban.com/top250?start={}'.format(a*25)
data = requests.get(url).text
s=etree.HTML(data)
file=s.xpath('//*[@id="content"]/div/div[1]/div/table')
time.sleep(3)
for div in file:
title = div.xpath("./tr/td[2]/div[1]/a/@title")[0]
href = div.xpath("./tr/td[2]/div[1]/a/@href")[0]
score=div.xpath("./tr/td[2]/div[2]/span[2]/text()")[0]
num=div.xpath("./tr/td[2]/div[2]/span[3]/text()")[0].strip("(").strip().strip(")").strip()
scrible=div.xpath("./tr/td[2]/p[2]/span/text()")
if len(scrible) > 0:
print("書名:{},網頁:{},評分:{},評價人數:{},評價:{}\n".format(title,href,score,num,scrible[0]))
else:
print("{},{},{},{}\n".format(title,href,score,num))
五:爬取租房信息
1、單頁
# -*-coding:utf-8 -*-
from lxml import etree
import requests
import time
url = "http://sz.xiaozhu.com/"
data = requests.get(url).text
s=etree.HTML(data)
file=s.xpath('//*[@id="page_list"]/ul/li')
time.sleep(1)
for name in file:
title = name.xpath('./div[2]/div/a/span/text()')[0]
price = name.xpath('./div[2]/span[1]/i/text()')[0]
scrible = name.xpath('./div[2]/div/em /text()')[0].strip()
pic = name.xpath('./a/img/@lazy_src')[0]
print("標題:{},價格:{},描述:{},圖片:{}\n".format(title,print,scrible,pic))
2、多頁,儲存數據到本地
# -*-coding:utf-8 -*-
from lxml import etree
import requests
import time
with open('D:\PycharmProjects/test.txt','w',encoding='utf-8') as f:
for a in range(1,10):
url = 'http://sz.xiaozhu.com/search-duanzufang-p{}-0/'.format(a)
data = requests.get(url).text
s=etree.HTML(data)
file=s.xpath('//*[@id="page_list"]/ul/li')
time.sleep(1)
for name in file:
title = name.xpath('./div[2]/div/a/span/text()')[0]
price = name.xpath('./div[2]/span[1]/i/text()')[0]
scrible = name.xpath('./div[2]/div/em /text()')[0].strip()
pic = name.xpath('./a/img/@lazy_src')[0]
print("標題:{},價格:{},描述:{},圖片:{}\n".format(title,print,scrible,pic))
f.write("標題:{},價格:{},描述:{},圖片:{}\n".format(title, price, scrible, pic))
如果儲存格式是CSV的話會出現亂碼,首先要用記事本打開,然後另存爲 – 選擇編碼爲“ANSI”,再打開。
持續更新中!!!