本文僅供學習使用,如有侵權,聯繫刪除。
獲得豆瓣top 250書單的url
import lxml
import requests
import re
import csv
from requests.exceptions import RequestException
url_lt = []
def get_one_page(url):
try:
headers = {
"User_Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36"
}
response = requests.get(url,headers=headers,timeout = 5)
if response.status_code == 200:
return response.text
return None
except RequestException:
return None
def get_book_url_list(html):
soup = BeautifulSoup(html,'lxml')
url_list_info = soup.find_all(class_ = 'pl2')
pattern = re.compile('<a.*?href=(.*?)onclick=.*?title.*?>.*?</a>',re.S)
for url in url_list_info:
url = str(url)
url = re.search(pattern,url)
url_lt.append(url.group(1).strip())
def main(offset):
url = 'https://book.douban.com/top250?start=' + str(offset)
html = get_one_page(url)
get_book_url_list(html)
print(len(url_lt))
def write_csv(file,url_list):
with open(file,'a',encoding='utf-8',newline='') as csvfile:
fieldnames = ["rank","book_url"]
writer = csv.DictWriter(csvfile,fieldnames=fieldnames)
writer.writeheader()
for i in range(len(url_list)):
writer.writerow({"rank":i+1,"book_url":url_list[i]})
if __name__ == '__main__':
for i in range(10):
main(i)
write_csv("douban_TOP250_data.csv",url_lt)