import os
import requests
from lxml import etree
class Spider(object):
def __init__(self):
# self.url = 'https://www.doutula.com/photo/list/?page=1'
self.headers = {
'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.100 Safari/537.36'
}
self.offset = 1
def get_data(self, url):
print('正在爬取第{}頁.....................'.format(self.offset))
folder = 'img/' + '第{}頁'.format(self.offset)
if not os.path.exists(folder):
os.mkdir(folder)
response = requests.get(url, headers=self.headers).text
page = etree.HTML(response)
node_list = page.xpath('//div[@class="page-content text-center"]//a')
for node in node_list:
pic_url = node.xpath('.//img/@data-original')[0]
title = node.xpath('.//p/text()')[0]
response2 = requests.get(pic_url, headers=self.headers).content
save_name = folder + '/'+ title + '.jpg'
print(save_name)
try:
with open(save_name, 'wb') as f:
f.write(response2)
except:
pass
self.offset += 1
next_url = 'https://www.doutula.com/photo/list/?page={}'.format(self.offset)
self.get_data(next_url)
if __name__ == '__main__':
s = Spider()
s.get_data('https://www.doutula.com/photo/list/?page=1')
表情包分頁存儲
發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.