使用scrapy對百度圖片抓取
這篇水文是爲那些剛剛開始寫爬蟲的朋友們準備的,已經在爬蟲坑裏呆了很久的老鳥請繞過(自己分析去!你個伸手黨)。
廢話不多說(這就說的不少了),打開百度圖片
在哪個框框裏面輸入你想要下載的圖片名稱例如白雲,蒼(老師),狗 。然後,我就輸入了”狗”。就是這個吊樣子
然後,打開檢查(F12)勾選日誌,選擇XHR
然後就開始擼代碼了,創建一個scrapy項目,不會的同學自行百度。然後,爬蟲這樣寫的:`# -- coding: utf-8 --
import re
import scrapy
import json
from spiderframe.items import ImgsItem
from urllib.parse import quote
class ImageBaiduSpider(scrapy.Spider):
name = ‘image_baidu’
def __init__(self, category=None, *args, **kwargs):
super(ImageBaiduSpider, self).__init__(*args, **kwargs)
self.category = category
def start_requests(self):
url = "https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord={category}&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=&hd=&latest=©right=&word={category}&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=1&fr=&expermode=&force=&pn=30&rn=30&gsm=1e&1560505489300=".format(category=quote(self.category))
yield scrapy.Request(url=url, callback=self.parse, dont_filter=True)
def parse(self, response):
resp = json.loads(response.text)
data = resp.get("data", [])
img_urls = []
for img in data:
hover_url = img.get("middleURL")
if hover_url:
img_urls.append(hover_url)
print(hover_url)
item = ImgsItem()
item["category"] = self.category
item["image_urls"] = img_urls
yield item
total_num = resp.get("displayNum")
current_num = re.findall('&pn=(.*?)&rn=30', response.url)[0]
if int(current_num) < int(total_num):
url = "https://image.baidu.com/search/acjson?tn=resultjson_com&ipn=rj&ct=201326592&is=&fp=result&queryWord={category}&cl=2&lm=-1&ie=utf-8&oe=utf-8&adpicid=&st=-1&z=&ic=&hd=&latest=©right=&word={category}&s=&se=&tab=&width=&height=&face=&istype=&qc=&nc=1&fr=&expermode=&force=&pn={page}&rn=30&gsm=1e&1560505489300=".format(
category=quote(self.category), page=int(current_num)+30)
yield scrapy.Request(url=url, callback=self.parse, dont_filter=True)
`
items 是這樣定義的:
# -*- coding: utf-8 -*-
# Define here the models for your scraped items
#
# See documentation in:
# https://doc.scrapy.org/en/latest/topics/items.html
import scrapy
class SpiderframeItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 這個跟爬圖片無關
url = scrapy.Field()
content = scrapy.Field()
class ImgsItem(scrapy.Item):
# define the fields for your item here like:
# name = scrapy.Field()
# 這個是爬圖片的
category = scrapy.Field()
image_urls = scrapy.Field() # 這個圖片的URL 類型:list
images = scrapy.Field() # 這個看源碼,源碼說默認結果字段,也不知道要它幹啥, 有個屌用!
pipelines是這樣寫的
class ImagePipeline(ImagesPipeline):
def get_media_requests(self, item, info):
# 這個方法是在發送下載請求之前調用的,其實這個方法本身就是去發送下載請求的
request_objs = super(ImagePipeline, self).get_media_requests(item, info)
for request_obj in request_objs:
request_obj.item = item
return request_objs
def file_path(self, request, response=None, info=None):
# 這個方法是在圖片將要被存儲的時候調用,來獲取這個圖片存儲的路徑
path = super(ImagePipeline, self).file_path(request, response, info)
category = request.item.get('category')
image_store = settings.IMAGES_STORE
category_path = os.path.join(image_store, category)
if not os.path.exists(category_path):
os.makedirs(category_path)
image_name = path.replace("full/", "")
image_path = os.path.join(category_path, image_name)
return image_path
setting設置是這樣的:
# image info
IMAGES_STORE = os.path.join(os.path.dirname(os.path.dirname(__file__)), 'spiderframe/files/image')
ITEM_PIPELINES = {
'spiderframe.pipelines.SpiderframePipeline': 300,
'spiderframe.pipelines.RedisPipeline': 350,
'spiderframe.pipelines.MySQLPipeline': 400,
'spiderframe.pipelines.ImagePipeline': 450, # 就這個是圖片,其他不用看
}
讓爬蟲跑起來就完事了,不會讓爬蟲跑起來的自行百度。
兩個小坑,一個是爬蟲傳進去那個參數關鍵詞:“狗” 要編碼一下。鏈接要用https不能用http
兩個小坑注意,不然返回的圖片URL無法下載。