使用scrapy的ImagesPipeline爬取圖片的時候,運行報錯
Traceback (most recent call last):
File "/home/lcy/.local/lib/python2.7/site-packages/twisted/internet/defer.py", line 653, in _runCallbacks
current.result = callback(current.result, *args, **kw)
File "/home/lcy/.local/lib/python2.7/site-packages/scrapy/pipelines/media.py", line 62, in process_item
requests = arg_to_iter(self.get_media_requests(item, info))
File "/home/lcy/.local/lib/python2.7/site-packages/scrapy/pipelines/images.py", line 147, in get_media_requests
return [Request(x) for x in item.get(self.images_urls_field, [])]
File "/home/lcy/.local/lib/python2.7/site-packages/scrapy/http/request/__init__.py", line 25, in __init__
self._set_url(url)
File "/home/lcy/.local/lib/python2.7/site-packages/scrapy/http/request/__init__.py", line 57, in _set_url
raise ValueError('Missing scheme in request url: %s' % self._url)
ValueError: Missing scheme in request url: h
查找了相關的文檔,瞭解到使用ImagesPipeline傳入的url地址必須是一個list,在傳入一個list的時候pipeline處理的速度要快得多,而我寫的是一個字符串,所以報錯,所以我們需要修改一下傳入的url格式就行了
源碼附上:
修改前:
# -*- coding: utf-8 -*-
import scrapy
from imgspider.items import QiubaiPicItem
import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )
class QiubaipicSpider(scrapy.Spider):
name = "qiubaiPic"
allowed_domains = ["qiushibaike.com"]
start_urls = ['http://qiushibaike.com/']
def parse(self, response):
# page_value=response.xpath('//*[@id="content-left"]/ul/li[8]/a/span/text()').extract()[0]
# for page in range(1,int(page_value)):
# url='http://www.qiushibaike.com/pic/page/'+str(page)
# yield scrapy.Request(url,callback=self.parse_detail)
url='http://www.qiushibaike.com/pic/page/3'
yield scrapy.Request(url,callback=self.parse_detail)
def parse_detail(self,response):
item=[]
divs=response.xpath('//*[@id="content-left"]/div[@class="article block untagged mb15"]')
for div in divs:
QiubaiPic=QiubaiPicItem()
src=div.xpath('div[@class="thumb"]/a/img/@src').extract()[0]
img_path='http://'+src[2:]
QiubaiPic['img']=img_path
item.append(QiubaiPic)
return item
修改後:
# -*- coding: utf-8 -*-
import scrapy
from imgspider.items import QiubaiPicItem
import sys
reload(sys)
sys.setdefaultencoding( "utf-8" )
class QiubaipicSpider(scrapy.Spider):
name = "qiubaiPic"
allowed_domains = ["qiushibaike.com"]
start_urls = ['http://qiushibaike.com/']
def parse(self, response):
# page_value=response.xpath('//*[@id="content-left"]/ul/li[8]/a/span/text()').extract()[0]
# for page in range(1,int(page_value)):
# url='http://www.qiushibaike.com/pic/page/'+str(page)
# yield scrapy.Request(url,callback=self.parse_detail)
url='http://www.qiushibaike.com/pic/page/3'
yield scrapy.Request(url,callback=self.parse_detail)
def parse_detail(self,response):
item=[]
img_paths=[]
divs=response.xpath('//*[@id="content-left"]/div[@class="article block untagged mb15"]')
for div in divs:
QiubaiPic=QiubaiPicItem()
src=div.xpath('div[@class="thumb"]/a/img/@src').extract()[0]
img_path='http://'+src[2:]
img_paths.append(img_path)
QiubaiPic['img']=img_paths
item.append(QiubaiPic)
return item
# -*- coding: utf-8 -*-
import random
BOT_NAME = 'imgspider'
SPIDER_MODULES = ['imgspider.spiders']
NEWSPIDER_MODULE = 'imgspider.spiders'
#瀏覽器請求頭,這個必須要有
USER_AGENT_LIST=[
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1",
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SE 2.X MetaSr 1.0; SE 2.X MetaSr 1.0; .NET CLR 2.0.50727; SE 2.X MetaSr 1.0)",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3",
"Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; 360SE)",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3",
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3",
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24",
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/22.0.1207.1 Safari/537.1" \
"Mozilla/5.0 (X11; CrOS i686 2268.111.0) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.57 Safari/536.11", \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1092.0 Safari/536.6", \
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.6 (KHTML, like Gecko) Chrome/20.0.1090.0 Safari/536.6", \
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/19.77.34.5 Safari/537.1", \
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.9 Safari/536.5", \
"Mozilla/5.0 (Windows NT 6.0) AppleWebKit/536.5 (KHTML, like Gecko) Chrome/19.0.1084.36 Safari/536.5", \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
"Mozilla/5.0 (Windows NT 5.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
"Mozilla/5.0 (Macintosh; Intel Mac OS X 10_8_0) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1063.0 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1062.0 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.1) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.1 Safari/536.3", \
"Mozilla/5.0 (Windows NT 6.2) AppleWebKit/536.3 (KHTML, like Gecko) Chrome/19.0.1061.0 Safari/536.3", \
"Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24", \
"Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
]
ua= random.choice(USER_AGENT_LIST)
if ua:
USER_AGENT =ua
print ua
else:
USER_AGENT="Mozilla/5.0 (Windows NT 6.2; WOW64) AppleWebKit/535.24 (KHTML, like Gecko) Chrome/19.0.1055.1 Safari/535.24"
#是否遵循robots協定
ROBOTSTXT_OBEY = False
#線程數量
CONCURRENT_REQUESTS = 32
#下載延遲單位秒
DOWNLOAD_DELAY = 3
#cookies開關,建議禁用
COOKIES_ENABLED = False
# See http://scrapy.readthedocs.org/en/latest/topics/item-pipeline.html
ITEM_PIPELINES = {
'scrapy.pipelines.images.ImagesPipline':1}
ITEM_PIPELINES = {'scrapy.pipelines.images.ImagesPipeline': 1}
IMAGES_URLS_FIELD = 'img'
IMAGES_STORE = r'/home/lcy/pics'
LOG_FILE="scrapy.log"