基礎爬蟲參考:http://www.cnblogs.com/xin-xin/p/4297852.html
建議閱讀這個,寫的很清晰易懂
說明文檔:
直接運行, 會將嗅事百科第1頁到第20頁的笑話內容爬下來;
例如:
嗅事百科有很多笑話,這裏我沒有處理;
# -*- coding:utf-8 -*-
from scrapy import Selector
import urllib2
import sys
from time import sleep
reload(sys)
sys.setdefaultencoding('utf-8')
page = 1
for x in range(1, 20):
url = 'http://www.qiushibaike.com/8hr/page/'+str(page)
headers = {'User-Agent': 'Mozilla/4.0 (compatible; MSIE 5.5; Windows NT)'}
try:
request = urllib2.Request(url, headers = headers)
response = urllib2.urlopen(request)
except urllib2.URLError, e:
if hasattr(e, 'code'):
print e.code
if hasattr(e, 'reason'):
print e.reason
# selector的使用參考:http://scrapy-chs.readthedocs.io/zh_CN/latest/topics/selectors.html
sel = Selector(text=response.read(), type="html")
with open(r'C:\Users\Wang Zuo\Desktop\test.txt', 'a') as f:
# 通過 XPath來選擇title標籤內的文字:
for x in sel.xpath('//div[@class = "content"]/text()').extract():
f.write(x)
sleep(0.5)
page += 1