博客地址:http://blog.whattoc.com/2013/09/19/nodejs_api_http_2/
詳解Node.js API系列 Http模塊(2) CNodejs爬蟲實現
簡單爬蟲設計
var http = require('http');
http.get("http://cnodejs.org/", function(res) {
var size = 0;
var chunks = [];
res.on('data', function(chunk){
size += chunk.length;
chunks.push(chunk);
});
res.on('end', function(){
var data = Buffer.concat(chunks, size);
console.log(data.toString())
});
}).on('error', function(e) {
console.log("Got error: " + e.message);
});
http.get(options, callback)
- http://cnodejs.org/ 爬行目標地址。
- res.on(‘data’) 監聽data事件。
- res.on(‘end’) 數據獲取完畢事件。
- Buffer.concat(chunks, size); 連接多次data的buff。
- data.toString() 將data二進制數據轉換成utf-8的字符串,如果頁面是GBK的時候,請使用iconv模塊進行轉換,原生Node.js不支持GBK。
設計目標
- 制定爬蟲的url規則
- 分析頁面信息
- 清洗沒用數據
- 存儲有用數據
制定爬蟲的url規則
觀察 http://cnodejs.org/ 的url規則,http://cnodejs.org/?page=頁數, 根據規則,不難想出處理的思路,首先獲取用迭代器模式是最方便的,首先,獲取單個page頁裏面的單個page路徑,通過路徑爬取page的頁面內容。採用迭代器模式是最方便的,next()做page的頁面索引,hasNext()判斷page頁是否超出了有效範圍,超出範圍則停止索引,下面是僞代碼
var Urls = function(start_url){
this.start_url = start_url; //base url
this.page = 0; //url page
this.targetPage = ''; //topic page
}
Urls.prototype.next = function(){
var data;
if (!this.hasNext()) {
return null;
}
this.page += 1;
data = request.get(this.targetPage) //get topic page
return data;
}
Urls.prototype.hasNext = function(){
//http://cnodejs.org/p=[1,2,3,4]
var url = this.start_url + this.page;
// if get page success from url,return ture,or return false
// get topic page url
}
// main
var urls = new Urls();
while(urls.hasNext()){
console.log(urls.next());
}
分析頁面數據
分析頁面的過程,主要工作是分析頁面的元素提取出目標的內容,例如正文和評論等。這裏我們需要採用cheerio的第三方庫,該模塊採取類似Jquery方式的DOM選擇器,通過DOM選擇器來實現信息提取。
npm install cheerio
項目地址:https://github.com/MatthewMueller/cheerio
官方demo例子
var cheerio = require('cheerio'),
$ = cheerio.load('<h2 class="title">Hello world</h2>');
$('h2.title').text('Hello there!');
$('h2').addClass('welcome');
$.html();
//=> <h2 class="title welcome">Hello there!</h2>
提取cnodejs的topics 鏈接
$ = cheerio.load(data); //data是的頁面數據
topics = $('.cell .topic_wrap a')
for(var i=0; i < topics.length; i++){
console.log(topics[i].attribs['href'])
result:
/topic/52386d26101e574521a12ccd
/topic/5232cd39101e57452106ce5a
/topic/52390cdb101e574521b1e252
/topic/521b1dcabee8d3cb128c56dd
/topic/5238c6d2101e574521aaca13
/topic/52380b4e101e57452193617c
內容信息提取
提取condejs帖子內容和標題
$ = cheerio.load(data);
var topic = $('.inner.topic');
console.log(topic.children('h3').text()) //標題
var content = topic.children('.topic_content').text()
console.log(content); //文章內容
清洗沒用的數據
由於爬取的內容,可能帶有html標籤或者表情方面的信息,可能跟目標內容不符合,通過這個環節來過濾,這裏向大家推薦一個模塊 validator,該模塊可以過濾xss攻擊,字符串裏面的空格,判斷內容的屬性等,詳細可以到項目地址學習https://github.com/chriso/node-validator
安裝
npm install validator
demo例子
var check = require('validator').check,
sanitize = require('validator').sanitize
//Validate
check('[email protected]').len(6, 64).isEmail(); //Methods are chainable
check('abc').isInt(); //Throws 'Invalid integer'
check('abc', 'Please enter a number').isInt(); //Throws 'Please enter a number'
check('abcdefghijklmnopzrtsuvqxyz').is(/^[a-z]+$/);
//Set a message per validator
check('foo', {
isNumeric: 'This is not a number',
contains: 'The value doesn\'t have a 0 in it'
}).isNumeric().contains('0');
//Referencing validator args from the message
check('foo', 'The message needs to be between %1 and %2 characters long (you passed "%0")').len(2, 6);
//Sanitize / Filter
var int = sanitize('0123').toInt(); //123
var bool = sanitize('true').toBoolean(); //true
var str = sanitize(' \t\r hello \n').trim(); //'hello'
var str = sanitize('aaaaaaaaab').ltrim('a'); //'b'
var str = sanitize(large_input_str).xss();
var str = sanitize('<a>').entityDecode(); //'<a>'
過濾剛纔爬取的內容,主要是過濾空格
var topic = $('.inner.topic');
title = topic.children('h3').text() //標題
sanitize(title).trim()
存儲有用數據
鹹魚白菜各有所需,對於游泳的數據,可以存成文本,也可以存到數據庫,本次實例,爲了足夠精簡,所以不採用數據庫存儲,採用文本的方式記錄和json的格式記錄數據。
一個爬蟲的流程完成了,我們來重新看看實現代碼
vi url.js
var http = require('http');
var cheerio = require('cheerio');
var sanitize = require('validator').sanitize;
var async = require('async');
var fs = require('fs');
var BASE_URL = 'http://cnodejs.org'
var scrapy = {}
/**
* Get page from url.
*
* Examples:
*
* scrapy.get('http://www.baidu.com', cb);
* // => 'baidu page html
*
* @interface
* @param {String} url:ex http://www.baidu.com
* @param {Function} cb
* @private
*/
scrapy.get = function(url, cb){
http.get(url, function(res) {
var size = 0;
var chunks = [];
res.on('data', function(chunk){
size += chunk.length;
chunks.push(chunk);
});
res.on('end', function(){
var data = Buffer.concat(chunks, size);
cb(null, data);
});
}).on('error', function(e) {
cb(e, null);
});
}
var Urls = function(startUrl){
this.startUrl = startUrl;
this.page = 0;
this.homePage = '';
}
Urls.prototype.next = function(cb){
var self = this;
this.hasNext(function(err, bRet){
if(!bRet){
return null;
}
self.homeParse(function(err, topics){
self.page += 1;
cb(null, topics);
})
})
}
Urls.prototype.hasNext = function(cb){
var self = this;
var url = this.startUrl + this.page;
scrapy.get(url, function(err, data){
var html = data.toString();
$ = cheerio.load(html);
self.homePage = $('.cell .topic_wrap a');
if(self.homePage.length === 0){
return cb(null, false);
}
return cb(null, true);
});
};
Urls.prototype.homeParse = function(cb){
var self = this;
var topics = [];
async.filter(self.homePage, function(i, cb){
var url = BASE_URL + self.homePage[i].attribs['href']
scrapy.get(url, function(err, topic){
topics.push(topic.toString());
cb(null);
})
},function(err){
cb(err, topics);
});
}
Urls.prototype.parseTopic = function(html){
$ = cheerio.load(html);
var topic = $('.inner.topic');
var item = {};
item.title = sanitize(topic.children('h3').text()).trim();
item.content = sanitize(topic.children('.topic_content').text()).trim();
return item;
};
Urls.prototype.Pipeline = function(items){
var result = JSON.stringify(items);
fs.writeFileSync('result.txt', result)
}
exports = module.exports = Urls
vi app.js
var Urls = require('./lib/url.js');
var async = require('async');
var startUrl = 'http://cnodejs.org/?page='
var urls = new Urls(startUrl);
urls.next(function(err, topics){
var self = this;
var items = [];
async.filter(topics, function(topic, cb){
items.push(urls.parseTopic(topic));
cb(null);
},function(err){
urls.Pipeline(items);
})
})