目標:我們要抓 http://www.cnbaowen.net/news/list-3720-1.html 右側的內容,保存在數據庫6.1 中的spider_baowen表中;
我用的windows系統
下載
1) https://github.com/owner888/phpspider
2) https://pan.baidu.com/s/10n9ZOUQBlrJzOQx0ShOmMQ 提取碼:b2zc
創建數據庫與相關表
CREATE TABLE `spider_baowen` (
`id` int(10) unsigned NOT NULL AUTO_INCREMENT,
`title` varchar(200) CHARACTER SET utf8mb4 DEFAULT NULL,
`content` varchar(200) CHARACTER SET utf8mb4 DEFAULT NULL,
`site_url` varchar(200) CHARACTER SET utf8mb4 DEFAULT '0' COMMENT '文章類型 1行業資訊 2技術資料',
`site_id` int(5) DEFAULT NULL COMMENT '站點id',
PRIMARY KEY (`id`) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=5 DEFAULT CHARSET=utf8 COLLATE=utf8_bin;
php代碼 放在demo目錄下 命名爲test_baowenwang.php
<?php
// composer下載方式
// 先使用composer命令下載:
// composer require owner888/phpspider
// 引入加載器
//require './vendor/autoload.php';
// GitHub下載方式
require_once __DIR__ . '/../autoloader.php';
use phpspider\core\phpspider;
use phpspider\core\log;
use phpspider\core\selector;
/* Do NOT delete this comment */
/* 不要刪除這段註釋 */
$configs = array(
'name' => '保溫網',
'domains' => array(
'www.cnbaowen.net'
),
'log_file' => 'data/test_baowenwang.log',
'log_type' => 'warn,debug',
'tasknum' => 5,
'max_depth' => 1,
'export' => [
'type' => 'db',
'table' => 'spider_baowen',
],
'db_config' => [
'host' => '127.0.0.1',
'port' => '3306',
'user' => 'root',
'pass' => 'root',
'name' => '6.1'
],
'scan_urls' => array(
'http://www.cnbaowen.net/news/list-3720-1.html'
),
'content_url_regexes' => [
'http://www.cnbaowen.net/news/show-\d+.html'
],
'list_url_regexes' => [
'http://www.cnbaowen.net/news/list-3720-\d+.html'
],
'fields' => [
[
'name' => "title",
'selector' => "//h1[@id='title']",
'required' => true,
],
[
'name' => 'content',
'selector' => "//div[@id='content']",
'required' => true,
],
[
'name' => 'site_url'
],
[
'name' => 'site_id'
],
],
);
$spider = new phpspider($configs);
$spider->on_start = function ($spider) {
for ($i = 1; $i <= 2; $i++) {
$url = "http://www.cnbaowen.net/news/list-3720-{$i}.html";
$spider->add_url($url);
}
};
$spider->on_list_page = function ($page, $content, $spider) {
//$content = selector::select($content,"//span[@class='f_r']"); // 中間詳情頁
$content = selector::select($content, "//div[@class='box_body thumb']");// 右側詳情頁
if (is_array($content)) {
$content = implode('', $content);
}
$regex = "#http://www.cnbaowen.net/news/show-\d+.html#";
$urls = array();
preg_match_all($regex, $content, $out);
$urls = empty($out[0]) ? [] : $out[0];
$urls = array_unique($urls);
if (!empty($urls)) {
foreach ($urls as $url) {
$spider->add_url($url);
}
}
// 通知爬蟲不再從當前網頁中發現待爬url
return false;
};
$spider->on_content_page = function ($page, $content, $phpspider) {
return false;
};
$spider->on_extract_field = function ($fieldname, $data, $page) {
switch ($fieldname) {
case 'content':
{
$s = preg_replace("/<div style=\"float:right[\s\S]*?div>/", "", $data);
$s = preg_replace('/<a .*?href="(.*?)".*?>/is', "<a href='#'>", $s);
$data = preg_replace('/<img.*?>/is', "", $s);
$data = mb_substr($data, 0, 1000);
return $data;
}
case 'site_url':
return $page['url'];
case 'site_id':
return 1;
default:
return $data;
}
};
$spider->start();
打開命令行 cd /d demo所在的目錄
php -f test_baowenwang.php
查看數據庫表,完畢;