phpspider 完整使用技巧 含代碼

目標:我們要抓 http://www.cnbaowen.net/news/list-3720-1.html 右側的內容,保存在數據庫6.1 中的spider_baowen表中;

我用的windows系統

 

下載

1) https://github.com/owner888/phpspider

2) https://pan.baidu.com/s/10n9ZOUQBlrJzOQx0ShOmMQ    提取碼:b2zc

創建數據庫與相關表

 CREATE TABLE `spider_baowen` (
  `id` int(10) unsigned NOT NULL AUTO_INCREMENT,
  `title` varchar(200) CHARACTER SET utf8mb4 DEFAULT NULL,
  `content` varchar(200) CHARACTER SET utf8mb4 DEFAULT NULL,
  `site_url` varchar(200) CHARACTER SET utf8mb4 DEFAULT '0' COMMENT '文章類型 1行業資訊 2技術資料',
  `site_id` int(5) DEFAULT NULL COMMENT '站點id',
  PRIMARY KEY (`id`) USING BTREE
) ENGINE=InnoDB AUTO_INCREMENT=5 DEFAULT CHARSET=utf8 COLLATE=utf8_bin;

php代碼 放在demo目錄下 命名爲test_baowenwang.php

<?php
// composer下載方式
// 先使用composer命令下載:
// composer require owner888/phpspider
// 引入加載器
//require './vendor/autoload.php';

// GitHub下載方式
require_once __DIR__ . '/../autoloader.php';

use phpspider\core\phpspider;
use phpspider\core\log;
use phpspider\core\selector;

/* Do NOT delete this comment */
/* 不要刪除這段註釋 */

$configs = array(
    'name' => '保溫網',
    'domains' => array(
        'www.cnbaowen.net'
    ),
    'log_file' => 'data/test_baowenwang.log',
    'log_type' => 'warn,debug',
    'tasknum' => 5,
    'max_depth' => 1,
    'export' => [
        'type' => 'db',
        'table' => 'spider_baowen',
    ],

    'db_config' => [
        'host' => '127.0.0.1',
        'port' => '3306',
        'user' => 'root',
        'pass' => 'root',
        'name' => '6.1'
    ],
    'scan_urls' => array(
        'http://www.cnbaowen.net/news/list-3720-1.html'
    ),
    'content_url_regexes' => [
        'http://www.cnbaowen.net/news/show-\d+.html'
    ],
    'list_url_regexes' => [
        'http://www.cnbaowen.net/news/list-3720-\d+.html'
    ],

    'fields' => [
        [
            'name' => "title",
            'selector' => "//h1[@id='title']",
            'required' => true,
        ],
        [
            'name' => 'content',
            'selector' => "//div[@id='content']",
            'required' => true,
        ],
        [
            'name' => 'site_url'
        ],
        [
            'name' => 'site_id'
        ],

    ],
);

$spider = new phpspider($configs);


$spider->on_start = function ($spider) {
    for ($i = 1; $i <= 2; $i++) {
        $url = "http://www.cnbaowen.net/news/list-3720-{$i}.html";
        $spider->add_url($url);
    }
};
$spider->on_list_page = function ($page, $content, $spider) {

    //$content = selector::select($content,"//span[@class='f_r']");         // 中間詳情頁

    $content = selector::select($content, "//div[@class='box_body thumb']");// 右側詳情頁

    if (is_array($content)) {
        $content = implode('', $content);
    }
    $regex = "#http://www.cnbaowen.net/news/show-\d+.html#";
    $urls = array();
    preg_match_all($regex, $content, $out);
    $urls = empty($out[0]) ? [] : $out[0];
    $urls = array_unique($urls);


    if (!empty($urls)) {
        foreach ($urls as $url) {
            $spider->add_url($url);
        }
    }
    // 通知爬蟲不再從當前網頁中發現待爬url
    return false;
};


$spider->on_content_page = function ($page, $content, $phpspider) {
    return false;
};

$spider->on_extract_field = function ($fieldname, $data, $page) {
    switch ($fieldname) {
        case 'content':
        {
            $s = preg_replace("/<div style=\"float:right[\s\S]*?div>/", "", $data);
            $s = preg_replace('/<a .*?href="(.*?)".*?>/is', "<a href='#'>", $s);
            $data = preg_replace('/<img.*?>/is', "", $s);
            $data = mb_substr($data, 0, 1000);
            return $data;
        }
        case 'site_url':
            return $page['url'];
        case 'site_id':
            return 1;
        default:
            return $data;
    }

};
$spider->start();

打開命令行 cd /d demo所在的目錄

php -f test_baowenwang.php

 

查看數據庫表,完畢;

發表評論
所有評論
還沒有人評論,想成為第一個評論的人麼? 請在上方評論欄輸入並且點擊發布.
相關文章