lxwphp
<?php

/**
 * 这个是成功的案例插入数据库
 */

require_once __DIR__ . \'/../autoloader.php\';

use phpspider\core\phpspider;

/* Do NOT delete this comment */
/* 不要删除这段注释 */ // 不清楚他这个意思,不知道为啥不能删除
//定义一个数组
$configs = array(
    \'name\' => \'简书\',
    \'log_show\' => true,
    \'tasknum\' => 8,
    // 数据库配置
    \'db_config\' => array(
        // 数据url地址
        \'host\' => \'127.0.0.1\',
        // 数据库端口
        \'port\' => 3306,
        // 数据库登录账号
        \'user\' => \'***\',
        // 密码
        \'pass\' => \'**\',
        // 数据库名 切记这个是数据库的名字要一致。可以随意更改,但是要和数据库的名字一致
        \'name\' => \'lxw_db\'),
    \'export\' => array(
        \'type\' => \'db\',
        \'table\' => \'jianshu\' //添加表, jianshu ,
    ),
    \'max_try\' => 5,

// 爬取的页面
    \'domains\' => array(\'jianshu\', \'www.jianshu.com\'),
// 抓取的起点
    \'scan_urls\' => array(\'https://www.jianshu.com/c/V2CqjW?utm_medium=index-collections&utm_source=desktop\'),
// 列表页实例
    \'list_url_regexes\' => array("https://www.jianshu.com/c/\d+"),
//内容页实例
    \'content_url_regexes\' => array("https://www.jianshu.com/p/\d+"),

    \'fields\' => array(
        // 表结构,也就是表字段
        array(
            \'name\' => \'title\',
            \'selector\' => "//h1",
            //获取所有class值为title的h1节点
            \'required\' => true
        ),
        array(
            \'name\' => \'content\',
            //获取所有class值为show-content-free的div节点
            \'selector\' => "//article",
            \'required\' => true
        ),
    )
);
$spider = new phpspider($configs);
$spider->start();

s

爬取[糗事百科]
<?php
/**
 *User: lxw
 *Date: 2020-12-29
 */
require_once __DIR__ . \'/../autoloader.php\';
use phpspider\core\phpspider;
/* Do NOT delete this comment */
/* 不要删除这段注释 */ // 不清楚他这个意思,不知道为啥不能删除

$configs = array(
    \'name\' => \'糗事百科\',
    \'log_show\' => true,
    \'tasknum\' => 1,
    \'domains\' => array(
        \'qiushibaike.com\',
        \'www.qiushibaike.com\'
    ),
    \'scan_urls\' => array(
        \'http://www.qiushibaike.com\'
    ),
    \'content_url_regexes\' => array(
        "http://www.qiushibaike.com/article/\d+"
    ),
    \'list_url_regexes\' => array(
        "http://www.qiushibaike.com/8hr/page/\d+"
//        "http://www.qiushibaike.com/8hr/page/\d+\?s=\d+"
    ),


    \'max_try\' => 5,
    // 数据库配置
    \'db_config\' => array(
        \'host\' => \'127.0.0.1\',
        \'port\' => 3306,
        \'user\' => \'**\',
        \'pass\' => \'**\',
        \'name\' => \'lxw_db\'
    ),
    \'export\' => array(
        \'type\' => \'db\',
        \'table\' => \'360ky\' //添加表, jianshu ,
    ),
    \'fields\' => array(
        array(
            // 抽取内容页的文章内容
            \'name\' => "title",
            \'selector\' => "//h1[contains(@class,\'article-title\')]",
            \'required\' => true
        ),
        array(
            // 抽取内容页的文章作者
            \'name\' => "author",
            \'selector\' => "//span[contains(@class,\'side-user-name\')]",
            \'required\' => true
        ),
        array(
            // 抽取内容页的文章内容
            \'name\' => "content",
            \'selector\' => "//*[@id=\'single-next-link\']",
            \'required\' => true
        ),
        array(
            // 抽取内容页的头像
            \'name\' => "headimg",
            \'selector_type\' => \'css\',
            \'selector\' => ".side-left-userinfo>img",
            \'required\' => true
        ),
    ),
);

$spider = new phpspider($configs);
$spider->start();

 

分类:

技术点:

相关文章:

  • 2021-11-30
  • 2022-12-23
  • 2022-01-01
  • 2022-12-23
  • 2022-12-23
  • 2021-07-25
猜你喜欢
  • 2022-03-14
  • 2021-12-21
  • 2021-12-04
  • 2021-04-19
  • 2022-12-23
  • 2021-08-17
  • 2021-10-05
相关资源
相似解决方案