<?php
/**
* 这个是成功的案例插入数据库
*/
require_once __DIR__ . \'/../autoloader.php\';
use phpspider\core\phpspider;
/* Do NOT delete this comment */
/* 不要删除这段注释 */ // 不清楚他这个意思,不知道为啥不能删除
//定义一个数组
$configs = array(
\'name\' => \'简书\',
\'log_show\' => true,
\'tasknum\' => 8,
// 数据库配置
\'db_config\' => array(
// 数据url地址
\'host\' => \'127.0.0.1\',
// 数据库端口
\'port\' => 3306,
// 数据库登录账号
\'user\' => \'***\',
// 密码
\'pass\' => \'**\',
// 数据库名 切记这个是数据库的名字要一致。可以随意更改,但是要和数据库的名字一致
\'name\' => \'lxw_db\'),
\'export\' => array(
\'type\' => \'db\',
\'table\' => \'jianshu\' //添加表, jianshu ,
),
\'max_try\' => 5,
// 爬取的页面
\'domains\' => array(\'jianshu\', \'www.jianshu.com\'),
// 抓取的起点
\'scan_urls\' => array(\'https://www.jianshu.com/c/V2CqjW?utm_medium=index-collections&utm_source=desktop\'),
// 列表页实例
\'list_url_regexes\' => array("https://www.jianshu.com/c/\d+"),
//内容页实例
\'content_url_regexes\' => array("https://www.jianshu.com/p/\d+"),
\'fields\' => array(
// 表结构,也就是表字段
array(
\'name\' => \'title\',
\'selector\' => "//h1",
//获取所有class值为title的h1节点
\'required\' => true
),
array(
\'name\' => \'content\',
//获取所有class值为show-content-free的div节点
\'selector\' => "//article",
\'required\' => true
),
)
);
$spider = new phpspider($configs);
$spider->start();
s
爬取[糗事百科]
<?php
/**
*User: lxw
*Date: 2020-12-29
*/
require_once __DIR__ . \'/../autoloader.php\';
use phpspider\core\phpspider;
/* Do NOT delete this comment */
/* 不要删除这段注释 */ // 不清楚他这个意思,不知道为啥不能删除
$configs = array(
\'name\' => \'糗事百科\',
\'log_show\' => true,
\'tasknum\' => 1,
\'domains\' => array(
\'qiushibaike.com\',
\'www.qiushibaike.com\'
),
\'scan_urls\' => array(
\'http://www.qiushibaike.com\'
),
\'content_url_regexes\' => array(
"http://www.qiushibaike.com/article/\d+"
),
\'list_url_regexes\' => array(
"http://www.qiushibaike.com/8hr/page/\d+"
// "http://www.qiushibaike.com/8hr/page/\d+\?s=\d+"
),
\'max_try\' => 5,
// 数据库配置
\'db_config\' => array(
\'host\' => \'127.0.0.1\',
\'port\' => 3306,
\'user\' => \'**\',
\'pass\' => \'**\',
\'name\' => \'lxw_db\'
),
\'export\' => array(
\'type\' => \'db\',
\'table\' => \'360ky\' //添加表, jianshu ,
),
\'fields\' => array(
array(
// 抽取内容页的文章内容
\'name\' => "title",
\'selector\' => "//h1[contains(@class,\'article-title\')]",
\'required\' => true
),
array(
// 抽取内容页的文章作者
\'name\' => "author",
\'selector\' => "//span[contains(@class,\'side-user-name\')]",
\'required\' => true
),
array(
// 抽取内容页的文章内容
\'name\' => "content",
\'selector\' => "//*[@id=\'single-next-link\']",
\'required\' => true
),
array(
// 抽取内容页的头像
\'name\' => "headimg",
\'selector_type\' => \'css\',
\'selector\' => ".side-left-userinfo>img",
\'required\' => true
),
),
);
$spider = new phpspider($configs);
$spider->start();