ryt103114

1、爬虫相关的包

(1)const request =  require(\'superagent\'); // 处理get post put delete head 请求  轻量接http请求库,模仿浏览器登陆

(2)const cheerio = require(\'cheerio\'); // 加载html
(3)const fs = require(\'fs\'); // 加载文件系统模块 将数据存到一个文件中的时候会用到

        fs.writeFile(\'saveFiles/zybl.txt\', content, (error1) => { // 将文件存起来 文件路径 要存的内容 错误
             if (error1) throw error1;
             // console.log(\' text save \');
         });

(4)const fs      = require(\'graceful-fs\'); // 将文件存为xlse

       const writeStream = fs.createWriteStream(\'saveFiles/trader.xlsx\'); //新建xlsx文件

       writeStream.write(title);//像slsx里面写入内容

(5)const Promise = require(\'bluebird\'); //异步处理

(6)const Nightmare = require(\'nightmare\');//一个高层次的浏览器自动化图书馆  先要安装phantomjs 然后在装nightmare 

(7)const co        = require(\'co\');

2、爬虫代码 

\'use strict\';

const co = require(\'co\');
const fs = require(\'fs\');
const Nightmare = require(\'nightmare\'); // 可视化的浏览器

const url = \'http://sports.qq.com/isocce/\';

const onError = function (err) {
console.log(err);
};

const getHtml = function (pageUrl) {
const pageScraper = new Nightmare();// 打开浏览器
let content = null;

return co(function* run() {
yield pageScraper.goto(pageUrl.url).wait();
console.log(\'222222\' + pageUrl.url);
content = yield pageScraper.evaluate(() => {
const temp = document.querySelector(\'body\').innerHTML;
return temp;
});
console.log(\'子页面链接\');
console.dir(content);

yield fs.writeFile(\'../../saveFiles/\' + pageUrl.title + \'.html\', content, (err) => {
console.log(\'存文件.......\');
if (err) return console.log(err);
return console.log(\'Save pageUrl content to \' + pageUrl.title + \'.html\');
});
});
};

co(function* run() {
const scraper = new Nightmare({
show: true
});// 打开一个可视化的浏览器
let counter = 0;
// let next = null;
let links = [];

yield scraper
.goto(url) // 跳转的地址
.wait();
// .click(\'#feed-laliga > a\');
for (let i = 0; i < 5; i ++) {
yield scraper.wait(2000)
.click(\'#feed-laliga > a\');
}

links = yield scraper
.evaluate(() => {
const temp = document.querySelectorAll(\'#feed-laliga h3 > a\');
const list = [];
for (const each of temp) {
console.log(\'each\');
console.log(each);
list.push({
title: each.innerText,
url: each.href,
});
}
return list;
});
// 在这里 加载更多


console.log(\'这里\');
console.dir(links);

for (const link of links) {
if (link !== null && link.url !== \'javascript:void(0)\') {
counter += 1;
setTimeout(() => {
getHtml(link);
}, counter * links.length * 250);
}
}
yield scraper.end();
}).catch(onError);

 

分类:

技术点:

相关文章:

  • 2021-08-29
  • 2021-12-11
  • 2021-12-15
  • 2021-07-29
  • 2021-12-15
  • 2021-12-01
  • 2021-11-18
  • 2021-12-05
猜你喜欢
  • 2021-10-23
  • 2021-12-11
  • 2021-12-11
  • 2021-12-01
  • 2021-11-27
  • 2021-04-02
相关资源
相似解决方案