【发布时间】:2019-11-20 16:38:11
【问题描述】:
我正在抓取一个网站,并且我有一系列链接:
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Abercorn',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Longueuil',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Sainte-Anne-De-Bellevue',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Shawinigan',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Chateauguay',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Mont-Laurier',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Saint-Georges',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Sherbrooke',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Chicoutimi',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Montreal',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Saint-Henri-De-Levis',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Stukely-Sud',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Drummondville',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Montreal-Est',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Saint-Hubert',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Trois-Rivieres',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Gatineau',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Montreal-Nord',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Saint-Jerome',
"http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Val-D'or",
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Granby',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Montreal-Ouest',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Saint-Lambert',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Verdun',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Lachine',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Quebec',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Saint-Laurent',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Warwick',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Lasalle',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Rigaud',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Saint-Leonard',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Westmount',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Laval',
'http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Roxboro'
但是当我发出请求时,其中一些链接返回错误 403 - Forbidden。
Error null Forbidden http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Verdun
Error null Forbidden http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Granby
Error null Forbidden http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Saint-Lambert
Error null Forbidden http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Val-D'or
Error null Forbidden http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Lasalle
Error null Forbidden http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Laval
Error null Forbidden http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Warwick
Error null Forbidden http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Quebec
Error null Forbidden http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Westmount
Error null Forbidden http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Roxboro
Error null Forbidden http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Saint-Laurent
Error null Forbidden http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Rigaud
Error null Forbidden http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Saint-Leonard
Error null Forbidden http://www.adventistdirectory.org/SearchResults.aspx?CtryCode=CA&StateProv=QC&City=Lachine
当我使用链接较少的列表时,它可以完美运行。
这是我的代码:
const request = require('request');
const cheerio = require('cheerio');
function readChurches(cities){
const churches = []
for (let index = 0; index < cities[0].length; index++){
const city = cities[0][index];
churches.push(new Promise((resolve, reject) => {
const church = []
let options = {
url: city,
headers: {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
};
request(options, (error, response, html) => {
if(!error && response.statusCode == 200) {
const $ = cheerio.load(html);
const $$ = cheerio.load($('table').find('tbody').eq(1).find('tr').eq(1).find('td').eq(1).html())
$$('a').each((i, el) => {
const item = $(el).attr('href')
if(item != undefined){
if(item.includes('ViewEntity')) {
church.push(`http://www.adventistdirectory.org${item}`);
}
}
});
resolve(church);
} else {
console.log('Error',error,response.statusMessage,city)
reject(error)
}
});
}))
}
return Promise.all(churches);
}
如何绕过错误 403?。因为当我尝试在浏览器上打开链接时它可以工作,但当我使用 javascript 函数时却不起作用。
--- 新更新 ---
我已更改为代码。我添加了一个 try catch 块
function readChurches(cities){
const churches = []
for (let index = 0; index < cities[0].length; index++){
const city = cities[0][index];
churches.push(new Promise((resolve, reject) => {
const church = []
let options = {
url: city,
headers: {
'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'
}
};
try {
request(options, (error, response, html) => {
if(!error && response.statusCode == 200) {
const $ = cheerio.load(html);
const $$ = cheerio.load($('table').find('tbody').eq(1).find('tr').eq(1).find('td').eq(1).html())
$$('a').each((i, el) => {
const item = $(el).attr('href')
if(item != undefined){
if(item.includes('ViewEntity')) {
church.push(`http://www.adventistdirectory.org${item}`);
}
}
});
resolve(church);
}
});
} catch (error) {
console.log('Error',error,city)
reject(error)
}
}))
}
return churches
}
还创建了这个函数,由@chrispytoes 提供
async function doStuff(churches) {
const results = [];
for(let i in churches) {
try {
console.log(churches[i])
results.push(await churches[i]);
sleep(5000);
} catch (error) {
console.log(error)
}
}
return results
}
我正在运行它:
async function run(){
let provinces = []
provinces.push(`http://www.adventistdirectory.org/BrowseStateProv.aspx?CtryCode=CA&StateProv=QC`)
let cities = await readCities(provinces);
const churches = await readChurches(cities);
const stuff = await doStuff(churches)
console.log('Churches: ', stuff);
console.log('End')
} catch (error) {
console.log('Error', error)
}
}
我在我的控制台上得到了这个:
Promise { <pending> }
Promise {
[
'http://www.adventistdirectory.org/ViewEntity.aspx?EntityID=19653'
]
}
Promise { <pending> }
Promise {
[
'http://www.adventistdirectory.org/ViewEntity.aspx?EntityID=19637'
]
}
Promise {
[
'http://www.adventistdirectory.org/ViewEntity.aspx?EntityID=54633'
]
}
Promise {
[
'http://www.adventistdirectory.org/ViewEntity.aspx?EntityID=31155'
]
}
Promise {
[
'http://www.adventistdirectory.org/ViewEntity.aspx?EntityID=15271'
]
}
Promise { <pending> }
Promise {
[
'http://www.adventistdirectory.org/ViewEntity.aspx?EntityID=30783'
]
}
Promise { <pending> }
Promise { <pending> }
Promise {
[
'http://www.adventistdirectory.org/ViewEntity.aspx?EntityID=15265'
]
}
Promise {
[
'http://www.adventistdirectory.org/ViewEntity.aspx?EntityID=15255'
]
}
Promise {
[
'http://www.adventistdirectory.org/ViewEntity.aspx?EntityID=15251'
]
}
Promise { <pending> }
Promise {
[
'http://www.adventistdirectory.org/ViewEntity.aspx?EntityID=15247'
]
}
Promise {
[
'http://www.adventistdirectory.org/ViewEntity.aspx?EntityID=19645'
]
}
Promise {
[
'http://www.adventistdirectory.org/ViewEntity.aspx?EntityID=32838'
]
}
Promise {
[
'http://www.adventistdirectory.org/ViewEntity.aspx?EntityID=29973'
]
}
Promise { <pending> }
它没有到达console.log('Churches: ', stuff);
【问题讨论】:
-
您可能会受到速率限制(尽管这不是真正正确的状态代码) - 尝试在请求之间休眠?
-
我已经这样做了,但它不起作用
-
@BrunoLopesBacelar 您是否尝试过我的建议,一次提出一个请求?那仍然可能太快了,您可能不得不进一步限制它。当您自己执行一次请求时,请求是否有效?
-
@chrispytoes 我在描述中添加了更多信息。使用你的想法没有奏效。我仍然没有收到回复。
-
@BrunoLopesBacelar 您正在记录实际的承诺本身,而不是结果。检查我更新的答案。
标签: javascript node.js web-scraping request cheerio