【问题标题】:AJAX pagination Web crawl in Python using Scrapy使用 Scrapy 在 Python 中进行 AJAX 分页 Web 爬网
【发布时间】:2013-12-23 12:14:32
【问题描述】:

我正在使用 Python 和 Scrapy 抓取具有 AJAX 分页的网站。 我能够抓取第一页。

但是,如果我在通过 AJAX 加载完成的第二页上进行下去,我将无法获得其他页面的链接。

请指导我如何获取 AJAX 页面的链接。我正在使用BeautifulSoup 库进行网页抓取。

class SItenameSpider(CrawlSpider):
start_urls = []
rules = (
    Rule(SgmlLinkExtractor(allow=('/trends/','/keynote/')), callback='parse_item'),
)

def parse_item(self, response):
    print('Hi, crawling this page! %s' % response.url)


    extract_tuple_list =  site_product_crawl.parse_product_page('site url')
    items = []

    for extract_tuple in  extract_tuple_list:
      item = SitenameItem()
      item['site_id'] = extract_tuple[0]
      item['name'] = extract_tuple[1]
      item['price'] = extract_tuple[2]
      item['rating']=  extract_tuple[3]
      item['num_reviews']=  extract_tuple[4]
      item['category']=  cat_code
      item['url'] = response.url
      item['date'] = date_created 
      item['description'] = extract_tuple[6]
      items.append(item)
    return items

from bs4 import BeautifulSoup as bsoup
import requests
import pprint
import re

def return_html(url):
    try:
        return requests.get(url).text
    except Exception as e:
        print e
        return None

def parse_product_page(prod_url):
    #print prod_url
    soup = bsoup(return_html(prod_url))
    tuple_list = []
    avg_rating = None
    num_reviews = None
    prod_category = None
    prod_name = None
    prod_price = None
    prod_number = None



prod_price = '0' # the price is not available on site so it was put 0
#num_rev_div = soup.find('a', {'class' : 'bv-rating-label bv-text-link bv-focusable', 'href' : 'javascript:void(0)'})
url_split_prod_number = prod_url.split('://')
prod_number = url_split_prod_number[1].split('/')[1] + '_' + url_split_prod_number[1].split('/')[2].strip().encode('utf-8');
print prod_number
prod_description = soup.find('div', {'class' : 'articleText'}).get_text().strip().replace('<br/>','').encode('utf-8')
print prod_description    
prod_name_div = soup.find('div', id = 'titleSection')
prod_name = prod_name_div.h2.get_text().strip().encode('utf-8');
print prod_name

num_reviews = soup.find('span',itemprop='votes').get_text().strip().encode('utf-8').replace(',','');
avg_rating = soup.find('span',{'class' :'featuredstatbox'}).find('span',itemprop='rating').get_text().strip().encode('utf-8')            #get_text().strip().encode('utf-8').replace(',','');   


#print price_text
#if price_text != None:
  #prod_price = price_text.get_text().strip().encode('utf-8').replace('$','').replace(',','').split('-')[0].strip()
#print prod_price
tuple = (prod_number,
prod_name.strip().encode('utf-8'),
prod_price,
avg_rating,
num_reviews,
prod_category,
prod_description.replace('\n','').replace("'","''"))
tuple_list.append(tuple)

pprint.pprint(tuple_list)
return tuple_list

def main():
  parse_product_page('sitename')

if __name__ == '__main__':
    main()

【问题讨论】:

    标签: python ajax web-scraping beautifulsoup


    【解决方案1】:

    即使是通过 ajax 加载的页面也必须将请求发送到某个 url。如果您使用 chrome,请通过 chrome 开发者工具中的网络选项卡找到它。

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 2017-09-22
      • 2021-01-13
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      相关资源
      最近更新 更多