【问题标题】:requests + bs4 no results from pagesrequests + bs4 页面没有结果
【发布时间】:2017-03-05 09:02:12
【问题描述】:

这里是可以从https://www.gabar.org/membersearchresults.cfm获取信息的代码

但不能来自https://www.gabar.org/membersearchresults.cfm?start=1&id=70FFBD1B-9C8E-9913-79DBB8B989DED6C1

from bs4 import BeautifulSoup
import requests
import traceback


links_to_visit = []
navigation_links = []  # for testing next button

base_url = 'https://www.gabar.org'


def make_soup(link):
    r = requests.get(link)
    soup = BeautifulSoup(r.content, 'html.parser')
    return soup


def all_results(url):
    global links_to_visit
    global navigation_links
    soup = make_soup(url)
    print(soup)
    div = soup.find('div', {'class': 'cs_control'})
    links = div.find_all('a')
    print(links)
    for link in links:
        try:
            if link.text == 'Next':  # prev, next, new search
                navigation_links.append(link)
                print('got it')
            elif not '/MemberSearchDetail.cfm?ID=' in link.get('href'):
                pass  # I dont need that link
            else:
                links_to_visit.append(link)
        except:
            traceback.print_exc()
    print(len(links_to_visit))
    print(links_to_visit)
    #print(links_to_visit[-1].get('href'))


def start():
    flag = 1
    page = 1
    while page < 60716:
        flag = 0
        if navigation_links[-1].text == 'Next':
            flag = 1
            next_link = navigation_links[-1]
            #print(next_link.get('href'))
        page += 25
        print(base_url + next_link.get('href'))
        all_results(base_url + next_link.get('href'))
        print('page is:', page)

if __name__ == '__main__':
    all_results('https://www.gabar.org/membersearchresults.cfm')
    start()

如果我想获得完整的结果,我需要了解或做什么?

【问题讨论】:

    标签: beautifulsoup python-requests python-3.5


    【解决方案1】:

    您需要了解的是,HTTP 请求不仅仅是一个 URL。在这种情况下,搜索结果仅对执行搜索的会话可用,因此只有当您是该会话的“所有者”时才能翻页。大多数网站使用您需要随 HTTP 请求一起发送的会话 cookie 来识别会话。

    这可能是一个巨大的麻烦,但幸运的是 pythons requests 用requests.session 为你处理了所有这些。不是使用requests.get(url),而是初始化会话session=requests.session(),然后在后续请求session.get(url) 中使用该会话。这将自动为您保留 cookie,并且在许多方面表现得像一个实际的浏览器。

    您可以阅读更多关于 requests.session 如何工作的信息here

    最后但同样重要的是,您的固定代码 =)

    from bs4 import BeautifulSoup
    import requests
    import traceback
    
    
    links_to_visit = []
    navigation_links = []  # for testing next button
    # we initialize the session here
    session = requests.session()
    
    base_url = 'https://www.gabar.org'
    
    
    def make_soup(link):
        # r = requests.get(link)
        # we use the session here in order to preserve cookies across requests
        r = session.get(link)
        soup = BeautifulSoup(r.content, 'html.parser')
        return soup
    
    
    def all_results(url):
        # globals are almost never needed or recommended and certainly not here.
        # you can just leave this out
        # global links_to_visit
        # global navigation_links
        soup = make_soup(url)
        print(soup)
        div = soup.find('div', {'class': 'cs_control'})
        links = div.find_all('a')
        print(links)
        for link in links:
            try:
                if link.text == 'Next':  # prev, next, new search
                    navigation_links.append(link)
                    print('got it')
                elif not '/MemberSearchDetail.cfm?ID=' in link.get('href'):
                    pass  # I dont need that link
                else:
                    links_to_visit.append(link)
            except:
                traceback.print_exc()
        print(len(links_to_visit))
        print(links_to_visit)
        #print(links_to_visit[-1].get('href'))
    
    
    def start():
        flag = 1
        page = 1
        while page < 60716:
            flag = 0
            if navigation_links[-1].text == 'Next':
                flag = 1
                next_link = navigation_links[-1]
                #print(next_link.get('href'))
            page += 25
            print(base_url + next_link.get('href'))
            all_results(base_url + next_link.get('href'))
            print('page is:', page)
    
    if __name__ == '__main__':
        all_results('https://www.gabar.org/membersearchresults.cfm')
        start()
    

    【讨论】:

      猜你喜欢
      • 1970-01-01
      • 1970-01-01
      • 1970-01-01
      • 2021-02-14
      • 1970-01-01
      • 1970-01-01
      • 2011-07-18
      • 2020-03-01
      • 1970-01-01
      相关资源
      最近更新 更多