【发布时间】:2013-11-01 07:11:29
【问题描述】:
我正在尝试从 Google Scholar 的搜索结果中抓取 PDF 链接。我尝试根据 URL 的变化设置页面计数器,但是在前八个输出链接之后,我得到了重复的链接作为输出。
#!/usr/bin/env python
from mechanize import Browser
from BeautifulSoup import BeautifulSoup
from bs4 import BeautifulSoup
import urllib2
import requests
#modifying the url as per page
urlCounter = 0
while urlCounter <=30:
urlPart1 = "http://scholar.google.com/scholar?start="
urlPart2 = "&q=%22entity+resolution%22&hl=en&as_sdt=0,4"
url = urlPart1 + str(urlCounter) + urlPart2
page = urllib2.Request(url,None,{"User-Agent":"Mozilla/5.0 (X11; U; Linux i686) Gecko/20071127 Firefox/2.0.0.11"})
resp = urllib2.urlopen(page)
html = resp.read()
soup = BeautifulSoup(html)
urlCounter = urlCounter + 10
recordCount = 0
while recordCount <=9:
recordPart1 = "gs_ggsW"
finRecord = recordPart1 + str(recordCount)
recordCount = recordCount+1
#printing the links
for link in soup.find_all('div', id = finRecord):
linkstring = str(link)
soup1 = BeautifulSoup(linkstring)
for link in soup1.find_all('a'):
print(link.get('href'))
【问题讨论】:
标签: python web-scraping urllib2 google-scholar