按照建议使用发布请求的问题是该请求需要一个授权令牌,该令牌具有到期时间。您可以在 Chrome 或 Firefox 中看到 post 请求,如果您右键单击页面 -> 选择 Inspect -> 选择 Network 然后选择 Industry 点击 POST 请求并点击 Cookies 有一个 cookie password_grant_custom.client.expires 具有授权将不再起作用的时间戳。
但是,您可以使用 selenium 从所有页面上刮取数据。
首先安装 Selenium:
`sudo pip3 install selenium` on Linux or `pip install selenium` on Windows
那就找个驱动https://sites.google.com/a/chromium.org/chromedriver/downloads,
为您的 Chrome 版本获取正确的版本并将其从 zip 文件中解压缩。
注意在 Windows 上,您需要将 chromedriver 的路径添加到
driver = webdriver.Chrome(options=options)
在 Linux 上将 chromedriver 复制到 /usr/local/bin/chromedriver
from selenium import webdriver
from selenium.webdriver.common.by import By
import selenium.webdriver.support.expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
from bs4 import BeautifulSoup
import time
# Start with the driver maximised to see the drop down menus properly
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(options=options)
driver.get('https://www.gurufocus.com/insider/summary')
# Set the page size to 100 to reduce page loads
driver.find_element_by_xpath("//span[contains(text(),'40 / Page')]").click()
wait = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((
By.XPATH,
"//div[contains(text(),'100')]"))
)
element = driver.find_element_by_xpath("//div[contains(text(),'100')]").click()
# Wait for the page to load and don't overload the server
time.sleep(2)
# select Industry
driver.find_element_by_xpath("//span[contains(text(),'Industry')]").click()
# Select Financial Services
element = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((
By.XPATH,
"//span[contains(text(),'Financial Services')]"))
)
element.click()
ticker = []
while True:
# Wait for the page to load and don't overload the server
time.sleep(6)
# Parse the HTML
soup = BeautifulSoup(driver.page_source, 'html.parser')
for tk in soup.find_all('td', {'class': 'table-stock-info', 'data-column': 'Ticker'}):
ticker.append(tk.text)
try:
# Move to the next page
element = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CLASS_NAME, 'btn-next')))
element.click()
except TimeoutException as ex:
# No more pages so break
break
driver.quit()
print(len(ticker))
print(ticker)
输出
4604
['PUB ', 'ARES ', 'EIM ', 'CZNC ', 'SSB ', 'CNA ', 'TURN ', 'FNF ', 'EGIF ', 'NWPP etc...
更新
如果您想从所有页面上刮取所有数据和/或写入 csv,请使用 pandas:
from selenium import webdriver
from selenium.webdriver.common.by import By
import selenium.webdriver.support.expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
import pandas as pd
import time
# Start with the driver maximised to see the drop down menus properly
options = webdriver.ChromeOptions()
options.add_argument("--start-maximized")
driver = webdriver.Chrome(options=options)
driver.get('https://www.gurufocus.com/insider/summary')
# Set the page size to 100 to reduce page loads
driver.find_element_by_xpath("//span[contains(text(),'40 / Page')]").click()
wait = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((
By.XPATH,
"//div[contains(text(),'100')]"))
)
driver.find_element_by_xpath("//div[contains(text(),'100')]").click()
# Wait for the page to load and don't overload the server
time.sleep(2)
# select Industry
driver.find_element_by_xpath("//span[contains(text(),'Industry')]").click()
# Select Financial Services
element = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((
By.XPATH,
"//span[contains(text(),'Financial Services')]"))
)
element.click()
columns = [
'Ticker', 'Links', 'Company', 'Price1', 'Insider Name', 'Insider Position',
'Date', 'Buy/Sell', 'Insider Trading Shares', 'Shares Change', 'Price2',
'Cost(000)', 'Final Share', 'Price Change Since Insider Trade (%)',
'Dividend Yield %', 'PE Ratio', 'Market Cap ($M)', 'None'
]
df = pd.DataFrame(columns=columns)
while True:
# Wait for the page to load and don't overload the server
time.sleep(6)
# Parse the HTML
df = df.append(pd.read_html(driver.page_source, attrs={'class': 'data-table'})[0], ignore_index=True)
try:
# Move to the next page
element = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CLASS_NAME, 'btn-next')))
element.click()
except TimeoutException as ex:
# No more pages so break
break
driver.quit()
# Write to csv
df.to_csv("Financial_Services.csv", encoding='utf-8', index=False)
针对 cmets 进行了更新:
首先从https://github.com/mozilla/geckodriver/releases下载Firefox驱动geckodriver解压驱动。再次在 Windows 上,您需要将 geckodriver 的路径添加到 driver = webdriver.Firefox() 或在 linux 上将 geckodriver 复制到 /usr/local/bin/geckodriver
from selenium import webdriver
from selenium.webdriver.common.by import By
import selenium.webdriver.support.expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.common.exceptions import TimeoutException
import pandas as pd
import time
# Start with the driver maximised to see the drop down menus properly
driver = webdriver.Firefox()
driver.maximize_window()
driver.get('https://www.gurufocus.com/insider/summary')
# Set the page size to 100 to reduce page loads
driver.find_element_by_xpath("//span[contains(text(),'40 / Page')]").click()
wait = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((
By.XPATH,
"//div[contains(text(),'100')]"))
)
driver.find_element_by_xpath("//div[contains(text(),'100')]").click()
# Wait for the page to load and don't overload the server
time.sleep(2)
# select Industry
driver.find_element_by_xpath("//span[contains(text(),'Industry')]").click()
# Select Financial Services
element = WebDriverWait(driver, 5).until(
EC.presence_of_element_located((
By.XPATH,
"//span[contains(text(),'Financial Services')]"))
)
element.click()
columns = [
'Ticker', 'Links', 'Company', 'Price1', 'Insider Name', 'Insider Position',
'Date', 'Buy/Sell', 'Insider Trading Shares', 'Shares Change', 'Price2',
'Cost(000)', 'Final Share', 'Price Change Since Insider Trade (%)',
'Dividend Yield %', 'PE Ratio', 'Market Cap ($M)', 'None'
]
df = pd.DataFrame(columns=columns)
page_limit = 5
page = 0
while True:
# Wait for the page to load and don't overload the server
time.sleep(6)
# Parse the HTML
df = df.append(pd.read_html(driver.page_source, attrs={'class': 'data-table'})[0], ignore_index=True)
# Stop after page limit is reached.
page = page + 1
if page >= page_limit:
break
try:
# Move to the next page
element = WebDriverWait(driver, 5).until(EC.element_to_be_clickable((By.CLASS_NAME, 'btn-next')))
element.click()
except TimeoutException as ex:
# No more pages so break
break
driver.quit()
# Write to csv
df.to_csv("Financial_Services.csv", encoding='utf-8', index=False)