如何从使用 js 填充的网站中抓取数据？答案

【问题标题】：How to scrape data from website which is populated using js?如何从使用 js 填充的网站中抓取数据？
【发布时间】：2021-01-07 06:17:07
【问题描述】：

我正在尝试从 sharechat.com 抓取帖子数据（喜欢、分享、图像等），但问题是我无法找到使用 Selenium 的帖子的图像 URL，因为我怀疑它使用 Javascript 来填充它。

我尝试使用 Selenium 来查找最外层的 HTML（显示的 HTML），并且我获得了所有其他帖子信息，例如喜欢的数量、分享、cmets 等，但我无法获取存储图像，因为我找不到它的网址。

我这样做是为了进行情绪分析和推荐趋势的社交网络研究，所以我希望将帖子数据连同标签和喜欢、分享的数量等一起抓取。我只是在抓取图像的标签和 URL 时失败了.

Here 是您需要运行的 geckodriver 文件。
这里是my code：

import sys
import csv
import os
import time
import urllib
import datetime
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
serviceurl = 'https://sharechat.com/trending/Hindi'

files = "dataset_link_1.txt"
if not os.path.exists(files):
    file(files, 'w').close()
enter = open(files,'w');

url = serviceurl
driver = webdriver.Firefox(executable_path='D:\CHIT CHAT\Scrapper\geckodriver');
driver.maximize_window() #For maximizing window
driver.get(url);
driver.implicitly_wait(3) #gives an implicit wait for 10 seconds
while driver.execute_script("return document.readyState") != 'complete':
    pass;

for i in range(1,20):

    SCROLL_PAUSE_TIME = 0.5

    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)
        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height


        var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/a/div[3]/div[1]"%(i)).text.encode('utf-8')
        print(var) #No of watches
        enter.write("Total No of views:\n%s\n" %(var));

        var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/a/div[1]/div[1]/span"%(i)).text.encode('utf-8')
        print(var) #Title
        enter.write("Title:\n%s\n" %(var));

        var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/div[1]/a/div[2]/div/div[2]"%(i)).text.encode('utf-8')
        print(var) #owner bio
        enter.write("Writer's Bio:\n%s\n" %(var));

        var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/div[1]/a/div[2]/div/div[1]/strong"%(i)).text.encode('utf-8')
        print(var) #owner's bio
        enter.write("Writer's Name:\n%s\n" %(var));

        var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/div[2]/div/button[2]/div/span"%(i)).text.encode('utf-8')
        print(var) #comments
        enter.write("Total Comments:\n%s\n" %(var));

        var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/div[2]/div/button[1]/div/span"%(i)).text.encode('utf-8')
        print(var) #whatsapp
        enter.write("Whatsapp Share:\n%s\n" %(var));

        print()
        # driver.save_screenshot("captcha_%s.png"%(i))

    driver.back()

driver.quit()
enter.close()

【问题讨论】：

欢迎来到 SO。你能分享一下你试过的鳕鱼和你得到的错误吗？
@supputuri 感谢您的热烈欢迎 :) 这是我到目前为止编写的代码，没有错误，因为我所做的任何事情似乎都没有获取后期图像的 URL。 ideone.com/ZzESLQ

标签： python selenium selenium-webdriver web-scraping

【解决方案1】：

这里是折射代码。最后添加了标签和图像逻辑。

import sys
import csv
import os
import time
import urllib
import datetime
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
serviceurl = 'https://sharechat.com/trending/Hindi'

files = "dataset_link_1.txt"
# if not os.path.exists(files):
#     file(files, 'w').close()
enter = open(files,'w');

url = serviceurl
driver = webdriver.Firefox(executable_path=r'D:\CHIT CHAT\Scrapper\geckodriver');
driver.maximize_window() #For maximizing window
driver.get(url);
driver.implicitly_wait(3) #gives an implicit wait for 10 seconds
while driver.execute_script("return document.readyState") != 'complete':
    pass;

for i in range(1,20):

    SCROLL_PAUSE_TIME = 0.5

    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        # Wait to load page
        time.sleep(SCROLL_PAUSE_TIME)
        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            break
        last_height = new_height

    #get the number of feeds
    feedCards = driver.find_elements_by_xpath("//section[@class='post-batch']//div[contains(@class,'feedCard')]")
    for ifeedCard in range(len(feedCards)):
        # get Number of watches
        watches = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//div[contains(@class,'lhcaption')]/div[1]").text.encode('utf-8')
        print(watches)
        enter.write("Total No of views:\n%s\n" % (watches));
        # get title
        title = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//span[contains(@class,'darkText')]").text.encode('utf-8')
        print(title)
        enter.write("Title:\n%s\n" % (title));
        # get owner bio
        writerBio = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//div[contains(@class,'Pstart')]//div[contains(@class,'darkTextSecondary')]").text.encode('utf-8')
        print(writerBio)
        enter.write("Writer's Bio:\n%s\n" % (writerBio));
        # get owner name
        writerName = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//strong").text.encode('utf-8')
        print(writerName)
        enter.write("Writer Name:\n%s\n" % (writerName));
        # get comment
        comment = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//button[@aria-label='Click to comment']//span").text.encode('utf-8')
        print(comment)
        enter.write("Number of comments:\n%s\n" % (comment));
        # get share via whatsapp
        whatsApp = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//button[@aria-label='Click to share']//span").text.encode('utf-8')
        print(whatsApp)
        enter.write("Whatsapp Share:\n%s\n" % (whatsApp));
        #get tags
        tags = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//div[contains(@class,'primaryDark')]").text.encode('utf-8')
        print(tags)
        enter.write("Tags:\n%s\n" % (tags));
        # get onwer image
        image = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//img").get_attribute('src')
        print(image)
        enter.write("Owner Image link:\n%s\n" % (image));
        # post image
        postImage = driver.find_element_by_xpath("(//section[@class='post-batch']//div[contains(@class,'feedCard')])[" + str(ifeedCard+1) + "]//a[@class='D(b)']").get_attribute('href')
        print(postImage)
        enter.write("post image link:\n%s\n" % (postImage))


driver.quit()
enter.close()

如果您尝试将文件下载到不同的文件夹。使用下面的代码。

profile = webdriver.FirefoxProfile()
profile.set_preference("browser.download.folderList", 2)
profile.set_preference("browser.download.manager.showWhenStarting", False)
profile.set_preference("browser.download.dir", 'Here goes your folder where you want to download')
profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/x-gzip")

下载文件后，只需使用以下命令将文件重命名为所需的名称。

os.rename(download_file_name,desired_name) # you can pass the file name with path.

【讨论】：

您好，谢谢！它不运行。我已经更新了我的代码并添加了指向 geckodriver 文件的链接。只需将第 18 行替换为您保存 geckodriver 的 URL，您就可以在本地测试您的代码。再次感谢您的光临 :)
检查更新的代码现在应该是完美的（测试逻辑）。
如果您认为问题已解决，请点击左侧“否决”按钮下方的复选标记接受我的回答。
它存储帖子链接但不存储帖子内容（图像/视频）。其余的工作！我看到你的方法比我的好。我仍然不明白如何使用它的链接抓取帖子内容，请告诉我，以便我接受你的回答:)
您认为获取后期内容的最佳方法是模拟点击最右下角的下载按钮吗？或者它不是一个好方法？

【解决方案2】：

我更改了 Web 驱动程序路径和范围变量。如果您创建一个文件夹 C:\Py，我下面的代码将输出一个名为 PageSource_StackOverflowQ2.txt 的文本文件，其中包含图像 src 路径。

我在 htlm 中遇到了很多关于二进制字符的问题，所以可能有更好的方法来做到这一点，但希望这会帮助你到达你想要去的地方。

如果图像路径连续包含这 9 个字符，我的代码将停止 ("title=")

import sys
import csv
import os
import time
import urllib
import datetime
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
serviceurl = 'https://sharechat.com/trending/Hindi'

# files = "dataset_link_1.txt"
enter = open('C:\\Py\\dataset_link_1.txt','w+')
# if not os.path.exists(files):
#     file(files, 'w').close()
# enter = open(files,'w');

url = serviceurl
# driver = webdriver.Firefox(executable_path='D:\CHIT CHAT\Scrapper\geckodriver');
driver = webdriver.Firefox(executable_path=r'C:\\Py\\geckodriver.exe');
driver.maximize_window() #For maximizing window
driver.get(url);
driver.implicitly_wait(3) #gives an implicit wait for 10 seconds
while driver.execute_script("return document.readyState") != 'complete':
    pass;

# for i in range(1,20):
for i in range ( 1, 2 ):

SCROLL_PAUSE_TIME = 0.5

# Get scroll height
last_height = driver.execute_script("return document.body.scrollHeight")

while True:
    # Scroll down to bottom
    driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
    # Wait to load page
    time.sleep(SCROLL_PAUSE_TIME)
    # Calculate new scroll height and compare with last scroll height
    new_height = driver.execute_script("return document.body.scrollHeight")
    if new_height == last_height:
        break
    last_height = new_height


    var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/a/div[3]/div[1]"%(i)).text.encode('utf-8')
    print(var) #No of watches
    enter.write("Total No of views:\n%s\n" %(var));

    var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/a/div[1]/div[1]/span"%(i)).text.encode('utf-8')
    print(var) #Title
    enter.write("Title:\n%s\n" %(var));

    var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/div[1]/a/div[2]/div/div[2]"%(i)).text.encode('utf-8')
    print(var) #owner bio
    enter.write("Writer's Bio:\n%s\n" %(var));

    var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/div[1]/a/div[2]/div/div[1]/strong"%(i)).text.encode('utf-8')
    print(var) #owner's bio
    enter.write("Writer's Name:\n%s\n" %(var));

    var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/div[2]/div/button[2]/div/span"%(i)).text.encode('utf-8')
    print(var) #comments
    enter.write("Total Comments:\n%s\n" %(var));

    var = driver.find_element_by_xpath("/html/body/div/div[1]/div/main/div[1]/div[2]/div/section/div[%s]/div/div/div[2]/div/button[1]/div/span"%(i)).text.encode('utf-8')
    print(var) #whatsapp
    enter.write("Whatsapp Share:\n%s\n" %(var));


    PageSource1 = [driver.page_source]
    PageSource1 = PageSource1[0].encode ( "utf-8" )

    file = open ( 'C:\\Py\\PageSource_StackOverflowQ.txt', 'ab' )

    file.write ( PageSource1 )
    file.close ()
    FindPageCount = []

    file = open ( 'C:\\Py\\PageSource_StackOverflowQ1.txt', 'w' )

    with open ( 'C:\\Py\\PageSource_StackOverflowQ.txt', "rb" ) as outfile, open ( 'C:\\Py\\PageSource_StackOverflowQ1.txt', "a" ) as f1:
        for line in outfile:
            uline = line.decode ( 'ascii', errors='ignore' )
            f1.write ( uline )
    outfile.close ()
    f1.close ()

    data = open ( 'C:\\Py\\PageSource_StackOverflowQ1.txt', 'r' ).readlines ()
    with open ( 'C:\\Py\\PageSource_StackOverflowQ1.txt' ) as f, open ( 'C:\\Py\\PageSource_StackOverflowQ2.txt', "w" ) as f1:
        data = f.readlines ()
        for i in range ( len ( data ) ):
            line = data[i]
            if ("img src" in line):
                q = line.split("><")
                for k in q:
                    if("img src" in k):
                        h = 0
                        while h < len ( k ):
                            l = h + 9
                            if k[h:l] == '" title="':
                                f1.write ( k[9:h] )
                                f1.write ( '\n' )
                                print ( h )
                                print ( k[9:h] )
                            h = h + 1


    print()
    # driver.save_screenshot("captcha_%s.png"%(i))

driver.back()

driver.quit()
enter.close()

【讨论】：