【问题标题】:Using a python web crawler to scrape twitter accounts使用 python 网络爬虫抓取 Twitter 帐户
【发布时间】:2023-03-30 22:59:02
【问题描述】:

我正在为我的 A-Level 计算机科学课程编写这个程序,并且我正在尝试让一个爬虫从给定的用户关注/关注列表中抓取所有找到的用户。

脚本开头如下:

import requests
# import database as db
from bs4 import BeautifulSoup

debug = True


def getStartNode():  # Get the Twitter profile of the starting node
    global startNodeFollowing  # Declare the nodes vars as global for use in external functions
    global startNodeFollowers
    global startNodeLink
    if not debug:  # If debugging == False, allow the user to enter any starting node Twitter profile
        startNodeLink = input("Enter a link to the starting users Twitter profile\n[URL]: ")[:-1]  # Get profile link, remove the last char from input (space char, needed to enter link in terminal)
    else:  # If debugging == True, have predetermined starting node to save time during development
        startNodeLink = ("https://twitter.com/ckjellberg03")
    startNodeFollowers = (startNodeLink + "/followers")  # Create a new var using the starting node's Twitter profile, append for followers and following URL pages
    startNodeFollowing = (startNodeLink + "/following")

而爬虫就在这里:

def spider():  # Web Crawler
    getStartNode()
    print("\nUsing:", startNodeLink)

    urlFollowers = startNodeFollowers
    sourceCode = requests.get(urlFollowers)
    plainText = sourceCode.text  # Source code of the URL (urlFollowers) in plain text format
    soup = BeautifulSoup(plainText,'lxml')  # BeautifulSoup object to search through plainText for specific items/classes etc
    for link in soup.findAll('a', {'class': 'css-4rbku5 css-18t94o4 css-1dbjc4n r-1loqt21 r-1wbh5a2 r-dnmrzs r-1ny4l3l'}):  # 'a' is a link in HTML (anchor), class is the Twitter class for a profile
        href = link.get(href)
        print(href) # Display everything found (development purposes)

我很确定用户从 /followers 链接到他们的 Twitter 个人资料的类标识符是“css-4rbku5 css-18t94o4 css-1dbjc4n r-1loqt21 r-1wbh5a2 r-dnmrzs r-1ny4l3l”源代码,但打印结果不显示。

有什么建议可以指引我正确的方向吗?

谢谢!

【问题讨论】:

    标签: python web web-crawler


    【解决方案1】:

    抓取 Twitter 非常困难(相信我,我已经尝试了各种方法),你可以使用 Twitter API,但如果你想抓取一些信息,它们有限制(你不能只知道关注者的名字)使用 Twitter API,您可以使用以下代码:

    from TwitterAPI import TwitterAPI, TwitterPager
    import tweepy
    from tweepy import Cursor
    from datetime import datetime, date, time, timedelta
    
    consumer_key = 'consumer key'
    consumer_secret = 'consumer secret'
    token = 'token'
    token_secret = 'token secret'
    
    auth= tweepy.OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(token, token_secret)
    api = tweepy.API(auth)
    
    account_list = ['POTUS44']
    
    
    
    for target in account_list:
        print("Getting data for " + target)
        item = api.get_user(target)
        print("name: " + item.name)
        print("screen_name: " + item.screen_name)
        print("description: " + item.description)
        print("statuses_count: " + str(item.statuses_count))
        print("friends_count: " + str(item.friends_count))
        print("followers_count: " + str(item.followers_count))
    
        tweets = item.statuses_count
        account_created_date = item.created_at
        delta = datetime.utcnow() - account_created_date
        account_age_days = delta.days
        print("Account age (in days): " + str(account_age_days))
        if account_age_days > 0:
          print("Average tweets per day: " + "%.2f"%(float(tweets)/float(account_age_days)))
    
        tweets = item.statuses_count
        account_created_date = item.created_at
        delta = datetime.utcnow() - account_created_date
        account_age_days = delta.days
        print("Account age (in days): " + str(account_age_days))
        if account_age_days > 0:
          print("Average tweets per day: " + "%.2f"%(float(tweets)/float(account_age_days)))
    
        hashtags = []
        mentions = []
        tweet_count = 0
        end_date = datetime.utcnow() - timedelta(days=30)
        for status in Cursor(api.user_timeline, id=target).items():
          tweet_count += 1
          if hasattr(status, "entities"):
            entities = status.entities
            if "hashtags" in entities:
              for ent in entities["hashtags"]:
                if ent is not None:
                  if "text" in ent:
                    hashtag = ent["text"]
                    if hashtag is not None:
                      hashtags.append(hashtag)
            if "user_mentions" in entities:
              for ent in entities["user_mentions"]:
                if ent is not None:
                  if "screen_name" in ent:
                    name = ent["screen_name"]
                    if name is not None:
                      mentions.append(name)
          if status.created_at < end_date:
            break
    

    【讨论】:

      【解决方案2】:

      这里是如何在没有 API 的情况下做到这一点。一些困难源于使用权利 User-Agent 中的浏览器,

      import re, requests
      
      headers = { 'User-Agent': 'UCWEB/2.0 (compatible; Googlebot/2.1; +google.com/bot.html)'}
      
      
      def cleanhtml(raw_html):
        cleanr = re.compile('<.*?>')
        cleantext = re.sub(cleanr, '', raw_html)
        return cleantext
      
      content = ""
      for user in ['billgates']:
          content += "============================\n\n"
          content += user + "\n\n"
          content += "============================\n\n"
          url_twitter = 'https://twitter.com/%s' % user
          resp = requests.get(url_twitter, headers=headers)  # Send request
          res = re.findall(r'<p class="TweetTextSize.*?tweet-text.*?>(.*?)</p>',resp.text)
          for x in res:
              x = cleanhtml(x)
              x = x.replace("&#39;","'")
              x = x.replace('&quot;','"')
              x = x.replace("&nbsp;"," ")
              content += x 
              content += "\n\n"
              content += "---"
              content += "\n\n"
      

      【讨论】:

        猜你喜欢
        • 1970-01-01
        • 1970-01-01
        • 2021-02-28
        • 2015-05-12
        • 1970-01-01
        • 1970-01-01
        • 1970-01-01
        • 1970-01-01
        • 2021-05-08
        相关资源
        最近更新 更多