【问题标题】:How to append more than 200 downloaded tweets to dataframe?如何将超过 200 条下载的推文附加到数据框?
【发布时间】:2020-12-24 01:53:21
【问题描述】:

我正在使用循环下载超过 twitter 的速率上限;但是,当我尝试附加列表时,它会返回一个空数据框。

我的功能如下:

在:

import pandas as pd
import numpy as np
import tweepy
from datetime import timedelta

def get_tweets(handle):
    batch_count_for_tweet_downloads = 200
    try:
        alltweets = []
        tweets = api_twitter.user_timeline(screen_name=handle,
                                           count=batch_count_for_tweet_downloads,
                                           exclude_replies=True,
                                           include_rts=False,
                                           lang="en",
                                           tweet_mode="extended")
        # ---GET MORE THAN 200 TWEETS
        alltweets.extend(tweets)
        oldest = alltweets[-1].id - 1
        oldest_datetime = pd.to_datetime(str(pd.to_datetime(oldest))[:-10]).strftime("%Y-%m-%d %H:%M:%S")
        print(f"Getting Tweets For " + handle + ", After: " + oldest_datetime)
        while len(tweets) > 0:
            tweets = api_twitter.user_timeline(screen_name=handle, count=batch_count_for_tweet_downloads, max_id=oldest)
            alltweets.extend(tweets)
            oldest = alltweets[-1].id - 1
            print("Count: " + f"...{len(alltweets)} " + handle + " Tweets Downloaded")
        #---
        df = pd.DataFrame(data=[tweets.user.screen_name for tweets in alltweets], columns=['Handle'])
        df['Tweets'] = np.array([tweets.full_text for tweets in alltweets])
        df['Date'] = np.array([tweets.created_at - timedelta(hours=4) for tweets in alltweets])
        df['Len'] = np.array([len(tweets.full_text) for tweets in alltweets])
        df['Like_count'] = np.array([tweets.favorite_count for tweets in alltweets])
        df['RT_count'] = np.array([tweets.retweet_count for tweets in alltweets])
        total_tweets.extend(alltweets)
        print(handle + " Total Tweets Extracted: {}".format(len(alltweets)))
    except:
        pass
    return df

如您所见,我需要一些帮助来将循环合并到函数中。

最好的方法是什么?

提前感谢您的帮助。

编辑 1:(我的代码现在的样子)

在:

import tweepy
import pandas as pd
import numpy as np
from datetime import timedelta

handles = ['@MrML16419203', '@d00tn00t']

consumerKey = 'x'
consumerSecret = 'x'
accessToken = 'x'
accessTokenSecret = 'x'

authenticate = tweepy.OAuthHandler(consumerKey, consumerSecret)
authenticate.set_access_token(accessToken, accessTokenSecret)
api_twitter = tweepy.API(authenticate, wait_on_rate_limit=True)

total_tweets = []
def get_tweets(handle):
    batch_count_for_tweet_downloads = 200
    try:
        alltweets = []
        tweets = api_twitter.user_timeline(screen_name=handle,
                                           count=batch_count_for_tweet_downloads,
                                           exclude_replies=True,
                                           include_rts=False,
                                           lang="en",
                                           tweet_mode="extended")
        alltweets.extend(tweets)
        oldest = alltweets[-1].id - 1
        oldest_datetime = pd.to_datetime(str(pd.to_datetime(oldest))[:-10]).strftime("%Y-%m-%d %H:%M:%S")
        print(f"Getting Tweets For " + handle + ", After: " + oldest_datetime)
        while len(tweets) > 0:
            tweets = api_twitter.user_timeline(screen_name=handle, count=batch_count_for_tweet_downloads, max_id=oldest)
            alltweets.extend(tweets)
            if len(alltweets) > 0:
                oldest = alltweets[-1].id - 1
            else:
                pass
            print("Count: " + f"...{len(alltweets)} " + handle + " Tweets Downloaded")
        print('---Total Downloaded: ' + str(len(alltweets)) + ' for ' + handle + '---')
        df = pd.DataFrame(data=[tweets.user.screen_name for tweets in alltweets], columns=['Handle'])
        df['Tweets'] = np.array([tweets.full_text for tweets in alltweets])
        df['Date'] = np.array([tweets.created_at - timedelta(hours=4) for tweets in alltweets])
        df['Len'] = np.array([len(tweets.full_text) for tweets in alltweets])
        df['Like_count'] = np.array([tweets.favorite_count for tweets in alltweets])
        df['RT_count'] = np.array([tweets.retweet_count for tweets in alltweets])

        print([tweets.favorite_count for tweets in alltweets])
        print(np.array([tweets.favorite_count for tweets in alltweets]))

        total_tweets.extend(alltweets)
        print("----------Total Tweets Extracted: {}".format(df.shape[0]) + "----------")
    except:
        pass
    return df
df = pd.DataFrame()
for handle in handles:
    df_new = get_tweets(handle)
    df = pd.concat((df, df_new))
print(df)

输出:

Getting Tweets For @MrML16419203, After: 2011-03-19 07:03:53
Count: ...136 @MrML16419203 Tweets Downloaded
---Total Downloaded: 136 for @MrML16419203---
[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
----------Total Tweets Extracted: 136----------
Getting Tweets For @d00tn00t, After: 2009-11-27 19:18:58
Count: ...338 @d00tn00t Tweets Downloaded
Count: ...530 @d00tn00t Tweets Downloaded
Count: ...546 @d00tn00t Tweets Downloaded
Count: ...546 @d00tn00t Tweets Downloaded
---Total Downloaded: 546 for @d00tn00t---
           Handle   Tweets                Date  Len  Like_count  RT_count
0    MrML16419203   132716 2020-09-02 02:18:28  6.0         0.0       0.0
1    MrML16419203   432881 2020-09-02 02:04:23  6.0         0.0       0.0
2    MrML16419203   973625 2020-09-02 02:04:09  6.0         0.0       0.0
3    MrML16419203  1234567 2020-09-02 01:55:10  7.0         0.0       0.0
4    MrML16419203   225865 2020-09-02 01:27:11  6.0         0.0       0.0
..            ...      ...                 ...  ...         ...       ...
541      d00tn00t      NaN                 NaT  NaN         NaN       NaN
542      d00tn00t      NaN                 NaT  NaN         NaN       NaN
543      d00tn00t      NaN                 NaT  NaN         NaN       NaN
544      d00tn00t      NaN                 NaT  NaN         NaN       NaN
545      d00tn00t      NaN                 NaT  NaN         NaN       NaN

[682 rows x 6 columns]

正如您所见,对于推文少于 200 条的句柄,数据框会被填充。但是,不适用于包含超过 200 条推文的句柄。

【问题讨论】:

  • 可以添加缺少的导入吗?
  • 您是否看到“Total Tweets Extracted”打印并且数字大于 0?
  • 我不明白 当我尝试附加列表时它返回一个空数据帧。 你是说return df 返回一个没有行或列的数据帧吗?什么是“附加列表”部分?哪个列表?
  • 顺便说一句,您的 try/except 需要一些工作。如果您在创建数据框之前遇到异常,则 df 未定义,您将获得异常。也许返回一个罐装数据框或None 会更好。当然,捕捉所有异常是有风险的,你会错过合法的错误。当你没有得到你想要的回报但压制所有的例外时......这让人惊叹!
  • @anon01 添加了导入

标签: python pandas dataframe twitter tweepy


【解决方案1】:

对于偶然发现此问题的任何人,我都可以使用它:

def get_tweets(screen_name):
batch_count_for_tweet_downloads = 200
try:
    alltweets = []
    tweets = api_twitter.user_timeline(screen_name=screen_name,
                                       count=batch_count_for_tweet_downloads,
                                       exclude_replies=True,
                                       include_rts=False,
                                       lang="en")
    alltweets.extend(tweets)
    oldest = alltweets[-1].id - 1
    oldest_datetime = pd.to_datetime(str(pd.to_datetime(oldest))[:-10]).strftime("%Y-%m-%d %H:%M:%S")
    print(f"Getting Tweets For " + handle + ", After: " + oldest_datetime)
    while len(tweets) > 0:
        tweets = api_twitter.user_timeline(screen_name=screen_name, count=batch_count_for_tweet_downloads,
                                           max_id=oldest)
        alltweets.extend(tweets)
        if len(alltweets) > 0:
            oldest = alltweets[-1].id - 1
        else:
            pass
        print("Count: " + f"...{len(alltweets)} " + handle + " Tweets Downloaded")
    outtweets = [
        [tweet.user.screen_name, tweet.text, tweet.created_at, len(tweet.text),
         tweet.favorite_count, tweet.retweet_count] for tweet in alltweets]
    df_tweet_function = pd.DataFrame(outtweets,
                                     columns=['Handle', 'Tweets', 'Date', 'Len', 'Like_count', 'RT_count'])
    print('----------Total Downloaded: ' + str(len(alltweets)) + ' for ' + handle + '----------')
except tweepy.error.TweepError:
    pass
return df_tweet_function

df = pd.DataFrame() 如果 name == 'ma​​in': 对于手柄中的手柄: get_tweets(句柄) df = df.append(get_tweets(handle)) print("----------------总推文提取:{}".format(df.shape[0]) + "------------- --")

【讨论】:

    猜你喜欢
    • 2012-12-07
    • 1970-01-01
    • 1970-01-01
    • 2021-11-25
    • 2022-01-20
    • 2021-11-23
    • 1970-01-01
    • 1970-01-01
    相关资源
    最近更新 更多