如何使用 Python 在 Twitter 上的 JSON 文件中收集推文？答案

【问题标题】：How to collect Tweets in a JSON file on Twitter using Python?如何使用 Python 在 Twitter 上的 JSON 文件中收集推文？
【发布时间】：2018-04-25 11:57:57
【问题描述】：

我正在构建一个程序，在 1-2 个月内从特定国家（基于坐标）收集指定数量的推文（没有特定主题标签，只是随机帖子）。

例如，我正在收集 200 条来自美国的推文/状态更新，这些推文/状态更新是在 9 月到 10 月之间发布的。

我这样做的原因是因为我想收集这些推文并对其进行情绪分析，以查看来自指定国家/地区的平均推文是否是负面/正面的。

我遇到的问题是我不知道如何“过滤”随机推文/状态更新，因为这类推文没有主题标签。此外，我不确定 Twitter 是否允许我收集 2 个月前的推文。有什么建议吗？

代码

 import tweepy
from tweepy import OAuthHandler
import json
import datetime as dt
import time
import os
import sys


'''
I created a twitter account for anyone to use if they want to test the code!
I used Python 3 and tweepy version 3.5.0.
'''

def load_api():
    ''' Function that loads the twitter API after authorizing the user. '''

    consumer_key = 'nn'
    consumer_secret = 'nn'
    access_token = 'nn'
    access_secret = 'nnn'
    auth = OAuthHandler(consumer_key, consumer_secret)
    auth.set_access_token(access_token, access_secret)

    api = tweepy.API(auth, wait_on_rate_limit=True)

    # load the twitter API via tweepy
    return tweepy.API(auth)


def tweet_search(api, query, max_tweets, max_id, since_id, geocode):
    ''' Function that takes in a search string 'query', the maximum
        number of tweets 'max_tweets', and the minimum (i.e., starting)
        tweet id. It returns a list of tweepy.models.Status objects. '''

    searched_tweets = []
    while len(searched_tweets) < max_tweets:
        remaining_tweets = max_tweets - len(searched_tweets)
        try:
            new_tweets = api.search(q=query, count=remaining_tweets,
                                    since_id=str(since_id),
                                    max_id=str(max_id-1))
#                                    geocode=geocode)
            print('found',len(new_tweets),'tweets')
            if not new_tweets:
                print('no tweets found')
                break
            searched_tweets.extend(new_tweets)
            max_id = new_tweets[-1].id
        except tweepy.TweepError:
            print('exception raised, waiting 15 minutes')
            print('(until:', dt.datetime.now()+dt.timedelta(minutes=15), ')')
            time.sleep(15*60)
            break # stop the loop
    return searched_tweets, max_id


def get_tweet_id(api, date='', days_ago=9, query='a'):
    ''' Function that gets the ID of a tweet. This ID can then be
        used as a 'starting point' from which to search. The query is
        required and has been set to a commonly used word by default.
        The variable 'days_ago' has been initialized to the maximum
        amount we are able to search back in time (9).'''

    if date:
        # return an ID from the start of the given day
        td = date + dt.timedelta(days=1)
        tweet_date = '{0}-{1:0>2}-{2:0>2}'.format(td.year, td.month, td.day)
        tweet = api.search(q=query, count=1, until=tweet_date)
    else:
        # return an ID from __ days ago
        td = dt.datetime.now() - dt.timedelta(days=days_ago)
        tweet_date = '{0}-{1:0>2}-{2:0>2}'.format(td.year, td.month, td.day)
        # get list of up to 10 tweets
        tweet = api.search(q=query, count=10, until=tweet_date)
        print('search limit (start/stop):',tweet[0].created_at)
        # return the id of the first tweet in the list
        return tweet[0].id


def write_tweets(tweets, filename):
    ''' Function that appends tweets to a file. '''

    with open(filename, 'a') as f:
        for tweet in tweets:
            json.dump(tweet._json, f)
            f.write('\n')


def main():
    ''' This is a script that continuously searches for tweets
        that were created over a given number of days. The search
        dates and search phrase can be changed below. '''



    ''' search variables: '''
    search_phrases = ['#PythonPleaseWork']
    time_limit = 1.0                           # runtime limit in hours 
    max_tweets = 20                           # number of tweets per search but it doesn't seem to be working
    min_days_old, max_days_old = 1, 1          # search limits e.g., from 7 to 8
                                               # gives current weekday from last week,
                                               # min_days_old=0 will search from right now
    USA = '39.8,-95.583068847656,2500km'       # this geocode includes nearly all American
                                               # states (and a large portion of Canada) 
                                            # but it still fetches from outside the USA


    # loop over search items,
    # creating a new file for each
    for search_phrase in search_phrases:

        print('Search phrase =', search_phrase)

        ''' other variables '''
        name = search_phrase.split()[0]
        json_file_root = name + '/'  + name
        os.makedirs(os.path.dirname(json_file_root), exist_ok=True)
        read_IDs = False

        # open a file in which to store the tweets
        if max_days_old - min_days_old == 1:
            d = dt.datetime.now() - dt.timedelta(days=min_days_old)
            day = '{0}-{1:0>2}-{2:0>2}'.format(d.year, d.month, d.day)
        else:
            d1 = dt.datetime.now() - dt.timedelta(days=max_days_old-1)
            d2 = dt.datetime.now() - dt.timedelta(days=min_days_old)
            day = '{0}-{1:0>2}-{2:0>2}_to_{3}-{4:0>2}-{5:0>2}'.format(
                  d1.year, d1.month, d1.day, d2.year, d2.month, d2.day)
        json_file = json_file_root + '_' + day + '.json'
        if os.path.isfile(json_file):
            print('Appending tweets to file named: ',json_file)
            read_IDs = True

        # authorize and load the twitter API
        api = load_api()

        # set the 'starting point' ID for tweet collection
        if read_IDs:
            # open the json file and get the latest tweet ID
            with open(json_file, 'r') as f:
                lines = f.readlines()
                max_id = json.loads(lines[-1])['id']
                print('Searching from the bottom ID in file')
        else:
            # get the ID of a tweet that is min_days_old
            if min_days_old == 0:
                max_id = -1
            else:
                max_id = get_tweet_id(api, days_ago=(min_days_old-1))
        # set the smallest ID to search for
        since_id = get_tweet_id(api, days_ago=(max_days_old-1))
        print('max id (starting point) =', max_id)
        print('since id (ending point) =', since_id)



        ''' tweet gathering loop  '''
        start = dt.datetime.now()
        end = start + dt.timedelta(hours=time_limit)
        count, exitcount = 0, 0
        while dt.datetime.now() < end:
            count += 1
            print('count =',count)
            # collect tweets and update max_id
            tweets, max_id = tweet_search(api, search_phrase, max_tweets,
                                          max_id=max_id, since_id=since_id,
                                          geocode=USA)
            # write tweets to file in JSON format
            if tweets:
                write_tweets(tweets, json_file)
                exitcount = 0
            else:
                exitcount += 1
                if exitcount == 3:
                    if search_phrase == search_phrases[-1]:
                        sys.exit('Maximum number of empty tweet strings reached - exiting')
                    else:
                        print('Maximum number of empty tweet strings reached - breaking')
                        break


if __name__ == "__main__":
    main()

【问题讨论】：

在问题中包含最小可行测试用例的代码，不要链接到外部站点。
@hardillb 完成，我认为代码有点太多了。谢谢！

标签： python json api twitter

【解决方案1】：

您无法使用 Search API 获取 2 个月的历史数据。

“Twitter 搜索 API 根据过去 7 天内发布的最新推文样本进行搜索。在参与之前，重要的是要知道 Search API 关注的是相关性而不是完整性。这意味着搜索结果中可能会丢失一些推文和用户。”

https://developer.twitter.com/en/docs/tweets/search/overview/basic-search

您可以将 Streaming api 与国家/地区过滤器一起使用，您可以使用一些停用词来代替主题标签。例如，对于美国，您可以使用 "the,and" ，对于法国，您可以使用 "le,la,et" 等。

此外，共享您的访问令牌也不是一个好主意。

【讨论】：

谢谢，您是否知道流式 API 是否允许超过 2 个月（似乎不会），并且您知道我如何专门从一个国家？
您无法使用流式 api 获取历史数据。您可以使用此处解释的过滤器：Basic stream parameters & Filtering tweets by location 注意：只有一小部分推文带有地理标记。有用于研究目的的可用数据集：Tweets from USA Event based collections