I collect Persian tweets running the following python code:
#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import sys
import tweepy
import json
import os
consumer_key ="xxxx"
consumer_secret ="xxxx"
access_key = "xxxx"
access_secret = "xxxx"
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)
save_file = open("Out.json", 'a')
t1 = u""
class CustomStreamListener(tweepy.StreamListener):
def __init__(self, api):
self.api = api
super(tweepy.StreamListener, self).__init__()
# self.list_of_tweets = []
def on_data(self, tweet):
print tweet
save_file.write(str(tweet))
def on_error(self, status_code):
print >> sys.stderr, 'Encountered error with status code:', status_code
return True # Don't kill the stream
print "Stream restarted"
def on_timeout(self):
print >> sys.stderr, 'Timeout...'
return True # Don't kill the stream
print "Stream restarted"
def start_stream():
while True:
try:
sapi = tweepy.streaming.Stream(auth, CustomStreamListener(api))
sapi.filter(track=[t1])
except:
continue
start_stream()
{
"created_at": "Tue Feb 07 08:04:17 +0000 2017",
"id": 828877025049972737,
"id_str": "828877025049972737",
"text": "\u0644\u0637\u0641\u0627 \u0628\u0647 \u062d\u06cc\u0648\u0627\u0646\u0627\u062a \u063a\u06cc\u0631\u062e\u0627\u0646\u06af\u06cc \u063a\u0630\u0627\u00a0\u0646\u062f\u0647\u06cc\u062f https:\/\/t.co\/gFi5XCVQww https:\/\/t.co\/pQWPqbvJVF",
"display_text_range": [0, 58],
"source": "\u003ca href=\"http:\/\/publicize.wp.com\/\" rel=\"nofollow\"\u003eWordPress.com\u003c\/a\u003e",
"truncated": false,
"in_reply_to_status_id": null,
"in_reply_to_status_id_str": null,
"in_reply_to_user_id": null,
"in_reply_to_user_id_str": null,
"in_reply_to_screen_name": null,
...
"lang": "fa",
"timestamp_ms": "1486454657219"
}
The StreamListener.on_data()
method is passed the raw JSON data as received from Twitter. It is this data that contains valid JSON escape sequences.
If you wanted to save UTF-8 data directly, so with the \uhhhh
escape sequences replaced by the actual Unicode codepoint, you'll have to re-code the tweet. Use Saving utf-8 texts in json.dumps as UTF8, not as \u escape sequence to save the data afterwards.
Do note that writing multiple JSON objects to one file makes that file itself invalid JSON. You could produce JSON Lines output by injecting newlines (the standard json.dumps()
output does not produce newlines in the generated JSON document), then read those entries one by one with this answer.
So the important sections of your code should look like:
import json
save_file = open("Out.json", 'a')
class CustomStreamListener(tweepy.StreamListener):
# ...
def on_data(self, tweet):
tweet = json.loads(tweet)
json_doc = json.dumps(tweet, ensure_ascii=False)
save_file.write(json_doc.encode('utf8') + '\n')