farlay farlay - 4 years ago 208
JSON Question

Converting raw ASCII text with UTF-8 encoded characters represented by backslash escapes

I collect Persian tweets running the following python code:

#!/usr/bin/env python
# -*- coding: UTF-8 -*-
import sys
import tweepy
import json
import os

consumer_key ="xxxx"
consumer_secret ="xxxx"
access_key = "xxxx"
access_secret = "xxxx"

auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_key, access_secret)
api = tweepy.API(auth)

save_file = open("Out.json", 'a')

t1 = u""

class CustomStreamListener(tweepy.StreamListener):
def __init__(self, api):
self.api = api
super(tweepy.StreamListener, self).__init__()

# self.list_of_tweets = []

def on_data(self, tweet):
print tweet
save_file.write(str(tweet))

def on_error(self, status_code):
print >> sys.stderr, 'Encountered error with status code:', status_code
return True # Don't kill the stream
print "Stream restarted"

def on_timeout(self):
print >> sys.stderr, 'Timeout...'
return True # Don't kill the stream
print "Stream restarted"

def start_stream():
while True:
try:
sapi = tweepy.streaming.Stream(auth, CustomStreamListener(api))
sapi.filter(track=[t1])
except:
continue

start_stream()


It returns tweets text in raw ASCII text with UTF-8 encoded characters represented by backslash escapes. I would like to change the code in a way that directly saves the retrieved tweets in "Out.json" in UTF-8 encoded format.

{
"created_at": "Tue Feb 07 08:04:17 +0000 2017",
"id": 828877025049972737,
"id_str": "828877025049972737",
"text": "\u0644\u0637\u0641\u0627 \u0628\u0647 \u062d\u06cc\u0648\u0627\u0646\u0627\u062a \u063a\u06cc\u0631\u062e\u0627\u0646\u06af\u06cc \u063a\u0630\u0627\u00a0\u0646\u062f\u0647\u06cc\u062f https:\/\/t.co\/gFi5XCVQww https:\/\/t.co\/pQWPqbvJVF",
"display_text_range": [0, 58],
"source": "\u003ca href=\"http:\/\/publicize.wp.com\/\" rel=\"nofollow\"\u003eWordPress.com\u003c\/a\u003e",
"truncated": false,
"in_reply_to_status_id": null,
"in_reply_to_status_id_str": null,
"in_reply_to_user_id": null,
"in_reply_to_user_id_str": null,
"in_reply_to_screen_name": null,
...
"lang": "fa",
"timestamp_ms": "1486454657219"
}

Answer Source

The StreamListener.on_data() method is passed the raw JSON data as received from Twitter. It is this data that contains valid JSON escape sequences.

If you wanted to save UTF-8 data directly, so with the \uhhhh escape sequences replaced by the actual Unicode codepoint, you'll have to re-code the tweet. Use Saving utf-8 texts in json.dumps as UTF8, not as \u escape sequence to save the data afterwards.

Do note that writing multiple JSON objects to one file makes that file itself invalid JSON. You could produce JSON Lines output by injecting newlines (the standard json.dumps() output does not produce newlines in the generated JSON document), then read those entries one by one with this answer.

So the important sections of your code should look like:

import json

save_file = open("Out.json", 'a')

class CustomStreamListener(tweepy.StreamListener):
    # ...

    def on_data(self, tweet):
        tweet = json.loads(tweet)
        json_doc = json.dumps(tweet, ensure_ascii=False)
        save_file.write(json_doc.encode('utf8') + '\n')
Recommended from our users: Dynamic Network Monitoring from WhatsUp Gold from IPSwitch. Free Download