Let's get a first feel for the data extracted from each of the social networks and get an understanding of the data structure from each these sources.
In this section, we are going to establish connection with the Twitter API. Twitter offers two connection modes: the REST API, which allows us to search historical tweets for a given search term or hashtag, and the streaming API, which delivers real-time tweets under the rate limit in place.
In order to get a better understanding of how to operate with the Twitter API, we will go through the following steps:
Let's go through it step-by-step:
pip install twitter
from the command line:$ pip install twitter
self.auth
gets the credentials from Twitter. It then creates a registered API as self.api
. We have implemented two methods: the first one to search Twitter with a given query and the second one to parse the output to retrieve relevant information such as the tweet ID, the tweet text, and the tweet author. The code is as follows:import twitter import urlparse from pprint import pprint as pp class TwitterAPI(object): """ TwitterAPI class allows the Connection to Twitter via OAuth once you have registered with Twitter and receive the necessary credentiials """ # initialize and get the twitter credentials def __init__(self): consumer_key = 'Provide your credentials' consumer_secret = 'Provide your credentials' access_token = 'Provide your credentials' access_secret = 'Provide your credentials' self.consumer_key = consumer_key self.consumer_secret = consumer_secret self.access_token = access_token self.access_secret = access_secret # # authenticate credentials with Twitter using OAuth self.auth = twitter.oauth.OAuth(access_token, access_secret, consumer_key, consumer_secret) # creates registered Twitter API self.api = twitter.Twitter(auth=self.auth) # # search Twitter with query q (i.e. "ApacheSpark") and max. result def searchTwitter(self, q, max_res=10,**kwargs): search_results = self.api.search.tweets(q=q, count=10, **kwargs) statuses = search_results['statuses'] max_results = min(1000, max_res) for _ in range(10): try: next_results = search_results['search_metadata']['next_results'] except KeyError as e: break next_results = urlparse.parse_qsl(next_results[1:]) kwargs = dict(next_results) search_results = self.api.search.tweets(**kwargs) statuses += search_results['statuses'] if len(statuses) > max_results: break return statuses # # parse tweets as it is collected to extract id, creation # date, user id, tweet text def parseTweets(self, statuses): return [ (status['id'], status['created_at'], status['user']['id'], status['user']['name'], status['text'], url['expanded_url']) for status in statuses for url in status['entities']['urls'] ]
t= TwitterAPI()
q="ApacheSpark" tsearch = t.searchTwitter(q)
pp(tsearch[1]) {u'contributors': None, u'coordinates': None, u'created_at': u'Sat Apr 25 14:50:57 +0000 2015', u'entities': {u'hashtags': [{u'indices': [74, 86], u'text': u'sparksummit'}], u'media': [{u'display_url': u'pic.twitter.com/WKUMRXxIWZ', u'expanded_url': u'http://twitter.com/bigdata/status/591976255831969792/photo/1', u'id': 591976255156715520, u'id_str': u'591976255156715520', u'indices': [143, 144], u'media_url': ...(snip)... u'text': u'RT @bigdata: Enjoyed catching up with @ApacheSpark users & leaders at #sparksummit NYC: video clips are out http://t.co/qrqpP6cG9s http://tu2026', u'truncated': False, u'user': {u'contributors_enabled': False, u'created_at': u'Sat Apr 04 14:44:31 +0000 2015', u'default_profile': True, u'default_profile_image': True, u'description': u'', u'entities': {u'description': {u'urls': []}}, u'favourites_count': 0, u'follow_request_sent': False, u'followers_count': 586, u'following': False, u'friends_count': 2, u'geo_enabled': False, u'id': 3139047660, u'id_str': u'3139047660', u'is_translation_enabled': False, u'is_translator': False, u'lang': u'zh-cn', u'listed_count': 749, u'location': u'', u'name': u'Mega Data Mama', u'notifications': False, u'profile_background_color': u'C0DEED', u'profile_background_image_url': u'http://abs.twimg.com/images/themes/theme1/bg.png', u'profile_background_image_url_https': u'https://abs.twimg.com/images/themes/theme1/bg.png', ...(snip)... u'screen_name': u'MegaDataMama', u'statuses_count': 26673, u'time_zone': None, u'url': None, u'utc_offset': None, u'verified': False}}
tparsed = t.parseTweets(tsearch) pp(tparsed) [(591980327784046592, u'Sat Apr 25 15:01:23 +0000 2015', 63407360, u'Josxe9 Carlos Baquero', u'Big Data systems are making a difference in the fight against cancer. #BigData #ApacheSpark http://t.co/pnOLmsKdL9', u'http://tmblr.co/ZqTggs1jHytN0'), (591977704464875520, u'Sat Apr 25 14:50:57 +0000 2015', 3139047660, u'Mega Data Mama', u'RT @bigdata: Enjoyed catching up with @ApacheSpark users & leaders at #sparksummit NYC: video clips are out http://t.co/qrqpP6cG9s http://tu2026', u'http://goo.gl/eF5xwK'), (591977172589539328, u'Sat Apr 25 14:48:51 +0000 2015', 2997608763, u'Emma Clark', u'RT @bigdata: Enjoyed catching up with @ApacheSpark users & leaders at #sparksummit NYC: video clips are out http://t.co/qrqpP6cG9s http://tu2026', u'http://goo.gl/eF5xwK'), ... (snip)... (591879098349268992, u'Sat Apr 25 08:19:08 +0000 2015', 331263208, u'Mario Molina', u'#ApacheSpark speeds up big data decision-making http://t.co/8hdEXreNfN', u'http://www.computerweekly.com/feature/Apache-Spark-speeds-up-big-data-decision-making')]