python 2.7 - Unable to download the Twitter sentiment corpus by Niek Sanders -
i next tutorial on twitter sentiment analysis. have downloaded codes here http://www.sananalytics.com/lab/twitter-sentiment/. follow steps run install.py cmd prompt, while create json files in 'rawdata' folder, when view these json files, says:
{ "errors": [ { "message": "ssl required", "code": 92 } ] }
the install.py code follows:
# # sanders-twitter sentiment corpus install script # version 0.1 # # pulls tweet info twitter because tos prevents distributing directly. # # right utilize unauthenticated requests, rate-limited 150/hr. # utilize 125/hr remain safe. # # more double download speed using authentication # oauth logins. now, much of pita implement. allow # script run on weekend , you'll have data. # # - niek sanders # njs@sananalytics.com # oct 20, 2011 # # # excuse ugly code. threw possible , # don't code in python. # import csv, getpass, json, os, time, urllib def get_user_params(): user_params = {} # user input params user_params['inlist'] = raw_input( '\ninput file [./corpus.csv]: ' ) user_params['outlist'] = raw_input( 'results file [./full-corpus.csv]: ' ) user_params['rawdir'] = raw_input( 'raw info dir [./rawdata/]: ' ) # apply defaults if user_params['inlist'] == '': user_params['inlist'] = './corpus.csv' if user_params['outlist'] == '': user_params['outlist'] = './full-corpus.csv' if user_params['rawdir'] == '': user_params['rawdir'] = './rawdata/' homecoming user_params def dump_user_params( user_params ): # dump user params confirmation print 'input: ' + user_params['inlist'] print 'output: ' + user_params['outlist'] print 'raw data: ' + user_params['rawdir'] homecoming def read_total_list( in_filename ): # read total fetch list csv fp = open( in_filename, 'rb' ) reader = csv.reader( fp, delimiter=',', quotechar='"' ) total_list = [] row in reader: total_list.append( row ) homecoming total_list def purge_already_fetched( fetch_list, raw_dir ): # list of tweet ids still need downloading rem_list = [] # check each tweet see if have item in fetch_list: # check if json file exists tweet_file = raw_dir + item[2] + '.json' if os.path.exists( tweet_file ): # effort parse json file try: parse_tweet_json( tweet_file ) print '--> downloaded #' + item[2] except runtimeerror: rem_list.append( item ) else: rem_list.append( item ) homecoming rem_list def get_time_left_str( cur_idx, fetch_list, download_pause ): tweets_left = len(fetch_list) - cur_idx total_seconds = tweets_left * download_pause str_hr = int( total_seconds / 3600 ) str_min = int((total_seconds - str_hr*3600) / 60) str_sec = total_seconds - str_hr*3600 - str_min*60 homecoming '%dh %dm %ds' % (str_hr, str_min, str_sec) def download_tweets( fetch_list, raw_dir ): # ensure raw info directory exists if not os.path.exists( raw_dir ): os.mkdir( raw_dir ) # remain within rate limits max_tweets_per_hr = 125 download_pause_sec = 3600 / max_tweets_per_hr # download tweets idx in range(0,len(fetch_list)): # current item item = fetch_list[idx] # print status trem = get_time_left_str( idx, fetch_list, download_pause_sec ) print '--> downloading tweet #%s (%d of %d) (%s left)' % \ (item[2], idx+1, len(fetch_list), trem) # pull info url = 'http://api.twitter.com/1/statuses/show.json?id=' + item[2] urllib.urlretrieve( url, raw_dir + item[2] + '.json' ) # remain in twitter api rate limits print ' pausing %d sec obey twitter api rate limits' % \ (download_pause_sec) time.sleep( download_pause_sec ) homecoming def parse_tweet_json( filename ): # read tweet print 'opening: ' + filename fp = open( filename, 'rb' ) # parse json try: tweet_json = json.load( fp ) except valueerror: raise runtimeerror('error parsing json') # twitter api error msgs if 'error' in tweet_json: raise runtimeerror('error in downloaded tweet') # extract creation date , tweet text homecoming [ tweet_json['created_at'], tweet_json['text'] ] def build_output_corpus( out_filename, raw_dir, total_list ): # open csv output file fp = open( out_filename, 'wb' ) author = csv.writer( fp, delimiter=',', quotechar='"', escapechar='\\', quoting=csv.quote_all ) # write header row writer.writerow( ['topic','sentiment','tweetid','tweetdate','tweettext'] ) # parse downloaded tweets missing_count = 0 item in total_list: # ensure tweet exists if os.path.exists( raw_dir + item[2] + '.json' ): try: # parse tweet parsed_tweet = parse_tweet_json( raw_dir + item[2] + '.json' ) full_row = item + parsed_tweet # character encoding output in range(0,len(full_row)): full_row[i] = full_row[i].encode("utf-8") # write csv row writer.writerow( full_row ) except runtimeerror: print '--> bad info in tweet #' + item[2] missing_count += 1 else: print '--> missing tweet #' + item[2] missing_count += 1 # indicate success if missing_count == 0: print '\nsuccessfully downloaded corpus!' print 'output in: ' + out_filename + '\n' else: print '\nmissing %d of %d tweets!' % (missing_count, len(total_list)) print 'partial output in: ' + out_filename + '\n' homecoming def main(): # user parameters user_params = get_user_params() dump_user_params( user_params ) # fetch list total_list = read_total_list( user_params['inlist'] ) fetch_list = purge_already_fetched( total_list, user_params['rawdir'] ) # start fetching info twitter download_tweets( fetch_list, user_params['rawdir'] ) # sec pass failed downloads print '\nstarting sec pass retry failed downloads'; fetch_list = purge_already_fetched( total_list, user_params['rawdir'] ) download_tweets( fetch_list, user_params['rawdir'] ) # build output corpus build_output_corpus( user_params['outlist'], user_params['rawdir'], total_list ) homecoming if __name__ == '__main__': main()
for other wary travelers ...
i noticed kubik888 didn't link found updated code.
so:
a) if interested here finish upload of csv found on github - https://raw.githubusercontent.com/zfz/twitter_corpus/master/full-corpus.csv
this seems have whole 6000+ tweets, after "irrelevant" tweets removed has 3000+ observations.
b) alternatively, here repository finish code can fetch updated original 0.1 version niek sanders version 0.1 back upwards of twitter api 1.1 supportin oauth
https://github.com/aweiand/twittersentiment/blob/71c007948b8fb854b1df0b2a3a32d2629653e74b/gettwittercorpus/gettweets.py
it has total corpus in various formats: https://github.com/aweiand/twittersentiment/tree/71c007948b8fb854b1df0b2a3a32d2629653e74b/gettwittercorpus
python-2.7 twitter sentiment-analysis
No comments:
Post a Comment