Related
I have written following code to stream data from Tweepy API. And I am getting data inside stream object. But unable to get streamp["user"]["followers_count"] but don't know how to get it. I also tried jsonLines = lines.flatMap(lambda json_str:json.loads(json_str)) but no help.
from __future__ import print_function
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
import json
sc = SparkContext()
ssc = StreamingContext(sc, 10) #Slide Interval of 10 sec
socket_stream = ssc.socketTextStream("localhost",4444)
stream = socket_stream.window(30) #window length 30 sec
stream.pprint()
ssc.start()
ssc.awaitTermination()
stream.pprint() gives me following JSON.
{"created_at":"Sat May 08 09:27:43 +0000 2021","id":1390961604079067137,"id_str":"1390961604079067137","text":"The return of the king! This style has so many parameters that have to be correct and Stone nails em all! - Drinkin\u2026 https:\/\/t.co\/uCogsfW8NC","source":"\u003ca href=\"https:\/\/untappd.com\" rel=\"nofollow\"\u003eUntappd\u003c\/a\u003e","truncated":true,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":905439001,"id_str":"905439001","name":"Nahoj Morts","screen_name":"CraftBeerJunkie","location":null,"url":null,"description":null,"translator_type":"none","protected":false,"verified":false,"followers_count":15,"friends_count":113,"listed_count":0,"favourites_count":1,"statuses_count":1868,"created_at":"Fri Oct 26 06:36:01 +0000 2012","utc_offset":null,"time_zone":null,"geo_enabled":false,"lang":null,"contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_link_color":"1DA1F2","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/2765713047\/b92d58c569ad4739e67ef4d4e9a35780_normal.jpeg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/2765713047\/b92d58c569ad4739e67ef4d4e9a35780_normal.jpeg","default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null,"withheld_in_countries":[]},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"extended_tweet":{"full_text":"The return of the king! This style has so many parameters that have to be correct and Stone nails em all! - Drinking a Stone Sublimely Self-Righteous Black IPA by #StoneBrewing # Uggleberget \u2014 https:\/\/t.co\/a3XixD5teo","display_text_range":[0,217],"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/a3XixD5teo","expanded_url":"https:\/\/untp.beer\/s\/c1025322433","display_url":"untp.beer\/s\/c1025322433","indices":[194,217]}],"user_mentions":[{"screen_name":"StoneBrewing","name":"Stone Brewing","id":16331259,"id_str":"16331259","indices":[163,176]}],"symbols":[]}},"quote_count":0,"reply_count":0,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[{"url":"https:\/\/t.co\/uCogsfW8NC","expanded_url":"https:\/\/twitter.com\/i\/web\/status\/1390961604079067137","display_url":"twitter.com\/i\/web\/status\/1\u2026","indices":[117,140]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en","timestamp_ms":"1620466063886"}
{"created_at":"Sat May 08 09:27:43 +0000 2021","id":1390961604334919683,"id_str":"1390961604334919683","text":"[HQ] 210508 #KrisWu #Wuyifan # GTSSC 2021 (Day 1) Cr.Fanbaobao(3) #PorscheRacerKrisWu \nhttps:\/\/t.co\/cEcAtzKNsM\u2026 https:\/\/t.co\/3B9uUS3mdu","display_text_range":[0,140],"source":"\u003ca href=\"https:\/\/mobile.twitter.com\" rel=\"nofollow\"\u003eTwitter Web App\u003c\/a\u003e","truncated":true,"in_reply_to_status_id":1390961304316387331,"in_reply_to_status_id_str":"1390961304316387331","in_reply_to_user_id":620222301,"in_reply_to_user_id_str":"620222301","in_reply_to_screen_name":"kissmemyfan_","user":{"id":620222301,"id_str":"620222301","name":"KISSMEMYFAN1106\u2661","screen_name":"kissmemyfan_","location":null,"url":"https:\/\/www.youtube.com\/channel\/UCtYrDyYwqZyQfYtCz79zPHQ\/featured","description":"for #KrisWu Kris Wu Yifan since 120627 \u2740notify in case of emergency,pls dm\u2740 fb: https:\/\/www.facebook.com\/groups\/1051879991503131\/","translator_type":"regular","protected":false,"verified":false,"followers_count":91307,"friends_count":24,"listed_count":433,"favourites_count":14194,"statuses_count":95476,"created_at":"Wed Jun 27 16:34:02 +0000 2012","utc_offset":null,"time_zone":null,"geo_enabled":true,"lang":null,"contributors_enabled":false,"is_translator":false,"profile_background_color":"FFCC4D","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":true,"profile_link_color":"E81C4F","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":false,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/1252957871802707974\/RjNGNnly_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/1252957871802707974\/RjNGNnly_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/620222301\/1587563426","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null,"withheld_in_countries":[]},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"extended_tweet":{"full_text":"[HQ] 210508 #KrisWu #Wuyifan # GTSSC 2021 (Day 1) Cr.Fanbaobao(3) #PorscheRacerKrisWu \nhttps:\/\/t.co\/cEcAtzKNsM\nhttps:\/\/t.co\/PNbPqE88Y3 https:\/\/t.co\/8WPhfpHiyO","display_text_range":[0,134],"entities":{"hashtags":[{"text":"KrisWu","indices":[12,19]},{"text":"Wuyifan","indices":[20,28]},{"text":"PorscheRacerKrisWu","indices":[66,85]}],"urls":[{"url":"https:\/\/t.co\/cEcAtzKNsM","expanded_url":"https:\/\/wx1.sinaimg.cn\/large\/00688bwvly1gqb3c1cdbuj31l82bpe81.jpg","display_url":"wx1.sinaimg.cn\/large\/00688bwv\u2026","indices":[87,110]},{"url":"https:\/\/t.co\/PNbPqE88Y3","expanded_url":"https:\/\/wx4.sinaimg.cn\/large\/00688bwvly1gqb3c1mc9hj31ay1wb7ji.jpg","display_url":"wx4.sinaimg.cn\/large\/00688bwv\u2026","indices":[111,134]}],"user_mentions":[],"symbols":[],"media":[{"id":1390961423400857602,"id_str":"1390961423400857602","indices":[135,158],"media_url":"http:\/\/pbs.twimg.com\/media\/E02wC4IUcAI4F-O.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/E02wC4IUcAI4F-O.jpg","url":"https:\/\/t.co\/8WPhfpHiyO","display_url":"pic.twitter.com\/8WPhfpHiyO","expanded_url":"https:\/\/twitter.com\/kissmemyfan_\/status\/1390961604334919683\/photo\/1","type":"photo","sizes":{"small":{"w":465,"h":680,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":1024,"h":1498,"resize":"fit"},"medium":{"w":820,"h":1200,"resize":"fit"}}},{"id":1390961423417704450,"id_str":"1390961423417704450","indices":[135,158],"media_url":"http:\/\/pbs.twimg.com\/media\/E02wC4MVgAI8ZHZ.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/E02wC4MVgAI8ZHZ.jpg","url":"https:\/\/t.co\/8WPhfpHiyO","display_url":"pic.twitter.com\/8WPhfpHiyO","expanded_url":"https:\/\/twitter.com\/kissmemyfan_\/status\/1390961604334919683\/photo\/1","type":"photo","sizes":{"thumb":{"w":150,"h":150,"resize":"crop"},"medium":{"w":825,"h":1200,"resize":"fit"},"small":{"w":467,"h":680,"resize":"fit"},"large":{"w":1024,"h":1490,"resize":"fit"}}}]},"extended_entities":{"media":[{"id":1390961423400857602,"id_str":"1390961423400857602","indices":[135,158],"media_url":"http:\/\/pbs.twimg.com\/media\/E02wC4IUcAI4F-O.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/E02wC4IUcAI4F-O.jpg","url":"https:\/\/t.co\/8WPhfpHiyO","display_url":"pic.twitter.com\/8WPhfpHiyO","expanded_url":"https:\/\/twitter.com\/kissmemyfan_\/status\/1390961604334919683\/photo\/1","type":"photo","sizes":{"small":{"w":465,"h":680,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":1024,"h":1498,"resize":"fit"},"medium":{"w":820,"h":1200,"resize":"fit"}}},{"id":1390961423417704450,"id_str":"1390961423417704450","indices":[135,158],"media_url":"http:\/\/pbs.twimg.com\/media\/E02wC4MVgAI8ZHZ.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/E02wC4MVgAI8ZHZ.jpg","url":"https:\/\/t.co\/8WPhfpHiyO","display_url":"pic.twitter.com\/8WPhfpHiyO","expanded_url":"https:\/\/twitter.com\/kissmemyfan_\/status\/1390961604334919683\/photo\/1","type":"photo","sizes":{"thumb":{"w":150,"h":150,"resize":"crop"},"medium":{"w":825,"h":1200,"resize":"fit"},"small":{"w":467,"h":680,"resize":"fit"},"large":{"w":1024,"h":1490,"resize":"fit"}}}]}},"quote_count":0,"reply_count":0,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[{"text":"KrisWu","indices":[12,19]},{"text":"Wuyifan","indices":[20,28]},{"text":"PorscheRacerKrisWu","indices":[66,85]}],"urls":[{"url":"https:\/\/t.co\/cEcAtzKNsM","expanded_url":"https:\/\/wx1.sinaimg.cn\/large\/00688bwvly1gqb3c1cdbuj31l82bpe81.jpg","display_url":"wx1.sinaimg.cn\/large\/00688bwv\u2026","indices":[87,110]},{"url":"https:\/\/t.co\/3B9uUS3mdu","expanded_url":"https:\/\/twitter.com\/i\/web\/status\/1390961604334919683","display_url":"twitter.com\/i\/web\/status\/1\u2026","indices":[112,135]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"ht","timestamp_ms":"1620466063947"}
{"created_at":"Sat May 08 09:27:44 +0000 2021","id":1390961607572865031,"id_str":"1390961607572865031","text":"RT #aa86marat2: #\u0644\u0627_\u0644\u0644\u062a\u0637\u0639\u064a\u0645_\u0627\u0644\u0627\u062c\u0628\u0627\u0631\u064a\n\u0628\u0633\u0645 \u0627\u0644\u0644\u0647 \u062a\u0648\u0643\u0644\u062a \u0639\u0644\u0649 \u0627\u0644\u0644\u0647 \n\u0648\u0627\u0644\u0644\u0647 \u0643\u0631\u064a\u0645 \u064a\u0627\u0631\u0628 \u062a\u062a\u0642\u0641\u0644 \u0627\u0644\u064a\u0648\u0645 \u0628\u0648\u062c\u0648\u062f\u0627\u0644\u062e\u064a\u0631\u064a\u0646 \u0627\u0645\u062b\u0627\u0644\u0643\u0645\n \u064a\u0627\u0631\u0628 \u0633\u062e\u0631\u0644\u0647 \u0645\u0646 \u064a\u0642\u0641\u0644\u0647\u0627\n\u0633\u062c\u064a\u0646\u2026","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1250758860660527105,"id_str":"1250758860660527105","name":"om_mhamd99","screen_name":"Mhamd99Om","location":null,"url":null,"description":"\u064a\u0627\u0631\u0628 \u2728\ud83e\udd32\ud83c\udffb\u0628\u0627\u0631\u0643 \u0644\u064a \u0641\u064a\u0645\u0627 \u0623\u0639\u0637\u062a\u0646\u064a \u0648\u0641\u0631\u062c \u0647\u0645\u064a \u0628\u0642\u0636\u0627\u0621 \u062f\u064a\u0646 \u0648\u0627\u0639\u062a\u064a \u0639\u0644\u064a \u0630\u0644\u0643 \u0641\u0627\u062a\u0648\u0631\u0647 \u062a\u0646\u0641\u064a\u0630 1935092757","translator_type":"none","protected":false,"verified":false,"followers_count":903,"friends_count":389,"listed_count":0,"favourites_count":2174,"statuses_count":33294,"created_at":"Thu Apr 16 12:12:15 +0000 2020","utc_offset":null,"time_zone":null,"geo_enabled":false,"lang":null,"contributors_enabled":false,"is_translator":false,"profile_background_color":"F5F8FA","profile_background_image_url":"","profile_background_image_url_https":"","profile_background_tile":false,"profile_link_color":"1DA1F2","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/1286520143431061504\/k2Jg3W0u_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/1286520143431061504\/k2Jg3W0u_normal.jpg","default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null,"withheld_in_countries":[]},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Sat May 08 02:27:00 +0000 2021","id":1390855727116427270,"id_str":"1390855727116427270","text":"#\u0644\u0627_\u0644\u0644\u062a\u0637\u0639\u064a\u0645_\u0627\u0644\u0627\u062c\u0628\u0627\u0631\u064a\n\u0628\u0633\u0645 \u0627\u0644\u0644\u0647 \u062a\u0648\u0643\u0644\u062a \u0639\u0644\u0649 \u0627\u0644\u0644\u0647 \n\u0648\u0627\u0644\u0644\u0647 \u0643\u0631\u064a\u0645 \u064a\u0627\u0631\u0628 \u062a\u062a\u0642\u0641\u0644 \u0627\u0644\u064a\u0648\u0645 \u0628\u0648\u062c\u0648\u062f\u0627\u0644\u062e\u064a\u0631\u064a\u0646 \u0627\u0645\u062b\u0627\u0644\u0643\u0645\n \u064a\u0627\u0631\u0628 \u0633\u062e\u0631\u0644\u0647 \u0645\u0646 \u064a\u0642\u0641\u0644\u2026 https:\/\/t.co\/mpCKrtjMW4","display_text_range":[0,140],"source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":true,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1338279194892115974,"id_str":"1338279194892115974","name":"\u0639\u0628\u062f\u0627\u0644\u0631\u062d\u0645\u0646 \u0645\u062d\u0645\u062f 2","screen_name":"aa86marat2","location":null,"url":null,"description":null,"translator_type":"none","protected":false,"verified":false,"followers_count":382,"friends_count":82,"listed_count":1,"favourites_count":43,"statuses_count":16494,"created_at":"Mon Dec 14 00:26:54 +0000 2020","utc_offset":null,"time_zone":null,"geo_enabled":false,"lang":null,"contributors_enabled":false,"is_translator":false,"profile_background_color":"F5F8FA","profile_background_image_url":"","profile_background_image_url_https":"","profile_background_tile":false,"profile_link_color":"1DA1F2","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/1356322484673519616\/7NSfbGye_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/1356322484673519616\/7NSfbGye_normal.jpg","default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null,"withheld_in_countries":[]},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"extended_tweet":{"full_text":"#\u0644\u0627_\u0644\u0644\u062a\u0637\u0639\u064a\u0645_\u0627\u0644\u0627\u062c\u0628\u0627\u0631\u064a\n\u0628\u0633\u0645 \u0627\u0644\u0644\u0647 \u062a\u0648\u0643\u0644\u062a \u0639\u0644\u0649 \u0627\u0644\u0644\u0647 \n\u0648\u0627\u0644\u0644\u0647 \u0643\u0631\u064a\u0645 \u064a\u0627\u0631\u0628 \u062a\u062a\u0642\u0641\u0644 \u0627\u0644\u064a\u0648\u0645 \u0628\u0648\u062c\u0648\u062f\u0627\u0644\u062e\u064a\u0631\u064a\u0646 \u0627\u0645\u062b\u0627\u0644\u0643\u0645\n \u064a\u0627\u0631\u0628 \u0633\u062e\u0631\u0644\u0647 \u0645\u0646 \u064a\u0642\u0641\u0644\u0647\u0627\n\u0633\u062c\u064a\u0646 \u0639\u0645\u0631\u064756 \u0639\u0627\u0645\u0627 \u0645\u062a\u0632\u0648\u062c \u0644\u062f\u064a\u0647 \u0637\u0641\u0644 \u0645\u0633\u062c\u0648\u0646 \u0645\u0646\u0630\u0639\u0627\u0645\u064a\u0646 \u06484\u0623\u0634\u0647\u0631\n \u0645\u062a\u0628\u0642\u0649 \u0639\u0644\u064a\u0647 296200 \u0631\u064a\u0627\u0644\n\u0627\u0644\u0641\u0627\u062a\u0648\u0631\u0629\n1934638011\n\u0639\u0628\u0631 #\n\nhttps:\/\/t.co\/JjOHPoGLrJ\n\n\u0627\u0644\u062c\u0631\u0623\u0629_\u0641\u064a_\u0627\u0644\u0637\u0628\u0639\n\u0645\u0635\u0631\u0641_\u0627\u0644\u0631\u0627\u062c\u062d\u064a https:\/\/t.co\/mGuOo9qwZP","display_text_range":[0,276],"entities":{"hashtags":[{"text":"\u0644\u0627_\u0644\u0644\u062a\u0637\u0639\u064a\u0645_\u0627\u0644\u0627\u062c\u0628\u0627\u0631\u064a","indices":[0,20]}],"urls":[{"url":"https:\/\/t.co\/JjOHPoGLrJ","expanded_url":"https:\/\/Ehsan.sa\/referral\/29734F898E5CCB199DD92FD5CE8284C3B10081046BB13791AC72608BDD62B3D84020AC2B515E485C2E4B718A425C1A710438C7B814161E4197F6E7F4F73557AA","display_url":"Ehsan.sa\/referral\/29734\u2026","indices":[223,246]}],"user_mentions":[],"symbols":[],"media":[{"id":1390855708363698183,"id_str":"1390855708363698183","indices":[277,300],"media_url":"http:\/\/pbs.twimg.com\/media\/E01P5c7XsAcgFlQ.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/E01P5c7XsAcgFlQ.jpg","url":"https:\/\/t.co\/mGuOo9qwZP","display_url":"pic.twitter.com\/mGuOo9qwZP","expanded_url":"https:\/\/twitter.com\/aa86marat2\/status\/1390855727116427270\/photo\/1","type":"photo","sizes":{"thumb":{"w":150,"h":150,"resize":"crop"},"medium":{"w":554,"h":1200,"resize":"fit"},"large":{"w":720,"h":1560,"resize":"fit"},"small":{"w":314,"h":680,"resize":"fit"}}},{"id":1390855718069317649,"id_str":"1390855718069317649","indices":[277,300],"media_url":"http:\/\/pbs.twimg.com\/media\/E01P6BFXsBEhWDZ.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/E01P6BFXsBEhWDZ.jpg","url":"https:\/\/t.co\/mGuOo9qwZP","display_url":"pic.twitter.com\/mGuOo9qwZP","expanded_url":"https:\/\/twitter.com\/aa86marat2\/status\/1390855727116427270\/photo\/1","type":"photo","sizes":{"thumb":{"w":150,"h":150,"resize":"crop"},"small":{"w":511,"h":680,"resize":"fit"},"large":{"w":901,"h":1200,"resize":"fit"},"medium":{"w":901,"h":1200,"resize":"fit"}}}]},"extended_entities":{"media":[{"id":1390855708363698183,"id_str":"1390855708363698183","indices":[277,300],"media_url":"http:\/\/pbs.twimg.com\/media\/E01P5c7XsAcgFlQ.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/E01P5c7XsAcgFlQ.jpg","url":"https:\/\/t.co\/mGuOo9qwZP","display_url":"pic.twitter.com\/mGuOo9qwZP","expanded_url":"https:\/\/twitter.com\/aa86marat2\/status\/1390855727116427270\/photo\/1","type":"photo","sizes":{"thumb":{"w":150,"h":150,"resize":"crop"},"medium":{"w":554,"h":1200,"resize":"fit"},"large":{"w":720,"h":1560,"resize":"fit"},"small":{"w":314,"h":680,"resize":"fit"}}},{"id":1390855718069317649,"id_str":"1390855718069317649","indices":[277,300],"media_url":"http:\/\/pbs.twimg.com\/media\/E01P6BFXsBEhWDZ.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/E01P6BFXsBEhWDZ.jpg","url":"https:\/\/t.co\/mGuOo9qwZP","display_url":"pic.twitter.com\/mGuOo9qwZP","expanded_url":"https:\/\/twitter.com\/aa86marat2\/status\/1390855727116427270\/photo\/1","type":"photo","sizes":{"thumb":{"w":150,"h":150,"resize":"crop"},"small":{"w":511,"h":680,"resize":"fit"},"large":{"w":901,"h":1200,"resize":"fit"},"medium":{"w":901,"h":1200,"resize":"fit"}}}]}},"quote_count":0,"reply_count":1,"retweet_count":33,"favorite_count":2,"entities":{"hashtags":[{"text":"\u0644\u0627_\u0644\u0644\u062a\u0637\u0639\u064a\u0645_\u0627\u0644\u0627\u062c\u0628\u0627\u0631\u064a","indices":[0,20]}],"urls":[{"url":"https:\/\/t.co\/mpCKrtjMW4","expanded_url":"https:\/\/twitter.com\/i\/web\/status\/1390855727116427270","display_url":"twitter.com\/i\/web\/status\/1\u2026","indices":[117,140]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"ar"},"is_quote_status":false,"quote_count":0,"reply_count":0,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[{"text":"\u0644\u0627_\u0644\u0644\u062a\u0637\u0639\u064a\u0645_\u0627\u0644\u0627\u062c\u0628\u0627\u0631\u064a","indices":[16,36]}],"urls":[],"user_mentions":[{"screen_name":"aa86marat2","name":"\u0639\u0628\u062f\u0627\u0644\u0631\u062d\u0645\u0646 \u0645\u062d\u0645\u062f 2","id":1338279194892115974,"id_str":"1338279194892115974","indices":[3,14]}],"symbols":[]},"favorited":false,"retweeted":false,"filter_level":"low","lang":"ar","timestamp_ms":"1620466064719"}
{"created_at":"Sat May 08 09:27:44 +0000 2021","id":1390961607799296000,"id_str":"1390961607799296000","text":"RT #ariftgif: Tag ur bestie \/ fav person \/ # third person https:\/\/t.co\/L4pGn5nAKJ","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":812488479607169024,"id_str":"812488479607169024","name":"\ud83c\udfae","screen_name":"bjeyyy_","location":null,"url":null,"description":"always 20 hihihi","translator_type":"none","protected":false,"verified":false,"followers_count":100,"friends_count":108,"listed_count":0,"favourites_count":2519,"statuses_count":48348,"created_at":"Sat Dec 24 02:42:03 +0000 2016","utc_offset":null,"time_zone":null,"geo_enabled":true,"lang":null,"contributors_enabled":false,"is_translator":false,"profile_background_color":"F5F8FA","profile_background_image_url":"","profile_background_image_url_https":"","profile_background_tile":false,"profile_link_color":"1DA1F2","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/1381271588394336256\/CnmfqTCN_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/1381271588394336256\/CnmfqTCN_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/812488479607169024\/1582382619","default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null,"withheld_in_countries":[]},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Fri May 07 10:38:58 +0000 2021","id":1390617147198480388,"id_str":"1390617147198480388","text":"Tag ur bestie \/ fav person \/ # third person https:\/\/t.co\/L4pGn5nAKJ","display_text_range":[0,43],"source":"\u003ca href=\"http:\/\/twitter.com\/download\/android\" rel=\"nofollow\"\u003eTwitter for Android\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1188902968277385216,"id_str":"1188902968277385216","name":"arif | \u2606","screen_name":"ariftgif","location":"\ud835\udcb6\ud835\udcc3\ud835\udcbf\ud835\udcb6\ud835\udcc8\ud835\udcc2\ud835\udcb6\ud835\udcc7\ud835\udcb6","url":"https:\/\/vt.tiktok.com\/ZSEAw2WN\/","description":"ugly asf | a man with mullet hair","translator_type":"none","protected":false,"verified":false,"followers_count":2144,"friends_count":937,"listed_count":0,"favourites_count":20934,"statuses_count":19470,"created_at":"Mon Oct 28 19:39:09 +0000 2019","utc_offset":null,"time_zone":null,"geo_enabled":false,"lang":null,"contributors_enabled":false,"is_translator":false,"profile_background_color":"F5F8FA","profile_background_image_url":"","profile_background_image_url_https":"","profile_background_tile":false,"profile_link_color":"1DA1F2","profile_sidebar_border_color":"C0DEED","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/1387751035041505282\/f_Q5rJ_B_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/1387751035041505282\/f_Q5rJ_B_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1188902968277385216\/1619601431","default_profile":true,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null,"withheld_in_countries":[]},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"quote_count":49,"reply_count":176,"retweet_count":1451,"favorite_count":3773,"entities":{"hashtags":[],"urls":[],"user_mentions":[],"symbols":[],"media":[{"id":1390617137362792450,"id_str":"1390617137362792450","indices":[44,67],"media_url":"http:\/\/pbs.twimg.com\/media\/E0x26ysUcAIYLJd.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/E0x26ysUcAIYLJd.jpg","url":"https:\/\/t.co\/L4pGn5nAKJ","display_url":"pic.twitter.com\/L4pGn5nAKJ","expanded_url":"https:\/\/twitter.com\/ariftgif\/status\/1390617147198480388\/photo\/1","type":"photo","sizes":{"thumb":{"w":150,"h":150,"resize":"crop"},"small":{"w":680,"h":667,"resize":"fit"},"medium":{"w":1080,"h":1060,"resize":"fit"},"large":{"w":1080,"h":1060,"resize":"fit"}}}]},"extended_entities":{"media":[{"id":1390617137362792450,"id_str":"1390617137362792450","indices":[44,67],"media_url":"http:\/\/pbs.twimg.com\/media\/E0x26ysUcAIYLJd.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/E0x26ysUcAIYLJd.jpg","url":"https:\/\/t.co\/L4pGn5nAKJ","display_url":"pic.twitter.com\/L4pGn5nAKJ","expanded_url":"https:\/\/twitter.com\/ariftgif\/status\/1390617147198480388\/photo\/1","type":"photo","sizes":{"thumb":{"w":150,"h":150,"resize":"crop"},"small":{"w":680,"h":667,"resize":"fit"},"medium":{"w":1080,"h":1060,"resize":"fit"},"large":{"w":1080,"h":1060,"resize":"fit"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"is_quote_status":false,"quote_count":0,"reply_count":0,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[],"urls":[],"user_mentions":[{"screen_name":"ariftgif","name":"arif | \u2606","id":1188902968277385216,"id_str":"1188902968277385216","indices":[3,12]}],"symbols":[],"media":[{"id":1390617137362792450,"id_str":"1390617137362792450","indices":[58,81],"media_url":"http:\/\/pbs.twimg.com\/media\/E0x26ysUcAIYLJd.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/E0x26ysUcAIYLJd.jpg","url":"https:\/\/t.co\/L4pGn5nAKJ","display_url":"pic.twitter.com\/L4pGn5nAKJ","expanded_url":"https:\/\/twitter.com\/ariftgif\/status\/1390617147198480388\/photo\/1","type":"photo","sizes":{"thumb":{"w":150,"h":150,"resize":"crop"},"small":{"w":680,"h":667,"resize":"fit"},"medium":{"w":1080,"h":1060,"resize":"fit"},"large":{"w":1080,"h":1060,"resize":"fit"}},"source_status_id":1390617147198480388,"source_status_id_str":"1390617147198480388","source_user_id":1188902968277385216,"source_user_id_str":"1188902968277385216"}]},"extended_entities":{"media":[{"id":1390617137362792450,"id_str":"1390617137362792450","indices":[58,81],"media_url":"http:\/\/pbs.twimg.com\/media\/E0x26ysUcAIYLJd.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/E0x26ysUcAIYLJd.jpg","url":"https:\/\/t.co\/L4pGn5nAKJ","display_url":"pic.twitter.com\/L4pGn5nAKJ","expanded_url":"https:\/\/twitter.com\/ariftgif\/status\/1390617147198480388\/photo\/1","type":"photo","sizes":{"thumb":{"w":150,"h":150,"resize":"crop"},"small":{"w":680,"h":667,"resize":"fit"},"medium":{"w":1080,"h":1060,"resize":"fit"},"large":{"w":1080,"h":1060,"resize":"fit"}},"source_status_id":1390617147198480388,"source_status_id_str":"1390617147198480388","source_user_id":1188902968277385216,"source_user_id_str":"1188902968277385216"}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en","timestamp_ms":"1620466064773"}
I am new to spark and already spent 3 days finding out how to get value from Nested JSON Inside DStream Object.
You can map the json string to a tuple of the values that you want to extract:
import json
def parse_json_string(str):
user = json.loads(str)["user"]
return (user["name"], user["followers_count"])
stream = socket_stream.window...
stream = stream.map(lambda str: parse_json_string(str))
stream.pprint()
ssc.start()
ssc.awaitTermination()
Output:
-------------------------------------------
Time: 2021-05-08 18:50:48
-------------------------------------------
('Nahoj Morts', 15)
('KISSMEMYFAN1106♡', 91307)
('om_mhamd99', 903)
('🎮', 100)
I would like to use a webservice who deliver a binary file with some data, I know the result but I don't know how I can decode this binary with a script.
Here is my binary :
https://pastebin.com/3vnM8CVk
0a39 0a06 3939 3831 3438 1206 4467 616d
6178 1a0b 6361 7264 6963 6f6e 5f33 3222
0d54 6865 204f 6c64 2047 7561 7264 2a02
....
Some part are in ASCII so it easy to catch, at the end of the file you got vehicle name in ASCII and some data, it should be kill/victory/battle/XP/Money data but I don't understand how I can decode these hexa value, I tried to compare 2 vehicles who got same kills but I don't see any match.
There is a way to decode these data ?
Thanks :)
Hello guys, after 1 year I started again to find a solution, so here is the structure of the packet I guessed : (the part between [ ] I still don't know what is it for)
[52 37 08 01 10] 4E [18] EA [01 25] AB AA AA 3E [28] D4 [01 30] EC [01 38] 88 01 [40] 91 05 [48] 9F CA 22 [50] F5 C2 9A 02 [5A 12]
| | | | | | | | |
Victories Victory Ratio| | Air target| Xp Money earned
| | | Ground Target
Battles Deaths Respawns
So here is the result :
Victory : 78
Battles : 234
Victory Ratio : ? (should be arround 33%)
Deaths : 212
Respawns : 236
Air Target : 136
Ground Target : 657
Xp : ? (should be arround 566.56k)
Money : ? (should be arround 4.63M)
Is there a special way to calculate the result of a long hex like this ?
F5 C2 9A 02 (should be arround 4.63M)
I tell you a bit more :
I know the result, but I don't know how to calculate it with these hex from the packet.
If I check a packet with a small amout of money or XP to be compatible with one hex :
[52 1E 08 01 10] 01 [18] [01 25] 00 00 80 3F [28] 01 [30] 01 [48] 24 [50] 6E [5A 09]
6E = 110 Money earned
24 = 36 XP earned
Another exemple :
[52 21 08 01 10] 02 [18] 03 [25] AB AA 2A 3F [28] 02 [30] 03 [40] 01 [48] 78 [50] C7 08 [5A 09]
XP earned = hex 78 = 120
Money earned = hex C7 08 = 705
How C7 08 can do 705 decimal ?
Here is the full content in case but I know how to isolate just these part I don't need to decode all these hex data :
https://pastebin.com/vAKPynNb
What you have asked is nothing but how to reverse engineer a binary file. Lot of threads already on SO
Reverse engineer a binary dictionary file to extract strings
Tools to help reverse engineer binary file formats
https://reverseengineering.stackexchange.com/questions/3495/what-tools-exist-for-excavating-data-structures-from-flat-binary-files
http://www.iwriteiam.nl/Ha_HTCABFF.html
The final take out on all is that no single solution for you, you need to spend effort to figure it out. There are tools to help you, but don't expect a magic wand tool to give you the structure/data.
Any kind of file read operation is done in text or binary format with basic file handlers. And some languages offer type reading of int, float etc. or arrays of them.
The complex operations behind these reading are almost always kept hidden from normal users. And then the user has to learn more when it comes to read/write operations of data structures.
In this case, OFFSET and SEEK are the words one must find value and act accordingly. whence data read, it must be converted to suitable data type too.
The following code shows basics for these operations to write data and read blocks to get numbers back. It is written in PHP as the OP has commented in the question he uses PHP.
Offset is calculated with these byte values to be 11: char: 1 byte, short: 2 bytes, int: 4 bytes, float: 4 bytes.
<?php
$filename = "testdata.dat";
$filehandle = fopen($filename, "w+");
$data=["test string","another test string",77,777,77777,7.77];
fwrite($filehandle,$data[0]);
fwrite($filehandle,$data[1]);
$numbers=array_slice($data,2);
fwrite($filehandle,pack("c1s1i1f1",...$numbers));
fwrite($filehandle,"end"); // gives 3 to offset
fclose($filehandle);
$filename = "testdata.dat";
$filehandle = fopen($filename, "rb+");
$offset=filesize($filename)-11-3;
fseek($filehandle,$offset);
$numberblock= fread($filehandle,11);
$numbersback=unpack("c1a/s1b/i1c/f1d",$numberblock);
var_dump($numbersback);
fclose($filehandle);
?>
Once this example understood, the rest is to find the data structure in the requested file. I had written another example but it uses assumptions. I leave the rest to readers to find what assumptions I made here. Be careful though: I know nothing about real structure and values will not be correct.
<?php
$filename = "testfile";
$filehandle = fopen($filename, "rb");
$offset=17827-2*41; //filesize minus 2 user area
fseek($filehandle,$offset);
print $user1 = fread($filehandle, 41);echo "<br>";
$user1pr=unpack("s1kill/s1victory/s1battle/s1XP/s1Money/f1Life",$user1);
var_dump($user1pr); echo "<br>";
fseek($filehandle,$offset+41);
print $user2 = fread($filehandle, 41);echo "<br>";
$user2pr=unpack("s1kill/s1victory/s1battle/i1XP/i1Money/f1Life",$user2);
var_dump($user2pr); echo "<br>";
echo "<br><br>";
$repackeduser2=pack("s3i2f1",$user2pr["kill"],$user2pr["victory"],
$user2pr["battle"],$user2pr["XP"],$user2pr["Money"],
$user2pr["Life"]
);
print $user2 . "<br>" .$repackeduser2;
print "<br>3*s1=6bytes, 2*i=6bytes, 1*f=*bytes (machine dependent)<br>";
print pack("s1",$user2pr["kill"]) ."<br>";
print pack("s1",$user2pr["victory"]) ."<br>";
print pack("s1",$user2pr["battle"]) ."<br>";
print pack("i1",$user2pr["XP"]) ."<br>";
print pack("i1",$user2pr["Money"]) ."<br>";
print pack("f1",$user2pr["Life"]) ."<br>";
fclose($filehandle);
?>
PS: pack and unpack uses machine dependent size for some data types such as int and float, so be careful with working them. Read Official PHP:pack and PHP:unpack manuals.
This looks more like the hexdump of a binary file. Some methods of converting hex to strings resulted in the same scrambled output. Only some lines are readable like this...
Dgamaxcardicon_32" The Old Guard
As #Tarun Lalwani said, you would have to know the structure of this data to get the in plaintext.
If you have access to the raw binary, you could try using strings https://serverfault.com/questions/51477/linux-command-to-find-strings-in-binary-or-non-ascii-file
What I'm trying to do here is to leave texts only from each tweet.
import org.apache.spark.{SparkConf, SparkContext}
import scala.io.Source
object shortTwitter {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("ShortTwitterAnalysis").setMaster("local[2]")
val sc = new SparkContext(sparkConf)
val text = sc.textFile("/home/tobby/data/shortTwitter.txt")
val counts = text
.map(_.toLowerCase)
.map(_.toString)
.map(_.replace("\t", ""))
.map(_.replace("\"", ""))
.map(_.replace("\n", ""))
.map(_.replaceAll("[\\p{C}]", ""))
.map(_.split("\"text\":\"")(1).split("\",\"source\":")(0))
counts.foreach(println)
}
}
But the last map function .map(_.split("\"text\":\"")(1).split("\",\"source\":")(0)) does not work. Do you have any advice?
Without the .map(_.split("\"text\":\"")(1).split("\",\"source\":")(0)) my tweets look like below :
{created_at:wed jul 16 23:58:19 +0000 2014,id:489559687189110784,id_str:489559687189110784,text:a rose by any other name would smell as sweet,source:\u003ca href=\https:\/\/twitter.com\/download\/android\ rel=\nofollow\\u003etwitter for android\u003c\/a\u003e,truncated:false,in_reply_to_status_id:null,in_reply_to_status_id_str:null,in_reply_to_user_id:null,in_reply_to_user_id_str:null,in_reply_to_screen_name:null,user:{id:621244372,id_str:621244372,name:\u2665,screen_name:ivunia_ontrinae,location:,url:null,description:me myself & i \u2764,protected:false,verified:false,followers_count:1023,friends_count:591,listed_count:1,favourites_count:1909,statuses_count:26770,created_at:thu jun 28 19:23:06 +0000 2012,utc_offset:-10800,time_zone:atlantic time (canada),geo_enabled:true,lang:en,contributors_enabled:false,is_translator:false,profile_background_color:c0deed,profile_background_image_url:http:\/\/pbs.twimg.com\/profile_background_images\/378800000101658269\/ec0820565f0451a3ce7169c776fbe41f.jpeg,profile_background_image_url_https:https:\/\/pbs.twimg.com\/profile_background_images\/378800000101658269\/ec0820565f0451a3ce7169c776fbe41f.jpeg,profile_background_tile:true,profile_link_color:e62bb4,profile_sidebar_border_color:000000,profile_sidebar_fill_color:ddeef6,profile_text_color:333333,profile_use_background_image:true,profile_image_url:http:\/\/pbs.twimg.com\/profile_images\/483373612749959168\/f3qpy_66_normal.jpeg,profile_image_url_https:https:\/\/pbs.twimg.com\/profile_images\/483373612749959168\/f3qpy_66_normal.jpeg,profile_banner_url:https:\/\/pbs.twimg.com\/profile_banners\/621244372\/1404758956,default_profile:false,default_profile_image:false,following:null,follow_request_sent:null,notifications:null},geo:null,coordinates:null,place:null,contributors:null,retweet_count:0,favorite_count:0,entities:{hashtags:[],trends:[],urls:[],user_mentions:[],symbols:[]},favorited:false,retweeted:false,possibly_sensitive:false,filter_level:medium,lang:en}
{created_at:wed jul 16 23:58:19 +0000 2014,id:489559687189110784,id_str:489559687189110784,text:a rose is a rose is a rose,source:\u003ca href=\https:\/\/twitter.com\/download\/android\ rel=\nofollow\\u003etwitter for android\u003c\/a\u003e,truncated:false,in_reply_to_status_id:null,in_reply_to_status_id_str:null,in_reply_to_user_id:null,in_reply_to_user_id_str:null,in_reply_to_screen_name:null,user:{id:621244372,id_str:621244372,name:\u2665,screen_name:ivunia_ontrinae,location:,url:null,description:me myself & i \u2764,protected:false,verified:false,followers_count:1023,friends_count:591,listed_count:1,favourites_count:1909,statuses_count:26770,created_at:thu jun 28 19:23:06 +0000 2012,utc_offset:-10800,time_zone:atlantic time (canada),geo_enabled:true,lang:en,contributors_enabled:false,is_translator:false,profile_background_color:c0deed,profile_background_image_url:http:\/\/pbs.twimg.com\/profile_background_images\/378800000101658269\/ec0820565f0451a3ce7169c776fbe41f.jpeg,profile_background_image_url_https:https:\/\/pbs.twimg.com\/profile_background_images\/378800000101658269\/ec0820565f0451a3ce7169c776fbe41f.jpeg,profile_background_tile:true,profile_link_color:e62bb4,profile_sidebar_border_color:000000,profile_sidebar_fill_color:ddeef6,profile_text_color:333333,profile_use_background_image:true,profile_image_url:http:\/\/pbs.twimg.com\/profile_images\/483373612749959168\/f3qpy_66_normal.jpeg,profile_image_url_https:https:\/\/pbs.twimg.com\/profile_images\/483373612749959168\/f3qpy_66_normal.jpeg,profile_banner_url:https:\/\/pbs.twimg.com\/profile_banners\/621244372\/1404758956,default_profile:false,default_profile_image:false,following:null,follow_request_sent:null,notifications:null},geo:null,coordinates:null,place:null,contributors:null,retweet_count:0,favorite_count:0,entities:{hashtags:[],trends:[],urls:[],user_mentions:[],symbols:[]},favorited:false,retweeted:false,possibly_sensitive:false,filter_level:medium,lang:en}
{created_at:wed jul 16 23:58:19 +0000 2014,id:489559687176945664,id_str:489559687176945664,text:love is like a rose the joy of all the earth,source:\u003ca href=\http:\/\/twitter.com\/download\/iphone\ rel=\nofollow\\u003etwitter for iphone\u003c\/a\u003e,truncated:false,in_reply_to_status_id:null,in_reply_to_status_id_str:null,in_reply_to_user_id:null,in_reply_to_user_id_str:null,in_reply_to_screen_name:null,user:{id:363819213,id_str:363819213,name:ivanna010394,screen_name:ivannacarrillo,location:,url:null,description:null,protected:false,verified:false,followers_count:243,friends_count:530,listed_count:0,favourites_count:26,statuses_count:5672,created_at:sun aug 28 18:58:49 +0000 2011,utc_offset:-14400,time_zone:eastern time (us & canada),geo_enabled:false,lang:es,contributors_enabled:false,is_translator:false,profile_background_color:642d8b,profile_background_image_url:http:\/\/pbs.twimg.com\/profile_background_images\/767201253\/661eb2d4915e9ee6566647dcbaab0186.jpeg,profile_background_image_url_https:https:\/\/pbs.twimg.com\/profile_background_images\/767201253\/661eb2d4915e9ee6566647dcbaab0186.jpeg,profile_background_tile:true,profile_link_color:ff0000,profile_sidebar_border_color:ffffff,profile_sidebar_fill_color:7ac3ee,profile_text_color:3d1957,profile_use_background_image:true,profile_image_url:http:\/\/pbs.twimg.com\/profile_images\/455873054703648768\/_b4mf6o7_normal.jpeg,profile_image_url_https:https:\/\/pbs.twimg.com\/profile_images\/455873054703648768\/_b4mf6o7_normal.jpeg,profile_banner_url:https:\/\/pbs.twimg.com\/profile_banners\/363819213\/1402261141,default_profile:false,default_profile_image:false,following:null,follow_request_sent:null,notifications:null},geo:null,coordinates:null,place:null,contributors:null,retweeted_status:{created_at:wed jul 16 13:45:28 +0000 2014,id:489405458168709120,id_str:489405458168709120,text:our milan show is now sold out, thankyou :d tickets are still available for most of europe ! http:\/\/t.co\/arnh7pvoap http:\/\/t.co\/t5wzyocrtu,source:\u003ca href=\http:\/\/twitter.com\ rel=\nofollow\\u003etwitter web client\u003c\/a\u003e,truncated:false,in_reply_to_status_id:null,in_reply_to_status_id_str:null,in_reply_to_user_id:null,in_reply_to_user_id_str:null,in_reply_to_screen_name:null,user:{id:264107729,id_str:264107729,name:5 seconds of summer,screen_name:5sos,location:sydney, australia,url:http:\/\/www.facebook.com\/5secondsofsummer,description:4 aussies making music :) love the people who support us! our album is out :) http:\/\/po.st\/or93y4 | #ashton5sos #calum5sos #michael5sos #luke5sos,protected:false,verified:true,followers_count:3704204,friends_count:28660,listed_count:20024,favourites_count:1061,statuses_count:17297,created_at:fri mar 11 10:18:46 +0000 2011,utc_offset:36000,time_zone:sydney,geo_enabled:false,lang:en,contributors_enabled:false,is_translator:false,profile_background_color:000000,profile_background_image_url:http:\/\/pbs.twimg.com\/profile_background_images\/483531430371147778\/0gzkh2zi.jpeg,profile_background_image_url_https:https:\/\/pbs.twimg.com\/profile_background_images\/483531430371147778\/0gzkh2zi.jpeg,profile_background_tile:false,profile_link_color:c21b1b,profile_sidebar_border_color:ffffff,profile_sidebar_fill_color:ddeef6,profile_text_color:333333,profile_use_background_image:true,profile_image_url:http:\/\/pbs.twimg.com\/profile_images\/485730748574752768\/zm1ctcvv_normal.jpeg,profile_image_url_https:https:\/\/pbs.twimg.com\/profile_images\/485730748574752768\/zm1ctcvv_normal.jpeg,profile_banner_url:https:\/\/pbs.twimg.com\/profile_banners\/264107729\/1404117825,default_profile:false,default_profile_image:false,following:null,follow_request_sent:null,notifications:null},geo:null,coordinates:null,place:null,contributors:null,retweet_count:12648,favorite_count:31390,entities:{hashtags:[],trends:[],urls:[{url:http:\/\/t.co\/arnh7pvoap,expanded_url:http:\/\/5sos.com\/live,display_url:5sos.com\/live,indices:[93,115]}],user_mentions:[],symbols:[],media:[{id:489405457111715840,id_str:489405457111715840,indices:[116,138],media_url:http:\/\/pbs.twimg.com\/media\/bsq3q5zieaakbgg.jpg,media_url_https:https:\/\/pbs.twimg.com\/media\/bsq3q5zieaakbgg.jpg,url:http:\/\/t.co\/t5wzyocrtu,display_url:pic.twitter.com\/t5wzyocrtu,expanded_url:http:\/\/twitter.com\/5sos\/status\/489405458168709120\/photo\/1,type:photo,sizes:{small:{w:340,h:613,resize:fit},thumb:{w:150,h:150,resize:crop},medium:{w:600,h:1081,resize:fit},large:{w:811,h:1461,resize:fit}}}]},favorited:false,retweeted:false,possibly_sensitive:false,filter_level:low,lang:en},retweet_count:0,favorite_count:0,entities:{hashtags:[],trends:[],urls:[{url:http:\/\/t.co\/arnh7pvoap,expanded_url:http:\/\/5sos.com\/live,display_url:5sos.com\/live,indices:[103,125]}],user_mentions:[{screen_name:5sos,name:5 seconds of summer,id:264107729,id_str:264107729,indices:[3,8]}],symbols:[],media:[{id:489405457111715840,id_str:489405457111715840,indices:[126,140],media_url:http:\/\/pbs.twimg.com\/media\/bsq3q5zieaakbgg.jpg,media_url_https:https:\/\/pbs.twimg.com\/media\/bsq3q5zieaakbgg.jpg,url:http:\/\/t.co\/t5wzyocrtu,display_url:pic.twitter.com\/t5wzyocrtu,expanded_url:http:\/\/twitter.com\/5sos\/status\/489405458168709120\/photo\/1,type:photo,sizes:{small:{w:340,h:613,resize:fit},thumb:{w:150,h:150,resize:crop},medium:{w:600,h:1081,resize:fit},large:{w:811,h:1461,resize:fit}},source_status_id:489405458168709120,source_status_id_str:489405458168709120}]},favorited:false,retweeted:false,possibly_sensitive:false,filter_level:medium,lang:en}
{created_at:sat jan 16 12:00:47 +0000 2016,id:688330052233199616,id_str:688330052233199616,text:rt #nba2k: the battle of two young teams. tough season but one will emerge victorious. who will it be? lakers or 76ers? https:\/\/t.co\/nukkjq\u2026,source:\u003ca href=\http:\/\/twitter.com\ rel=\nofollow\\u003etwitter web client\u003c\/a\u003e,truncated:false,in_reply_to_status_id:null,in_reply_to_status_id_str:null,in_reply_to_user_id:null,in_reply_to_user_id_str:null,in_reply_to_screen_name:null,user:{id:4817727209,id_str:4817727209,name:mark lieyg,screen_name:_yungwiggins_,location:null,url:null,description:null,protected:false,verified:false,followers_count:3,friends_count:40,listed_count:0,favourites_count:0,statuses_count:39,created_at:sat jan 16 11:06:38 +0000 2016,utc_offset:-28800,time_zone:pacific time (us & canada),geo_enabled:false,lang:en,contributors_enabled:false,is_translator:false,profile_background_color:f5f8fa,profile_background_image_url:,profile_background_image_url_https:,profile_background_tile:false,profile_link_color:2b7bb9,profile_sidebar_border_color:c0deed,profile_sidebar_fill_color:ddeef6,profile_text_color:333333,profile_use_background_image:true,profile_image_url:http:\/\/abs.twimg.com\/sticky\/default_profile_images\/default_profile_1_normal.png,profile_image_url_https:https:\/\/abs.twimg.com\/sticky\/default_profile_images\/default_profile_1_normal.png,default_profile:true,default_profile_image:true,following:null,follow_request_sent:null,notifications:null},geo:null,coordinates:null,place:null,contributors:null,retweeted_status: {created_at:sat jan 02 03:31:10 +0000 2016,id:683128371627200513,id_str:683128371627200513,text:the battle of two young teams. tough season but one will emerge victorious. who will it be? lakers or 76ers? https:\/\/t.co\/nukkjqqspa,source:\u003ca href=\http:\/\/percolate.com\ rel=\nofollow\\u003epercolate\u003c\/a\u003e,truncated:false,in_reply_to_status_id:null,in_reply_to_status_id_str:null,in_reply_to_user_id:null,in_reply_to_user_id_str:null,in_reply_to_screen_name:null,user:{id:15573174,id_str:15573174,name:nba 2k 2k16,screen_name:nba2k,location:novato, ca,url:http:\/\/www.2k.com,description:esrb rating: everyone 10+. #nba2k16 available now for playstation 4 & xbox one, playstation 3 & xbox 360 & pc http:\/\/2kgam.es\/buynba2k16,protected:false,verified:true,followers_count:948071,friends_count:1630,listed_count:3305,favourites_count:10,statuses_count:8162,created_at:wed jul 23 21:57:14 +0000 2008,utc_offset:-28800,time_zone:pacific time (us & canada),geo_enabled:true,lang:en,contributors_enabled:false,is_translator:false,profile_background_color:000000,profile_background_image_url:http:\/\/pbs.twimg.com\/profile_background_images\/539865904528371712\/gnb-ggrq.png,profile_background_image_url_https:https:\/\/pbs.twimg.com\/profile_background_images\/539865904528371712\/gnb-ggrq.png,profile_background_tile:false,profile_link_color:ff0300,profile_sidebar_border_color:ffffff,profile_sidebar_fill_color:0d2b44,profile_text_color:408af2,profile_use_background_image:true,profile_image_url:http:\/\/pbs.twimg.com\/profile_images\/606562975109890048\/sumjozun_normal.jpg,profile_image_url_https:https:\/\/pbs.twimg.com\/profile_images\/606562975109890048\/sumjozun_normal.jpg,profile_banner_url:https:\/\/pbs.twimg.com\/profile_banners\/15573174\/1433457451,default_profile:false,default_profile_image:false,following:null,follow_request_sent:null,notifications:null},geo:null,coordinates:null,place:null,contributors:null,is_quote_status:false,retweet_count:112,favorite_count:547,entities:{hashtags:[],urls:[],user_mentions:[],symbols:[],media:[{id:683128370796736512,id_str:683128370796736512,indices:[109,132],media_url:http:\/\/pbs.twimg.com\/media\/cxr1okvusaamnu4.jpg,media_url_https:https:\/\/pbs.twimg.com\/media\/cxr1okvusaamnu4.jpg,url:https:\/\/t.co\/nukkjqqspa,display_url:pic.twitter.com\/nukkjqqspa,expanded_url:http:\/\/twitter.com\/nba2k\/status\/683128371627200513\/photo\/1,type:photo,sizes:{large:{w:1024,h:419,resize:fit},thumb:{w:150,h:150,resize:crop},medium:{w:600,h:245,resize:fit},small:{w:340,h:139,resize:fit}}}]},extended_entities:{media:[{id:683128370796736512,id_str:683128370796736512,indices:[109,132],media_url:http:\/\/pbs.twimg.com\/media\/cxr1okvusaamnu4.jpg,media_url_https:https:\/\/pbs.twimg.com\/media\/cxr1okvusaamnu4.jpg,url:https:\/\/t.co\/nukkjqqspa,display_url:pic.twitter.com\/nukkjqqspa,expanded_url:http:\/\/twitter.com\/nba2k\/status\/683128371627200513\/photo\/1,type:photo,sizes:{large:{w:1024,h:419,resize:fit},thumb:{w:150,h:150,resize:crop},medium:{w:600,h:245,resize:fit},small:{w:340,h:139,resize:fit}}}]},favorited:false,retweeted:false,possibly_sensitive:false,filter_level:low,lang:en},is_quote_status:false,retweet_count:0,favorite_count:0,entities:{hashtags:[],urls:[],user_mentions:[{screen_name:nba2k,name:nba 2k 2k16,id:15573174,id_str:15573174,indices:[3,9]}],symbols:[],media:[{id:683128370796736512,id_str:683128370796736512,indices:[120,140],media_url:http:\/\/pbs.twimg.com\/media\/cxr1okvusaamnu4.jpg,media_url_https:https:\/\/pbs.twimg.com\/media\/cxr1okvusaamnu4.jpg,url:https:\/\/t.co\/nukkjqqspa,display_url:pic.twitter.com\/nukkjqqspa,expanded_url:http:\/\/twitter.com\/nba2k\/status\/683128371627200513\/photo\/1,type:photo,sizes:{large:{w:1024,h:419,resize:fit},thumb:{w:150,h:150,resize:crop},medium:{w:600,h:245,resize:fit},small:{w:340,h:139,resize:fit}},source_status_id:683128371627200513,source_status_id_str:683128371627200513,source_user_id:15573174,source_user_id_str:15573174}]},extended_entities:{media:[{id:683128370796736512,id_str:683128370796736512,indices:[120,140],media_url:http:\/\/pbs.twimg.com\/media\/cxr1okvusaamnu4.jpg,media_url_https:https:\/\/pbs.twimg.com\/media\/cxr1okvusaamnu4.jpg,url:https:\/\/t.co\/nukkjqqspa,display_url:pic.twitter.com\/nukkjqqspa,expanded_url:http:\/\/twitter.com\/nba2k\/status\/683128371627200513\/photo\/1,type:photo,sizes:{large:{w:1024,h:419,resize:fit},thumb:{w:150,h:150,resize:crop},medium:{w:600,h:245,resize:fit},small:{w:340,h:139,resize:fit}},source_status_id:683128371627200513,source_status_id_str:683128371627200513,source_user_id:15573174,source_user_id_str:15573174}]},favorited:false,retweeted:false,possibly_sensitive:false,filter_level:low,lang:en,timestamp_ms:1452945647663}
{created_at:wed jul 16 23:58:19 +0000 2014,id:489559687176945664,id_str:489559687176945664,text:at christmas i no more desire a rose than wish a snow in may’s new-fangled mirth,source:\u003ca href=\http:\/\/twitter.com\/download\/iphone\ rel=\nofollow\\u003etwitter for iphone\u003c\/a\u003e,truncated:false,in_reply_to_status_id:null,in_reply_to_status_id_str:null,in_reply_to_user_id:null,in_reply_to_user_id_str:null,in_reply_to_screen_name:null,user:{id:363819213,id_str:363819213,name:ivanna010394,screen_name:ivannacarrillo,location:,url:null,description:null,protected:false,verified:false,followers_count:243,friends_count:530,listed_count:0,favourites_count:26,statuses_count:5672,created_at:sun aug 28 18:58:49 +0000 2011,utc_offset:-14400,time_zone:eastern time (us & canada),geo_enabled:false,lang:es,contributors_enabled:false,is_translator:false,profile_background_color:642d8b,profile_background_image_url:http:\/\/pbs.twimg.com\/profile_background_images\/767201253\/661eb2d4915e9ee6566647dcbaab0186.jpeg,profile_background_image_url_https:https:\/\/pbs.twimg.com\/profile_background_images\/767201253\/661eb2d4915e9ee6566647dcbaab0186.jpeg,profile_background_tile:true,profile_link_color:ff0000,profile_sidebar_border_color:ffffff,profile_sidebar_fill_color:7ac3ee,profile_text_color:3d1957,profile_use_background_image:true,profile_image_url:http:\/\/pbs.twimg.com\/profile_images\/455873054703648768\/_b4mf6o7_normal.jpeg,profile_image_url_https:https:\/\/pbs.twimg.com\/profile_images\/455873054703648768\/_b4mf6o7_normal.jpeg,profile_banner_url:https:\/\/pbs.twimg.com\/profile_banners\/363819213\/1402261141,default_profile:false,default_profile_image:false,following:null,follow_request_sent:null,notifications:null},geo:null,coordinates:null,place:null,contributors:null,retweeted_status:{created_at:wed jul 16 13:45:28 +0000 2014,id:489405458168709120,id_str:489405458168709120,text:our milan show is now sold out, thankyou :d tickets are still available for most of europe ! http:\/\/t.co\/arnh7pvoap http:\/\/t.co\/t5wzyocrtu,source:\u003ca href=\http:\/\/twitter.com\ rel=\nofollow\\u003etwitter web client\u003c\/a\u003e,truncated:false,in_reply_to_status_id:null,in_reply_to_status_id_str:null,in_reply_to_user_id:null,in_reply_to_user_id_str:null,in_reply_to_screen_name:null,user:{id:264107729,id_str:264107729,name:5 seconds of summer,screen_name:5sos,location:sydney, australia,url:http:\/\/www.facebook.com\/5secondsofsummer,description:4 aussies making music :) love the people who support us! our album is out :) http:\/\/po.st\/or93y4 | #ashton5sos #calum5sos #michael5sos #luke5sos,protected:false,verified:true,followers_count:3704204,friends_count:28660,listed_count:20024,favourites_count:1061,statuses_count:17297,created_at:fri mar 11 10:18:46 +0000 2011,utc_offset:36000,time_zone:sydney,geo_enabled:false,lang:en,contributors_enabled:false,is_translator:false,profile_background_color:000000,profile_background_image_url:http:\/\/pbs.twimg.com\/profile_background_images\/483531430371147778\/0gzkh2zi.jpeg,profile_background_image_url_https:https:\/\/pbs.twimg.com\/profile_background_images\/483531430371147778\/0gzkh2zi.jpeg,profile_background_tile:false,profile_link_color:c21b1b,profile_sidebar_border_color:ffffff,profile_sidebar_fill_color:ddeef6,profile_text_color:333333,profile_use_background_image:true,profile_image_url:http:\/\/pbs.twimg.com\/profile_images\/485730748574752768\/zm1ctcvv_normal.jpeg,profile_image_url_https:https:\/\/pbs.twimg.com\/profile_images\/485730748574752768\/zm1ctcvv_normal.jpeg,profile_banner_url:https:\/\/pbs.twimg.com\/profile_banners\/264107729\/1404117825,default_profile:false,default_profile_image:false,following:null,follow_request_sent:null,notifications:null},geo:null,coordinates:null,place:null,contributors:null,retweet_count:12648,favorite_count:31390,entities:{hashtags:[],trends:[],urls:[{url:http:\/\/t.co\/arnh7pvoap,expanded_url:http:\/\/5sos.com\/live,display_url:5sos.com\/live,indices:[93,115]}],user_mentions:[],symbols:[],media:[{id:489405457111715840,id_str:489405457111715840,indices:[116,138],media_url:http:\/\/pbs.twimg.com\/media\/bsq3q5zieaakbgg.jpg,media_url_https:https:\/\/pbs.twimg.com\/media\/bsq3q5zieaakbgg.jpg,url:http:\/\/t.co\/t5wzyocrtu,display_url:pic.twitter.com\/t5wzyocrtu,expanded_url:http:\/\/twitter.com\/5sos\/status\/489405458168709120\/photo\/1,type:photo,sizes:{small:{w:340,h:613,resize:fit},thumb:{w:150,h:150,resize:crop},medium:{w:600,h:1081,resize:fit},large:{w:811,h:1461,resize:fit}}}]},favorited:false,retweeted:false,possibly_sensitive:false,filter_level:low,lang:en},retweet_count:0,favorite_count:0,entities:{hashtags:[],trends:[],urls:[{url:http:\/\/t.co\/arnh7pvoap,expanded_url:http:\/\/5sos.com\/live,display_url:5sos.com\/live,indices:[103,125]}],user_mentions:[{screen_name:5sos,name:5 seconds of summer,id:264107729,id_str:264107729,indices:[3,8]}],symbols:[],media:[{id:489405457111715840,id_str:489405457111715840,indices:[126,140],media_url:http:\/\/pbs.twimg.com\/media\/bsq3q5zieaakbgg.jpg,media_url_https:https:\/\/pbs.twimg.com\/media\/bsq3q5zieaakbgg.jpg,url:http:\/\/t.co\/t5wzyocrtu,display_url:pic.twitter.com\/t5wzyocrtu,expanded_url:http:\/\/twitter.com\/5sos\/status\/489405458168709120\/photo\/1,type:photo,sizes:{small:{w:340,h:613,resize:fit},thumb:{w:150,h:150,resize:crop},medium:{w:600,h:1081,resize:fit},large:{w:811,h:1461,resize:fit}},source_status_id:489405458168709120,source_status_id_str:489405458168709120}]},favorited:false,retweeted:false,possibly_sensitive:false,filter_level:medium,lang:en}
{created_at:sat jan 16 12:00:48 +0000 2016,id:688330056410755072,id_str:688330056410755072,text:i was going to bake a cake and listen to the football. flour refund?,source:\u003ca href=\http:\/\/twitter.com\/download\/iphone\ rel=\nofollow\\u003etwitter for iphone\u003c\/a\u003e,truncated:false,in_reply_to_status_id:null,in_reply_to_status_id_str:null,in_reply_to_user_id:null,in_reply_to_user_id_str:null,in_reply_to_screen_name:null,user:{id:252303653,id_str:252303653,name:pete blackman,screen_name:peteblackman,location:null,url:null,description:null,protected:false,verified:false,followers_count:409,friends_count:903,listed_count:18,favourites_count:5664,statuses_count:22919,created_at:mon feb 14 22:44:37 +0000 2011,utc_offset:3600,time_zone:amsterdam,geo_enabled:false,lang:en,contributors_enabled:false,is_translator:false,profile_background_color:c0deed,profile_background_image_url:http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png,profile_background_image_url_https:https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png,profile_background_tile:false,profile_link_color:0084b4,profile_sidebar_border_color:c0deed,profile_sidebar_fill_color:ddeef6,profile_text_color:333333,profile_use_background_image:true,profile_image_url:http:\/\/pbs.twimg.com\/profile_images\/2600097910\/image_normal.jpg,profile_image_url_https:https:\/\/pbs.twimg.com\/profile_images\/2600097910\/image_normal.jpg,default_profile:true,default_profile_image:false,following:null,follow_request_sent:null,notifications:null},geo:null,coordinates:null,place:null,contributors:null,is_quote_status:false,retweet_count:0,favorite_count:0,entities:{hashtags:[],urls:[],user_mentions:[],symbols:[]},favorited:false,retweeted:false,filter_level:low,lang:en,timestamp_ms:1452945648659}
Or is there any other way but using split? I would really appreciate your tips.
The error is as below.
16/09/18 22:49:37 ERROR TaskSetManager: Task 0 in stage 0.0 failed 1 times; aborting job
Hi hope I understand the question correctly, you are attempting to read a file and with the text mentioned above and then print the "text" mentioned in file containing json
If the above assumption is correct, here a simple code which would do this:
val matchingPattern = "(?i)(text:)(.+?)(,source:)".r
val tweets = scala.io.Source.fromPath("/home/tobby/data/shortTwitter.txt").getLines.reduceLeft(_+_)
matchingPattern.findAllIn(tweets).matchData foreach { m => println(m.group(2)) }
Hope it helps, if the above assumption is not correct please provide a sample input and expected output
I've been using the searchTwitter function from the Twitter REST API to retrieve a certain amount of tweets and I've dumped this to a TXT file.
The structure of this TXT file is:
"text" "favorited" "favoriteCount" "replyToSN" "created" "truncated" "replyToSID" "id" "replyToUID" "statusSource" "screenName" "retweetCount" "isRetweet" "retweeted" "longitude" "latitude"
"1" "RT #kobebryant: Last night was the final chapter to an incredible story. I walk away at peace knowing my love for the game & this city will…" FALSE 0 NA 2016-04-14 23:59:59 FALSE NA "720763566027096066" NA "Twitter for iPhone" "JtLONGWAY" 204125 TRUE FALSE NA NA
"2" "RT #kobebryant: Last night was the final chapter to an incredible story. I walk away at peace knowing my love for the game & this city will…" FALSE 0 NA 2016-04-14 23:59:59 FALSE NA "720763566014332928" NA "Twitter for Android" "Mr_Wizrd" 204125 TRUE FALSE NA NA
"3" "RT #MagicJohnson: I got a chance to get to know #kobebryant away from the court at the #Dodgers game! #ThankYouKobe #KB20 https://twitter.com/sVsW…" FALSE 0 NA 2016-04-14 23:59:59 FALSE NA "720763563783110661" NA "Twitter for iPhone" "TynashKobe" 777 TRUE FALSE NA NA
and I would like to have this as a JSON structure, i.e.
{"created_at":"Wed Apr 13 22:06:02 +0000 2016","id":720372500065071104,"id_str":"720372500065071104","text":"RT #STAPLESCenter: This is where #kobebryant will hold is final press conference tonight. #ThankYouKobe https:\/\/t.co\/1rTiq5eAS9","source":"\u003ca href=\"http:\/\/tweetlogix.com\" rel=\"nofollow\"\u003eTweetlogix\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":149681225,"id_str":"149681225","name":"SP","screen_name":"Mr_LayedBak","location":"West side of Detroit","url":null,"description":"Unfollow me if you're easily offended","protected":false,"verified":false,"followers_count":4326,"friends_count":597,"listed_count":105,"favourites_count":371,"statuses_count":227845,"created_at":"Sat May 29 23:21:29 +0000 2010","utc_offset":-14400,"time_zone":"Eastern Time (US & Canada)","geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/pbs.twimg.com\/profile_background_images\/736248613\/7d89d45f16e6c4e508a883aded1aac64.jpeg","profile_background_image_url_https":"https:\/\/pbs.twimg.com\/profile_background_images\/736248613\/7d89d45f16e6c4e508a883aded1aac64.jpeg","profile_background_tile":true,"profile_link_color":"141313","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"000000","profile_text_color":"660A0A","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/719706881736974341\/XT8R51s8_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/719706881736974341\/XT8R51s8_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/149681225\/1452265608","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Wed Apr 13 21:37:36 +0000 2016","id":720365343500144640,"id_str":"720365343500144640","text":"This is where #kobebryant will hold is final press conference tonight. #ThankYouKobe https:\/\/t.co\/1rTiq5eAS9","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":28725783,"id_str":"28725783","name":"STAPLES Center","screen_name":"STAPLESCenter","location":"Los Angeles","url":"http:\/\/www.staplescenter.com","description":"Sports and Entertainment Center of the World located in downtown Los Angeles #LALIVE since 1999. Instagram: #staplescenterla","protected":false,"verified":true,"followers_count":82891,"friends_count":10907,"listed_count":862,"favourites_count":1905,"statuses_count":11024,"created_at":"Sat Apr 04 03:04:17 +0000 2009","utc_offset":-25200,"time_zone":"Pacific Time (US & Canada)","geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/pbs.twimg.com\/profile_background_images\/553367185700036609\/q6Kh8Ru8.jpeg","profile_background_image_url_https":"https:\/\/pbs.twimg.com\/profile_background_images\/553367185700036609\/q6Kh8Ru8.jpeg","profile_background_tile":true,"profile_link_color":"009999","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/2394735481\/7rom2fzqu1vwrq94yzll_normal.jpeg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/2394735481\/7rom2fzqu1vwrq94yzll_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/28725783\/1416251684","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"retweet_count":264,"favorite_count":439,"entities":{"hashtags":[{"text":"ThankYouKobe","indices":[71,84]}],"urls":[],"user_mentions":[{"screen_name":"kobebryant","name":"Kobe Bryant","id":1059194370,"id_str":"1059194370","indices":[14,25]}],"symbols":[],"media":[{"id":720365333593260032,"id_str":"720365333593260032","indices":[85,108],"media_url":"http:\/\/pbs.twimg.com\/media\/Cf9AdElVAAA7BqM.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/Cf9AdElVAAA7BqM.jpg","url":"https:\/\/t.co\/1rTiq5eAS9","display_url":"pic.twitter.com\/1rTiq5eAS9","expanded_url":"http:\/\/twitter.com\/STAPLESCenter\/status\/720365343500144640\/photo\/1","type":"photo","sizes":{"small":{"w":340,"h":425,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":1024,"h":1280,"resize":"fit"},"medium":{"w":600,"h":750,"resize":"fit"}}}]},"extended_entities":{"media":[{"id":720365333593260032,"id_str":"720365333593260032","indices":[85,108],"media_url":"http:\/\/pbs.twimg.com\/media\/Cf9AdElVAAA7BqM.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/Cf9AdElVAAA7BqM.jpg","url":"https:\/\/t.co\/1rTiq5eAS9","display_url":"pic.twitter.com\/1rTiq5eAS9","expanded_url":"http:\/\/twitter.com\/STAPLESCenter\/status\/720365343500144640\/photo\/1","type":"photo","sizes":{"small":{"w":340,"h":425,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":1024,"h":1280,"resize":"fit"},"medium":{"w":600,"h":750,"resize":"fit"}}}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"is_quote_status":false,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[{"text":"ThankYouKobe","indices":[90,103]}],"urls":[],"user_mentions":[{"screen_name":"STAPLESCenter","name":"STAPLES Center","id":28725783,"id_str":"28725783","indices":[3,17]},{"screen_name":"kobebryant","name":"Kobe Bryant","id":1059194370,"id_str":"1059194370","indices":[33,44]}],"symbols":[],"media":[{"id":720365333593260032,"id_str":"720365333593260032","indices":[104,127],"media_url":"http:\/\/pbs.twimg.com\/media\/Cf9AdElVAAA7BqM.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/Cf9AdElVAAA7BqM.jpg","url":"https:\/\/t.co\/1rTiq5eAS9","display_url":"pic.twitter.com\/1rTiq5eAS9","expanded_url":"http:\/\/twitter.com\/STAPLESCenter\/status\/720365343500144640\/photo\/1","type":"photo","sizes":{"small":{"w":340,"h":425,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":1024,"h":1280,"resize":"fit"},"medium":{"w":600,"h":750,"resize":"fit"}},"source_status_id":720365343500144640,"source_status_id_str":"720365343500144640","source_user_id":28725783,"source_user_id_str":"28725783"}]},"extended_entities":{"media":[{"id":720365333593260032,"id_str":"720365333593260032","indices":[104,127],"media_url":"http:\/\/pbs.twimg.com\/media\/Cf9AdElVAAA7BqM.jpg","media_url_https":"https:\/\/pbs.twimg.com\/media\/Cf9AdElVAAA7BqM.jpg","url":"https:\/\/twitter.com\/1rTiq5eAS9","display_url":"pic.twitter.com\/1rTiq5eAS9","expanded_url":"http:\/\/twitter.com\/STAPLESCenter\/status\/720365343500144640\/photo\/1","type":"photo","sizes":{"small":{"w":340,"h":425,"resize":"fit"},"thumb":{"w":150,"h":150,"resize":"crop"},"large":{"w":1024,"h":1280,"resize":"fit"},"medium":{"w":600,"h":750,"resize":"fit"}},"source_status_id":720365343500144640,"source_status_id_str":"720365343500144640","source_user_id":28725783,"source_user_id_str":"28725783"}]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en","timestamp_ms":"1460585162546"}
I've been trying to load the TXT file with the read.csv(file, header = TRUE, sep ="") and the 1st problem I've found is that since the TXT is formed having the white space as separator for the header, then I get an error saying that there are more columns in the rows than in the header (of course as I'm trying to process also the text from the tweets).
If I don't specify the separator (i.e. read.csv(file)) and I dump the content in a dataframe, then I only get 1 column.
Any hint?
You could do something like
txt <- readLines("myfile.txt")
df <- read.table(text=sub("\\d+-\\d+-\\d+ \\d+:\\d+:\\d+", '"\\1"', txt), header=T)
library(jsonlite)
toJSON(df)
# [{"text":"RT #kobebryant: Last night was the final chapter to an incredible story. I walk ...
Problems arise, because the datetime column created is not wrapped in quotes. Thus, date and time get separated - and the number of header fields does not match anymore. (This simple approach may break if there are for example similar patterns in the textcolumn.)
I am trying to read a JSON file into Pandas. It's a relatively large file (41k records) mostly text.
{"sanders": [{"date": "February 8, 2016 Monday", "source": "Federal News
Service", "subsource": "MSNBC \"MSNBC Live\" Interview with Sen. Bernie
Sanders (I-VT), Democratic", "quotes": ["Well, it's not very progressive to
take millions of dollars from Wall Street as well.", "That's a very good
question, and I wish I could give her a definitive answer. QUOTE SHORTENED FOR
SPACE"]}, {"date": "February 7, 2016 Sunday", "source": "CBS News Transcripts", "subsource": "SHOW: CBS FACE THE NATION 10:30 AM EST", "quotes":
["Well, John -- John, I think that`s a media narrative that goes around and
around and around. I don`t accept that media narrative.", "Well, that`s what
she said about Barack Obama in 2008. "]},
I tried:
quotes = pd.read_json("/quotes.json")
I expected it to read in cleanly because it was file created in python. However, I got this error:
ValueError Traceback (most recent call last)
<ipython-input-19-c1acfdf0dbc6> in <module>()
----> 1 quotes = pd.read_json("/Users/kate/Documents/99Antennas/Client\
Files/Fusion/data/quotes.json")
/Users/kate/venv/lib/python2.7/site-packages/pandas/io/json.pyc in
read_json(path_or_buf, orient, typ, dtype, convert_axes, convert_dates,
keep_default_dates, numpy, precise_float, date_unit)
208 obj = FrameParser(json, orient, dtype, convert_axes,
convert_dates,
209 keep_default_dates, numpy, precise_float,
--> 210 date_unit).parse()
211
212 if typ == 'series' or obj is None:
/Users/kate/venv/lib/python2.7/site-packages/pandas/io/json.pyc in parse(self)
276
277 else:
--> 278 self._parse_no_numpy()
279
280 if self.obj is None:
/Users/kate/venv/lib/python2.7/site-packages/pandas/io/json.pyc in _
parse_no_numpy(self)
493 if orient == "columns":
494 self.obj = DataFrame(
--> 495 loads(json, precise_float=self.precise_float),
dtype=None)
496 elif orient == "split":
497 decoded = dict((str(k), v)
ValueError: Expected object or value
After reading the documentation and stackoverflow, I also tried adding convert_dates=False to the parameters, but that did not correct the problem. I would welcome suggestions as to how to handle this error.
Try removing the forward slash in the filename. If you run this python code from the same directory where the file is sitting, it should work.
quotes = pd.read_json("quotes.json")
SPKoder mentioned the forward slash. I was looking for an answer when I realized I hadn't added a / when combing filename and path (i.e. c:/path/herefile.json, instead of c:/path/here/file.json). Anyways the error I received was ...
ValueError: Expected object or value
Not a very intuitive error message, but that is what causes it.