What I'm trying to do here is to leave texts only from each tweet.
import org.apache.spark.{SparkConf, SparkContext}
import scala.io.Source
object shortTwitter {
def main(args: Array[String]): Unit = {
val sparkConf = new SparkConf().setAppName("ShortTwitterAnalysis").setMaster("local[2]")
val sc = new SparkContext(sparkConf)
val text = sc.textFile("/home/tobby/data/shortTwitter.txt")
val counts = text
.map(_.toLowerCase)
.map(_.toString)
.map(_.replace("\t", ""))
.map(_.replace("\"", ""))
.map(_.replace("\n", ""))
.map(_.replaceAll("[\\p{C}]", ""))
.map(_.split("\"text\":\"")(1).split("\",\"source\":")(0))
counts.foreach(println)
}
}
But the last map function .map(_.split("\"text\":\"")(1).split("\",\"source\":")(0)) does not work. Do you have any advice?
Without the .map(_.split("\"text\":\"")(1).split("\",\"source\":")(0)) my tweets look like below :
{created_at:wed jul 16 23:58:19 +0000 2014,id:489559687189110784,id_str:489559687189110784,text:a rose by any other name would smell as sweet,source:\u003ca href=\https:\/\/twitter.com\/download\/android\ rel=\nofollow\\u003etwitter for android\u003c\/a\u003e,truncated:false,in_reply_to_status_id:null,in_reply_to_status_id_str:null,in_reply_to_user_id:null,in_reply_to_user_id_str:null,in_reply_to_screen_name:null,user:{id:621244372,id_str:621244372,name:\u2665,screen_name:ivunia_ontrinae,location:,url:null,description:me myself & i \u2764,protected:false,verified:false,followers_count:1023,friends_count:591,listed_count:1,favourites_count:1909,statuses_count:26770,created_at:thu jun 28 19:23:06 +0000 2012,utc_offset:-10800,time_zone:atlantic time (canada),geo_enabled:true,lang:en,contributors_enabled:false,is_translator:false,profile_background_color:c0deed,profile_background_image_url:http:\/\/pbs.twimg.com\/profile_background_images\/378800000101658269\/ec0820565f0451a3ce7169c776fbe41f.jpeg,profile_background_image_url_https:https:\/\/pbs.twimg.com\/profile_background_images\/378800000101658269\/ec0820565f0451a3ce7169c776fbe41f.jpeg,profile_background_tile:true,profile_link_color:e62bb4,profile_sidebar_border_color:000000,profile_sidebar_fill_color:ddeef6,profile_text_color:333333,profile_use_background_image:true,profile_image_url:http:\/\/pbs.twimg.com\/profile_images\/483373612749959168\/f3qpy_66_normal.jpeg,profile_image_url_https:https:\/\/pbs.twimg.com\/profile_images\/483373612749959168\/f3qpy_66_normal.jpeg,profile_banner_url:https:\/\/pbs.twimg.com\/profile_banners\/621244372\/1404758956,default_profile:false,default_profile_image:false,following:null,follow_request_sent:null,notifications:null},geo:null,coordinates:null,place:null,contributors:null,retweet_count:0,favorite_count:0,entities:{hashtags:[],trends:[],urls:[],user_mentions:[],symbols:[]},favorited:false,retweeted:false,possibly_sensitive:false,filter_level:medium,lang:en}
{created_at:wed jul 16 23:58:19 +0000 2014,id:489559687189110784,id_str:489559687189110784,text:a rose is a rose is a rose,source:\u003ca href=\https:\/\/twitter.com\/download\/android\ rel=\nofollow\\u003etwitter for android\u003c\/a\u003e,truncated:false,in_reply_to_status_id:null,in_reply_to_status_id_str:null,in_reply_to_user_id:null,in_reply_to_user_id_str:null,in_reply_to_screen_name:null,user:{id:621244372,id_str:621244372,name:\u2665,screen_name:ivunia_ontrinae,location:,url:null,description:me myself & i \u2764,protected:false,verified:false,followers_count:1023,friends_count:591,listed_count:1,favourites_count:1909,statuses_count:26770,created_at:thu jun 28 19:23:06 +0000 2012,utc_offset:-10800,time_zone:atlantic time (canada),geo_enabled:true,lang:en,contributors_enabled:false,is_translator:false,profile_background_color:c0deed,profile_background_image_url:http:\/\/pbs.twimg.com\/profile_background_images\/378800000101658269\/ec0820565f0451a3ce7169c776fbe41f.jpeg,profile_background_image_url_https:https:\/\/pbs.twimg.com\/profile_background_images\/378800000101658269\/ec0820565f0451a3ce7169c776fbe41f.jpeg,profile_background_tile:true,profile_link_color:e62bb4,profile_sidebar_border_color:000000,profile_sidebar_fill_color:ddeef6,profile_text_color:333333,profile_use_background_image:true,profile_image_url:http:\/\/pbs.twimg.com\/profile_images\/483373612749959168\/f3qpy_66_normal.jpeg,profile_image_url_https:https:\/\/pbs.twimg.com\/profile_images\/483373612749959168\/f3qpy_66_normal.jpeg,profile_banner_url:https:\/\/pbs.twimg.com\/profile_banners\/621244372\/1404758956,default_profile:false,default_profile_image:false,following:null,follow_request_sent:null,notifications:null},geo:null,coordinates:null,place:null,contributors:null,retweet_count:0,favorite_count:0,entities:{hashtags:[],trends:[],urls:[],user_mentions:[],symbols:[]},favorited:false,retweeted:false,possibly_sensitive:false,filter_level:medium,lang:en}
{created_at:wed jul 16 23:58:19 +0000 2014,id:489559687176945664,id_str:489559687176945664,text:love is like a rose the joy of all the earth,source:\u003ca href=\http:\/\/twitter.com\/download\/iphone\ rel=\nofollow\\u003etwitter for iphone\u003c\/a\u003e,truncated:false,in_reply_to_status_id:null,in_reply_to_status_id_str:null,in_reply_to_user_id:null,in_reply_to_user_id_str:null,in_reply_to_screen_name:null,user:{id:363819213,id_str:363819213,name:ivanna010394,screen_name:ivannacarrillo,location:,url:null,description:null,protected:false,verified:false,followers_count:243,friends_count:530,listed_count:0,favourites_count:26,statuses_count:5672,created_at:sun aug 28 18:58:49 +0000 2011,utc_offset:-14400,time_zone:eastern time (us & canada),geo_enabled:false,lang:es,contributors_enabled:false,is_translator:false,profile_background_color:642d8b,profile_background_image_url:http:\/\/pbs.twimg.com\/profile_background_images\/767201253\/661eb2d4915e9ee6566647dcbaab0186.jpeg,profile_background_image_url_https:https:\/\/pbs.twimg.com\/profile_background_images\/767201253\/661eb2d4915e9ee6566647dcbaab0186.jpeg,profile_background_tile:true,profile_link_color:ff0000,profile_sidebar_border_color:ffffff,profile_sidebar_fill_color:7ac3ee,profile_text_color:3d1957,profile_use_background_image:true,profile_image_url:http:\/\/pbs.twimg.com\/profile_images\/455873054703648768\/_b4mf6o7_normal.jpeg,profile_image_url_https:https:\/\/pbs.twimg.com\/profile_images\/455873054703648768\/_b4mf6o7_normal.jpeg,profile_banner_url:https:\/\/pbs.twimg.com\/profile_banners\/363819213\/1402261141,default_profile:false,default_profile_image:false,following:null,follow_request_sent:null,notifications:null},geo:null,coordinates:null,place:null,contributors:null,retweeted_status:{created_at:wed jul 16 13:45:28 +0000 2014,id:489405458168709120,id_str:489405458168709120,text:our milan show is now sold out, thankyou :d tickets are still available for most of europe ! http:\/\/t.co\/arnh7pvoap http:\/\/t.co\/t5wzyocrtu,source:\u003ca href=\http:\/\/twitter.com\ rel=\nofollow\\u003etwitter web client\u003c\/a\u003e,truncated:false,in_reply_to_status_id:null,in_reply_to_status_id_str:null,in_reply_to_user_id:null,in_reply_to_user_id_str:null,in_reply_to_screen_name:null,user:{id:264107729,id_str:264107729,name:5 seconds of summer,screen_name:5sos,location:sydney, australia,url:http:\/\/www.facebook.com\/5secondsofsummer,description:4 aussies making music :) love the people who support us! our album is out :) http:\/\/po.st\/or93y4 | #ashton5sos #calum5sos #michael5sos #luke5sos,protected:false,verified:true,followers_count:3704204,friends_count:28660,listed_count:20024,favourites_count:1061,statuses_count:17297,created_at:fri mar 11 10:18:46 +0000 2011,utc_offset:36000,time_zone:sydney,geo_enabled:false,lang:en,contributors_enabled:false,is_translator:false,profile_background_color:000000,profile_background_image_url:http:\/\/pbs.twimg.com\/profile_background_images\/483531430371147778\/0gzkh2zi.jpeg,profile_background_image_url_https:https:\/\/pbs.twimg.com\/profile_background_images\/483531430371147778\/0gzkh2zi.jpeg,profile_background_tile:false,profile_link_color:c21b1b,profile_sidebar_border_color:ffffff,profile_sidebar_fill_color:ddeef6,profile_text_color:333333,profile_use_background_image:true,profile_image_url:http:\/\/pbs.twimg.com\/profile_images\/485730748574752768\/zm1ctcvv_normal.jpeg,profile_image_url_https:https:\/\/pbs.twimg.com\/profile_images\/485730748574752768\/zm1ctcvv_normal.jpeg,profile_banner_url:https:\/\/pbs.twimg.com\/profile_banners\/264107729\/1404117825,default_profile:false,default_profile_image:false,following:null,follow_request_sent:null,notifications:null},geo:null,coordinates:null,place:null,contributors:null,retweet_count:12648,favorite_count:31390,entities:{hashtags:[],trends:[],urls:[{url:http:\/\/t.co\/arnh7pvoap,expanded_url:http:\/\/5sos.com\/live,display_url:5sos.com\/live,indices:[93,115]}],user_mentions:[],symbols:[],media:[{id:489405457111715840,id_str:489405457111715840,indices:[116,138],media_url:http:\/\/pbs.twimg.com\/media\/bsq3q5zieaakbgg.jpg,media_url_https:https:\/\/pbs.twimg.com\/media\/bsq3q5zieaakbgg.jpg,url:http:\/\/t.co\/t5wzyocrtu,display_url:pic.twitter.com\/t5wzyocrtu,expanded_url:http:\/\/twitter.com\/5sos\/status\/489405458168709120\/photo\/1,type:photo,sizes:{small:{w:340,h:613,resize:fit},thumb:{w:150,h:150,resize:crop},medium:{w:600,h:1081,resize:fit},large:{w:811,h:1461,resize:fit}}}]},favorited:false,retweeted:false,possibly_sensitive:false,filter_level:low,lang:en},retweet_count:0,favorite_count:0,entities:{hashtags:[],trends:[],urls:[{url:http:\/\/t.co\/arnh7pvoap,expanded_url:http:\/\/5sos.com\/live,display_url:5sos.com\/live,indices:[103,125]}],user_mentions:[{screen_name:5sos,name:5 seconds of summer,id:264107729,id_str:264107729,indices:[3,8]}],symbols:[],media:[{id:489405457111715840,id_str:489405457111715840,indices:[126,140],media_url:http:\/\/pbs.twimg.com\/media\/bsq3q5zieaakbgg.jpg,media_url_https:https:\/\/pbs.twimg.com\/media\/bsq3q5zieaakbgg.jpg,url:http:\/\/t.co\/t5wzyocrtu,display_url:pic.twitter.com\/t5wzyocrtu,expanded_url:http:\/\/twitter.com\/5sos\/status\/489405458168709120\/photo\/1,type:photo,sizes:{small:{w:340,h:613,resize:fit},thumb:{w:150,h:150,resize:crop},medium:{w:600,h:1081,resize:fit},large:{w:811,h:1461,resize:fit}},source_status_id:489405458168709120,source_status_id_str:489405458168709120}]},favorited:false,retweeted:false,possibly_sensitive:false,filter_level:medium,lang:en}
{created_at:sat jan 16 12:00:47 +0000 2016,id:688330052233199616,id_str:688330052233199616,text:rt #nba2k: the battle of two young teams. tough season but one will emerge victorious. who will it be? lakers or 76ers? https:\/\/t.co\/nukkjq\u2026,source:\u003ca href=\http:\/\/twitter.com\ rel=\nofollow\\u003etwitter web client\u003c\/a\u003e,truncated:false,in_reply_to_status_id:null,in_reply_to_status_id_str:null,in_reply_to_user_id:null,in_reply_to_user_id_str:null,in_reply_to_screen_name:null,user:{id:4817727209,id_str:4817727209,name:mark lieyg,screen_name:_yungwiggins_,location:null,url:null,description:null,protected:false,verified:false,followers_count:3,friends_count:40,listed_count:0,favourites_count:0,statuses_count:39,created_at:sat jan 16 11:06:38 +0000 2016,utc_offset:-28800,time_zone:pacific time (us & canada),geo_enabled:false,lang:en,contributors_enabled:false,is_translator:false,profile_background_color:f5f8fa,profile_background_image_url:,profile_background_image_url_https:,profile_background_tile:false,profile_link_color:2b7bb9,profile_sidebar_border_color:c0deed,profile_sidebar_fill_color:ddeef6,profile_text_color:333333,profile_use_background_image:true,profile_image_url:http:\/\/abs.twimg.com\/sticky\/default_profile_images\/default_profile_1_normal.png,profile_image_url_https:https:\/\/abs.twimg.com\/sticky\/default_profile_images\/default_profile_1_normal.png,default_profile:true,default_profile_image:true,following:null,follow_request_sent:null,notifications:null},geo:null,coordinates:null,place:null,contributors:null,retweeted_status: {created_at:sat jan 02 03:31:10 +0000 2016,id:683128371627200513,id_str:683128371627200513,text:the battle of two young teams. tough season but one will emerge victorious. who will it be? lakers or 76ers? https:\/\/t.co\/nukkjqqspa,source:\u003ca href=\http:\/\/percolate.com\ rel=\nofollow\\u003epercolate\u003c\/a\u003e,truncated:false,in_reply_to_status_id:null,in_reply_to_status_id_str:null,in_reply_to_user_id:null,in_reply_to_user_id_str:null,in_reply_to_screen_name:null,user:{id:15573174,id_str:15573174,name:nba 2k 2k16,screen_name:nba2k,location:novato, ca,url:http:\/\/www.2k.com,description:esrb rating: everyone 10+. #nba2k16 available now for playstation 4 & xbox one, playstation 3 & xbox 360 & pc http:\/\/2kgam.es\/buynba2k16,protected:false,verified:true,followers_count:948071,friends_count:1630,listed_count:3305,favourites_count:10,statuses_count:8162,created_at:wed jul 23 21:57:14 +0000 2008,utc_offset:-28800,time_zone:pacific time (us & canada),geo_enabled:true,lang:en,contributors_enabled:false,is_translator:false,profile_background_color:000000,profile_background_image_url:http:\/\/pbs.twimg.com\/profile_background_images\/539865904528371712\/gnb-ggrq.png,profile_background_image_url_https:https:\/\/pbs.twimg.com\/profile_background_images\/539865904528371712\/gnb-ggrq.png,profile_background_tile:false,profile_link_color:ff0300,profile_sidebar_border_color:ffffff,profile_sidebar_fill_color:0d2b44,profile_text_color:408af2,profile_use_background_image:true,profile_image_url:http:\/\/pbs.twimg.com\/profile_images\/606562975109890048\/sumjozun_normal.jpg,profile_image_url_https:https:\/\/pbs.twimg.com\/profile_images\/606562975109890048\/sumjozun_normal.jpg,profile_banner_url:https:\/\/pbs.twimg.com\/profile_banners\/15573174\/1433457451,default_profile:false,default_profile_image:false,following:null,follow_request_sent:null,notifications:null},geo:null,coordinates:null,place:null,contributors:null,is_quote_status:false,retweet_count:112,favorite_count:547,entities:{hashtags:[],urls:[],user_mentions:[],symbols:[],media:[{id:683128370796736512,id_str:683128370796736512,indices:[109,132],media_url:http:\/\/pbs.twimg.com\/media\/cxr1okvusaamnu4.jpg,media_url_https:https:\/\/pbs.twimg.com\/media\/cxr1okvusaamnu4.jpg,url:https:\/\/t.co\/nukkjqqspa,display_url:pic.twitter.com\/nukkjqqspa,expanded_url:http:\/\/twitter.com\/nba2k\/status\/683128371627200513\/photo\/1,type:photo,sizes:{large:{w:1024,h:419,resize:fit},thumb:{w:150,h:150,resize:crop},medium:{w:600,h:245,resize:fit},small:{w:340,h:139,resize:fit}}}]},extended_entities:{media:[{id:683128370796736512,id_str:683128370796736512,indices:[109,132],media_url:http:\/\/pbs.twimg.com\/media\/cxr1okvusaamnu4.jpg,media_url_https:https:\/\/pbs.twimg.com\/media\/cxr1okvusaamnu4.jpg,url:https:\/\/t.co\/nukkjqqspa,display_url:pic.twitter.com\/nukkjqqspa,expanded_url:http:\/\/twitter.com\/nba2k\/status\/683128371627200513\/photo\/1,type:photo,sizes:{large:{w:1024,h:419,resize:fit},thumb:{w:150,h:150,resize:crop},medium:{w:600,h:245,resize:fit},small:{w:340,h:139,resize:fit}}}]},favorited:false,retweeted:false,possibly_sensitive:false,filter_level:low,lang:en},is_quote_status:false,retweet_count:0,favorite_count:0,entities:{hashtags:[],urls:[],user_mentions:[{screen_name:nba2k,name:nba 2k 2k16,id:15573174,id_str:15573174,indices:[3,9]}],symbols:[],media:[{id:683128370796736512,id_str:683128370796736512,indices:[120,140],media_url:http:\/\/pbs.twimg.com\/media\/cxr1okvusaamnu4.jpg,media_url_https:https:\/\/pbs.twimg.com\/media\/cxr1okvusaamnu4.jpg,url:https:\/\/t.co\/nukkjqqspa,display_url:pic.twitter.com\/nukkjqqspa,expanded_url:http:\/\/twitter.com\/nba2k\/status\/683128371627200513\/photo\/1,type:photo,sizes:{large:{w:1024,h:419,resize:fit},thumb:{w:150,h:150,resize:crop},medium:{w:600,h:245,resize:fit},small:{w:340,h:139,resize:fit}},source_status_id:683128371627200513,source_status_id_str:683128371627200513,source_user_id:15573174,source_user_id_str:15573174}]},extended_entities:{media:[{id:683128370796736512,id_str:683128370796736512,indices:[120,140],media_url:http:\/\/pbs.twimg.com\/media\/cxr1okvusaamnu4.jpg,media_url_https:https:\/\/pbs.twimg.com\/media\/cxr1okvusaamnu4.jpg,url:https:\/\/t.co\/nukkjqqspa,display_url:pic.twitter.com\/nukkjqqspa,expanded_url:http:\/\/twitter.com\/nba2k\/status\/683128371627200513\/photo\/1,type:photo,sizes:{large:{w:1024,h:419,resize:fit},thumb:{w:150,h:150,resize:crop},medium:{w:600,h:245,resize:fit},small:{w:340,h:139,resize:fit}},source_status_id:683128371627200513,source_status_id_str:683128371627200513,source_user_id:15573174,source_user_id_str:15573174}]},favorited:false,retweeted:false,possibly_sensitive:false,filter_level:low,lang:en,timestamp_ms:1452945647663}
{created_at:wed jul 16 23:58:19 +0000 2014,id:489559687176945664,id_str:489559687176945664,text:at christmas i no more desire a rose than wish a snow in may’s new-fangled mirth,source:\u003ca href=\http:\/\/twitter.com\/download\/iphone\ rel=\nofollow\\u003etwitter for iphone\u003c\/a\u003e,truncated:false,in_reply_to_status_id:null,in_reply_to_status_id_str:null,in_reply_to_user_id:null,in_reply_to_user_id_str:null,in_reply_to_screen_name:null,user:{id:363819213,id_str:363819213,name:ivanna010394,screen_name:ivannacarrillo,location:,url:null,description:null,protected:false,verified:false,followers_count:243,friends_count:530,listed_count:0,favourites_count:26,statuses_count:5672,created_at:sun aug 28 18:58:49 +0000 2011,utc_offset:-14400,time_zone:eastern time (us & canada),geo_enabled:false,lang:es,contributors_enabled:false,is_translator:false,profile_background_color:642d8b,profile_background_image_url:http:\/\/pbs.twimg.com\/profile_background_images\/767201253\/661eb2d4915e9ee6566647dcbaab0186.jpeg,profile_background_image_url_https:https:\/\/pbs.twimg.com\/profile_background_images\/767201253\/661eb2d4915e9ee6566647dcbaab0186.jpeg,profile_background_tile:true,profile_link_color:ff0000,profile_sidebar_border_color:ffffff,profile_sidebar_fill_color:7ac3ee,profile_text_color:3d1957,profile_use_background_image:true,profile_image_url:http:\/\/pbs.twimg.com\/profile_images\/455873054703648768\/_b4mf6o7_normal.jpeg,profile_image_url_https:https:\/\/pbs.twimg.com\/profile_images\/455873054703648768\/_b4mf6o7_normal.jpeg,profile_banner_url:https:\/\/pbs.twimg.com\/profile_banners\/363819213\/1402261141,default_profile:false,default_profile_image:false,following:null,follow_request_sent:null,notifications:null},geo:null,coordinates:null,place:null,contributors:null,retweeted_status:{created_at:wed jul 16 13:45:28 +0000 2014,id:489405458168709120,id_str:489405458168709120,text:our milan show is now sold out, thankyou :d tickets are still available for most of europe ! http:\/\/t.co\/arnh7pvoap http:\/\/t.co\/t5wzyocrtu,source:\u003ca href=\http:\/\/twitter.com\ rel=\nofollow\\u003etwitter web client\u003c\/a\u003e,truncated:false,in_reply_to_status_id:null,in_reply_to_status_id_str:null,in_reply_to_user_id:null,in_reply_to_user_id_str:null,in_reply_to_screen_name:null,user:{id:264107729,id_str:264107729,name:5 seconds of summer,screen_name:5sos,location:sydney, australia,url:http:\/\/www.facebook.com\/5secondsofsummer,description:4 aussies making music :) love the people who support us! our album is out :) http:\/\/po.st\/or93y4 | #ashton5sos #calum5sos #michael5sos #luke5sos,protected:false,verified:true,followers_count:3704204,friends_count:28660,listed_count:20024,favourites_count:1061,statuses_count:17297,created_at:fri mar 11 10:18:46 +0000 2011,utc_offset:36000,time_zone:sydney,geo_enabled:false,lang:en,contributors_enabled:false,is_translator:false,profile_background_color:000000,profile_background_image_url:http:\/\/pbs.twimg.com\/profile_background_images\/483531430371147778\/0gzkh2zi.jpeg,profile_background_image_url_https:https:\/\/pbs.twimg.com\/profile_background_images\/483531430371147778\/0gzkh2zi.jpeg,profile_background_tile:false,profile_link_color:c21b1b,profile_sidebar_border_color:ffffff,profile_sidebar_fill_color:ddeef6,profile_text_color:333333,profile_use_background_image:true,profile_image_url:http:\/\/pbs.twimg.com\/profile_images\/485730748574752768\/zm1ctcvv_normal.jpeg,profile_image_url_https:https:\/\/pbs.twimg.com\/profile_images\/485730748574752768\/zm1ctcvv_normal.jpeg,profile_banner_url:https:\/\/pbs.twimg.com\/profile_banners\/264107729\/1404117825,default_profile:false,default_profile_image:false,following:null,follow_request_sent:null,notifications:null},geo:null,coordinates:null,place:null,contributors:null,retweet_count:12648,favorite_count:31390,entities:{hashtags:[],trends:[],urls:[{url:http:\/\/t.co\/arnh7pvoap,expanded_url:http:\/\/5sos.com\/live,display_url:5sos.com\/live,indices:[93,115]}],user_mentions:[],symbols:[],media:[{id:489405457111715840,id_str:489405457111715840,indices:[116,138],media_url:http:\/\/pbs.twimg.com\/media\/bsq3q5zieaakbgg.jpg,media_url_https:https:\/\/pbs.twimg.com\/media\/bsq3q5zieaakbgg.jpg,url:http:\/\/t.co\/t5wzyocrtu,display_url:pic.twitter.com\/t5wzyocrtu,expanded_url:http:\/\/twitter.com\/5sos\/status\/489405458168709120\/photo\/1,type:photo,sizes:{small:{w:340,h:613,resize:fit},thumb:{w:150,h:150,resize:crop},medium:{w:600,h:1081,resize:fit},large:{w:811,h:1461,resize:fit}}}]},favorited:false,retweeted:false,possibly_sensitive:false,filter_level:low,lang:en},retweet_count:0,favorite_count:0,entities:{hashtags:[],trends:[],urls:[{url:http:\/\/t.co\/arnh7pvoap,expanded_url:http:\/\/5sos.com\/live,display_url:5sos.com\/live,indices:[103,125]}],user_mentions:[{screen_name:5sos,name:5 seconds of summer,id:264107729,id_str:264107729,indices:[3,8]}],symbols:[],media:[{id:489405457111715840,id_str:489405457111715840,indices:[126,140],media_url:http:\/\/pbs.twimg.com\/media\/bsq3q5zieaakbgg.jpg,media_url_https:https:\/\/pbs.twimg.com\/media\/bsq3q5zieaakbgg.jpg,url:http:\/\/t.co\/t5wzyocrtu,display_url:pic.twitter.com\/t5wzyocrtu,expanded_url:http:\/\/twitter.com\/5sos\/status\/489405458168709120\/photo\/1,type:photo,sizes:{small:{w:340,h:613,resize:fit},thumb:{w:150,h:150,resize:crop},medium:{w:600,h:1081,resize:fit},large:{w:811,h:1461,resize:fit}},source_status_id:489405458168709120,source_status_id_str:489405458168709120}]},favorited:false,retweeted:false,possibly_sensitive:false,filter_level:medium,lang:en}
{created_at:sat jan 16 12:00:48 +0000 2016,id:688330056410755072,id_str:688330056410755072,text:i was going to bake a cake and listen to the football. flour refund?,source:\u003ca href=\http:\/\/twitter.com\/download\/iphone\ rel=\nofollow\\u003etwitter for iphone\u003c\/a\u003e,truncated:false,in_reply_to_status_id:null,in_reply_to_status_id_str:null,in_reply_to_user_id:null,in_reply_to_user_id_str:null,in_reply_to_screen_name:null,user:{id:252303653,id_str:252303653,name:pete blackman,screen_name:peteblackman,location:null,url:null,description:null,protected:false,verified:false,followers_count:409,friends_count:903,listed_count:18,favourites_count:5664,statuses_count:22919,created_at:mon feb 14 22:44:37 +0000 2011,utc_offset:3600,time_zone:amsterdam,geo_enabled:false,lang:en,contributors_enabled:false,is_translator:false,profile_background_color:c0deed,profile_background_image_url:http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png,profile_background_image_url_https:https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png,profile_background_tile:false,profile_link_color:0084b4,profile_sidebar_border_color:c0deed,profile_sidebar_fill_color:ddeef6,profile_text_color:333333,profile_use_background_image:true,profile_image_url:http:\/\/pbs.twimg.com\/profile_images\/2600097910\/image_normal.jpg,profile_image_url_https:https:\/\/pbs.twimg.com\/profile_images\/2600097910\/image_normal.jpg,default_profile:true,default_profile_image:false,following:null,follow_request_sent:null,notifications:null},geo:null,coordinates:null,place:null,contributors:null,is_quote_status:false,retweet_count:0,favorite_count:0,entities:{hashtags:[],urls:[],user_mentions:[],symbols:[]},favorited:false,retweeted:false,filter_level:low,lang:en,timestamp_ms:1452945648659}
Or is there any other way but using split? I would really appreciate your tips.
The error is as below.
16/09/18 22:49:37 ERROR TaskSetManager: Task 0 in stage 0.0 failed 1 times; aborting job
Hi hope I understand the question correctly, you are attempting to read a file and with the text mentioned above and then print the "text" mentioned in file containing json
If the above assumption is correct, here a simple code which would do this:
val matchingPattern = "(?i)(text:)(.+?)(,source:)".r
val tweets = scala.io.Source.fromPath("/home/tobby/data/shortTwitter.txt").getLines.reduceLeft(_+_)
matchingPattern.findAllIn(tweets).matchData foreach { m => println(m.group(2)) }
Hope it helps, if the above assumption is not correct please provide a sample input and expected output
I'm trying to iterate twitter data which is stored in a json file:
fname = 'test.json'
with open(fname, 'r') as f:
for line in f:
tweet = json.loads(line)['text']
print(tweet)
It prints the first tweet in the file just fine but when it iterates for a second time it gives me a JSONDecodeError:
JSONDecodeError: Expecting value: line 2 column 1 (char 1)
My JSON file is 650Mb is size approximately.
To get the twitter data I used the StreamListener from the Twitter API.
Here is a glimpse into my JSON file:
{"created_at":"Sun Apr 24 05:37:02 +0000 2016","id":724109877732204544,"id_str":"724109877732204544","text":"JONES RETURNS WITH A UNANIMOUS DECISION WIN IVER OVINCE SAINT PREUX! #UFC197 https:\/\/t.co\/KlfaAh9h21","source":"\u003ca href=\"http:\/\/instagram.com\" rel=\"nofollow\"\u003eInstagram\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":714389668633116672,"id_str":"714389668633116672","name":"Leon Doyle","screen_name":"TheLDPodcast","location":"Dublin, Ireland","url":"http:\/\/www.youtube.com","description":"A weekly\/bi-weekly podcast focused mainly around MMA, Boxing, fighting etc. With the occasional random topic.","protected":false,"verified":false,"followers_count":7,"friends_count":59,"listed_count":0,"favourites_count":3,"statuses_count":31,"created_at":"Mon Mar 28 09:52:24 +0000 2016","utc_offset":null,"time_zone":null,"geo_enabled":false,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"000000","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme1\/bg.png","profile_background_tile":false,"profile_link_color":"004455","profile_sidebar_border_color":"000000","profile_sidebar_fill_color":"000000","profile_text_color":"000000","profile_use_background_image":false,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/714390864030797824\/REXXKCvs_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/714390864030797824\/REXXKCvs_normal.jpg","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[{"text":"UFC197","indices":[69,76]}],"urls":[{"url":"https:\/\/t.co\/KlfaAh9h21","expanded_url":"https:\/\/www.instagram.com\/p\/BEkk6Gewpqy\/","display_url":"instagram.com\/p\/BEkk6Gewpqy\/","indices":[77,100]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en","timestamp_ms":"1461476222819"}
{"created_at":"Sun Apr 24 05:37:03 +0000 2016","id":724109879200366592,"id_str":"724109879200366592","text":"regrann from #ufc - #AndStill UFC flyweight champ #MightyMouseUFC! #UFC197\n\nPresented by\u2026 https:\/\/t.co\/zbE5CsFxMJ","source":"\u003ca href=\"http:\/\/instagram.com\" rel=\"nofollow\"\u003eInstagram\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1070221260,"id_str":"1070221260","name":"Will Manuel","screen_name":"TheWillManuel","location":"Kenai, AK","url":null,"description":"Alaskan. Paramedic. Firefighter. Industrial Security. Libertarian. 2nd Amendment. Liberty. BJJ & Muay Thai novice. #TeamRed #RedemptionMMA #BJJ #MuayThai #MMA","protected":false,"verified":false,"followers_count":437,"friends_count":573,"listed_count":32,"favourites_count":2516,"statuses_count":3184,"created_at":"Tue Jan 08 07:22:47 +0000 2013","utc_offset":-28800,"time_zone":"Alaska","geo_enabled":false,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"C0DEED","profile_background_image_url":"http:\/\/pbs.twimg.com\/profile_background_images\/579042288040435713\/VeA-zI45.jpeg","profile_background_image_url_https":"https:\/\/pbs.twimg.com\/profile_background_images\/579042288040435713\/VeA-zI45.jpeg","profile_background_tile":true,"profile_link_color":"4A913C","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"DDEEF6","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/715188796615237632\/JvxeLz8D_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/715188796615237632\/JvxeLz8D_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1070221260\/1447179132","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[{"text":"AndStill","indices":[22,31]},{"text":"UFC197","indices":[69,76]}],"urls":[{"url":"https:\/\/t.co\/zbE5CsFxMJ","expanded_url":"https:\/\/www.instagram.com\/p\/BEkk6a0QMeX\/","display_url":"instagram.com\/p\/BEkk6a0QMeX\/","indices":[92,115]}],"user_mentions":[{"screen_name":"ufc","name":"#UFC197","id":6446742,"id_str":"6446742","indices":[13,17]},{"screen_name":"MightyMouseUFC","name":"Demetrious Johnson","id":140845817,"id_str":"140845817","indices":[52,67]}],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en","timestamp_ms":"1461476223169"}
{"created_at":"Sun Apr 24 05:37:03 +0000 2016","id":724109882341896192,"id_str":"724109882341896192","text":"RT #BESTFlGHTS: Jon Jones flips off Daniel Cormier at #UFC197 https:\/\/t.co\/S0pDvRWhfW","source":"\u003ca href=\"http:\/\/twitter.com\/download\/iphone\" rel=\"nofollow\"\u003eTwitter for iPhone\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1019191860,"id_str":"1019191860","name":"Paul","screen_name":"Paulie_Frat","location":"Mount Pocono, PA","url":null,"description":"...","protected":false,"verified":false,"followers_count":272,"friends_count":259,"listed_count":0,"favourites_count":1580,"statuses_count":1622,"created_at":"Tue Dec 18 07:10:12 +0000 2012","utc_offset":-14400,"time_zone":"Eastern Time (US & Canada)","geo_enabled":true,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_tile":true,"profile_link_color":"009999","profile_sidebar_border_color":"EEEEEE","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/512140999444164608\/4H2fiOtg_normal.jpeg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/512140999444164608\/4H2fiOtg_normal.jpeg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1019191860\/1461422809","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"retweeted_status":{"created_at":"Sun Apr 24 05:12:13 +0000 2016","id":724103630702432256,"id_str":"724103630702432256","text":"Jon Jones flips off Daniel Cormier at #UFC197 https:\/\/t.co\/S0pDvRWhfW","source":"\u003ca href=\"http:\/\/bufferapp.com\" rel=\"nofollow\"\u003eBuffer\u003c\/a\u003e","truncated":false,"in_reply_to_status_id":null,"in_reply_to_status_id_str":null,"in_reply_to_user_id":null,"in_reply_to_user_id_str":null,"in_reply_to_screen_name":null,"user":{"id":1370712786,"id_str":"1370712786","name":"BEST FIGHTS","screen_name":"BESTFlGHTS","location":"MMA, Boxing, Street Fights","url":"http:\/\/snapchat.com\/add\/wshhfans","description":"Parody, we do not own the content posted DM's are open send me your fight","protected":false,"verified":false,"followers_count":156257,"friends_count":17861,"listed_count":83,"favourites_count":1,"statuses_count":6723,"created_at":"Sun Apr 21 22:43:19 +0000 2013","utc_offset":-25200,"time_zone":"Arizona","geo_enabled":false,"lang":"en","contributors_enabled":false,"is_translator":false,"profile_background_color":"131516","profile_background_image_url":"http:\/\/abs.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_image_url_https":"https:\/\/abs.twimg.com\/images\/themes\/theme14\/bg.gif","profile_background_tile":true,"profile_link_color":"ABB8C2","profile_sidebar_border_color":"FFFFFF","profile_sidebar_fill_color":"EFEFEF","profile_text_color":"333333","profile_use_background_image":true,"profile_image_url":"http:\/\/pbs.twimg.com\/profile_images\/620356388833734657\/NvmkmGDk_normal.jpg","profile_image_url_https":"https:\/\/pbs.twimg.com\/profile_images\/620356388833734657\/NvmkmGDk_normal.jpg","profile_banner_url":"https:\/\/pbs.twimg.com\/profile_banners\/1370712786\/1460756748","default_profile":false,"default_profile_image":false,"following":null,"follow_request_sent":null,"notifications":null},"geo":null,"coordinates":null,"place":null,"contributors":null,"is_quote_status":false,"retweet_count":740,"favorite_count":624,"entities":{"hashtags":[{"text":"UFC197","indices":[38,45]}],"urls":[{"url":"https:\/\/t.co\/S0pDvRWhfW","expanded_url":"http:\/\/vine.co\/v\/iU5T53X6U7J","display_url":"vine.co\/v\/iU5T53X6U7J","indices":[46,69]}],"user_mentions":[],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en"},"is_quote_status":false,"retweet_count":0,"favorite_count":0,"entities":{"hashtags":[{"text":"UFC197","indices":[54,61]}],"urls":[{"url":"https:\/\/t.co\/S0pDvRWhfW","expanded_url":"http:\/\/vine.co\/v\/iU5T53X6U7J","display_url":"vine.co\/v\/iU5T53X6U7J","indices":[62,85]}],"user_mentions":[{"screen_name":"BESTFlGHTS","name":"BEST FIGHTS","id":1370712786,"id_str":"1370712786","indices":[3,14]}],"symbols":[]},"favorited":false,"retweeted":false,"possibly_sensitive":false,"filter_level":"low","lang":"en","timestamp_ms":"1461476223918"}
How can I solve this issue?
If your JSON file has exactly the same structure as the piece you are posting, the empty lines between tweets indeed cause a JSONDecodeError. If that's the problem, just check that the line is not empty before processing:
In [12]:
with open(fname, 'r') as f:
for line in f:
if (not line.strip()):
continue
tweet = json.loads(line)['text']
print(tweet)
Hope it helps.