I have an extract of tweets in JSON format. I have attached an sample of the data. I would need to convert this JSON into a dataframe.
So far I managed to convert it using the "jsonlite" package:
json_data <- jsonlite::stream_in(file("myjsonfile.txt"))
But it does not load all the information contained in the tweets. For example I only see the user who retweeted but not who posted the tweet.
You can view the json file better using this website by copy pasting the file and selecting format: http://jsonviewer.stack.hu/
The data is coming from the Twitter API (more information on this data available here: https://dev.twitter.com/overview/api/tweets
Thank you in advance for your time and help.
ML_Enthousiast
{"favorited": false, "in_reply_to_status_id_str": null, "in_reply_to_user_id": null, "truncated": false, "in_reply_to_user_id_str": null, "coordinates": null, "retweeted": false, "text": "RT #Antoniotalks: Revenue streams for #OpenData companies!\n#Cloud #StartUp #SMM #AI #IoT #Fintech #BigData #deeplearning #Mpgvip\u2026 ", "retweet_count": 0, "filter_level": "low", "created_at": "Thu Jun 29 18:47:18 +0000 2017", "favorite_count": 0, "retweeted_status": {"favorited": false, "in_reply_to_status_id_str": null, "in_reply_to_user_id": null, "display_text_range": [0, 140], "truncated": true, "in_reply_to_user_id_str": null, "coordinates": null, "retweeted": false, "text": "Revenue streams for #OpenData companies!\n#Cloud #StartUp #SMM #AI #IoT #Fintech #BigData #deeplearning #Mpgvip\u2026 ", "retweet_count": 38, "filter_level": "low", "created_at": "Wed Jun 28 12:45:08 +0000 2017", "favorite_count": 48, "in_reply_to_screen_name": null, "extended_tweet": {"extended_entities": {"media": [{"media_url_https": "", "sizes": {"thumb": {"w": 150, "h": 150, "resize": "crop"}, "large": {"w": 1200, "h": 927, "resize": "fit"}, "medium": {"w": 1200, "h": 927, "resize": "fit"}, "small": {"w": 680, "h": 525, "resize": "fit"}}, "type": "photo", "expanded_url": "", "id": 880044388679901184, "media_url": "http://pbs.twimg.com/media/DDaLtXXXYAAI2eM.jpg", "id_str": "880044388679901184", "display_url": "pic.twitter.com/aw9HeukUYv", "indices": [139, 162], "url": ""}]}, "full_text": "Revenue streams for #OpenData companies!\n#Cloud #StartUp #SMM #AI #IoT #Fintech #BigData #deeplearning #Mpgvip #defstar5 #DataScience #CIO ", "entities": {"user_mentions": [], "hashtags": [{"text": "OpenData", "indices": [20, 29]}, {"text": "Cloud", "indices": [41, 47]}, {"text": "StartUp", "indices": [48, 56]}, {"text": "SMM", "indices": [57, 61]}, {"text": "AI", "indices": [62, 65]}, {"text": "IoT", "indices": [66, 70]}, {"text": "Fintech", "indices": [71, 79]}, {"text": "BigData", "indices": [80, 88]}, {"text": "deeplearning", "indices": [89, 102]}, {"text": "Mpgvip", "indices": [103, 110]}, {"text": "defstar5", "indices": [111, 120]}, {"text": "DataScience", "indices": [121, 133]}, {"text": "CIO", "indices": [134, 138]}], "media": [{"media_url_https": "", "sizes": {"thumb": {"w": 150, "h": 150, "resize": "crop"}, "large": {"w": 1200, "h": 927, "resize": "fit"}, "medium": {"w": 1200, "h": 927, "resize": "fit"}, "small": {"w": 680, "h": 525, "resize": "fit"}}, "type": "photo", "expanded_url": "", "id": 880044388679901184, "media_url": "", "id_str": "880044388679901184", "display_url": "pic.twitter.com/aw9HeukUYv", "indices": [139, 162], "url": ""}], "symbols": [], "urls": []}, "display_text_range": [0, 138]}, "in_reply_to_status_id": null, "source": "Buffer", "id_str": "880044392110796800", "entities": {"user_mentions": [], "hashtags": [{"text": "OpenData", "indices": [20, 29]}, {"text": "Cloud", "indices": [41, 47]}, {"text": "StartUp", "indices": [48, 56]}, {"text": "SMM", "indices": [57, 61]}, {"text": "AI", "indices": [62, 65]}, {"text": "IoT", "indices": [66, 70]}, {"text": "Fintech", "indices": [71, 79]}, {"text": "BigData", "indices": [80, 88]}, {"text": "deeplearning", "indices": [89, 102]}, {"text": "Mpgvip", "indices": [103, 110]}], "symbols": [], "urls": [{"display_url": "twitter.com/i/web/status/8\u2026", "indices": [112, 135], "expanded_url": "", "url": "8H"}]}, "lang": "en", "id": 880044392110796800, "is_quote_status": false, "geo": null, "user": {"screen_name": "Antoniotalks", "profile_background_image_url": "", "profile_image_url": "jpg", "follow_request_sent": null, "profile_background_tile": false, "id": 2445890839, "is_translator": false, "description": "A father & CEO of Recruitd (#imrecruitd). Helping companies magnify their #employer and #recruitment #brand and #jobseekers with the #skillstosucceed.", "listed_count": 198, "favourites_count": 398, "created_at": "Tue Apr 15 19:13:52 +0000 2014", "notifications": null, "profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme1/bg.png", "contributors_enabled": false, "profile_background_color": "C0DEED", "following": null, "friends_count": 6792, "protected": false, "default_profile": true, "profile_use_background_image": true, "name": "Antonio Giugno", "location": "London, England", "geo_enabled": true, "id_str": "2445890839", "utc_offset": -25200, "profile_banner_url": "0", "profile_text_color": "333333", "lang": "en-gb", "statuses_count": 4058, "profile_sidebar_fill_color": "DDEEF6", "default_profile_image": false, "profile_image_url_https": "4433/dVeGYfTX_normal.jpg", "profile_link_color": "1DA1F2", "url": "rnontubein", "verified": false, "profile_sidebar_border_color": "C0DEED", "followers_count": 6323, "time_zone": "Pacific Time (US & Canada)"}, "contributors": null, "possibly_sensitive": false, "place": null}, "in_reply_to_screen_name": null, "timestamp_ms": "1498762038396", "in_reply_to_status_id": null, "source": "Mobile Web (M2)", "id_str": "880497923150286848", "entities": {"user_mentions": [{"screen_name": "Antoniotalks", "id": 2445890839, "id_str": "2445890839", "name": "Antonio Giugno", "indices": [3, 16]}], "hashtags": [{"text": "OpenData", "indices": [38, 47]}, {"text": "Cloud", "indices": [59, 65]}, {"text": "StartUp", "indices": [66, 74]}, {"text": "SMM", "indices": [75, 79]}, {"text": "AI", "indices": [80, 83]}, {"text": "IoT", "indices": [84, 88]}, {"text": "Fintech", "indices": [89, 97]}, {"text": "BigData", "indices": [98, 106]}, {"text": "deeplearning", "indices": [107, 120]}, {"text": "Mpgvip", "indices": [121, 128]}], "symbols": [], "urls": [{"indices": [130, 130], "expanded_url": null, "url": ""}]}, "lang": "en", "id": 880497923150286848, "is_quote_status": false, "geo": null, "user": {"screen_name": "henrymbuguak", "profile_background_image_url": "://abs.twimg.com/images/themes/theme3/bg.gif", "profile_image_url": "://pbs.twimg.com/profile_images/822772556818239489/0yTbHCGj_normal.jpg", "follow_request_sent": null, "profile_background_tile": false, "id": 310697279, "is_translator": false, "description": "I enjoy coding. Visit my github project: :// ://github.com/henrymbuguak", "listed_count": 62, "favourites_count": 978, "created_at": "Sat Jun 04 05:55:09 +0000 2011", "notifications": null, "profile_background_image_url_https": "://abs.twimg.com/images/themes/theme3/bg.gif", "contributors_enabled": false, "profile_background_color": "EDECE9", "following": null, "friends_count": 2540, "protected": false, "default_profile": false, "profile_use_background_image": true, "name": "kiarie henry mbugua", "location": "Njoro, Kenya.", "geo_enabled": false, "id_str": "310697279", "utc_offset": 10800, "profile_banner_url": "://pbs.twimg.com/profile_banners/310697279/1484999353", "profile_text_color": "634047", "lang": "en", "statuses_count": 3775, "profile_sidebar_fill_color": "E3E2DE", "default_profile_image": false, "profile_image_url_https": "//pbs.twimg.com/profile_images/822772556818239489/0yTbHCGj_normal.jpg", "profile_link_color": "088253", "url": null, "verified": false, "profile_sidebar_border_color": "D3D2CF", "followers_count": 2141, "time_zone": "Nairobi"}, "contributors": null, "place": null}
If I read in your data using
indata <- jsonlite::read_json("myjsonfile.json")
then I get all the information contained in the JSON file. It is a nested list so you may need to extract the information you want from one of the elements in the list
> names(indata)
[1] "favorited" "in_reply_to_status_id_str"
[3] "in_reply_to_user_id" "truncated"
[5] "in_reply_to_user_id_str" "coordinates"
[7] "retweeted" "text"
[9] "retweet_count" "filter_level"
[11] "created_at" "favorite_count"
[13] "retweeted_status" "in_reply_to_screen_name"
[15] "timestamp_ms" "in_reply_to_status_id"
[17] "source" "id_str"
[19] "entities" "lang"
[21] "id" "is_quote_status"
[23] "geo" "user"
[25] "contributors" "place"
The information about the user is for example (only a part shown)
> indata$user
$screen_name
[1] "henrymbuguak"
$profile_background_image_url
[1] "://abs.twimg.com/images/themes/theme3/bg.gif"
$profile_image_url
[1] "://pbs.twimg.com/profile_images/822772556818239489/0yTbHCGj_normal.jpg"
$follow_request_sent
NULL
$profile_background_tile
[1] FALSE
$id
[1] 310697279
so you can get the user with indata$user$screen_name
Related
How can i import my json file to a postgresql using python my data will look likes
{
"blocked_by": false,
"blocking": false,
"contributors_enabled": false,
"created_at": "Thu Dec 17 06:32:35 +0000 2020",
"default_profile": true,
"default_profile_image": false,
"description": "seo",
"entities": {
"description": {
"urls": []
}
},
"favourites_count": 12,
"follow_request_sent": false,
"followers_count": 56,
"following": false,
"friends_count": 1344,
"geo_enabled": false,
"has_extended_profile": true,
"id": 1339458394508374018,
"id_str": "1339458394508374018",
"is_translation_enabled": false,
"is_translator": false,
"lang": null,
"listed_count": 3,
"live_following": false,
"location": "",
"muting": false,
"name": "katheryn myle",
"notifications": false,
"profile_background_color": "F5F8FA",
"profile_background_image_url": null,
"profile_background_image_url_https": null,
"profile_background_tile": false,
"profile_image_url": "http://pbs.twimg.com/profile_images/1343124937389842432/30cfUmGe_normal.jpg",
"profile_image_url_https": "https://pbs.twimg.com/profile_images/1343124937389842432/30cfUmGe_normal.jpg",
"profile_link_color": "1DA1F2",
"profile_sidebar_border_color": "C0DEED",
"profile_sidebar_fill_color": "DDEEF6",
"profile_text_color": "333333",
"profile_use_background_image": true,
"protected": false,
"screen_name": "KatherynMyle",
"status": {
"contributors": null,
"coordinates": null,
"created_at": "Tue Apr 05 10:17:04 +0000 2022",
"entities": {
"hashtags": [],
"symbols": [],
"urls": [
{
"display_url": "twitter.com/i/web/status/1\u2026",
"expanded_url": "https://twitter.com/i/web/status/1511286791319564293",
"indices": [
117,
140
],
"url": "shortened url"
}
],
"user_mentions": [
{
"id": 2247373052,
"id_str": "2247373052",
"indices": [
0,
12
],
"name": "GDevelop",
"screen_name": "GDevelopApp"
}
]
},
"favorite_count": 0,
"favorited": false,
"geo": null,
"id": 1511286791319564293,
"id_str": "1511286791319564293",
"in_reply_to_screen_name": "GDevelopApp",
"in_reply_to_status_id": null,
"in_reply_to_status_id_str": null,
"in_reply_to_user_id": 2247373052,
"in_reply_to_user_id_str": "2247373052",
"is_quote_status": false,
"lang": "en",
"place": null,
"retweet_count": 0,
"retweeted": false,
"source": "Twitter Web App",
"text": "#GDevelopApp Hello\nHow are you fine I am a link builder and we are selling links and posts on high-quality sites if\u2026 ",
"truncated": true
},
"statuses_count": 382,
"time_zone": null,
"translator_type": "none",
"url": null,
"utc_offset": null,
"verified": false,
"withheld_in_countries": []
I want to create a table with it have ing the index or key name as the column name and also i want only some index data from it not all (like utc _offset,place,etc.).so i want to insert the specific data i wanted to postgresql
https://drive.google.com/file/d/1OkJFfHyU4Eb-V3mrU2qJ_jwBaQQkunJa/view?usp=drivesdk this is the json file
Your question is very vague and I can't come up with a short answer except this one:
pSQL does support JSON as a data type:
https://www.postgresql.org/docs/current/datatype-json.html
https://www.postgresql.org/docs/current/functions-json.html
You could just store the whole json as a single json element and select/convert/whatever within postgre later on.
How to insert JSONB into Postgresql with Python?
I use jq 1.5 under a Windows enviroment to modify a given large json file to extract a single Array ("Offers") from that large file:
'.Offers[] | ({Price: .AdultPriceEUR, Currency: .Currency, Link: .Deeplink, Tickettyp: .TicketClassIndex, Flightindex: .FlightIndex })'
After that i got an "unnamed" Array. But for the later processing it is necessary that the Array keeps his old "Name". I checked the documentation and found the setpath function but it is not possible to keep the Name "easy" on extraction?
shorten example of the json file:
{"Airports": [
{
"Aliases": null,
"ContinentCode": "EU",
"ContinentGroup": 1,
"CountryCode": "DE",
"CountryName": "Germany",
"DST": "",
"DisplayName": "Hamburg (HAM) Germany",
"Iata": "HAM",
"IataLink": false,
"Icao": "EDDH",
"Latitude": 53.63215,
"Longitude": 10.0041609,
"MainCityCode": "HAM",
"MainCityDisplayName": "Hamburg (HAM) Germany",
"MainCityName": "Hamburg",
"Name": "Hamburg",
"Priority": 142,
"StateCode": null,
"StateName": null,
"TimeZone": -798214753
},
{
"Aliases": null,
"ContinentCode": "AS",
"ContinentGroup": 4,
"CountryCode": "TH",
"CountryName": "Thailand",
"DST": "",
"DisplayName": "Suvarnabhumi, Bangkok (BKK) Thailand",
"Iata": "BKK",
"IataLink": false,
"Icao": "VTBS",
"Latitude": 13.6922979,
"Longitude": 100.750694,
"MainCityCode": "BKK",
"MainCityDisplayName": "Bangkok (BKK) Thailand",
"MainCityName": "Bangkok",
"Name": "Suvarnabhumi",
"Priority": 1462,
"StateCode": null,
"StateName": null,
"TimeZone": -640089798
}], "Offers": [
{
"AdultPrice": 2977.6,
"AdultPriceEUR": 2977.6,
"AdultPriceExclTax": 0.0,
"Currency": "EUR",
"FeeIndexes": [
0,
1,
2,
3,
4,
5,
6
],
"FlightIndex": 0,
"IsPaymentIncluded": true,
"MobileDeepLink": null,
"PaymentMethods": [
"American Express",
"Diners Club",
"MasterCard Credit",
"MasterCard Debit",
"Paypal",
"Visa Credit",
"Visa Debit"
],
"Score": 2501.3,
"SegmentFares": null,
"SegmentKey": -1,
"TicketClassIndex": 1,
"TotalIsCalculated": false,
"TotalPrice": 2977.6,
"TotalPriceEUR": 2977.6,
"TotalPriceExclTax": 0.0
},
{
"AdultPrice": 4697.27,
"AdultPriceEUR": 4697.27,
"AdultPriceExclTax": 0.0,
"Currency": "EUR",
"FeeIndexes": [
0,
1,
2,
3,
4,
7,
8,
5,
6
],
"FlightIndex": 1,
"IsPaymentIncluded": true,
"MobileDeepLink": null,
"PaymentMethods": [
"American Express",
"Diners Club",
"MasterCard Credit",
"MasterCard Debit",
"Paypal",
"Sofortüberweisung",
"Überweisung",
"Visa Credit",
"Visa Debit"
],
"Score": 3438.64,
"SegmentFares": null,
"SegmentKey": -1,
"TicketClassIndex": 1,
"TotalIsCalculated": false,
"TotalPrice": 4697.27,
"TotalPriceEUR": 4697.27,
"TotalPriceExclTax": 0.0
}]
}
thanks
BR
Timo
Looks like you're looking at this:
jq '{Offers:[.Offers[] | {Price: .AdultPriceEUR, Currency: .Currency, Link: .Deeplink, Tickettyp: .TicketClassIndex, Flightindex: .FlightIndex }]}' file
It just creates a new object containing an Offers table with the content you want to put it it.
I have a nested JSON. I want it to be read in pandas in order to explore it, but I got errors. When to use read_json method, I got: "Trailing data". It is valid JSON. How to read it in pd? (Tried differently, but did not work). It looks like this:
{
"contributors": null,
"coordinates": null,
"created_at": "Fri May 26 08:54:00 +0000 2017",
"entities": {
"hashtags": [],
"media": [
{
"display_url": "pic.twitter.com/Pm28ORTePl",
"expanded_url": "",
"id": 868027417121751040,
"id_str": "868027417121751040",
"indices": [
94,
117
],
"media_url": "",
"sizes": {
"large": {
"h": 404,
"resize": "fit",
"w": 773
},
"medium": {
"h": 404,
"resize": "fit",
"w": 773
},
"small": {
"h": 355,
"resize": "fit",
"w": 680
},
"thumb": {
"h": 150,
"resize": "crop",
"w": 150
}
},
"type": "photo",
"url": ""
}
],
"symbols": [],
"urls": [
{
"display_url": "",
"expanded_url": "",
"indices": [
70,
93
],
"url": ""
}
],
"user_mentions": []
},
"extended_entities": {
"media": [
{
"display_url": "pic.twitter.com/Pm28ORTePl",
"expanded_url": "1",
"id": 868027417121751040,
"id_str": "868027417121751040",
"indices": [
94,
117
],
"media_url": "",
"media_url_https": "",
"sizes": {
"large": {
"h": 404,
"resize": "fit",
"w": 773
},
"medium": {
"h": 404,
"resize": "fit",
"w": 773
},
"small": {
"h": 355,
"resize": "fit",
"w": 680
},
"thumb": {
"h": 150,
"resize": "crop",
"w": 150
}
},
"type": "photo",
"url": ""
}
]
},
"favorite_count": 1,
"favorited": false,
"geo": null,
"id": 868027425757724672,
"id_str": "868027425757724672",
"in_reply_to_screen_name": null,
"in_reply_to_status_id": null,
"in_reply_to_status_id_str": null,
"in_reply_to_user_id": null,
"in_reply_to_user_id_str": null,
"is_quote_status": false,
"lang": "ru",
"place": null,
"possibly_sensitive": false,
"retweet_count": 0,
"retweeted": false,
"source": "Twitter Web Client",
"text": "\u041f\u0440\u043e\u043f\u0430\u0432\u0448\u0430\u044f \u0432 \u041a\u043e\u043a\u0448\u0435\u0442\u0430\u0443 \u0448\u043a\u043e\u043b\u044c\u043d\u0438\u0446\u0430 \u0436\u0438\u043b\u0430 \u0432 \u0437\u0430\u0431\u0440\u043e\u0448\u0435\u043d\u043d\u043e\u043c \u0434\u043e\u043c\u0435 \u0438 \u0431\u0440\u043e\u0434\u044f\u0436\u043d\u0438\u0447\u0430\u043b\u0430\n",
"truncated": false,
"user": {
"contributors_enabled": false,
"created_at": "Wed May 18 11:59:50 +0000 2011",
"default_profile": true,
"default_profile_image": false,
"description": "\u041a\u0430\u0437\u0430\u0445\u0441\u0442\u0430\u043d\u0441\u043a\u0438\u0439 \u0438\u043d\u0442\u0435\u0440\u043d\u0435\u0442-\u043f\u043e\u0440\u0442\u0430\u043b",
"entities": {
"description": {
"urls": []
},
"url": {
"urls": [
{
"display_url": "",
"expanded_url": "",
"indices": [
0,
22
],
"url": ""
}
]
}
},
"favourites_count": 87,
"follow_request_sent": false,
"followers_count": 17989,
"following": true,
"friends_count": 98,
"geo_enabled": true,
"has_extended_profile": false,
"id": 300811189,
"id_str": "300811189",
"is_translation_enabled": false,
"is_translator": false,
"lang": "ru",
"listed_count": 86,
"location": "\u0410\u043b\u043c\u0430\u0442\u044b",
"name": "",
"notifications": false,
"profile_background_color": "C0DEED",
"profile_background_image_url": "http://abs.twimg.com/images/themes/theme1/bg.png",
"profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme1/bg.png",
"profile_background_tile": false,
"profile_banner_url": "https://pbs.twimg.com/profile_banners/300811189/1489117916",
"profile_image_url": "http://pbs.twimg.com/profile_images/840047424882298881/NxZSyfhM_normal.jpg",
"profile_image_url_https": "https://pbs.twimg.com/profile_images/840047424882298881/NxZSyfhM_normal.jpg",
"profile_link_color": "1DA1F2",
"profile_sidebar_border_color": "C0DEED",
"profile_sidebar_fill_color": "DDEEF6",
"profile_text_color": "333333",
"profile_use_background_image": true,
"protected": false,
"screen_name": "",
"statuses_count": 53011,
"time_zone": "Quito",
"translator_type": "none",
"url": "",
"utc_offset": -18000,
"verified": false
}
}
Sorry, but your JSON is actually not valid, despite your saying it is.
This line:
"media_url": "": "",
Should probably be:
"media_url": "",
At which point, when I added the final bracket } that was outside of your code block, validated as properly formed JSON.
I have two files consisting of json objects in two different locations on my hdfs and I need to join those two depending on a common field.
First file consists of tweet data and has 34 fields (I literally counted). It looks like:
{"contributors": null, "truncated": false, "text": "US Bank Loans And credit card capitol one business", "avl_brand_all": ["US Bank"], "is_quote_status": false , "in_reply_to_status_id": null, "id": 770150015968825344, "favorite_count": 0, "avl_num_sentences": 1, "source": "<a href=\"http://twitter.com\" rel=\"nofollow\">Twitter Web Client</ a>", "retweeted": false, "coordinates": null, "entities": {"symbols": [], "user_mentions": [], "hashtags": [], "urls": [{"url": "<link>": [51, 74], "expand ed_url": "http://usbanklogins.com/bank/", "display_url": "usbanklogins.com/bank/"}]}, "in_reply_to_screen_name": null, "in_reply_to_user_id": null, "avl_word_tags": [{"distance": 1, " word": "u", "pos": "OTHER"}, {"distance": 1, "word": "bank", "pos": "NOUN"}, {"distance": 1, "word": "loan", "pos": "NOUN"}, {"distance": 1, "word": "credit", "pos": "NOUN"}, {"distan ce": 1, "word": "card", "pos": "NOUN"}, {"distance": 1, "word": "capitol", "pos": "VERB"}, {"distance": 1, "word": "one", "pos": "OTHER"}, {"distance": 1, "word": "business", "pos": " NOUN"}], "avl_brand_1": "US Bank", "retweet_count": 0, "avl_lexicon_text": "us bank loans and credit card capitol one business", "id_str": "770150015968825344", "favorited": false, "a vl_sentences": ["us bank loans and credit card capitol one business"], "user": {"follow_request_sent": false, "has_extended_profile": false, "profile_use_background_image": true, "id" : 485610502, "verified": false, "profile_text_color": "0C3E53", "profile_image_url_https": "<link>", "profile _sidebar_fill_color": "FFF7CC", "geo_enabled": false, "entities": {"url": {"urls": [{"url": "link", "indices": [0, 22], "expanded_url": "http://www.seowithme.com", " display_url": "seowithme.com"}]}, "description": {"urls": []}}, "followers_count": 347, "profile_sidebar_border_color": "F2E195", "location": "", "default_profile_image": false, "id_s tr": "485610502", "is_translation_enabled": false, "utc_offset": null, "statuses_count": 117, "description": "seowithme", "friends_count": 959, "profile_link_color": "FF0000", "profil e_image_url": "http://pbs.twimg.com/profile_images/2334489262/qyznw08zjrgv3vlxtdvt_normal.jpeg", "notifications": false, "profile_background_image_url_https": "https://abs.twimg.com/i mages/themes/theme12/bg.gif", "profile_background_color": "BADFCD", "profile_background_image_url": "http://abs.twimg.com/images/themes/theme12/bg.gif", "screen_name": "sajanshrestha2 2", "lang": "en", "following": false, "profile_background_tile": false, "favourites_count": 2, "name": "sajan shrestha", "url": "<link>", "created_at": "Tue Feb 07 11: 40:39 +0000 2012", "contributors_enabled": false, "time_zone": null, "protected": false, "default_profile": false, "is_translator": false, "listed_count": 0}, "avl_num_paragraphs": 1, "geo": null, "in_reply_to_user_id_str": null, "possibly_sensitive": false, "lang": "en", "created_at": "Mon Aug 29 06:44:07 +0000 2016", "avl_source": "individual", "in_reply_to_stat us_id_str": null, "place": null, "metadata": {"iso_language_code": "en", "result_type": "recent"}, "avl_num_words": 8}
The second file has json objects each having only two fields. Looks like:
{"avl_syntaxnet_tags": [{"pos_tag": "PRP", "position": "1", "dep_rel": "dep", "parent": "3", "word": "us"}, {"pos_tag": "NN", "position": "2", "dep_rel": "nn", "parent": "3", "word": "bank"}, {"pos_tag": "NNS", "position": "3", "dep_rel": "nsubj", "parent": "7", "word": "loans"}, {"pos_tag": "CC", "position": "4", "dep_rel": "cc", "parent": "3", "word": "and"}, {" pos_tag": "NN", "position": "5", "dep_rel": "nn", "parent": "6", "word": "credit"}, {"pos_tag": "NN", "position": "6", "dep_rel": "conj", "parent": "3", "word": "card"}, {"pos_tag": " VBP", "position": "7", "dep_rel": "ROOT", "parent": "0", "word": "capitol"}, {"pos_tag": "CD", "position": "8", "dep_rel": "num", "parent": "9", "word": "one"}, {"pos_tag": "NN", "pos ition": "9", "dep_rel": "dobj", "parent": "7", "word": "business"}], "avl_lexicon_text": "us bank loans and credit card capitol one business"}
Now, there is a common fiels in both the json_objects named avl_lexicon_text and I want to join these two objects using the common field.
I wrote the following Pig script for the join:
a = LOAD file1 as (a1, a2);
b = LOAD file2 as (b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13, b14, b15, b16, b17, b18, b19, b20, b21, b22, b23, b24, b25, b26, b27, b28, b29, b30, b31, b32, b33, b34);
x = JOIN b BY b19 FULL, a BY a2;
STORE x INTO '$SYNTAXNET_OUTPUT';
I checked b19 is the avl_lexicon_text field in b and a2 is the same in a. The results I get are really weird. When I dump x, I am not getting a new json_object that contains all the fields in a and b. I get all the objects in b followed by all the objects in a.
Can someone suggest me the right way to do this?
EDIT: Also, is there a way I can do this without loading the schema? Because sometime in future, if the format of any of the files changes (a new field gets added or an existing field gets deleted), I do not want to change the pig script. Is there a way I can do the JOIN without referencing the field position but by accessing the field name? Thanks! )
The behavior is expected since you have specified a FULL outer join.
Remove FULL to only get matching records.See here for FULL outer join.
x = JOIN b BY b19, a BY a2;
Below is a json example of Twitter's tweet. It's a large json. What is the best library/method to parse it into a case class in scala?
For instance, in Play Framework 2.x it's possible to do that with it's internal library by defining case classes and implicit conversions, but in this case I don't use Play. Should I?
spray-json seems to be most popular scala json library, but in this case it looks quite disappointing - standard approach seems to be limited to 22 elements and uses pattern matching, which becomes ridiculous in the context of multi nested structure with hundreds of elements. Any ideas?
{
"created_at": "Sat Oct 24 06:44:34 +0000 2015",
"id": 657809891558576132,
"id_str": "657809891558576132",
"text": "RT #M23projects: Kara Walker \"Go to Hell or Atlanta, Whichever Come First\" #victoriamiro #London https://t.co/HapqKa4i0l https://t.co/95G…",
"source": "Twitter for iPhone",
"truncated": false,
"in_reply_to_status_id": null,
"in_reply_to_status_id_str": null,
"in_reply_to_user_id": null,
"in_reply_to_user_id_str": null,
"in_reply_to_screen_name": null,
"user": {
"id": 2792146884,
"id_str": "2792146884",
"name": "Tonbridge School Art",
"screen_name": "ArtTonSchool",
"location": "Tonbridge",
"url": null,
"description": "Tonbridge School is an independent day and boarding school for boys. Tweets by the Art Department.",
"protected": false,
"verified": false,
"followers_count": 187,
"friends_count": 288,
"listed_count": 10,
"favourites_count": 1069,
"statuses_count": 1764,
"created_at": "Fri Sep 05 15:37:43 +0000 2014",
"utc_offset": 3600,
"time_zone": "London",
"geo_enabled": true,
"lang": "en-gb",
"contributors_enabled": false,
"is_translator": false,
"profile_background_color": "C0DEED",
"profile_background_image_url": "http://abs.twimg.com/images/themes/theme1/bg.png",
"profile_background_image_url_https": "https://abs.twimg.com/images/themes/theme1/bg.png",
"profile_background_tile": false,
"profile_link_color": "0084B4",
"profile_sidebar_border_color": "C0DEED",
"profile_sidebar_fill_color": "DDEEF6",
"profile_text_color": "333333",
"profile_use_background_image": true,
"profile_image_url": "http://pbs.twimg.com/profile_images/507921409738543104/V35eZACR_normal.jpeg",
"profile_image_url_https": "https://pbs.twimg.com/profile_images/507921409738543104/V35eZACR_normal.jpeg",
"profile_banner_url": "https://pbs.twimg.com/profile_banners/2792146884/1410119421",
"default_profile": true,
"default_profile_image": false,
"following": null,
"follow_request_sent": null,
"notifications": null
},
"geo": null,
"coordinates": null,
"place": null,
"contributors": null,
"retweeted_status": {
"created_at": "Sat Oct 24 02:27:06 +0000 2015",
"id": 657745100739506176,
"id_str": "657745100739506176",
"text": "Kara Walker \"Go to Hell or Atlanta, Whichever Come First\" #victoriamiro #London https://t.co/HapqKa4i0l https://t.co/95GaLC4XTo",
"source": "Twitter for iPhone",
"truncated": false,
"in_reply_to_status_id": null,
"in_reply_to_status_id_str": null,
"in_reply_to_user_id": null,
"in_reply_to_user_id_str": null,
"in_reply_to_screen_name": null,
"user": {
"id": 999716342,
"id_str": "999716342",
"name": "M23",
"screen_name": "M23projects",
"location": "New York",
"url": "http://M23.co",
"description": "M23's project space + itinerant program promotes new work by new artists. \nhttp://Instagram.com/m23projects",
"protected": false,
"verified": false,
"followers_count": 9150,
"friends_count": 7353,
"listed_count": 174,
"favourites_count": 1354,
"statuses_count": 4666,
"created_at": "Sun Dec 09 17:13:35 +0000 2012",
"utc_offset": -14400,
"time_zone": "Eastern Time (US & Canada)",
"geo_enabled": true,
"lang": "en",
"contributors_enabled": false,
"is_translator": false,
"profile_background_color": "547587",
"profile_background_image_url": "http://pbs.twimg.com/profile_background_images/884257252/e329bbc1b91d695862d5b23a209f2d34.jpeg",
"profile_background_image_url_https": "https://pbs.twimg.com/profile_background_images/884257252/e329bbc1b91d695862d5b23a209f2d34.jpeg",
"profile_background_tile": true,
"profile_link_color": "414A4D",
"profile_sidebar_border_color": "FFFFFF",
"profile_sidebar_fill_color": "DDEEF6",
"profile_text_color": "333333",
"profile_use_background_image": true,
"profile_image_url": "http://pbs.twimg.com/profile_images/458985956830236673/Z_4Bq9PJ_normal.jpeg",
"profile_image_url_https": "https://pbs.twimg.com/profile_images/458985956830236673/Z_4Bq9PJ_normal.jpeg",
"profile_banner_url": "https://pbs.twimg.com/profile_banners/999716342/1398650659",
"default_profile": false,
"default_profile_image": false,
"following": null,
"follow_request_sent": null,
"notifications": null
},
"geo": null,
"coordinates": null,
"place": null,
"contributors": null,
"is_quote_status": false,
"retweet_count": 2,
"favorite_count": 3,
"entities": {
"hashtags": [
{
"text": "London",
"indices": [
74,
81
]
}
],
"urls": [
{
"url": "https://t.co/HapqKa4i0l",
"expanded_url": "http://instagram.com/m23projects",
"display_url": "instagram.com/m23projects",
"indices": [
82,
105
]
}
],
"user_mentions": [
{
"screen_name": "victoriamiro",
"name": "Victoria Miro",
"id": 373924746,
"id_str": "373924746",
"indices": [
58,
71
]
}
],
"symbols": [],
"media": [
{
"id": 657745078413201408,
"id_str": "657745078413201408",
"indices": [
106,
129
],
"media_url": "http://pbs.twimg.com/media/CSDHqfeUkAA4a0Y.jpg",
"media_url_https": "https://pbs.twimg.com/media/CSDHqfeUkAA4a0Y.jpg",
"url": "https://t.co/95GaLC4XTo",
"display_url": "pic.twitter.com/95GaLC4XTo",
"expanded_url": "http://twitter.com/M23projects/status/657745100739506176/photo/1",
"type": "photo",
"sizes": {
"small": {
"w": 340,
"h": 255,
"resize": "fit"
},
"medium": {
"w": 600,
"h": 450,
"resize": "fit"
},
"thumb": {
"w": 150,
"h": 150,
"resize": "crop"
},
"large": {
"w": 1024,
"h": 768,
"resize": "fit"
}
}
}
]
},
"extended_entities": {
"media": [
{
"id": 657745078413201408,
"id_str": "657745078413201408",
"indices": [
106,
129
],
"media_url": "http://pbs.twimg.com/media/CSDHqfeUkAA4a0Y.jpg",
"media_url_https": "https://pbs.twimg.com/media/CSDHqfeUkAA4a0Y.jpg",
"url": "https://t.co/95GaLC4XTo",
"display_url": "pic.twitter.com/95GaLC4XTo",
"expanded_url": "http://twitter.com/M23projects/status/657745100739506176/photo/1",
"type": "photo",
"sizes": {
"small": {
"w": 340,
"h": 255,
"resize": "fit"
},
"medium": {
"w": 600,
"h": 450,
"resize": "fit"
},
"thumb": {
"w": 150,
"h": 150,
"resize": "crop"
},
"large": {
"w": 1024,
"h": 768,
"resize": "fit"
}
}
},
{
"id": 657745085275095040,
"id_str": "657745085275095040",
"indices": [
106,
129
],
"media_url": "http://pbs.twimg.com/media/CSDHq5CUwAAC-6a.jpg",
"media_url_https": "https://pbs.twimg.com/media/CSDHq5CUwAAC-6a.jpg",
"url": "https://t.co/95GaLC4XTo",
"display_url": "pic.twitter.com/95GaLC4XTo",
"expanded_url": "http://twitter.com/M23projects/status/657745100739506176/photo/1",
"type": "photo",
"sizes": {
"small": {
"w": 340,
"h": 453,
"resize": "fit"
},
"medium": {
"w": 600,
"h": 800,
"resize": "fit"
},
"thumb": {
"w": 150,
"h": 150,
"resize": "crop"
},
"large": {
"w": 768,
"h": 1024,
"resize": "fit"
}
}
},
{
"id": 657745085300277248,
"id_str": "657745085300277248",
"indices": [
106,
129
],
"media_url": "http://pbs.twimg.com/media/CSDHq5IVAAAn2YH.jpg",
"media_url_https": "https://pbs.twimg.com/media/CSDHq5IVAAAn2YH.jpg",
"url": "https://t.co/95GaLC4XTo",
"display_url": "pic.twitter.com/95GaLC4XTo",
"expanded_url": "http://twitter.com/M23projects/status/657745100739506176/photo/1",
"type": "photo",
"sizes": {
"small": {
"w": 340,
"h": 453,
"resize": "fit"
},
"medium": {
"w": 600,
"h": 800,
"resize": "fit"
},
"thumb": {
"w": 150,
"h": 150,
"resize": "crop"
},
"large": {
"w": 768,
"h": 1024,
"resize": "fit"
}
}
},
{
"id": 657745085275082752,
"id_str": "657745085275082752",
"indices": [
106,
129
],
"media_url": "http://pbs.twimg.com/media/CSDHq5CUkAAd0oL.jpg",
"media_url_https": "https://pbs.twimg.com/media/CSDHq5CUkAAd0oL.jpg",
"url": "https://t.co/95GaLC4XTo",
"display_url": "pic.twitter.com/95GaLC4XTo",
"expanded_url": "http://twitter.com/M23projects/status/657745100739506176/photo/1",
"type": "photo",
"sizes": {
"small": {
"w": 340,
"h": 255,
"resize": "fit"
},
"medium": {
"w": 600,
"h": 450,
"resize": "fit"
},
"thumb": {
"w": 150,
"h": 150,
"resize": "crop"
},
"large": {
"w": 1024,
"h": 768,
"resize": "fit"
}
}
}
]
},
"favorited": false,
"retweeted": false,
"possibly_sensitive": false,
"filter_level": "low",
"lang": "en"
},
"is_quote_status": false,
"retweet_count": 0,
"favorite_count": 0,
"entities": {
"hashtags": [
{
"text": "London",
"indices": [
91,
98
]
}
],
"urls": [
{
"url": "https://t.co/HapqKa4i0l",
"expanded_url": "http://instagram.com/m23projects",
"display_url": "instagram.com/m23projects",
"indices": [
99,
122
]
}
],
"user_mentions": [
{
"screen_name": "M23projects",
"name": "M23",
"id": 999716342,
"id_str": "999716342",
"indices": [
3,
15
]
},
{
"screen_name": "victoriamiro",
"name": "Victoria Miro",
"id": 373924746,
"id_str": "373924746",
"indices": [
75,
88
]
}
],
"symbols": [],
"media": [
{
"id": 657745078413201408,
"id_str": "657745078413201408",
"indices": [
123,
140
],
"media_url": "http://pbs.twimg.com/media/CSDHqfeUkAA4a0Y.jpg",
"media_url_https": "https://pbs.twimg.com/media/CSDHqfeUkAA4a0Y.jpg",
"url": "https://t.co/95GaLC4XTo",
"display_url": "pic.twitter.com/95GaLC4XTo",
"expanded_url": "http://twitter.com/M23projects/status/657745100739506176/photo/1",
"type": "photo",
"sizes": {
"small": {
"w": 340,
"h": 255,
"resize": "fit"
},
"medium": {
"w": 600,
"h": 450,
"resize": "fit"
},
"thumb": {
"w": 150,
"h": 150,
"resize": "crop"
},
"large": {
"w": 1024,
"h": 768,
"resize": "fit"
}
},
"source_status_id": 657745100739506176,
"source_status_id_str": "657745100739506176",
"source_user_id": 999716342,
"source_user_id_str": "999716342"
}
]
},
"extended_entities": {
"media": [
{
"id": 657745078413201408,
"id_str": "657745078413201408",
"indices": [
123,
140
],
"media_url": "http://pbs.twimg.com/media/CSDHqfeUkAA4a0Y.jpg",
"media_url_https": "https://pbs.twimg.com/media/CSDHqfeUkAA4a0Y.jpg",
"url": "https://t.co/95GaLC4XTo",
"display_url": "pic.twitter.com/95GaLC4XTo",
"expanded_url": "http://twitter.com/M23projects/status/657745100739506176/photo/1",
"type": "photo",
"sizes": {
"small": {
"w": 340,
"h": 255,
"resize": "fit"
},
"medium": {
"w": 600,
"h": 450,
"resize": "fit"
},
"thumb": {
"w": 150,
"h": 150,
"resize": "crop"
},
"large": {
"w": 1024,
"h": 768,
"resize": "fit"
}
},
"source_status_id": 657745100739506176,
"source_status_id_str": "657745100739506176",
"source_user_id": 999716342,
"source_user_id_str": "999716342"
},
{
"id": 657745085275095040,
"id_str": "657745085275095040",
"indices": [
123,
140
],
"media_url": "http://pbs.twimg.com/media/CSDHq5CUwAAC-6a.jpg",
"media_url_https": "https://pbs.twimg.com/media/CSDHq5CUwAAC-6a.jpg",
"url": "https://t.co/95GaLC4XTo",
"display_url": "pic.twitter.com/95GaLC4XTo",
"expanded_url": "http://twitter.com/M23projects/status/657745100739506176/photo/1",
"type": "photo",
"sizes": {
"small": {
"w": 340,
"h": 453,
"resize": "fit"
},
"medium": {
"w": 600,
"h": 800,
"resize": "fit"
},
"thumb": {
"w": 150,
"h": 150,
"resize": "crop"
},
"large": {
"w": 768,
"h": 1024,
"resize": "fit"
}
},
"source_status_id": 657745100739506176,
"source_status_id_str": "657745100739506176",
"source_user_id": 999716342,
"source_user_id_str": "999716342"
},
{
"id": 657745085300277248,
"id_str": "657745085300277248",
"indices": [
123,
140
],
"media_url": "http://pbs.twimg.com/media/CSDHq5IVAAAn2YH.jpg",
"media_url_https": "https://pbs.twimg.com/media/CSDHq5IVAAAn2YH.jpg",
"url": "https://t.co/95GaLC4XTo",
"display_url": "pic.twitter.com/95GaLC4XTo",
"expanded_url": "http://twitter.com/M23projects/status/657745100739506176/photo/1",
"type": "photo",
"sizes": {
"small": {
"w": 340,
"h": 453,
"resize": "fit"
},
"medium": {
"w": 600,
"h": 800,
"resize": "fit"
},
"thumb": {
"w": 150,
"h": 150,
"resize": "crop"
},
"large": {
"w": 768,
"h": 1024,
"resize": "fit"
}
},
"source_status_id": 657745100739506176,
"source_status_id_str": "657745100739506176",
"source_user_id": 999716342,
"source_user_id_str": "999716342"
},
{
"id": 657745085275082752,
"id_str": "657745085275082752",
"indices": [
123,
140
],
"media_url": "http://pbs.twimg.com/media/CSDHq5CUkAAd0oL.jpg",
"media_url_https": "https://pbs.twimg.com/media/CSDHq5CUkAAd0oL.jpg",
"url": "https://t.co/95GaLC4XTo",
"display_url": "pic.twitter.com/95GaLC4XTo",
"expanded_url": "http://twitter.com/M23projects/status/657745100739506176/photo/1",
"type": "photo",
"sizes": {
"small": {
"w": 340,
"h": 255,
"resize": "fit"
},
"medium": {
"w": 600,
"h": 450,
"resize": "fit"
},
"thumb": {
"w": 150,
"h": 150,
"resize": "crop"
},
"large": {
"w": 1024,
"h": 768,
"resize": "fit"
}
},
"source_status_id": 657745100739506176,
"source_status_id_str": "657745100739506176",
"source_user_id": 999716342,
"source_user_id_str": "999716342"
}
]
},
"favorited": false,
"retweeted": false,
"possibly_sensitive": false,
"filter_level": "low",
"lang": "en",
"timestamp_ms": "1445669074321"
}
**UPDATE: ** I guess I should stick to play-json, even more so for performance reasons - http://derekwyatt.org/2014/01/15/benchmarking-spray-json-argonaut-play-json/
You can depends on Play's JSON library by itself:
// build.sbt
libraryDependencies += "com.typesafe.play" % "play-json_2.11" % "X.X.X"
// Tweet.scala
import play.api.libs.json._
case class User(id: String, name: String, ...)
implicit val userFormat = Json.format[User]
case class Tweet(id: String, content: String, user: User)
implicit val tweetFormat = Json.format[Tweet]
This will use play-json's macros to auto-generate the formatters you need to parse JSON into instances of Tweet and User.
Regardless of library you choose you won't find an elegant solution for handling more than 22 fields since that's a limitation of the case class implementation (up until 2.11) rather than any specific design choice by a library.