I'm working with twitter data which fetched in jsonl form. I've converted it to json and am trying to convert it to a csv (to import into a program which accepts either csv or MySQL). However, some people put forced new lines into their tweets or bios. This is causing the csv file to have multiple lines for entries, often breaking up in the middle of a tweet. I've tried a few of the python json to csv codes floating on github.
The latest attempt I tried:
jq -s "." tiny00subset.jsonl > tiny00subset.json
json2csv -i tiny00subset.json -o tiny00subset.csv
Partial example tweet (json format):
{
"created_at": "Mon Aug 13 10:40:34 +0000 2018",
"id": 1028954459110555600,
"id_str": "1028954459110555649",
"full_text": "Oh well, they deal with it quite well. Like they add numbers and facts and such crazy stuff.\nhttps://REPLACED/DuBGmHCnG8\n#climatechange https://REPLACED/d5IBchM3Uk",
"truncated": false,
"display_text_range": [
0,
131
],
"entities": {
"hashtags": [
{
"text": "climatechange",
"indices": [
117,
131
]
}
],
"symbols": [],
"user_mentions": [],
"urls": [
{
"url": "https://REPLACED/DuBGmHCnG8",
"expanded_url": "https://tamino.wordpress.com/2018/08/08/usa-temperature-can-i-sucker-you/",
"display_url": "tamino.wordpress.com/2018/08/08/usa…",
"indices": [
93,
116
]
},
{
"url": "https://REPLACED/d5IBchM3Uk",
"expanded_url": "https://twitter.com/Tony__Heller/status/1028672939753758720",
"display_url": "twitter.com/Tony__Heller/s…",
"indices": [
132,
155
]
}
]
},
}
CSV Output:
"Mon Aug 13 10:40:34 +0000 2018",1028954459110555600,"1028954459110555649","Oh well, they deal with it quite well. Like they add numbers and facts and such crazy stuff.
https://REPLACED/DuBGmHCnG8
#climatechange https://REPLACED/d5IBchM3Uk",false,"[0,131]","{""hashtags"":[{""text"":""climatechange"",""indices"":[117,131]}],""symbols"":[],""user_mentions"":[],""urls"":[{""url"":""https://REPLACED/DuBGmHCnG8"",""expanded_url"":""https://tamino.wordpress.com/2018/08/08/usa-temperature-can-i-sucker-you/"",""display_url"":""tamino.wordpress.com/2018/08/08/usa…"",""indices"":[93,116]},{""url"":""https://REPLACED/d5IBchM3Uk"",""expanded_url"":""https://twitter.com/Tony__Heller/status/1028672939753758720"",""display_url"":""twitter.com/Tony__Heller/s…"",""indices"":[132,155]}]}","TweetDeck",,,,,,"{""id"":59806323,""id_str"":""59806323"",""name"":""Daniel"",""screen_name"":""sleeksorrow"",""location"":""Karlsruhe, Germany"",""description"":""Politik, IT, Blödsinn und deren Schnittmenge. Ebenfalls: Hochmittelalter Darstellung, Falknerei, Greifvogelschutz - profile picture by #herrkausk"",""url"":""https://REPLACED/E8aNHIhCtg"",""entities"":{""url"":{""urls"":[{""url"":""https://REPLACED/E8aNHIhCtg"",""expanded_url"":""http://sleeksorrow.blogspot.com/"",""display_url"":""sleeksorrow.blogspot.com"",""indices"":[0,23]}]},""description"":{""urls"":[]}},""protected"":false,""followers_count"":572,""friends_count"":392,""listed_count"":47,""created_at"":""Fri Jul 24 15:15:25 +0000 2009"",""favourites_count"":13259,""utc_offset"":null,""time_zone"":null,""geo_enabled"":false,""verified"":false,""statuses_count"":48861,""lang"":null,""contributors_enabled"":false,""is_translator"":false,""is_translation_enabled"":false,""profile_background_color"":""1A1B1F"",""profile_background_image_url"":""http://abs.twimg.com/images/themes/theme9/bg.gif"",""profile_background_image_url_https"":""https://abs.twimg.com/images/themes/theme9/bg.gif"",""profile_background_tile"":false,""profile_image_url"":""http://pbs.twimg.com/profile_images/877219681513480192/1rj4xqpK_normal.jpg"",""profile_image_url_https"":""https://pbs.twimg.com/profile_images/877219681513480192/1rj4xqpK_normal.jpg"",""profile_banner_url"":""https://pbs.twimg.com/profile_banners/59806323/1397029131"",""profile_image_extensions_alt_text"":null,""profile_banner_extensions_alt_text"":null,""profile_link_color"":""2FC2EF"",""profile_sidebar_border_color"":""181A1E"",""profile_sidebar_fill_color"":""252429"",""profile_text_color"":""666666"",""profile_use_background_image"":true,""has_extended_profile"":false,""default_profile"":false,""default_profile_image"":false,""can_media_tag"":true,""followed_by"":false,""following"":false,""follow_request_sent"":false,""notifications"":false,""translator_type"":""none""}",,,,,true,1028672939753758700,"1028672939753758720","{""url"":""https://REPLACED/d5IBchM3Uk"",""expanded"":""https://twitter.com/Tony__Heller/status/1028672939753758720"",""display"":""twitter.com/Tony__Heller/s…""}","{""created_at"":""Sun Aug 12 16:01:55 +0000 2018"",""id"":1028672939753758700,""id_str"":""1028672939753758720"",""full_text"":""#DeanFieldingF1 It is very difficult or impossible for climate alarmists to deal with reality. https://REPLACED/wOJTptxIqH"",""truncated"":false,""display_text_range"":[16,94],""entities"":{""hashtags"":[],""symbols"":[],""user_mentions"":[{""screen_name"":""DeanFieldingF1"",""name"":""Dean Fielding"",""id"":797295219825897500,""id_str"":""797295219825897472"",""indices"":[0,15]}],""urls"":[],""media"":[{""id"":1028672868849090600,""id_str"":""1028672868849090560"",""indices"":[95,118],""media_url"":""http://pbs.twimg.com/media/DkaUhinVAAARrIY.jpg"",""media_url_https"":""https://pbs.twimg.com/media/DkaUhinVAAARrIY.jpg"",""url"":""https://REPLACED/wOJTptxIqH"",""display_url"":""pic.twitter.com/wOJTptxIqH"",""expanded_url"":""https://twitter.com/SteveSGoddard/status/1028672939753758720/photo/1"",""type"":""photo"",""sizes"":{""thumb"":{""w"":150,""h"":150,""resize"":""crop""},""medium"":{""w"":1070,""h"":983,""resize"":""fit""},""large"":{""w"":1070,""h"":983,""resize"":""fit""},""small"":{""w"":680,""h"":625,""resize"":""fit""}},""features"":{""orig"":{""faces"":[]},""medium"":{""faces"":[]},""large"":{""faces"":[]},""small"":{""faces"":[]}}}]},""extended_entities"":{""media"":[{""id"":1028672868849090600,""id_str"":""1028672868849090560"",""indices"":[95,118],""media_url"":""http://pbs.twimg.com/media/DkaUhinVAAARrIY.jpg"",""media_url_https"":""https://pbs.twimg.com/media/DkaUhinVAAARrIY.jpg"",""url"":""https://REPLACED/wOJTptxIqH"",""display_url"":""pic.twitter.com/wOJTptxIqH"",""expanded_url"":""https://twitter.com/SteveSGoddard/status/1028672939753758720/photo/1"",""type"":""photo"",""sizes"":{""thumb"":{""w"":150,""h"":150,""resize"":""crop""},""medium"":{""w"":1070,""h"":983,""resize"":""fit""},""large"":{""w"":1070,""h"":983,""resize"":""fit""},""small"":{""w"":680,""h"":625,""resize"":""fit""}},""features"":{""orig"":{""faces"":[]},""medium"":{""faces"":[]},""large"":{""faces"":[]},""small"":{""faces"":[]}},""ext_alt_text"":null},{""id"":1028672883986333700,""id_str"":""1028672883986333697"",""indices"":[95,118],""media_url"":""http://pbs.twimg.com/media/DkaUibAVAAEaQt0.jpg"",""media_url_https"":""https://pbs.twimg.com/media/DkaUibAVAAEaQt0.jpg"",""url"":""https://REPLACED/wOJTptxIqH"",""display_url"":""pic.twitter.com/wOJTptxIqH"",""expanded_url"":""https://twitter.com/SteveSGoddard/status/1028672939753758720/photo/1"",""type"":""photo"",""sizes"":{""thumb"":{""w"":150,""h"":150,""resize"":""crop""},""medium"":{""w"":1070,""h"":983,""resize"":""fit""},""large"":{""w"":1070,""h"":983,""resize"":""fit""},""small"":{""w"":680,""h"":625,""resize"":""fit""}},""features"":{""orig"":{""faces"":[]},""medium"":{""faces"":[]},""large"":{""faces"":[]},""small"":{""faces"":[]}},""ext_alt_text"":null}]},""source"":""Twitter Web Client"",""in_reply_to_status_id"":1028671170802081800,""in_reply_to_status_id_str"":""1028671170802081793"",""in_reply_to_user_id"":797295219825897500,""in_reply_to_user_id_str"":""797295219825897472"",""in_reply_to_screen_name"":""DeanFieldingF1"",""user"":{""id"":435704007,""id_str"":""435704007"",""name"":""Tony Heller"",""screen_name"":""Tony__Heller"",""location"":""Colorado"",""description"":""https://REPLACED/j5CaDNyIqE"",""url"":""https://REPLACED/Pyn117xXna"",""entities"":{""url"":{""urls"":[{""url"":""https://REPLACED/Pyn117xXna"",""expanded_url"":""http://realclimatescience.com"",""display_url"":""realclimatescience.com"",""indices"":[0,23]}]},""description"":{""urls"":[{""url"":""https://REPLACED/j5CaDNyIqE"",""expanded_url"":""https://realclimatescience.com/who-is-tony-heller/"",""display_url"":""realclimatescience.com/who-is-tony-he…"",""indices"":[0,23]}]}},""protected"":false,""followers_count"":44955,""friends_count"":374,""listed_count"":886,""created_at"":""Tue Dec 13 10:44:34 +0000 2011"",""favourites_count"":3740,""utc_offset"":null,""time_zone"":null,""geo_enabled"":true,""verified"":false,""statuses_count"":165165,""lang"":null,""contributors_enabled"":false,""is_translator"":false,""is_translation_enabled"":false,""profile_background_color"":""185370"",""profile_background_image_url"":""http://abs.twimg.com/images/themes/theme1/bg.png"",""profile_background_image_url_https"":""https://abs.twimg.com/images/themes/theme1/bg.png"",""profile_background_tile"":false,""profile_image_url"":""http://pbs.twimg.com/profile_images/1175541923508916225/0qEi4yIj_normal.jpg"",""profile_image_url_https"":""https://pbs.twimg.com/profile_images/1175541923508916225/0qEi4yIj_normal.jpg"",""profile_banner_url"":""https://pbs.twimg.com/profile_banners/435704007/1469798959"",""profile_image_extensions_alt_text"":null,""profile_banner_extensions_alt_text"":null,""profile_link_color"":""0084B4"",""profile_sidebar_border_color"":""FFFFFF"",""profile_sidebar_fill_color"":""DDEEF6"",""profile_text_color"":""333333"",""profile_use_background_image"":true,""has_extended_profile"":false,""default_profile"":false,""default_profile_image"":false,""can_media_tag"":false,""followed_by"":false,""following"":false,""follow_request_sent"":false,""notifications"":false,""translator_type"":""none""},""geo"":null,""coordinates"":null,""place"":null,""contributors"":null,""is_quote_status"":false,""retweet_count"":16,""favorite_count"":27,""favorited"":false,""retweeted"":false,""possibly_sensitive"":false,""lang"":""en""}",0,0,false,false,false,"en"
starting from
{
"created_at": "Mon Aug 13 10:40:34 +0000 2018",
"id": 1028954459110555600,
"id_str": "1028954459110555649",
"full_text": "Oh well, they deal with it quite well. Like they add numbers and facts and such crazy stuff.\nhttps://REPLACED/DuBGmHCnG8\n#climatechange https://REPLACED/d5IBchM3Uk",
"truncated": false,
"display_text_range": [
0,
131
],
"entities": {
"hashtags": [
{
"text": "climatechange",
"indices": [
117,
131
]
}
],
"symbols": [],
"user_mentions": [],
"urls": [
{
"url": "https://REPLACED/DuBGmHCnG8",
"expanded_url": "https://tamino.wordpress.com/2018/08/08/usa-temperature-can-i-sucker-you/",
"display_url": "tamino.wordpress.com/2018/08/08/usa…",
"indices": [
93,
116
]
},
{
"url": "https://REPLACED/d5IBchM3Uk",
"expanded_url": "https://twitter.com/Tony__Heller/status/1028672939753758720",
"display_url": "twitter.com/Tony__Heller/s…",
"indices": [
132,
155
]
}
]
}
}
and running (it's https://github.com/johnkerl/miller)
mlr --j2c unsparsify input.json >input.csv
you have this kind of output https://gist.github.com/aborruso/6e0361923a3c45b9fe55ebf7590953de#file-output-csv
If you open it as raw you have the carriage return. And a spreasheet read it properly.
Then, using properly the import process you need to use, the \n is not a problem.