I'm working with twitter data which fetched in jsonl form. I've converted it to json and am trying to convert it to a csv (to import into a program which accepts either csv or MySQL). However, some people put forced new lines into their tweets or bios. This is causing the csv file to have multiple lines for entries, often breaking up in the middle of a tweet. I've tried a few of the python json to csv codes floating on github.
The latest attempt I tried:
jq -s "." tiny00subset.jsonl > tiny00subset.json
json2csv -i tiny00subset.json -o tiny00subset.csv
Partial example tweet (json format):
{
"created_at": "Mon Aug 13 10:40:34 +0000 2018",
"id": 1028954459110555600,
"id_str": "1028954459110555649",
"full_text": "Oh well, they deal with it quite well. Like they add numbers and facts and such crazy stuff.\nhttps://REPLACED/DuBGmHCnG8\n#climatechange https://REPLACED/d5IBchM3Uk",
"truncated": false,
"display_text_range": [
0,
131
],
"entities": {
"hashtags": [
{
"text": "climatechange",
"indices": [
117,
131
]
}
],
"symbols": [],
"user_mentions": [],
"urls": [
{
"url": "https://REPLACED/DuBGmHCnG8",
"expanded_url": "https://tamino.wordpress.com/2018/08/08/usa-temperature-can-i-sucker-you/",
"display_url": "tamino.wordpress.com/2018/08/08/usa…",
"indices": [
93,
116
]
},
{
"url": "https://REPLACED/d5IBchM3Uk",
"expanded_url": "https://twitter.com/Tony__Heller/status/1028672939753758720",
"display_url": "twitter.com/Tony__Heller/s…",
"indices": [
132,
155
]
}
]
},
}
CSV Output:
"Mon Aug 13 10:40:34 +0000 2018",1028954459110555600,"1028954459110555649","Oh well, they deal with it quite well. Like they add numbers and facts and such crazy stuff.
https://REPLACED/DuBGmHCnG8
#climatechange https://REPLACED/d5IBchM3Uk",false,"[0,131]","{""hashtags"":[{""text"":""climatechange"",""indices"":[117,131]}],""symbols"":[],""user_mentions"":[],""urls"":[{""url"":""https://REPLACED/DuBGmHCnG8"",""expanded_url"":""https://tamino.wordpress.com/2018/08/08/usa-temperature-can-i-sucker-you/"",""display_url"":""tamino.wordpress.com/2018/08/08/usa…"",""indices"":[93,116]},{""url"":""https://REPLACED/d5IBchM3Uk"",""expanded_url"":""https://twitter.com/Tony__Heller/status/1028672939753758720"",""display_url"":""twitter.com/Tony__Heller/s…"",""indices"":[132,155]}]}","TweetDeck",,,,,,"{""id"":59806323,""id_str"":""59806323"",""name"":""Daniel"",""screen_name"":""sleeksorrow"",""location"":""Karlsruhe, Germany"",""description"":""Politik, IT, Blödsinn und deren Schnittmenge. Ebenfalls: Hochmittelalter Darstellung, Falknerei, Greifvogelschutz - profile picture by #herrkausk"",""url"":""https://REPLACED/E8aNHIhCtg"",""entities"":{""url"":{""urls"":[{""url"":""https://REPLACED/E8aNHIhCtg"",""expanded_url"":""http://sleeksorrow.blogspot.com/"",""display_url"":""sleeksorrow.blogspot.com"",""indices"":[0,23]}]},""description"":{""urls"":[]}},""protected"":false,""followers_count"":572,""friends_count"":392,""listed_count"":47,""created_at"":""Fri Jul 24 15:15:25 +0000 2009"",""favourites_count"":13259,""utc_offset"":null,""time_zone"":null,""geo_enabled"":false,""verified"":false,""statuses_count"":48861,""lang"":null,""contributors_enabled"":false,""is_translator"":false,""is_translation_enabled"":false,""profile_background_color"":""1A1B1F"",""profile_background_image_url"":""http://abs.twimg.com/images/themes/theme9/bg.gif"",""profile_background_image_url_https"":""https://abs.twimg.com/images/themes/theme9/bg.gif"",""profile_background_tile"":false,""profile_image_url"":""http://pbs.twimg.com/profile_images/877219681513480192/1rj4xqpK_normal.jpg"",""profile_image_url_https"":""https://pbs.twimg.com/profile_images/877219681513480192/1rj4xqpK_normal.jpg"",""profile_banner_url"":""https://pbs.twimg.com/profile_banners/59806323/1397029131"",""profile_image_extensions_alt_text"":null,""profile_banner_extensions_alt_text"":null,""profile_link_color"":""2FC2EF"",""profile_sidebar_border_color"":""181A1E"",""profile_sidebar_fill_color"":""252429"",""profile_text_color"":""666666"",""profile_use_background_image"":true,""has_extended_profile"":false,""default_profile"":false,""default_profile_image"":false,""can_media_tag"":true,""followed_by"":false,""following"":false,""follow_request_sent"":false,""notifications"":false,""translator_type"":""none""}",,,,,true,1028672939753758700,"1028672939753758720","{""url"":""https://REPLACED/d5IBchM3Uk"",""expanded"":""https://twitter.com/Tony__Heller/status/1028672939753758720"",""display"":""twitter.com/Tony__Heller/s…""}","{""created_at"":""Sun Aug 12 16:01:55 +0000 2018"",""id"":1028672939753758700,""id_str"":""1028672939753758720"",""full_text"":""#DeanFieldingF1 It is very difficult or impossible for climate alarmists to deal with reality. https://REPLACED/wOJTptxIqH"",""truncated"":false,""display_text_range"":[16,94],""entities"":{""hashtags"":[],""symbols"":[],""user_mentions"":[{""screen_name"":""DeanFieldingF1"",""name"":""Dean Fielding"",""id"":797295219825897500,""id_str"":""797295219825897472"",""indices"":[0,15]}],""urls"":[],""media"":[{""id"":1028672868849090600,""id_str"":""1028672868849090560"",""indices"":[95,118],""media_url"":""http://pbs.twimg.com/media/DkaUhinVAAARrIY.jpg"",""media_url_https"":""https://pbs.twimg.com/media/DkaUhinVAAARrIY.jpg"",""url"":""https://REPLACED/wOJTptxIqH"",""display_url"":""pic.twitter.com/wOJTptxIqH"",""expanded_url"":""https://twitter.com/SteveSGoddard/status/1028672939753758720/photo/1"",""type"":""photo"",""sizes"":{""thumb"":{""w"":150,""h"":150,""resize"":""crop""},""medium"":{""w"":1070,""h"":983,""resize"":""fit""},""large"":{""w"":1070,""h"":983,""resize"":""fit""},""small"":{""w"":680,""h"":625,""resize"":""fit""}},""features"":{""orig"":{""faces"":[]},""medium"":{""faces"":[]},""large"":{""faces"":[]},""small"":{""faces"":[]}}}]},""extended_entities"":{""media"":[{""id"":1028672868849090600,""id_str"":""1028672868849090560"",""indices"":[95,118],""media_url"":""http://pbs.twimg.com/media/DkaUhinVAAARrIY.jpg"",""media_url_https"":""https://pbs.twimg.com/media/DkaUhinVAAARrIY.jpg"",""url"":""https://REPLACED/wOJTptxIqH"",""display_url"":""pic.twitter.com/wOJTptxIqH"",""expanded_url"":""https://twitter.com/SteveSGoddard/status/1028672939753758720/photo/1"",""type"":""photo"",""sizes"":{""thumb"":{""w"":150,""h"":150,""resize"":""crop""},""medium"":{""w"":1070,""h"":983,""resize"":""fit""},""large"":{""w"":1070,""h"":983,""resize"":""fit""},""small"":{""w"":680,""h"":625,""resize"":""fit""}},""features"":{""orig"":{""faces"":[]},""medium"":{""faces"":[]},""large"":{""faces"":[]},""small"":{""faces"":[]}},""ext_alt_text"":null},{""id"":1028672883986333700,""id_str"":""1028672883986333697"",""indices"":[95,118],""media_url"":""http://pbs.twimg.com/media/DkaUibAVAAEaQt0.jpg"",""media_url_https"":""https://pbs.twimg.com/media/DkaUibAVAAEaQt0.jpg"",""url"":""https://REPLACED/wOJTptxIqH"",""display_url"":""pic.twitter.com/wOJTptxIqH"",""expanded_url"":""https://twitter.com/SteveSGoddard/status/1028672939753758720/photo/1"",""type"":""photo"",""sizes"":{""thumb"":{""w"":150,""h"":150,""resize"":""crop""},""medium"":{""w"":1070,""h"":983,""resize"":""fit""},""large"":{""w"":1070,""h"":983,""resize"":""fit""},""small"":{""w"":680,""h"":625,""resize"":""fit""}},""features"":{""orig"":{""faces"":[]},""medium"":{""faces"":[]},""large"":{""faces"":[]},""small"":{""faces"":[]}},""ext_alt_text"":null}]},""source"":""Twitter Web Client"",""in_reply_to_status_id"":1028671170802081800,""in_reply_to_status_id_str"":""1028671170802081793"",""in_reply_to_user_id"":797295219825897500,""in_reply_to_user_id_str"":""797295219825897472"",""in_reply_to_screen_name"":""DeanFieldingF1"",""user"":{""id"":435704007,""id_str"":""435704007"",""name"":""Tony Heller"",""screen_name"":""Tony__Heller"",""location"":""Colorado"",""description"":""https://REPLACED/j5CaDNyIqE"",""url"":""https://REPLACED/Pyn117xXna"",""entities"":{""url"":{""urls"":[{""url"":""https://REPLACED/Pyn117xXna"",""expanded_url"":""http://realclimatescience.com"",""display_url"":""realclimatescience.com"",""indices"":[0,23]}]},""description"":{""urls"":[{""url"":""https://REPLACED/j5CaDNyIqE"",""expanded_url"":""https://realclimatescience.com/who-is-tony-heller/"",""display_url"":""realclimatescience.com/who-is-tony-he…"",""indices"":[0,23]}]}},""protected"":false,""followers_count"":44955,""friends_count"":374,""listed_count"":886,""created_at"":""Tue Dec 13 10:44:34 +0000 2011"",""favourites_count"":3740,""utc_offset"":null,""time_zone"":null,""geo_enabled"":true,""verified"":false,""statuses_count"":165165,""lang"":null,""contributors_enabled"":false,""is_translator"":false,""is_translation_enabled"":false,""profile_background_color"":""185370"",""profile_background_image_url"":""http://abs.twimg.com/images/themes/theme1/bg.png"",""profile_background_image_url_https"":""https://abs.twimg.com/images/themes/theme1/bg.png"",""profile_background_tile"":false,""profile_image_url"":""http://pbs.twimg.com/profile_images/1175541923508916225/0qEi4yIj_normal.jpg"",""profile_image_url_https"":""https://pbs.twimg.com/profile_images/1175541923508916225/0qEi4yIj_normal.jpg"",""profile_banner_url"":""https://pbs.twimg.com/profile_banners/435704007/1469798959"",""profile_image_extensions_alt_text"":null,""profile_banner_extensions_alt_text"":null,""profile_link_color"":""0084B4"",""profile_sidebar_border_color"":""FFFFFF"",""profile_sidebar_fill_color"":""DDEEF6"",""profile_text_color"":""333333"",""profile_use_background_image"":true,""has_extended_profile"":false,""default_profile"":false,""default_profile_image"":false,""can_media_tag"":false,""followed_by"":false,""following"":false,""follow_request_sent"":false,""notifications"":false,""translator_type"":""none""},""geo"":null,""coordinates"":null,""place"":null,""contributors"":null,""is_quote_status"":false,""retweet_count"":16,""favorite_count"":27,""favorited"":false,""retweeted"":false,""possibly_sensitive"":false,""lang"":""en""}",0,0,false,false,false,"en"
starting from
{
"created_at": "Mon Aug 13 10:40:34 +0000 2018",
"id": 1028954459110555600,
"id_str": "1028954459110555649",
"full_text": "Oh well, they deal with it quite well. Like they add numbers and facts and such crazy stuff.\nhttps://REPLACED/DuBGmHCnG8\n#climatechange https://REPLACED/d5IBchM3Uk",
"truncated": false,
"display_text_range": [
0,
131
],
"entities": {
"hashtags": [
{
"text": "climatechange",
"indices": [
117,
131
]
}
],
"symbols": [],
"user_mentions": [],
"urls": [
{
"url": "https://REPLACED/DuBGmHCnG8",
"expanded_url": "https://tamino.wordpress.com/2018/08/08/usa-temperature-can-i-sucker-you/",
"display_url": "tamino.wordpress.com/2018/08/08/usa…",
"indices": [
93,
116
]
},
{
"url": "https://REPLACED/d5IBchM3Uk",
"expanded_url": "https://twitter.com/Tony__Heller/status/1028672939753758720",
"display_url": "twitter.com/Tony__Heller/s…",
"indices": [
132,
155
]
}
]
}
}
and running (it's https://github.com/johnkerl/miller)
mlr --j2c unsparsify input.json >input.csv
you have this kind of output https://gist.github.com/aborruso/6e0361923a3c45b9fe55ebf7590953de#file-output-csv
If you open it as raw you have the carriage return. And a spreasheet read it properly.
Then, using properly the import process you need to use, the \n is not a problem.
I have kilo rc1 installed on CentOS7 with following rpm pkgs.
# rpm -qa | grep -Ei 'nova|urllib3|request|six'
requests-2.6.0-1.noarch
python-novaclient-2.23.0.post13-1.noarch
nova-2015.1.0rc1-1.noarch
six-1.9.0-1.noarch
nova-docker-0.0.0.post183-1.noarch
urllib3-1.10.1-1.noarch
Keystone, Glance and Neutron work as expected plus nova GET method such as nova service-list. All password and configuration files have been verified. when issuing a nova boot cmd, I got http 500 error detailed below.
--- Get the token with neutron/password ----
# openstack token issue
+------------+----------------------------------+
| Field | Value |
+------------+----------------------------------+
| expires | 2015-04-22T21:12:14Z |
| id | 24e6a5e2546c41c98865c946f10f7ddb |
| project_id | 0ece3f1cc56a4a0bba1906b43d1faceb |
| user_id | ef18eea137ed4dabad1f92f4a393fd70 |
+------------+----------------------------------+
-------------- Used the issued token to call nova API ---------------
# curl -g -i -X GET http://10.0.0.244:8774/v2/0ece3f1cc56a4a0bba1906b43d1faceb/os-services -H "User-Agent: python-novaclient" -H "Accept: application/json" -H "X-Auth-Token: 24e6a5e2546c41c98865c946f10f7ddb"
HTTP/1.1 200 OK
Content-Type: application/json
Content-Length: 1285
Date: Wed, 22 Apr 2015 20:14:57 GMT
{"services": [{"status": "enabled", "binary": "nova-scheduler", "zone": "internal", "state": "up", "updated_at": "2015-04-22T20:14:53.000000", "host": "ctrail72", "disabled_reason": null, "id": 1}, {"status": "enabled", "binary": "nova-conductor", "zone": "internal", "state": "up", "updated_at": "2015-04-22T20:14:55.000000", "host": "ctrail72", "disabled_reason": null, "id": 2}, {"status": "enabled", "binary": "nova-console", "zone": "internal", "state": "up", "updated_at": "2015-04-22T20:14:47.000000", "host": "ctrail72", "disabled_reason": null, "id": 3}, {"status": "enabled", "binary": "nova-consoleauth", "zone": "internal", "state": "up", "updated_at": "2015-04-22T20:14:48.000000", "host": "ctrail72", "disabled_reason": null, "id": 4}, {"status": "enabled", "binary": "nova-cert", "zone": "internal", "state": "up", "updated_at": "2015-04-22T20:14:50.000000", "host": "ctrail72", "disabled_reason": null, "id": 5}, {"status": "enabled", "binary": "nova-compute", "zone": "nova", "state": "up", "updated_at": "2015-04-22T20:14:51.000000", "host": "ctrail72", "disabled_reason": null, "id": 6}, {"status": "enabled", "binary": "nova-compute", "zone": "nova", "state": "up", "updated_at": "2015-04-22T20:14:54.000000", "host": "comp70-1", "disabled_reason": null, "id": 7}]}[
---------- Same token works for neutron API ------
# curl -g -i -X GET http://10.0.0.244:9696/v2.0/subnets.json -H "X-Auth- Token: 24e6a5e2546c41c98865c946f10f7ddb"
HTTP/1.1 200 OK
Content-Type: application/json; charset=UTF-8
Content-Length: 898
X-Openstack-Request-Id: req-4f547520-4cb2-4f59-835a-711c933664d1
Date: Wed, 22 Apr 2015 20:16:15 GMT
{"subnets": [{"name": "sub101", "enable_dhcp": true, "network_id": "40945ae1-344c-4ebd-a25b-2776feb0f409", "tenant_id": "959d7f7e020b48509aea18dcec819491", "dns_nameservers": [], "gateway_ip": "10.0.0.1", "ipv6_ra_mode": ...
-------- nova boot failed with 500 error code ------
nova --debug boot --flavor 1 --image dockerc7 --nic net-id=40945ae1-344c-4ebd-a25b-2776feb0f409 d01
.......
Traceback (most recent call last):
File "/usr/lib/python2.7/site-packages/novaclient/shell.py", line 911, in main
OpenStackComputeShell().main(argv)
File "/usr/lib/python2.7/site-packages/novaclient/shell.py", line 838, in main
args.func(self.cs, args)
File "/usr/lib/python2.7/site-packages/novaclient/v2/shell.py", line 500, in do_boot
server = cs.servers.create(*boot_args, **boot_kwargs)
File "/usr/lib/python2.7/site-packages/novaclient/v2/servers.py", line 929, in create
**boot_kwargs)
File "/usr/lib/python2.7/site-packages/novaclient/v2/servers.py", line 557, in _boot
return_raw=return_raw, **kwargs)
File "/usr/lib/python2.7/site-packages/novaclient/base.py", line 152, in _create
_resp, body = self.api.client.post(url, body=body)
File "/usr/lib/python2.7/site-packages/keystoneclient/adapter.py", line 171, in post
return self.request(url, 'POST', **kwargs)
File "/usr/lib/python2.7/site-packages/novaclient/client.py", line 97, in request
raise exceptions.from_response(resp, body, url, method)
ClientException: The server has either erred or is incapable of performing
the requested operation. (HTTP 500)
ERROR (ClientException): The server has either erred or is incapable of performing the requested operation. (HTTP 500)
Further trace back it found that self._sock.recv in /usr/lib64/python2.7/socket.py returns 500 error.
> /usr/lib64/python2.7/socket.py(481)readline()
-> if not data:
(Pdb) l
476 data = self._sock.recv(self._rbufsize)
477 except error, e:
478 if e.args[0] == EINTR:
479 continue
480 raise
481 -> if not data:
482 break
483 left = size - buf_len
484 # did we just receive a newline?
485 nl = data.find('\n', 0, left)
486 if nl >= 0:
(Pdb) p data
'HTTP/1.1 500 Internal Server Error\r\nContent-Length: 128\r\nContent-Type: application/json; charset=UTF-8\r\nDate: Wed, 22 Apr 2015 18:21:28 GMT\r\nConnection: keep-alive\r\n\r\n{"computeFault": {"message": "The server has either erred or is incapable of performing the requested operation.", "code": 500}}'
Any suggestion and tips are appreciated!