I am trying to load json file with more then 100 columns into Bigquery. Some of these columns has special character within their name i.e. dollar sign($) and period(.) Rows/Records content also varies - meaning not all columns may be present in each row/record which is totally acceptable json format.
I have search similar posts
How to manage/handle schema changes while loading JSON file into BigQuery table
BigQuery: Create column of JSON datatype
which suggest to load the data into single "STRING" column as CSV format first and parse out columns using JSON_EXTRACT() function to target table. Hence I have created a BigQuery table with following schema definition:
[
{
"name": "data",
"type": "STRING"
}
]
then I have ran following CLI command:
bq load --source_format=CSV test.bq_load_test ./data_file.json ./bq_load_test_schema.json
which result into following error:
Error Message:
BigQuery error in load operation: Error processing job : Error
while reading data, error message: CSV table encountered too many errors, giving up. Rows: 1; errors: 1. Please look into the
errors[] collection for more details.
Failure details:
- Error while reading data, error message: Too many values in row
starting at position: 0.
Here's the data file layout:
root
|-- $insert_id: string (nullable = true)
|-- $schema: long (nullable = true)
|-- adid: string (nullable = true)
|-- vendor_attribution_ids: array (nullable = true)
| |-- element: string (containsNull = true)
|-- vendor_event_type: string (nullable = true)
|-- vendor_id: long (nullable = true)
|-- app: long (nullable = true)
|-- city: string (nullable = true)
|-- client_event_time: string (nullable = true)
|-- client_upload_time: string (nullable = true)
|-- country: string (nullable = true)
|-- data: struct (nullable = true)
| |-- first_event: boolean (nullable = true)
|-- device_brand: string (nullable = true)
|-- device_carrier: string (nullable = true)
|-- device_family: string (nullable = true)
|-- device_id: string (nullable = true)
|-- device_manufacturer: string (nullable = true)
|-- device_model: string (nullable = true)
|-- device_type: string (nullable = true)
|-- dma: string (nullable = true)
|-- event_id: long (nullable = true)
|-- event_properties: struct (nullable = true)
| |-- U.vf: string (nullable = true)
| |-- app.name: string (nullable = true)
| |-- app.pillar: string (nullable = true)
| |-- app.version: string (nullable = true)
| |-- categories: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- cmfAppId: string (nullable = true)
| |-- content.area: string (nullable = true)
| |-- content.authenticated: boolean (nullable = true)
| |-- content.authors: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- content.cms: string (nullable = true)
| |-- content.id: string (nullable = true)
| |-- content.keywords.collections: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- content.keywords.company: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- content.keywords.location: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- content.keywords.organization: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- content.keywords.person: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- content.keywords.subject: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- content.keywords.tag: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- content.media.audiovideo: string (nullable = true)
| |-- content.media.contentarea: string (nullable = true)
| |-- content.media.duration: long (nullable = true)
| |-- content.media.episodenumber: string (nullable = true)
| |-- content.media.genre: string (nullable = true)
| |-- content.media.length: double (nullable = true)
| |-- content.media.liveondemand: string (nullable = true)
| |-- content.media.region: string (nullable = true)
| |-- content.media.seasonnumber: string (nullable = true)
| |-- content.media.show: string (nullable = true)
| |-- content.media.sport: string (nullable = true)
| |-- content.media.streamtitle: string (nullable = true)
| |-- content.media.type: string (nullable = true)
| |-- content.originaltitle: string (nullable = true)
| |-- content.pubdate: long (nullable = true)
| |-- content.publishedtime: string (nullable = true)
| |-- content.subsection1: string (nullable = true)
| |-- content.subsection2: string (nullable = true)
| |-- content.subsection3: string (nullable = true)
| |-- content.subsection4: string (nullable = true)
| |-- content.tier: string (nullable = true)
| |-- content.title: string (nullable = true)
| |-- content.type: string (nullable = true)
| |-- content.updatedtime: string (nullable = true)
| |-- content.url: string (nullable = true)
| |-- custom.DNT: boolean (nullable = true)
| |-- custom.cookiesenabled: boolean (nullable = true)
| |-- custom.engine: string (nullable = true)
| |-- feature.name: string (nullable = true)
| |-- feature.position: string (nullable = true)
| |-- lastupdate: string (nullable = true)
| |-- pubdate: string (nullable = true)
| |-- referrer.campaign: string (nullable = true)
| |-- referrer.url: string (nullable = true)
| |-- syndicate: string (nullable = true)
| |-- user.interests.explicit.no: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- user.interests.explicit.yes: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- user.tier: string (nullable = true)
| |-- userTier: string (nullable = true)
|-- event_time: string (nullable = true)
|-- event_type: string (nullable = true)
|-- idfa: string (nullable = true)
|-- ip_address: string (nullable = true)
|-- is_attribution_event: boolean (nullable = true)
|-- language: string (nullable = true)
|-- library: string (nullable = true)
|-- location_lat: double (nullable = true)
|-- location_lng: double (nullable = true)
|-- os_name: string (nullable = true)
|-- os_version: string (nullable = true)
|-- paying: string (nullable = true)
|-- platform: string (nullable = true)
|-- processed_time: string (nullable = true)
|-- region: string (nullable = true)
|-- sample_rate: string (nullable = true)
|-- server_received_time: string (nullable = true)
|-- server_upload_time: string (nullable = true)
|-- session_id: long (nullable = true)
|-- start_version: string (nullable = true)
|-- user_creation_time: string (nullable = true)
|-- user_id: string (nullable = true)
|-- user_properties: struct (nullable = true)
| |-- internal.userID: string (nullable = true)
| |-- internal.userTier: string (nullable = true)
| |-- experiment.id: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- experiment.variant: array (nullable = true)
| | |-- element: string (containsNull = true)
| |-- location.news: string (nullable = true)
| |-- location.radio: string (nullable = true)
| |-- location.region: string (nullable = true)
| |-- location.tv: string (nullable = true)
| |-- location.weather: string (nullable = true)
| |-- referrer.campaign: string (nullable = true)
| |-- user.id: string (nullable = true)
| |-- user.id.internalvisitor: string (nullable = true)
| |-- user.tier: string (nullable = true)
|-- uuid: string (nullable = true)
|-- version_name: string (nullable = true)
|-- feature_origin: string (nullable = true)
Here's the snippet of the data file:
{"server_received_time":"2019-01-17 15:00:00.482000","app":161,"device_carrier":null,"$schema":12,"city":"Caro","user_id":null,"uuid":"9018","event_time":"2019-01-17 15:00:00.045000","platform":"Web","os_version":"49","vendor_id":711,"processed_time":"2019-01-17 15:00:00.817195","user_creation_time":"2018-11-01 19:16:34.971000","version_name":null,"ip_address":null,"paying":null,"dma":null,"group_properties":{},"user_properties":{"location.radio":"ca","vendor.userTier":"free","vendor.userID":"a989","user.id":"a989","user.tier":"free","location.region":"ca"},"client_upload_time":"2019-01-17 15:00:00.424000","$insert_id":"e8410","event_type":"LOADED","library":"amp\/4.5.2","vendor_attribution_ids":null,"device_type":"Mac","device_manufacturer":null,"start_version":null,"location_lng":null,"server_upload_time":"2019-01-17 15:00:00.493000","event_id":64,"location_lat":null,"os_name":"Chrome","vendor_event_type":null,"device_brand":null,"groups":{},"event_properties":{"content.authenticated":false,"content.subsection1":"regions","custom.DNT":true,"content.subsection2":"ca","referrer.url":"","content.url":"","content.type":"index","content.title":"","custom.cookiesenabled":true,"app.pillar":"feed","content.area":"news","app.name":"oc"},"data":{},"device_id":"","language":"English","device_model":"Mac","country":"","region":"","is_attribution_event":false,"adid":null,"session_id":15,"device_family":"Mac","sample_rate":null,"idfa":null,"client_event_time":"2019-01-17 14:59:59.987000"}
{"server_received_time":"2019-01-17 15:00:00.913000","app":161,"device_carrier":null,"$schema":12,"city":"Fo","user_id":null,"uuid":"9052","event_time":"2019-01-17 15:00:00.566000","platform":"Web","os_version":"71","vendor_id":797,"processed_time":"2019-01-17 15:00:01.301936","user_creation_time":"2019-01-17 15:00:00.566000","version_name":null,"ip_address":null,"paying":null,"dma":"CO","group_properties":{},"user_properties":{"user.tier":"free"},"client_upload_time":"2019-01-17 15:00:00.157000","$insert_id":"69ae","event_type":"START WEB SESSION","library":"amp\/4.5.2","vendor_attribution_ids":null,"device_type":"Android","device_manufacturer":null,"start_version":null,"location_lng":null,"server_upload_time":"2019-01-17 15:00:00.925000","event_id":1,"location_lat":null,"os_name":"Chrome Mobile","vendor_event_type":null,"device_brand":null,"groups":{},"event_properties":{"content.subsection3":"home","content.subsection2":"archives","content.title":"","content.keywords.subject":["Lifestyle\/Recreation and leisure\/Outdoor recreation\/Boating","Lifestyle\/Relationships\/Couples","General news\/Weather","Oddities"],"content.publishedtime":154687,"app.name":"oc","referrer.url":"","content.subsection1":"archives","content.url":"","content.authenticated":false,"content.keywords.location":["Ot"],"content.originaltitle":"","content.type":"story","content.authors":["Archives"],"app.pillar":"feed","content.area":"news","content.id":"1.49","content.updatedtime":1546878600538,"content.keywords.tag":["24 1","boat house","Ot","Rockcliffe","River","m"],"content.keywords.person":["Ber","Shi","Jea","Jean\u00e9tien"]},"data":{"first_event":true},"device_id":"","language":"English","device_model":"Android","country":"","region":"","is_attribution_event":false,"adid":null,"session_id":15477,"device_family":"Android","sample_rate":null,"idfa":null,"client_event_time":"2019-01-17 14:59:59.810000"}
{"server_received_time":"2019-01-17 15:00:00.913000","app":16,"device_carrier":null,"$schema":12,"city":"","user_id":null,"uuid":"905","event_time":"2019-01-17 15:00:00.574000","platform":"Web","os_version":"71","vendor_id":7973,"processed_time":"2019-01-17 15:00:01.301957","user_creation_time":"2019-01-17 15:00:00.566000","version_name":null,"ip_address":null,"paying":null,"dma":"DCO","group_properties":{},"user_properties":{"user.tier":"free"},"client_upload_time":"2019-01-17 15:00:00.157000","$insert_id":"d045","event_type":"LOADED","library":"am-js\/4.5.2","vendor_attribution_ids":null,"device_type":"Android","device_manufacturer":null,"start_version":null,"location_lng":null,"server_upload_time":"2019-01-17 15:00:00.925000","event_id":2,"location_lat":null,"os_name":"Chrome Mobile","vendor_event_type":null,"device_brand":null,"groups":{},"event_properties":{"content.subsection3":"home","content.subsection2":"archives","content.subsection1":"archives","content.keywords.subject":["Lifestyle\/Recreation and leisure\/Outdoor recreation\/Boating","Lifestyle\/Relationships\/Couples","General news\/Weather","Oddities"],"content.type":"story","content.keywords.location":["Ot"],"app.pillar":"feed","app.name":"oc","content.authenticated":false,"custom.DNT":false,"content.id":"1.4","content.keywords.person":["Ber","Shi","Jea","Je\u00e9tien"],"content.title":"","content.url":"","content.originaltitle":"","custom.cookiesenabled":true,"content.authors":["Archives"],"content.publishedtime":1546878600538,"referrer.url":"","content.area":"news","content.updatedtime":1546878600538,"content.keywords.tag":["24 1","boat house","O","Rockcliffe","River","pr"]},"data":{},"device_id":"","language":"English","device_model":"Android","country":"","region":"","is_attribution_event":false,"adid":null,"session_id":1547737199081,"device_family":"Android","sample_rate":null,"idfa":null,"client_event_time":"2019-01-17 14:59:59.818000"}
Any input? What am I missing here?
You have to specify the right delimiter for CSV file. Notice that the default value for this flag is ',' and your data has ',', therefore, every row is interpreted as multiple fields. I tested with your data and this worked for me:
bq load --source_format=CSV -F ';' test.bq_load_test ./data_file.json
Notice that ';' worked because the snippet data does not contain ';'
I have a dataframe that I am trying to write out to an S3 location as a JSON.
df.printSchema
root
|-- userId: string (nullable = false)
|-- firstName: string (nullable = false)
|-- address: string (nullable = true)
|-- Email: array (nullable = true)
| |-- element: string (containsNull = true)
|-- UserFoodFavourites: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- foodName: string (nullable = true)
| | |-- isFavFood: boolean (nullable = false)
|-- UserGameFavourites: array (nullable = true)
| |-- element: array (containsNull = true)
| | |-- element: struct (containsNull = true)
| | | |-- Department: string (nullable = false)
| | | |-- gameName: string (nullable = false)
Writing out dataframe to JSON:
df.repartition(1).write.option("mode","append").json("s3Location")
JSON output I get:
{"userId":111,"firstName":"first123","address":"xyz",
"Email":["def#gmail.com","abc#gmail.com"],
"UserFoodFavourites":[{"foodName":"food1","isFavFood":true},{"foodName":"food2","isFavFood":false}],
"UserGameFavourites":[[{"Department":"Outdoor","gameName":"O1"}],[{"Department":"Indoor","gameName":"I1"},{"Department":"Indoor","gameName":"I2"}]]}
{"userId":123,"firstName":"first123","address":"xyz",
"Email":["def#gmail.com","abc#gmail.com"],
"UserFoodFavourites":[{"foodName":"food1","isFavFood":true},{"foodName":"food2","isFavFood":false}],
"UserGameFavourites":[[{"Department":"Outdoor","gameName":"O1"}],[{"Department":"Indoor","gameName":"I1"},{"Department":"Indoor","gameName":"I2"}]]}
alias prettyjson='python -m json.tool'
However this does not work when I try printing this file in a pretty JSON format using prettyJSON on this file because these are written as multiple JSON objects per userId.
I am trying to write this out as a single JSON object with new line separation, so I can run prettyJSON on this file.
Any help is appreciated.
I have this kind of JSON data:
{
"data": [
{
"id": "4619623",
"team": "452144",
"created_on": "2018-10-09 02:55:51",
"links": {
"edit": "https://some_page",
"publish": "https://some_publish",
"default": "https://some_default"
}
},
{
"id": "4619600",
"team": "452144",
"created_on": "2018-10-09 02:42:25",
"links": {
"edit": "https://some_page",
"publish": "https://some_publish",
"default": "https://some_default"
}
}
}
I read this data using Apache spark and I want to write them partition by id column. When I use this:
df.write.partitionBy("data.id").json(<path_to_folder>)
I will get error:
Exception in thread "main" org.apache.spark.sql.AnalysisException: Partition column data.id not found in schema
I also tried to use explode function like that:
import org.apache.spark.sql.functions.{col, explode}
val renamedDf= df.withColumn("id", explode(col("data.id")))
renamedDf.write.partitionBy("id").json(<path_to_folder>)
That actually helped, but each id partition folder contained the same original JSON file.
EDIT: schema of df DataFrame:
|-- data: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- created_on: string (nullable = true)
| | |-- id: string (nullable = true)
| | |-- links: struct (nullable = true)
| | | |-- default: string (nullable = true)
| | | |-- edit: string (nullable = true)
| | | |-- publish: string (nullable = true)
Schema of renamedDf DataFrame:
|-- data: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- created_on: string (nullable = true)
| | |-- id: string (nullable = true)
| | |-- links: struct (nullable = true)
| | | |-- default: string (nullable = true)
| | | |-- edit: string (nullable = true)
| | | |-- publish: string (nullable = true)
|-- id: string (nullable = true)
I am using spark 2.1.0
I found this solution: DataFrame partitionBy on nested columns
And this example:http://bigdatums.net/2016/02/12/how-to-extract-nested-json-data-in-spark/
But none of this helped me to solve my problem.
Thanks in andvance for any help.
try the following code:
val renamedDf = df
.select(explode(col("data")) as "x" )
.select($"x.*")
renamedDf.write.partitionBy("id").json(<path_to_folder>)
You are just missing a select statement after the initial explode
val df = spark.read.option("multiLine", true).option("mode", "PERMISSIVE").json("/FileStore/tables/test.json")
df.printSchema
root
|-- data: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- created_on: string (nullable = true)
| | |-- id: string (nullable = true)
| | |-- links: struct (nullable = true)
| | | |-- default: string (nullable = true)
| | | |-- edit: string (nullable = true)
| | | |-- publish: string (nullable = true)
| | |-- team: string (nullable = true)
import org.apache.spark.sql.functions.{col, explode}
val df1= df.withColumn("data", explode(col("data")))
df1.printSchema
root
|-- data: struct (nullable = true)
| |-- created_on: string (nullable = true)
| |-- id: string (nullable = true)
| |-- links: struct (nullable = true)
| | |-- default: string (nullable = true)
| | |-- edit: string (nullable = true)
| | |-- publish: string (nullable = true)
| |-- team: string (nullable = true)
val df2 = df1.select("data.created_on","data.id","data.team","data.links")
df2.show
+-------------------+-------+------+--------------------+
| created_on| id| team| links|
+-------------------+-------+------+--------------------+
|2018-10-09 02:55:51|4619623|452144|[https://some_def...|
|2018-10-09 02:42:25|4619600|452144|[https://some_def...|
+-------------------+-------+------+--------------------+
df2.write.partitionBy("id").json("FileStore/tables/test_part.json")
val f = spark.read.json("/FileStore/tables/test_part.json/id=4619600")
f.show
+-------------------+--------------------+------+
| created_on| links| team|
+-------------------+--------------------+------+
|2018-10-09 02:42:25|[https://some_def...|452144|
+-------------------+--------------------+------+
val full = spark.read.json("/FileStore/tables/test_part.json")
full.show
+-------------------+--------------------+------+-------+
| created_on| links| team| id|
+-------------------+--------------------+------+-------+
|2018-10-09 02:55:51|[https://some_def...|452144|4619623|
|2018-10-09 02:42:25|[https://some_def...|452144|4619600|
+-------------------+--------------------+------+-------+