How to import nested json into google big query - json

I'm inserting JSON into Google Big Query.
At the bottom of the question is the schema of the JSON.
Below is an example JSON:
{
"_index":"data",
"_type":"collection_v1",
"_id":"548d035f23r8987b768a5e60",
"_score":1,
"_source":{
"fullName":"Mike Smith",
"networks":[
{
"id":[
"12923449"
],
"network":"facebook",
"link":"https://www.facebook.com/127654449"
}
],
"sex":{
"network":"facebook",
"value":"male"
},
"interests":{
},
"score":1.045,
"merged_by":"548f899444v5t4v45te9a4cc"
}
}
as you can see there's a "_source.fullName" field with "Mike Smith".
When I try to create a table with it, it errors out:
Array specified for non-repeated field: _source.fullName.
I believe this field is a one-time field for _source. How do I overcome this error?
here's the Schema:
[
{
"name": "_index",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "_id",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "_type",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "score",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "header",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "fullName",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "src",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "avatar",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "merged_by",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "cover",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "sex",
"type": "RECORD",
"mode": "NULLABLE",
"fields": [
{
"name": "network",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "value",
"type": "STRING",
"mode": "NULLABLE"
}
]
},
{
"name": "_source",
"type": "RECORD",
"mode": "NULLABLE",
"fields": [
{
"name": "fullName",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "links",
"type": "STRING",
"mode": "REPEATED"
},
{
"name": "birthday",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "value",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "network",
"type": "STRING",
"mode": "NULLABLE"
}
]
},
{
"name": "phones",
"type": "STRING",
"mode": "REPEATED"
},
{
"name": "pictures",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "url",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "tab",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "network",
"type": "STRING",
"mode": "NULLABLE"
}
]
},
{
"name": "contacts",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "id",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "fullName",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "tag",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "network",
"type": "STRING",
"mode": "NULLABLE"
}
]
},
{
"name": "groups",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "id",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "Name",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "network",
"type": "STRING",
"mode": "NULLABLE"
}
]
},
{
"name": "skills",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "value",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "network",
"type": "STRING",
"mode": "NULLABLE"
}
]
},
{
"name": "relations",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "value",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "network",
"type": "STRING",
"mode": "NULLABLE"
}
]
},
{
"name": "about",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "value",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "network",
"type": "STRING",
"mode": "NULLABLE"
}
]
},
{
"name": "emails",
"type": "STRING",
"mode": "REPEATED"
},
{
"name": "languages",
"type": "STRING",
"mode": "REPEATED"
},
{
"name": "places",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "network",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "value",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "type",
"type": "STRING",
"mode": "NULLABLE"
}
]
},
{
"name": "education",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "network",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "school",
"type": "STRING",
"mode": "NULLABLE"
}
]
},
{
"name": "experience",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "network",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "start",
"type": "NUMERIC",
"mode": "NULLABLE"
},
{
"name": "company",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "title",
"type": "STRING",
"mode": "NULLABLE"
}
]
},
{
"name": "networks",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "network",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "link",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "id",
"type": "STRING",
"mode": "REPEATED"
}
]
},
{
"name": "network",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "others",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "network",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "value",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "tag",
"type": "STRING",
"mode": "NULLABLE"
}
]
},
{
"name": "books",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "network",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "value",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "tag",
"type": "STRING",
"mode": "NULLABLE"
}
]
},
{
"name": "music",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "network",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "value",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "tag",
"type": "STRING",
"mode": "NULLABLE"
}
]
},
{
"name": "games",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "network",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "value",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "tag",
"type": "STRING",
"mode": "NULLABLE"
}
]
},
{
"name": "spotify",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "network",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "value",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "tag",
"type": "STRING",
"mode": "NULLABLE"
}
]
}
]
}
]
}
]

You could import the full json row as if it was a CSV - basically a one column BigQuery table of json objects. Then you can parse the JSON at will inside BigQuery, with queries like this:
WITH j AS (
SELECT """{"_index":"data","_type":"collection_v1","_id":"548d035f23r8987b768a5e60","_score":1,"_source":{"fullName":"Mike Smith","networks":[{"id":["12923449"],"network":"facebook","link":"https://www.facebook.com/127654449"}],"sex":{"network":"facebook","value":"male"},"interests":{},"score":1.045,"merged_by":"548f899444v5t4v45te9a4cc"}}""" j
)
SELECT index
, STRUCT(
JSON_EXTRACT_SCALAR(source, '$.fullName') AS fullName
, [
STRUCT(
JSON_EXTRACT_SCALAR(source, '$.networks[0].id[0]') AS id
, JSON_EXTRACT_SCALAR(source, '$.networks[0].network') AS network
, JSON_EXTRACT_SCALAR(source, '$.networks[0].link') AS link)
] AS networks
) source
FROM (
SELECT JSON_EXTRACT_SCALAR(j.j, '$._index') index
, JSON_EXTRACT(j.j, '$._source') source
FROM j
)
See:
https://medium.com/google-cloud/bigquery-lazy-data-loading-ddl-dml-partitions-and-half-a-trillion-wikipedia-pageviews-cd3eacd657b6

Related

Debezium MySql source connector - cant see data in topic

I have defined Debezium MySQL source connector with the following configuraion
{
"name": "quickstart-debezium-source1",
"config": {
"connector.class": "io.debezium.connector.mysql.MySqlConnector",
"tasks.max": 1,
"database.hostname": "host.docker.internal",
"database.server.name": "connect_test",
"database.server.id": "5555",
"database.port": "3306",
"database.user": "root",
"database.password": "Fintech1!",
"database.history.kafka.topic": "debezium-source",
"database.history.kafka.bootstrap.servers": "broker:29092",
"include.schema.changes": "true",
"key.converter" : "io.confluent.connect.avro.AvroConverter",
"key.converter.schema.registry.url" : "http://host.docker.internal:8081",
"value.converter":"io.confluent.connect.avro.AvroConverter",
"value.converter.schema.registry.url": "http://host.docker.internal:8081",
"topic.creation.default.replication.factor": -1,
"topic.creation.default.partitions": -1,
"topic.creation.default.cleanup.policy": "compact",
"topic.creation.default.compression.type": "lz4"
Also have the following table in MySql:
I am able to see in Kafka new topic created and also new data that being added/updated to the table.
but, I can't see the new data on the topic (I am using confluent-cloud-center)
On the topic I can see the new data like:
I cant see the values of the columns in the table
Also I am trying to create a KSQLDB table out of this topic and no results are coming.
Hope some1 could help me with that,
I also have schema-registry, and the schema registry of the topic is:
{
"type": "record",
"name": "Envelope",
"namespace": "connect_test.connect_test.test",
"fields": [
{
"name": "before",
"type": [
"null",
{
"type": "record",
"name": "Value",
"fields": [
{
"name": "id",
"type": "long"
},
{
"name": "name",
"type": [
"null",
"string"
],
"default": null
},
{
"name": "email",
"type": [
"null",
"string"
],
"default": null
},
{
"name": "department",
"type": [
"null",
"string"
],
"default": null
},
{
"name": "modified",
"type": {
"type": "string",
"connect.version": 1,
"connect.default": "1970-01-01T00:00:00Z",
"connect.name": "io.debezium.time.ZonedTimestamp"
},
"default": "1970-01-01T00:00:00Z"
}
],
"connect.name": "connect_test.connect_test.test.Value"
}
],
"default": null
},
{
"name": "after",
"type": [
"null",
"Value"
],
"default": null
},
{
"name": "source",
"type": {
"type": "record",
"name": "Source",
"namespace": "io.debezium.connector.mysql",
"fields": [
{
"name": "version",
"type": "string"
},
{
"name": "connector",
"type": "string"
},
{
"name": "name",
"type": "string"
},
{
"name": "ts_ms",
"type": "long"
},
{
"name": "snapshot",
"type": [
{
"type": "string",
"connect.version": 1,
"connect.parameters": {
"allowed": "true,last,false,incremental"
},
"connect.default": "false",
"connect.name": "io.debezium.data.Enum"
},
"null"
],
"default": "false"
},
{
"name": "db",
"type": "string"
},
{
"name": "sequence",
"type": [
"null",
"string"
],
"default": null
},
{
"name": "table",
"type": [
"null",
"string"
],
"default": null
},
{
"name": "server_id",
"type": "long"
},
{
"name": "gtid",
"type": [
"null",
"string"
],
"default": null
},
{
"name": "file",
"type": "string"
},
{
"name": "pos",
"type": "long"
},
{
"name": "row",
"type": "int"
},
{
"name": "thread",
"type": [
"null",
"long"
],
"default": null
},
{
"name": "query",
"type": [
"null",
"string"
],
"default": null
}
],
"connect.name": "io.debezium.connector.mysql.Source"
}
},
{
"name": "op",
"type": "string"
},
{
"name": "ts_ms",
"type": [
"null",
"long"
],
"default": null
},
{
"name": "transaction",
"type": [
"null",
{
"type": "record",
"name": "ConnectDefault",
"namespace": "io.confluent.connect.avro",
"fields": [
{
"name": "id",
"type": "string"
},
{
"name": "total_order",
"type": "long"
},
{
"name": "data_collection_order",
"type": "long"
}
]
}
],
"default": null
}
],
"connect.name": "connect_test.connect_test.test.Envelope"
}

hello, im getting this error code when i try to load a dataset into BigQuery "Unexpected token g in JSON at position 25"

[
{
"description": glide_no"",
"mode": "NULLABLE",
"name": "glide_no",
"type": "INTEGER"
},
{
"description": "vehicle registration number",
"mode": "NULLABLE",
"name": "vehicle",
"type": "STRING"
},
{
"description": "PSP CODE",
"mode": "REQUIRED",
"name": "client",
"type": "STRING"
},
{
"description": "Company name",
"mode": "REQUIRED",
"name": "haulier",
"type": "STRING" /*this is position 25/*
},
{
"description": "waste type",
"mode": "NULLABLE",
"name": "waste_type",
"type": "STRING"
},
{
"description": "source of waste",
"mode": "NULLABLE",
"name": "source",
"type": "STRING"
},
{
"description": "gross weight in kg",
"mode": "REQUIRED",
"name": "gross_weight",
"type": "INTEGER"
},
{
"description": "tare weight in kg",
"mode": "REQUIRED",
"name": "tare_weight",
"type": "INTEGER"
},
{
"description": "net weight in kg",
"mode": "REQUIRED",
"name": "net_weight",
"type": "INTEGER"
},
{
"description": "gross date",
"mode": "REQUIRED",
"name": "gross_date",
"type": "DATETIME"
},
{
"description": "tare date",
"mode": "REQUIRED",
"name": "tare_date",
"type": "DATETIME"
},
{
"description": "day of the week",
"mode": "REQUIRED",
"name": "day",
"type": "STRING"
},
{
"description": "month of the year",
"mode": "REQUIRED",
"name": "month",
"type": "STRING"
},
{
"description": "vehicle condition rating",
"mode": "NULLABLE",
"name": "memo",
"type": "STRING"
},
{
"description": "vehicle type",
"mode": "REQUIRED",
"name": "vehicle_type",
"type": "STRING"
}
]
The first field contains invalid JSON:
"description": glide_no"",
Perhaps:
"description": "glide_no",
NOTE position 25 refers to the 25th character not line.
There are many good online tools available that help manage JSON, e.g. The JSON Validator

Data Transfers to Big Query error "Error while reading data, error message: JSON table encountered too many errors, giving up. Rows: 1; errors: 1"

Hello im currently trying to establish daily data transfers from Google Cloud Storage to Big Query tables. Theses tables are just meant to store raw data (JSON files) and i unnest them later with scheduled queries. I have no issue when i create the table manually but im getting the error Error while reading data, error message: JSON table encountered too many errors, giving up. Rows: 1; errors: 1 when i launch a test transfer (even if I previously deleted the data in the table).
Here is an example of the raw data table/JSON schema:
{
"fields": [{
"name": "data",
"type": "RECORD",
"mode": "REPEATED",
"schema": {
"fields": [{
"name": "relationships",
"type": "RECORD",
"mode": "NULLABLE",
"schema": {
"fields": [{
"name": "createdBy",
"type": "RECORD",
"mode": "NULLABLE",
"schema": {
"fields": [{
"name": "data",
"type": "RECORD",
"mode": "NULLABLE",
"schema": {
"fields": [{
"name": "type",
"type": "STRING",
"mode": "NULLABLE"
}, {
"name": "id",
"type": "INTEGER",
"mode": "NULLABLE"
}]
}
}]
}
}, {
"name": "validationWorkflow",
"type": "RECORD",
"mode": "NULLABLE",
"schema": {
"fields": [{
"name": "data",
"type": "RECORD",
"mode": "REPEATED",
"schema": {
"fields": [{
"name": "type",
"type": "STRING",
"mode": "NULLABLE"
}, {
"name": "id",
"type": "INTEGER",
"mode": "NULLABLE"
}]
}
}]
}
}, {
"name": "projects",
"type": "RECORD",
"mode": "NULLABLE",
"schema": {
"fields": [{
"name": "data",
"type": "RECORD",
"mode": "REPEATED",
"schema": {
"fields": [{
"name": "type",
"type": "STRING",
"mode": "NULLABLE"
}, {
"name": "id",
"type": "INTEGER",
"mode": "NULLABLE"
}]
}
}]
}
}, {
"name": "expensesReport",
"type": "RECORD",
"mode": "NULLABLE",
"schema": {
"fields": [{
"name": "data",
"type": "RECORD",
"mode": "NULLABLE",
"schema": {
"fields": [{
"name": "type",
"type": "STRING",
"mode": "NULLABLE"
}, {
"name": "id",
"type": "INTEGER",
"mode": "NULLABLE"
}]
}
}]
}
}, {
"name": "agency",
"type": "RECORD",
"mode": "NULLABLE",
"schema": {
"fields": [{
"name": "data",
"type": "RECORD",
"mode": "NULLABLE",
"schema": {
"fields": [{
"name": "type",
"type": "STRING",
"mode": "NULLABLE"
}, {
"name": "id",
"type": "INTEGER",
"mode": "NULLABLE"
}]
}
}]
}
}, {
"name": "files",
"type": "RECORD",
"mode": "NULLABLE",
"schema": {
"fields": [{
"name": "data",
"type": "STRING",
"mode": "REPEATED"
}]
}
}, {
"name": "resource",
"type": "RECORD",
"mode": "NULLABLE",
"schema": {
"fields": [{
"name": "data",
"type": "RECORD",
"mode": "NULLABLE",
"schema": {
"fields": [{
"name": "type",
"type": "STRING",
"mode": "NULLABLE"
}, {
"name": "id",
"type": "INTEGER",
"mode": "NULLABLE"
}]
}
}]
}
}, {
"name": "validations",
"type": "RECORD",
"mode": "NULLABLE",
"schema": {
"fields": [{
"name": "data",
"type": "RECORD",
"mode": "REPEATED",
"schema": {
"fields": [{
"name": "type",
"type": "STRING",
"mode": "NULLABLE"
}, {
"name": "id",
"type": "INTEGER",
"mode": "NULLABLE"
}]
}
}]
}
}, {
"name": "orders",
"type": "RECORD",
"mode": "NULLABLE",
"schema": {
"fields": [{
"name": "data",
"type": "RECORD",
"mode": "REPEATED",
"schema": {
"fields": [{
"name": "type",
"type": "STRING",
"mode": "NULLABLE"
}, {
"name": "id",
"type": "INTEGER",
"mode": "NULLABLE"
}]
}
}]
}
}]
}
}, {
"name": "attributes",
"type": "RECORD",
"mode": "NULLABLE",
"schema": {
"fields": [{
"name": "plannedTimes",
"type": "STRING",
"mode": "REPEATED"
}, {
"name": "state",
"type": "STRING",
"mode": "NULLABLE"
}, {
"name": "exceptionalTimes",
"type": "STRING",
"mode": "REPEATED"
}, {
"name": "closed",
"type": "BOOLEAN",
"mode": "NULLABLE"
}, {
"name": "informationComments",
"type": "STRING",
"mode": "NULLABLE"
}, {
"name": "regularTimes",
"type": "RECORD",
"mode": "REPEATED",
"schema": {
"fields": [{
"name": "batch",
"type": "RECORD",
"mode": "NULLABLE",
"schema": {
"fields": [{
"name": "title",
"type": "STRING",
"mode": "NULLABLE"
}, {
"name": "id",
"type": "INTEGER",
"mode": "NULLABLE"
}]
}
}, {
"name": "workUnitType",
"type": "RECORD",
"mode": "NULLABLE",
"schema": {
"fields": [{
"name": "name",
"type": "STRING",
"mode": "NULLABLE"
}, {
"name": "activityType",
"type": "STRING",
"mode": "NULLABLE"
}, {
"name": "reference",
"type": "INTEGER",
"mode": "NULLABLE"
}]
}
}, {
"name": "project",
"type": "RECORD",
"mode": "NULLABLE",
"schema": {
"fields": [{
"name": "reference",
"type": "STRING",
"mode": "NULLABLE"
}, {
"name": "id",
"type": "INTEGER",
"mode": "NULLABLE"
}]
}
}, {
"name": "row",
"type": "INTEGER",
"mode": "NULLABLE"
}, {
"name": "delivery",
"type": "RECORD",
"mode": "NULLABLE",
"schema": {
"fields": [{
"name": "endDate",
"type": "DATE",
"mode": "NULLABLE"
}, {
"name": "startDate",
"type": "DATE",
"mode": "NULLABLE"
}, {
"name": "title",
"type": "STRING",
"mode": "NULLABLE"
}, {
"name": "id",
"type": "INTEGER",
"mode": "NULLABLE"
}]
}
}, {
"name": "startDate",
"type": "DATE",
"mode": "NULLABLE"
}, {
"name": "duration",
"type": "FLOAT",
"mode": "NULLABLE"
}, {
"name": "id",
"type": "INTEGER",
"mode": "NULLABLE"
}]
}
}, {
"name": "updateDate",
"type": "TIMESTAMP",
"mode": "NULLABLE"
}, {
"name": "creationDate",
"type": "TIMESTAMP",
"mode": "NULLABLE"
}, {
"name": "absencesTimes",
"type": "RECORD",
"mode": "REPEATED",
"schema": {
"fields": [{
"name": "workUnitType",
"type": "RECORD",
"mode": "NULLABLE",
"schema": {
"fields": [{
"name": "name",
"type": "STRING",
"mode": "NULLABLE"
}, {
"name": "activityType",
"type": "STRING",
"mode": "NULLABLE"
}, {
"name": "reference",
"type": "INTEGER",
"mode": "NULLABLE"
}]
}
}, {
"name": "duration",
"type": "FLOAT",
"mode": "NULLABLE"
}, {
"name": "startDate",
"type": "DATE",
"mode": "NULLABLE"
}]
}
}, {
"name": "term",
"type": "STRING",
"mode": "NULLABLE"
}]
}
}, {
"name": "type",
"type": "STRING",
"mode": "NULLABLE"
}, {
"name": "id",
"type": "INTEGER",
"mode": "NULLABLE"
}]
}
}, {
"name": "date",
"type": "DATE",
"mode": "NULLABLE"
}]
}
I know that BQ works better with JSON newline delimited format but this table must contains raw data even if its just 1 row in the end. The weirdest thing is that the transfer works for some files with similar schemas.
What should I do to make these transfers work ?
Thank you for your help
Regarding GCP documentation, I would say that using JSON newline delimited format is a known limitation.
"JSON data must be newline delimited. Each JSON object must be on a separate line in the file."
Here is the solution:
Some of the fields were auto-detected as "TIMESTAMP" when uploading manually. However, Data Transfer didn't recognize these fields as "TIMESTAMP" during the daily updates of these tables.
To solve this I edited the schema and declared these fields as "STRING".
Thank you for your help ;)
I'm also facing the same issue while I was working with Cloud DataFlow [ Apache Beam ] to migrate .csv data from Cloud Storage to Google BigQuery.
This error occurs during the conversion of .csv file to .json file resulting into an error. I resolved this issue by adding few lines of code to convert .csv file to .json file.
CSV file:
Gender,Math,Physics,Chemistry
male,57,50,53
male,63,66,59
male,65,56,54
CSV to JSON conversion:
Your output data must be in this below format then only it will load data to Google Bigquery.
[ {"Gender":"male","Math":"57","Physics":"50","Chemistry":"53"},
{"Gender":"male","Math":"63","Physics":"66","Chemistry":"59"},
{"Gender":"male","Math":"65","Physics":"56","Chemistry":"54"}, ]
Python code to convert .csv to .json file
count = "start"
lst =[]
final = ""
class csv2json(beam.DoFn):
def process(self,csvFile):
global count
global header
global lst,final
min_dict={}
lst=[]
print("**************")
print(csvFile)
print(type(csvFile))
print("**************")
#Consider first row as a header
if "start" == count:
for item in range(len(csvFile)):
min_dict[csvFile[item]]=""
count = "end"
header = csvFile
#Consider rest are the value for headers.
else:
for item in range(len(csvFile)):
min_dict[header[item]]=csvFile[item]
#converting dict into list of dict
lst.append(min_dict)
final = lst
print("**************")
print(final)
print(type(final))
print("**************")
return final
Hope my solution works for you as well.
For your reference have a glance here:
https://beam.apache.org/documentation/io/built-in/google-bigquery/#writing-to-a-table

JSON Schema Draft-07 if-then-else required field validation does not seem correct

Using Draft-07
What I got was
valid JSON
What error I expected from audit object was
directory: String length must be greater than or equal to 2
Tried two different validators with same results
https://www.jsonschemavalidator.net/
GoLang https://github.com/xeipuuv/gojsonschema
This is my schema
{
"$schema": "http://json-schema.org/draft-07/schema#",
"title": "ISAM-Wrapper",
"description": "Validate isam wrapper json",
"type": "object",
"properties": {
"directory": {
"description": "path to location of isam file",
"type": "string",
"minLength": 2
},
"isamFile": {
"description": "isam database file",
"type": "string",
"minLength": 4
},
"isamIndex": {
"description": "isam index file",
"type": "string",
"minLength": 4
},
"port": {
"description": "port number for REST listener",
"type": "integer",
"minimum": 60410,
"maximum": 69999
},
"actions": {
"description": "Which operations are supported",
"type": "object",
"items": {
"properties": {
"create": {
"type": "boolean"
},
"read": {
"type": "boolean"
},
"update": {
"type": "boolean"
},
"delete": {
"type": "boolean"
}
}
},
"required": [
"create",
"read",
"update",
"delete"
]
},
"fields": {
"description": "each object describes one field of the isam file",
"type": "array",
"minItems": 1,
"items": {
"title": "field",
"description": "field schema",
"type": "object",
"properties": {
"name": {
"type": "string",
"minLength": 1
},
"ordinal": {
"type": "integer",
"minimum": 0
},
"offset": {
"type": "integer",
"minimum": 0
},
"length": {
"type": "integer",
"minimum": 1
},
"dataType": {
"enum": [
"uchar",
"ulong",
"long",
"uint",
"int",
"ushort",
"short"
]
}
},
"required": [
"name",
"ordinal",
"offset",
"length",
"dataType"
]
}
},
"audit": {
"description": "input needed to enable and configure isam auditing",
"type": "object",
"items": {
"properties": {
"enable": {
"enum": [
true,
false
]
},
"directory": {
"type": "string",
"minLength": 2
},
"fileName": {
"type": "string",
"minLength": 4
},
"workDirectory": {
"type": "string",
"minLength": 2
},
"archiveDirectory": {
"type": "string",
"minLength": 2
},
"interval": {
"type": "integer",
"minimum": 1
},
"byteThreshold": {
"type": "integer",
"minimum": 1048576,
"maximum": 1073741824
}
}
},
"required": [
"enable"
],
"if": {
"not": {
"properties": {
"enable": {
"enum": [
false
]
}
}
}
},
"then": {
"required": [
"directory",
"fileName",
"workDirectory",
"archiveDirectory",
"interval",
"byteThreshold"
]
}
}
},
"required": [
"directory",
"isamFile",
"isamIndex",
"port",
"actions",
"fields",
"audit"
]
}
This is my JSON
{
"directory": "./",
"isamFile": "isam.dat",
"isamIndex": "isam.idx",
"port": 60410,
"actions": {
"create": true,
"read": true,
"update": true,
"delete": true
},
"fields": [
{
"name": "F1",
"ordinal": 0,
"offset": 0,
"length": 4,
"dataType": "ulong"
},
{
"name": "F2",
"ordinal": 1,
"offset": 4,
"length": 4,
"dataType": "ulong"
}
],
"audit": {
"enable": true,
"directory": "",
"fileName": "file",
"workDirectory": "./work",
"archiveDirectory": "./archive",
"interval": 5,
"byteThreshold": 1500000
}
}
This issue you have is that your schema is invalid. For both actions and audit you specify these as objects but you don't provide any properties. What you do do, however, is specify an items key (which does nothing here - that's a key on an array) which contains the properties.
Once you correct this error, the schema behaves as you intend, see https://repl.it/repls/BlankWellmadeFrontpage

Importing a nested json as a table with multiple nesting

In our data we have json fields that include repeated sections, as well as infinite nesting possibilities (the samples I have so far are quite simplistic). After seeing BQ repeated fields and records, I decided to try restructuring the data into repeated record fields, as our use case is related to analytics and then wanted to test out different use cases for the data to see which approach is more efficient (time/cost/difficulty) for the analysis we intend to do on it. I have created a sample json record that I want to upload to BQ, that uses all the features that I think we would need (I have validated is using http://jsonlint.com/):
{
"aid": "6dQcrgMVS0",
"hour": "2016042723",
"unixTimestamp": "1461814784",
"browserId": "BdHOHp2aL9REz9dXVeKDaxdvefE3Bgn6NHZcDQKeuC67vuQ7PBIXXJda3SOu",
"experienceId": "EXJYULQOXQ05",
"experienceVersion": "1.0",
"pageRule": "V1XJW61TPI99UWR",
"userSegmentRule": "67S3YVMB7EMQ6LP",
"branch": [{
"branchId": "1",
"branchType": "userSegments",
"itemId": "userSegment67S3YVMB7EMQ6LP",
"headerId": "null",
"itemMethod": "null"
}, {
"branchId": "1",
"branchType": "userSegments",
"itemId": "userSegment67S3YVMB7EMQ6LP",
"headerId": "null",
"itemMethod": "null"
}],
"event": [{
"eventId": "546",
"eventName": "testEvent",
"eventDetails": [{
"key": "a",
"value": "1"
}, {
"key": "b",
"value": "2"
}, {
"key": "c",
"value": "3"
}]
}, {
"eventId": "547",
"eventName": "testEvent2",
"eventDetails": [{
"key": "d",
"value": "4"
}, {
"key": "e",
"value": "5"
}, {
"key": "f",
"value": "6"
}]
}]
}
I am using BQ interface, to upload this json into a table with the following structure:
[
{
"name": "aid",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "hour",
"type": "INTEGER",
"mode": "NULLABLE"
},
{
"name": "unixTimestamp",
"type": "INTEGER",
"mode": "NULLABLE"
},
{
"name": "browserId",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "experienceId",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "experienceVersion",
"type": "FLOAT",
"mode": "NULLABLE"
},
{
"name": "pageRule",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "userSegmentRule",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "branch",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "branchId",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "branchType",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "itemId",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "headerId",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "itemMethod",
"type": "STRING",
"mode": "NULLABLE"
}
]
},
{
"name": "event",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "evenId",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "eventName",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "eventDetails",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "key",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "value",
"type": "STRING",
"mode": "NULLABLE"
}
]
}
]
}
]
My jobs fail with a
JSON parsing error in row starting at position 0 in <file_id>. Expected key (error code: invalid)
It is possible I can't have multiple nesting in a table, but the error seems more as if there was an issue with parsing the JSON itself. I was able to generate and successfully import a json with a simple repeated record (see example below):
{
"eventId": "546",
"eventName": "testEvent",
"eventDetails": [{
"key": "a",
"value": "1"
}, {
"key": "b",
"value": "2"
}, {
"key": "c",
"value": "3"
}]
}
Any advice is appreciated.
There doesn't seem to be anything problematic with your schema, so BigQuery should be able to load your data with your schema.
First, make sure you are uploading newline-delimited JSON to BigQuery. Your example row has many newline characters in the middle of your JSON row, and the parser is trying to interpret each line as a separate JSON row.
Second, it looks like your schema has the key "evenId" in the "event" record, but your example row has the key "eventId".