Importing a nested json as a table with multiple nesting - json

In our data we have json fields that include repeated sections, as well as infinite nesting possibilities (the samples I have so far are quite simplistic). After seeing BQ repeated fields and records, I decided to try restructuring the data into repeated record fields, as our use case is related to analytics and then wanted to test out different use cases for the data to see which approach is more efficient (time/cost/difficulty) for the analysis we intend to do on it. I have created a sample json record that I want to upload to BQ, that uses all the features that I think we would need (I have validated is using http://jsonlint.com/):
{
"aid": "6dQcrgMVS0",
"hour": "2016042723",
"unixTimestamp": "1461814784",
"browserId": "BdHOHp2aL9REz9dXVeKDaxdvefE3Bgn6NHZcDQKeuC67vuQ7PBIXXJda3SOu",
"experienceId": "EXJYULQOXQ05",
"experienceVersion": "1.0",
"pageRule": "V1XJW61TPI99UWR",
"userSegmentRule": "67S3YVMB7EMQ6LP",
"branch": [{
"branchId": "1",
"branchType": "userSegments",
"itemId": "userSegment67S3YVMB7EMQ6LP",
"headerId": "null",
"itemMethod": "null"
}, {
"branchId": "1",
"branchType": "userSegments",
"itemId": "userSegment67S3YVMB7EMQ6LP",
"headerId": "null",
"itemMethod": "null"
}],
"event": [{
"eventId": "546",
"eventName": "testEvent",
"eventDetails": [{
"key": "a",
"value": "1"
}, {
"key": "b",
"value": "2"
}, {
"key": "c",
"value": "3"
}]
}, {
"eventId": "547",
"eventName": "testEvent2",
"eventDetails": [{
"key": "d",
"value": "4"
}, {
"key": "e",
"value": "5"
}, {
"key": "f",
"value": "6"
}]
}]
}
I am using BQ interface, to upload this json into a table with the following structure:
[
{
"name": "aid",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "hour",
"type": "INTEGER",
"mode": "NULLABLE"
},
{
"name": "unixTimestamp",
"type": "INTEGER",
"mode": "NULLABLE"
},
{
"name": "browserId",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "experienceId",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "experienceVersion",
"type": "FLOAT",
"mode": "NULLABLE"
},
{
"name": "pageRule",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "userSegmentRule",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "branch",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "branchId",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "branchType",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "itemId",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "headerId",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "itemMethod",
"type": "STRING",
"mode": "NULLABLE"
}
]
},
{
"name": "event",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "evenId",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "eventName",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "eventDetails",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "key",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "value",
"type": "STRING",
"mode": "NULLABLE"
}
]
}
]
}
]
My jobs fail with a
JSON parsing error in row starting at position 0 in <file_id>. Expected key (error code: invalid)
It is possible I can't have multiple nesting in a table, but the error seems more as if there was an issue with parsing the JSON itself. I was able to generate and successfully import a json with a simple repeated record (see example below):
{
"eventId": "546",
"eventName": "testEvent",
"eventDetails": [{
"key": "a",
"value": "1"
}, {
"key": "b",
"value": "2"
}, {
"key": "c",
"value": "3"
}]
}
Any advice is appreciated.

There doesn't seem to be anything problematic with your schema, so BigQuery should be able to load your data with your schema.
First, make sure you are uploading newline-delimited JSON to BigQuery. Your example row has many newline characters in the middle of your JSON row, and the parser is trying to interpret each line as a separate JSON row.
Second, it looks like your schema has the key "evenId" in the "event" record, but your example row has the key "eventId".

Related

how to create nested and custom json format for the datafarme

i want to create sub-categories from the existing data frame
data frame column consists of (sample table) my changes required at the columns level not any changes in the data like a set of columns are the and column names 3 different suffixes (few with similar column names and other column names)
example like
|payer_id|payer_name|halo_payer_name|delta_payer_name|halo_desc|delta_desc|halo_operations|delta_notes|halo_processed_data|delta_processed_data|extra|insurance_company|
I want it to be grouped in this halo group halo_payer_name|halo_desc|halo_operations|halo_processed_data|
I want it to be grouped in this delta group delta_payer_name|delta_desc|delta_notes|delta_processed_data|
and the remaining columns as one group
so when converted to JSON it would come in this layout
{
"schema": {
"fields": [{
"payer_details": [{
"name": "payer_id",
"type": "string"
},
{
"name": "payer_name",
"type": "string"
},
{
"name": "extra",
"type": "string"
},
{
"name": "insurance_company",
"type": "string"
}
]
},
{
"halo": [{
"name": "halo_payer_name",
"type": "string"
},
{
"name": "halo_desc",
"type": "string"
},
{
"name": "halo_operstions",
"type": "string"
},
{
"name": "halo_processed_data",
"type": "string"
}
]
}, {
"delta": [{
"name": "delta_payer_name",
"type": "string"
},
{
"name": "delta_desc",
"type": "string"
},
{
"name": "delta_notes",
"type": "string"
},
{
"name": "delta_processed_data",
"type": "string"
}
]
}
],
"pandas_version": "1.4.0"
},
"masterdata": [{
"payer_details": [{
"payer_id": "",
"payer_name": "",
"extra": "",
"insurance_company": ""
}],
"halo": [{
"halo_payer_name": "",
"halo_desc": "",
"halo_operations": "",
"halo_processed_data": "",
}],
"delta":[{
"delta_payer_name": "",
"delta_desc": "",
"delta_notes": "",
"delta_processed_data": "",
}]
}]
}
for this type of situation i couldn't find a solution as it is a column based grouping instead of data-based grouping
so came across this post today it helped with my situation (adding data from a data frame and using it to create looped data and insert it into a dict and then convert the whole into a JSON file)
the ref that was helpful to me is link
so the solution for this question goes like this
schema={
"schema": {
"fields": [{
"payer_details": [{
"name": "payer_id",
"type": "string"
},
{
"name": "payer_name",
"type": "string"
},
{
"name": "extra",
"type": "string"
},
{
"name": "insurance_company",
"type": "string"
}
]
},
{
"halo": [{
"name": "halo_payer_name",
"type": "string"
},
{
"name": "halo_desc",
"type": "string"
},
{
"name": "halo_operstions",
"type": "string"
},
{
"name": "halo_processed_data",
"type": "string"
}
]
}, {
"delta": [{
"name": "delta_payer_name",
"type": "string"
},
{
"name": "delta_desc",
"type": "string"
},
{
"name": "delta_notes",
"type": "string"
},
{
"name": "delta_processed_data",
"type": "string"
}
]
}
],
"pandas_version": "1.4.0"
},
"masterdata": []
}
derived the schema above as i have desired
payer_list=[]
for i in df.index:
case={
"payer_details": [{
"payer_id": "{}".format(df['payer_id'][i]),
"payer_name": "{}".format(df['payer_name'][i]),
"extra": "{}".format(df['extra'][i]),
"insurance_company": "{}".format(df['insurance_company'][i])
}],
"halo": [{
"halo_payer_name": "{}".format(df['halo_payer_name'][i]),
"halo_desc": "{}".format(df['halo_desc'][i]),
"halo_operations": "{}".format(df['halo_operations'][i]),
"halo_processed_data": "{}".format(df['halo_processed_data'][i]),
}],
"delta":[{
"delta_payer_name": "{}".format(df['delta_payer_name'][i]),
"delta_desc": "{}".format(df['delta_desc'][i]),
"delta_notes": "{}".format(df['delta_notes'][i]),
"delta_processed_data": "{}".format(df['delta_processed_data'][i]),
}]
}
payer_list.append(case)
schema["masterdata"] = payer_list
created and empty list and run the loop and included in the empty list and joined or linked to the schema

Convert Nested Json Schem to Pyspark Schema

I have a schema which has nested fields.When I try to convert it with:
jtopy=json.dumps(schema_message['SchemaDefinition']) #json.dumps take a dictionary as input and returns a string as output.
print(jtopy)
dict_json=json.loads(jtopy) # json.loads take a string as input and returns a dictionary as output.
print(dict_json)
new_schema = StructType.fromJson(dict_json)
print(new_schema)
It returns error:
return StructType([StructField.fromJson(f) for f in json["fields"]])
TypeError: string indices must be integers
The schema is Definition as described below is what Im passing
{
"type": "record",
"name": "tags",
"namespace": "com.tigertext.data.events.tags",
"doc": "Schema for tags association to accounts (role,etc..)",
"fields": [
{
"name": "header",
"type": {
"type": "record",
"name": "eventHeader",
"namespace": "com.tigertext.data.events",
"doc": "Metadata about the event record.",
"fields": [
{
"name": "topic",
"type": "string",
"doc": "The topic this record belongs to. e.g. messages"
},
{
"name": "server",
"type": "string",
"doc": "The server that generated this event. e.g. xmpp-07"
},
{
"name": "service",
"type": "string",
"doc": "The service that generated this event. e.g. erlang-producer"
},
{
"name": "environment",
"type": "string",
"doc": "The environment this record belongs to. e.g. dev, prod"
},
{
"name": "time",
"type": "long",
"doc": "The time in epoch this record was produced."
}
]
}
},
{
"name": "eventType",
"type": {
"type": "enum",
"name": "eventType",
"symbols": [
"CREATE",
"UPDATE",
"DELETE",
"INIT"
]
},
"doc": "event type"
},
{
"name": "tagId",
"type": "string",
"doc": "Tag ID for the tag"
},
{
"name": "orgToken",
"type": "string",
"doc": "org ID"
},
{
"name": "tagName",
"type": "string",
"doc": "name of the tag"
},
{
"name": "colorId",
"type": "string",
"doc": "color id"
},
{
"name": "colorName",
"type": "string",
"doc": "color name"
},
{
"name": "colorValue",
"type": "string",
"doc": "color value e.g. #C8C8C8"
},
{
"name": "entities",
"type": [
"null",
{
"type": "array",
"items": {
"type": "record",
"name": "entity",
"fields": [
{
"name": "entityToken",
"type": "string"
},
{
"name": "entityType",
"type": "string"
}
]
}
}
],
"default": null
}
]
}
Above is the schema of the kafka topic I want to parse into pyspark schema

Data Transfers to Big Query error "Error while reading data, error message: JSON table encountered too many errors, giving up. Rows: 1; errors: 1"

Hello im currently trying to establish daily data transfers from Google Cloud Storage to Big Query tables. Theses tables are just meant to store raw data (JSON files) and i unnest them later with scheduled queries. I have no issue when i create the table manually but im getting the error Error while reading data, error message: JSON table encountered too many errors, giving up. Rows: 1; errors: 1 when i launch a test transfer (even if I previously deleted the data in the table).
Here is an example of the raw data table/JSON schema:
{
"fields": [{
"name": "data",
"type": "RECORD",
"mode": "REPEATED",
"schema": {
"fields": [{
"name": "relationships",
"type": "RECORD",
"mode": "NULLABLE",
"schema": {
"fields": [{
"name": "createdBy",
"type": "RECORD",
"mode": "NULLABLE",
"schema": {
"fields": [{
"name": "data",
"type": "RECORD",
"mode": "NULLABLE",
"schema": {
"fields": [{
"name": "type",
"type": "STRING",
"mode": "NULLABLE"
}, {
"name": "id",
"type": "INTEGER",
"mode": "NULLABLE"
}]
}
}]
}
}, {
"name": "validationWorkflow",
"type": "RECORD",
"mode": "NULLABLE",
"schema": {
"fields": [{
"name": "data",
"type": "RECORD",
"mode": "REPEATED",
"schema": {
"fields": [{
"name": "type",
"type": "STRING",
"mode": "NULLABLE"
}, {
"name": "id",
"type": "INTEGER",
"mode": "NULLABLE"
}]
}
}]
}
}, {
"name": "projects",
"type": "RECORD",
"mode": "NULLABLE",
"schema": {
"fields": [{
"name": "data",
"type": "RECORD",
"mode": "REPEATED",
"schema": {
"fields": [{
"name": "type",
"type": "STRING",
"mode": "NULLABLE"
}, {
"name": "id",
"type": "INTEGER",
"mode": "NULLABLE"
}]
}
}]
}
}, {
"name": "expensesReport",
"type": "RECORD",
"mode": "NULLABLE",
"schema": {
"fields": [{
"name": "data",
"type": "RECORD",
"mode": "NULLABLE",
"schema": {
"fields": [{
"name": "type",
"type": "STRING",
"mode": "NULLABLE"
}, {
"name": "id",
"type": "INTEGER",
"mode": "NULLABLE"
}]
}
}]
}
}, {
"name": "agency",
"type": "RECORD",
"mode": "NULLABLE",
"schema": {
"fields": [{
"name": "data",
"type": "RECORD",
"mode": "NULLABLE",
"schema": {
"fields": [{
"name": "type",
"type": "STRING",
"mode": "NULLABLE"
}, {
"name": "id",
"type": "INTEGER",
"mode": "NULLABLE"
}]
}
}]
}
}, {
"name": "files",
"type": "RECORD",
"mode": "NULLABLE",
"schema": {
"fields": [{
"name": "data",
"type": "STRING",
"mode": "REPEATED"
}]
}
}, {
"name": "resource",
"type": "RECORD",
"mode": "NULLABLE",
"schema": {
"fields": [{
"name": "data",
"type": "RECORD",
"mode": "NULLABLE",
"schema": {
"fields": [{
"name": "type",
"type": "STRING",
"mode": "NULLABLE"
}, {
"name": "id",
"type": "INTEGER",
"mode": "NULLABLE"
}]
}
}]
}
}, {
"name": "validations",
"type": "RECORD",
"mode": "NULLABLE",
"schema": {
"fields": [{
"name": "data",
"type": "RECORD",
"mode": "REPEATED",
"schema": {
"fields": [{
"name": "type",
"type": "STRING",
"mode": "NULLABLE"
}, {
"name": "id",
"type": "INTEGER",
"mode": "NULLABLE"
}]
}
}]
}
}, {
"name": "orders",
"type": "RECORD",
"mode": "NULLABLE",
"schema": {
"fields": [{
"name": "data",
"type": "RECORD",
"mode": "REPEATED",
"schema": {
"fields": [{
"name": "type",
"type": "STRING",
"mode": "NULLABLE"
}, {
"name": "id",
"type": "INTEGER",
"mode": "NULLABLE"
}]
}
}]
}
}]
}
}, {
"name": "attributes",
"type": "RECORD",
"mode": "NULLABLE",
"schema": {
"fields": [{
"name": "plannedTimes",
"type": "STRING",
"mode": "REPEATED"
}, {
"name": "state",
"type": "STRING",
"mode": "NULLABLE"
}, {
"name": "exceptionalTimes",
"type": "STRING",
"mode": "REPEATED"
}, {
"name": "closed",
"type": "BOOLEAN",
"mode": "NULLABLE"
}, {
"name": "informationComments",
"type": "STRING",
"mode": "NULLABLE"
}, {
"name": "regularTimes",
"type": "RECORD",
"mode": "REPEATED",
"schema": {
"fields": [{
"name": "batch",
"type": "RECORD",
"mode": "NULLABLE",
"schema": {
"fields": [{
"name": "title",
"type": "STRING",
"mode": "NULLABLE"
}, {
"name": "id",
"type": "INTEGER",
"mode": "NULLABLE"
}]
}
}, {
"name": "workUnitType",
"type": "RECORD",
"mode": "NULLABLE",
"schema": {
"fields": [{
"name": "name",
"type": "STRING",
"mode": "NULLABLE"
}, {
"name": "activityType",
"type": "STRING",
"mode": "NULLABLE"
}, {
"name": "reference",
"type": "INTEGER",
"mode": "NULLABLE"
}]
}
}, {
"name": "project",
"type": "RECORD",
"mode": "NULLABLE",
"schema": {
"fields": [{
"name": "reference",
"type": "STRING",
"mode": "NULLABLE"
}, {
"name": "id",
"type": "INTEGER",
"mode": "NULLABLE"
}]
}
}, {
"name": "row",
"type": "INTEGER",
"mode": "NULLABLE"
}, {
"name": "delivery",
"type": "RECORD",
"mode": "NULLABLE",
"schema": {
"fields": [{
"name": "endDate",
"type": "DATE",
"mode": "NULLABLE"
}, {
"name": "startDate",
"type": "DATE",
"mode": "NULLABLE"
}, {
"name": "title",
"type": "STRING",
"mode": "NULLABLE"
}, {
"name": "id",
"type": "INTEGER",
"mode": "NULLABLE"
}]
}
}, {
"name": "startDate",
"type": "DATE",
"mode": "NULLABLE"
}, {
"name": "duration",
"type": "FLOAT",
"mode": "NULLABLE"
}, {
"name": "id",
"type": "INTEGER",
"mode": "NULLABLE"
}]
}
}, {
"name": "updateDate",
"type": "TIMESTAMP",
"mode": "NULLABLE"
}, {
"name": "creationDate",
"type": "TIMESTAMP",
"mode": "NULLABLE"
}, {
"name": "absencesTimes",
"type": "RECORD",
"mode": "REPEATED",
"schema": {
"fields": [{
"name": "workUnitType",
"type": "RECORD",
"mode": "NULLABLE",
"schema": {
"fields": [{
"name": "name",
"type": "STRING",
"mode": "NULLABLE"
}, {
"name": "activityType",
"type": "STRING",
"mode": "NULLABLE"
}, {
"name": "reference",
"type": "INTEGER",
"mode": "NULLABLE"
}]
}
}, {
"name": "duration",
"type": "FLOAT",
"mode": "NULLABLE"
}, {
"name": "startDate",
"type": "DATE",
"mode": "NULLABLE"
}]
}
}, {
"name": "term",
"type": "STRING",
"mode": "NULLABLE"
}]
}
}, {
"name": "type",
"type": "STRING",
"mode": "NULLABLE"
}, {
"name": "id",
"type": "INTEGER",
"mode": "NULLABLE"
}]
}
}, {
"name": "date",
"type": "DATE",
"mode": "NULLABLE"
}]
}
I know that BQ works better with JSON newline delimited format but this table must contains raw data even if its just 1 row in the end. The weirdest thing is that the transfer works for some files with similar schemas.
What should I do to make these transfers work ?
Thank you for your help
Regarding GCP documentation, I would say that using JSON newline delimited format is a known limitation.
"JSON data must be newline delimited. Each JSON object must be on a separate line in the file."
Here is the solution:
Some of the fields were auto-detected as "TIMESTAMP" when uploading manually. However, Data Transfer didn't recognize these fields as "TIMESTAMP" during the daily updates of these tables.
To solve this I edited the schema and declared these fields as "STRING".
Thank you for your help ;)
I'm also facing the same issue while I was working with Cloud DataFlow [ Apache Beam ] to migrate .csv data from Cloud Storage to Google BigQuery.
This error occurs during the conversion of .csv file to .json file resulting into an error. I resolved this issue by adding few lines of code to convert .csv file to .json file.
CSV file:
Gender,Math,Physics,Chemistry
male,57,50,53
male,63,66,59
male,65,56,54
CSV to JSON conversion:
Your output data must be in this below format then only it will load data to Google Bigquery.
[ {"Gender":"male","Math":"57","Physics":"50","Chemistry":"53"},
{"Gender":"male","Math":"63","Physics":"66","Chemistry":"59"},
{"Gender":"male","Math":"65","Physics":"56","Chemistry":"54"}, ]
Python code to convert .csv to .json file
count = "start"
lst =[]
final = ""
class csv2json(beam.DoFn):
def process(self,csvFile):
global count
global header
global lst,final
min_dict={}
lst=[]
print("**************")
print(csvFile)
print(type(csvFile))
print("**************")
#Consider first row as a header
if "start" == count:
for item in range(len(csvFile)):
min_dict[csvFile[item]]=""
count = "end"
header = csvFile
#Consider rest are the value for headers.
else:
for item in range(len(csvFile)):
min_dict[header[item]]=csvFile[item]
#converting dict into list of dict
lst.append(min_dict)
final = lst
print("**************")
print(final)
print(type(final))
print("**************")
return final
Hope my solution works for you as well.
For your reference have a glance here:
https://beam.apache.org/documentation/io/built-in/google-bigquery/#writing-to-a-table

How to parse a dynamic Json - Power Automate

Im getting a http response from Azure LogAnalytics, the response is a Json like this
{
"tables": [
{
"name": "PrimaryResult",
"columns": [
{
"name": "TimeGenerated",
"type": "datetime"
},
{
"name": "DestinationIP",
"type": "string"
},
{
"name": "DestinationUserName",
"type": "string"
},
{
"name": "country_name",
"type": "string"
},
{
"name": "country_iso_code",
"type": "string"
},
{
"name": "AccountCustomEntity",
"type": "string"
}
],
"rows": [
[
"2021-05-17T14:07:01.878Z",
"158.000.000.33",
"luis",
"United States",
"US",
"luis"
]
]
}
]
}
I will never get the same colums or sometimes i will get more rows with data like this
{
"tables": [
{
"name": "PrimaryResult",
"columns": [
{
"name": "Account",
"type": "string"
},
{
"name": "Computer",
"type": "string"
},
{
"name": "IpAddress",
"type": "string"
},
{
"name": "AccountType",
"type": "string"
},
{
"name": "Activity",
"type": "string"
},
{
"name": "LogonTypeName",
"type": "string"
},
{
"name": "ProcessName",
"type": "string"
},
{
"name": "StartTimeUtc",
"type": "datetime"
},
{
"name": "EndTimeUtc",
"type": "datetime"
},
{
"name": "ConnectinCount",
"type": "long"
},
{
"name": "timestamp",
"type": "datetime"
},
{
"name": "AccountCustomEntity",
"type": "string"
},
{
"name": "HostCustomEntity",
"type": "string"
},
{
"name": "IPCustomEntity",
"type": "string"
}
],
"rows": [
[
"abc\\abc",
"EQ-DC02.abc.LOCAL",
"0.0.0.0",
"User",
"4624 - An account was successfully logged on.",
"10 - RemoteInteractive",
"C:\\Windows\\System32\\svchost.exe",
"2021-05-17T15:02:25.457Z",
"2021-05-17T15:02:25.457Z",
2,
"2021-05-17T15:02:25.457Z",
"abc\\abc",
"EQ-DC02.abc.LOCAL",
"0.0.0.0"
],
[
"abc\\eona",
"EQPD-SW01.abc.LOCAL",
"0.0.0.0",
"User",
"4624 - An account was successfully logged on.",
"10 - RemoteInteractive",
"C:\\Windows\\System32\\svchost.exe",
"2021-05-17T15:21:45.993Z",
"2021-05-17T15:21:45.993Z",
1,
"2021-05-17T15:21:45.993Z",
"abc\\abc",
"EQPD-SW01.abc.LOCAL",
"0.0.0.0"
]
]
}
]
}
Im using Power Automate to parse this kind of Json to a Object or to make a response
the question is, how can i parse this "Columns" and "Rows" to a object?
Similar discussion happened in community forum and the solution identified was:
parse JSON and transform it to XML and then search keys with XPATH in Flow

How to import nested json into google big query

I'm inserting JSON into Google Big Query.
At the bottom of the question is the schema of the JSON.
Below is an example JSON:
{
"_index":"data",
"_type":"collection_v1",
"_id":"548d035f23r8987b768a5e60",
"_score":1,
"_source":{
"fullName":"Mike Smith",
"networks":[
{
"id":[
"12923449"
],
"network":"facebook",
"link":"https://www.facebook.com/127654449"
}
],
"sex":{
"network":"facebook",
"value":"male"
},
"interests":{
},
"score":1.045,
"merged_by":"548f899444v5t4v45te9a4cc"
}
}
as you can see there's a "_source.fullName" field with "Mike Smith".
When I try to create a table with it, it errors out:
Array specified for non-repeated field: _source.fullName.
I believe this field is a one-time field for _source. How do I overcome this error?
here's the Schema:
[
{
"name": "_index",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "_id",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "_type",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "score",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "header",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "fullName",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "src",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "avatar",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "merged_by",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "cover",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "sex",
"type": "RECORD",
"mode": "NULLABLE",
"fields": [
{
"name": "network",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "value",
"type": "STRING",
"mode": "NULLABLE"
}
]
},
{
"name": "_source",
"type": "RECORD",
"mode": "NULLABLE",
"fields": [
{
"name": "fullName",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "links",
"type": "STRING",
"mode": "REPEATED"
},
{
"name": "birthday",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "value",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "network",
"type": "STRING",
"mode": "NULLABLE"
}
]
},
{
"name": "phones",
"type": "STRING",
"mode": "REPEATED"
},
{
"name": "pictures",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "url",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "tab",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "network",
"type": "STRING",
"mode": "NULLABLE"
}
]
},
{
"name": "contacts",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "id",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "fullName",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "tag",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "network",
"type": "STRING",
"mode": "NULLABLE"
}
]
},
{
"name": "groups",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "id",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "Name",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "network",
"type": "STRING",
"mode": "NULLABLE"
}
]
},
{
"name": "skills",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "value",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "network",
"type": "STRING",
"mode": "NULLABLE"
}
]
},
{
"name": "relations",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "value",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "network",
"type": "STRING",
"mode": "NULLABLE"
}
]
},
{
"name": "about",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "value",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "network",
"type": "STRING",
"mode": "NULLABLE"
}
]
},
{
"name": "emails",
"type": "STRING",
"mode": "REPEATED"
},
{
"name": "languages",
"type": "STRING",
"mode": "REPEATED"
},
{
"name": "places",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "network",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "value",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "type",
"type": "STRING",
"mode": "NULLABLE"
}
]
},
{
"name": "education",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "network",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "school",
"type": "STRING",
"mode": "NULLABLE"
}
]
},
{
"name": "experience",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "network",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "start",
"type": "NUMERIC",
"mode": "NULLABLE"
},
{
"name": "company",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "title",
"type": "STRING",
"mode": "NULLABLE"
}
]
},
{
"name": "networks",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "network",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "link",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "id",
"type": "STRING",
"mode": "REPEATED"
}
]
},
{
"name": "network",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "others",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "network",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "value",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "tag",
"type": "STRING",
"mode": "NULLABLE"
}
]
},
{
"name": "books",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "network",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "value",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "tag",
"type": "STRING",
"mode": "NULLABLE"
}
]
},
{
"name": "music",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "network",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "value",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "tag",
"type": "STRING",
"mode": "NULLABLE"
}
]
},
{
"name": "games",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "network",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "value",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "tag",
"type": "STRING",
"mode": "NULLABLE"
}
]
},
{
"name": "spotify",
"type": "RECORD",
"mode": "REPEATED",
"fields": [
{
"name": "network",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "value",
"type": "STRING",
"mode": "NULLABLE"
},
{
"name": "tag",
"type": "STRING",
"mode": "NULLABLE"
}
]
}
]
}
]
}
]
You could import the full json row as if it was a CSV - basically a one column BigQuery table of json objects. Then you can parse the JSON at will inside BigQuery, with queries like this:
WITH j AS (
SELECT """{"_index":"data","_type":"collection_v1","_id":"548d035f23r8987b768a5e60","_score":1,"_source":{"fullName":"Mike Smith","networks":[{"id":["12923449"],"network":"facebook","link":"https://www.facebook.com/127654449"}],"sex":{"network":"facebook","value":"male"},"interests":{},"score":1.045,"merged_by":"548f899444v5t4v45te9a4cc"}}""" j
)
SELECT index
, STRUCT(
JSON_EXTRACT_SCALAR(source, '$.fullName') AS fullName
, [
STRUCT(
JSON_EXTRACT_SCALAR(source, '$.networks[0].id[0]') AS id
, JSON_EXTRACT_SCALAR(source, '$.networks[0].network') AS network
, JSON_EXTRACT_SCALAR(source, '$.networks[0].link') AS link)
] AS networks
) source
FROM (
SELECT JSON_EXTRACT_SCALAR(j.j, '$._index') index
, JSON_EXTRACT(j.j, '$._source') source
FROM j
)
See:
https://medium.com/google-cloud/bigquery-lazy-data-loading-ddl-dml-partitions-and-half-a-trillion-wikipedia-pageviews-cd3eacd657b6