migrate mongodb table by specefic columns only to mysql using aws dms - mysql

I have a schema named reports in mongo and a collection named totals.
The keys in it looks like:
{ "_id" : { "dt" : "2018-12-02", "dt2" : "2018-04-08", "num" : 1312312312 }, "str" : 1 }
I would like to use DMS to migrate this collection into mysql instance on aws. The table should look like:
create table tab(
dt date,
dt2 date,
num bigint)
Currently, I'm using dms with simple rule:
{
"rules": [
{
"rule-type": "transformation",
"rule-id": "1",
"rule-name": "1",
"rule-target": "table",
"object-locator": {
"schema-name": "reports",
"table-name": "totals"
},
"rule-action": "rename",
"value": "tab",
"old-value": null
},
{
"rule-type": "selection",
"rule-id": "2",
"rule-name": "2",
"object-locator": {
"schema-name": "reports",
"table-name": "totals"
},
"rule-action": "include",
"filters": []
}
]
}
The result is not what I wanted:
MySQL [stats]> desc tab;
+-------+----------+------+-----+---------+-------+
| Field | Type | Null | Key | Default | Extra |
+-------+----------+------+-----+---------+-------+
| _doc | longtext | YES | | NULL | |
+-------+----------+------+-----+---------+-------+
MySQL [(none)]> select * from tab limit 1;
+------------------------------------------------------------------------------------------+
| _doc |
+------------------------------------------------------------------------------------------+
| { "_id" : { "dt" : "2018-12-02", "dt2" : "2018-04-08", "num" : 1312312312 }, "str" : 1 } |
+------------------------------------------------------------------------------------------+
1 row in set (0.00 sec)

Endpoint needed to have nestingLevel=ONE; instead of nestingLevel=NONE;.
Basically it means look at the data as a table instead of a document.

Related

format jq output into a table

I want to fetch some data from below JSON code:
I'm able to get the output using below command but now I want to format it in such a way that my output will look like the expected output.
Command:
cat dump | jq -r '["name","IP","NAT","location","method"],
(.objects[] | [.name, ."ipv4-address", ."nat-settings"."ipv4-address", ."nat-settings"."install-on", ."nat-settings".method])
| #csv'
| sed -e 's/"//g'
After using #csv I got below output:
name,IP,NAT,location,method
H_103.109.135.25,103.109.135.25,1.1.1.1,All,static
H_103.109.135.250,103.109.135.250,,,
and whenever I use #tsv I get "jq: error: tsv is not a valid format"
Can any one suggest me how can I achieve below output:
Expected Output:
Raw JSON Code:
{
"from" : 1,
"to" : 2,
"total" : 2,
"objects" : [ {
"uid" : "73b7036d-e8ec-47b7-99b5-19ca89eb5fd0",
"name" : "H_103.109.135.25",
"type" : "host",
"domain" : {
"uid" : "41e821a0-3720-11e3-aa6e-0800200c9fde",
"name" : "SMC User",
"domain-type" : "domain"
},
"ipv4-address" : "103.109.135.25",
"interfaces" : [ ],
"nat-settings" : {
"auto-rule" : true,
"ipv4-address" : "1.1.1.1",
"ipv6-address" : "",
"install-on" : "All",
"method" : "static"
},
"comments" : "",
"color" : "black",
"icon" : "Objects/host",
"tags" : [ ],
"meta-info" : {
"lock" : "unlocked",
"validation-state" : "ok",
"last-modify-time" : {
"posix" : 1674820459413,
"iso-8601" : "2023-01-27T17:24+0530"
},
"last-modifier" : "admin",
"creation-time" : {
"posix" : 1674818326777,
"iso-8601" : "2023-01-27T16:48+0530"
},
"creator" : "admin"
},
"read-only" : false,
"available-actions" : {
"edit" : "true",
"delete" : "true",
"clone" : "true"
}
}, {
"uid" : "7300c38a-a496-497a-b9e3-5701fa081393",
"name" : "H_103.109.135.250",
"type" : "host",
"domain" : {
"uid" : "41e821a0-3720-11e3-aa6e-0800200c9fde",
"name" : "SMC User",
"domain-type" : "domain"
},
"ipv4-address" : "103.109.135.250",
"interfaces" : [ ],
"nat-settings" : {
"auto-rule" : false
},
"comments" : "",
"color" : "black",
"icon" : "Objects/host",
"tags" : [ ],
"meta-info" : {
"lock" : "unlocked",
"validation-state" : "ok",
"last-modify-time" : {
"posix" : 1674818341888,
"iso-8601" : "2023-01-27T16:49+0530"
},
"last-modifier" : "admin",
"creation-time" : {
"posix" : 1674818341888,
"iso-8601" : "2023-01-27T16:49+0530"
},
"creator" : "admin"
},
"read-only" : false,
"available-actions" : {
"edit" : "true",
"delete" : "true",
"clone" : "true"
}
} ]
}
Note:
It's not mandatory that the output should be printed in table using jq only. "awk" or "sed" is also fine.
I have extracted data that required from the below raw json data:
Extracted data:
{
"name": "H_103.109.135.25",
"IP": "103.109.135.25",
"NAT": "1.1.1.1",
"location": "All",
"method": "static"
},
{
"name": "H_103.109.135.250",
"IP": "103.109.135.250",
"NAT": "NA",
"location": "NA",
"method": "NA"
}
I now just need to format this data into table like below or somewhat similar:
| name | IP | NAT | location | method |
|-------------------|-----------------|---------|------------|----------|
| H_103.109.135.25 | 103.109.135.25 | 1.1.1.1 | All | static |
| H_103.109.135.250 | 103.109.135.250 | NA | NA | NA |
There is jbtl which may produce what you're looking for. If you have this in output.jq for example:
.objects
| map(
{ name, IP: ."ipv4-address" } +
(."nat-settings" | {
NAT: (."ipv4-address" // "NA"),
location: (."install-on" // "NA"),
method: (.method // "NA")
})
)
then passing the data through this filter and piping it into jtbl with the -m option, like this:
cat dump | jq -f output.jq | jtbl -m
gives this
| name | IP | NAT | location | method |
|-------------------|-----------------|---------|------------|----------|
| H_103.109.135.25 | 103.109.135.25 | 1.1.1.1 | All | static |
| H_103.109.135.250 | 103.109.135.250 | NA | NA | NA |
miller is handy for pretty-printing output.
echo 'name,IP,NAT,location,method
H_103.109.135.25,103.109.135.25,1.1.1.1,All,static
H_103.109.135.250,103.109.135.250,,,' \
| mlr --c2p --barred put 'for (i,v in $*) {if (v == "") {$[i] = "NA"}}'
--c2p is a shortcut for --icsv --opprint which reads CSV input and outputs pretty-printed tabular form.
+-------------------+-----------------+---------+----------+--------+
| name | IP | NAT | location | method |
+-------------------+-----------------+---------+----------+--------+
| H_103.109.135.25 | 103.109.135.25 | 1.1.1.1 | All | static |
| H_103.109.135.250 | 103.109.135.250 | NA | NA | NA |
+-------------------+-----------------+---------+----------+--------+
The miller put verb takes an awk-like script.
See https://miller.readthedocs.io/en/latest/
A bit more functional style:
mlr --c2p --barred put '$* = apply($*, func(k,v) {return {k: v == "" ? "NA" : v}})'
I'd suggest removing quotes and adding "NA" inside jq, and then pipe the output to column
jq -r '
[
["name","IP","NAT","location","method"],
( .objects[]
| {"nat-settings": {"ipv4-address": "NA", "install-on": "NA", method: "NA"}} * .
| [.name, ."ipv4-address"] + (."nat-settings" | [."ipv4-address", ."install-on", .method])
)
][] | join(",")
' dump | column -s, -t
That assumes that the "nat-settings" object is missing the "ipv4-address", etc, keys.
I would recommend using jq's #tsv and the very standard tool, column, e.g. as follows:
< dump jq -r '
["name","IP","NAT","location","method"],
(.objects[] | [.name, ."ipv4-address", ."nat-settings"."ipv4-address", ."nat-settings"."install-on", ."nat-settings".method])
| #tsv' | column -t

Parse nested Json to splunk query which has string

I have a multiple result for a macAddress which contains the device details.
This is the sample data
"data": {
"a1:b2:c3:d4:11:22": {
"deviceIcons": {
"type": "Phone",
"icons": {
"3x": null,
"2x": "image.png"
}
},
"advancedDeviceId": {
"agentId": 113,
"partnerAgentId": "131",
"dhcpHostname": "Galaxy-J7",
"mac": "a1:b2:c3:d4:11:22",
"lastSeen": 12,
"model": "Android Phoe",
"id": 1
}
},
"a0:b2:c3:d4:11:22": {
"deviceIcons": {
"type": "Phone",
"icons": {
"3x": null,
"2x": "image.png"
}
},
"advancedDeviceId": {
"agentId": 113,
"partnerAgentId": "131",
"dhcpHostname": "Galaxy",
"mac": "a0:b2:c3:d4:11:22",
"lastSeen": 12,
"model": "Android Phoe",
"id": 1
}
}
}
}
How can I query in splunk for all the kind of above sample results to get the advancedDeviceId.model and advancedDeviceId.id in tabular format?
I think this will do what you want
| spath
| untable _time column value
| rex field=column "data.(?<address>[^.]+)\.advancedDeviceId\.(?<item>[^.]+)"
| table _time address item value
| eval {item}=value
| stats list(model) as model
list(id) as id
list(dhcpHostname) as dhcpHostname
list(mac) as mac
by address
Here is a "run anywhere" example that has two events each with two addresses:
| makeresults
| eval _raw="{\"data\":{\"a1:b2:c3:d4:11:21\":{\"deviceIcons\":{\"type\":\"Phone\",\"icons\":{\"3x\":null,\"2x\":\"image.png\"}},\"advancedDeviceId\":{\"agentId\":113,\"partnerAgentId\":\"131\",\"dhcpHostname\":\"Galaxy-J7\",\"mac\":\"a1:b2:c3:d4:11:21\",\"lastSeen\":12,\"model\":\"Android Phoe\",\"id\":1}},\"a0:b2:c3:d4:11:22\":{\"deviceIcons\":{\"type\":\"Phone\",\"icons\":{\"3x\":null,\"2x\":\"image.png\"}},\"advancedDeviceId\":{\"agentId\":113,\"partnerAgentId\":\"131\",\"dhcpHostname\":\"iPhone 6\",\"mac\":\"a0:b2:c3:d4:11:22\",\"lastSeen\":12,\"model\":\"Apple Phoe\",\"id\":2}}}}"
| append [
| makeresults
| eval _raw="{\"data\":{\"b1:b2:c3:d4:11:23\":{\"deviceIcons\":{\"type\":\"Phone\",\"icons\":{\"3x\":null,\"2x\":\"image.png\"}},\"advancedDeviceId\":{\"agentId\":113,\"partnerAgentId\":\"131\",\"dhcpHostname\":\"Nokia\",\"mac\":\"b1:b2:c3:d4:11:23\",\"lastSeen\":12,\"model\":\"Symbian Phoe\",\"id\":3}},\"b0:b2:c3:d4:11:24\":{\"deviceIcons\":{\"type\":\"Phone\",\"icons\":{\"3x\":null,\"2x\":\"image.png\"}},\"advancedDeviceId\":{\"agentId\":113,\"partnerAgentId\":\"131\",\"dhcpHostname\":\"Windows\",\"mac\":\"b0:b2:c3:d4:11:24\",\"lastSeen\":12,\"model\":\"Windows Phoe\",\"id\":4}}}}"
]
| spath
| untable _time column value
| rex field=column "data.(?<address>[^.]+)\.advancedDeviceId\.(?<item>[^.]+)"
| table _time address item value
| eval {item}=value
| stats list(model) as model
list(id) as id
list(dhcpHostname) as dhcpHostname
list(mac) as mac
by address

TSQL JSON_QUERY can you use a filter in the JSON Path

I have a table with a column that holds valid JSON, heres an example of the JSON structure:
{
"Requirements": {
"$values": [
{
"$type": "List",
"ListId": "956cf9c5-24ab-47d9-8082-940118f2f1a3",
"DefaultValue": "",
"MultiSelect": true,
"Selected": null,
"MultiSelected": {
"$type": "ListItem",
"$values": [
"Value1",
"Value2",
"Value3"
]
}
},
{
"$type": "List",
"ListId": "D11149DD-A682-4BC7-A87D-567954779234",
"DefaultValue": "",
"MultiSelect": true,
"Selected": null,
"MultiSelected": {
"$type": "ListItem",
"$values": [
"Value4",
"Value5",
"Value6",
"Value7"
]
}
}
]
}
}
I need to return the values from MultiSelected collection depending on the value of ListID.
I'm using the following JSON Path to retun value
$.Requirements."$values"[?(#.ListId=='956cf9c5-24ab-47d9-8082-940118f2f1a3')].MultiSelected."$values"
This worked fine in a JSON Expression tester.
But when I try to use it to query the table I get the following error:
JSON path is not properly formatted. Unexpected character '?' is found at position 25.
The query I'm using is as follows:
SELECT ID AS PayloadID,
Items.Item AS ItemsValues
FROM dbo.Payload
CROSS APPLY ( SELECT *
FROM OPENJSON( JSON_QUERY( Payload, '$.Requirements."$values"[?(#.ListId==''956cf9c5-24ab-47d9-8082-940118f2f1a3'')].MultiSelected."$values"' ) )
WITH ( Item nvarchar(200) '$' ) ) AS Items
WHERE ID = 3
I've tried replacing
?(#.ListId==''956cf9c5-24ab-47d9-8082-940118f2f1a3'')
with 0 and it works fine on SQL Server.
My question is, is filter syntax ?(...) supported in JSON_QUERY or is there something else I should be doing?
The database is running on Azure, were the database compatability level is set to SQL Server 2017 (140).
Thanks for your help in advance.
Andy
I would use openjson twice in stead
drop table if exists #payload
create table #payload(ID int,Payload nvarchar(max))
insert into #payload VALUES
(3,N'
{
"Requirements": {
"$values": [
{
"$type": "List",
"ListId": "956cf9c5-24ab-47d9-8082-940118f2f1a3",
"DefaultValue": "",
"MultiSelect": true,
"Selected": null,
"MultiSelected": {
"$type": "ListItem",
"$values": [
"Value1",
"Value2",
"Value3"
]
}
},
{
"$type": "List",
"ListId": "D11149DD-A682-4BC7-A87D-567954779234",
"DefaultValue": "",
"MultiSelect": true,
"Selected": null,
"MultiSelected": {
"$type": "ListItem",
"$values": [
"Value4",
"Value5",
"Value6",
"Value7"
]
}
}
]
}
}'
)
SELECT ID AS PayloadID,
Items.[value]
FROM #Payload a
CROSS APPLY OPENJSON( Payload, '$.Requirements."$values"' ) with ( ListId varchar(50),MultiSelected nvarchar(max) as json) b
CROSS APPLY OPENJSON( MultiSelected,'$."$values"' ) Items
where
a.id=3
AND b.listid='956cf9c5-24ab-47d9-8082-940118f2f1a3'
Reply:
+-----------+--------+
| PayloadID | value |
+-----------+--------+
| 3 | Value1 |
| 3 | Value2 |
| 3 | Value3 |
+-----------+--------+

Representing a DB schema in JSON

Let's say I have two tables in my database, employee and car defined thusly.
employee:
+--------------+------------+
| col_name | data_type |
+--------------+------------+
| eid | int |
| name | string |
| salary | int |
| destination | string |
+--------------+------------+
car:
+------------+----------------+
| col_name | data_type |
+------------+----------------+
| cid | int |
| name | string |
| model | string |
| cylinders | int |
| price | int |
+------------+----------------+
I would like to export this schema to a JSON object so that I can populate an HTML dropdown menu based on the table - for instance, the table menu would have employee and car. Selecting employee would populate another dropdown with the column names and types corresponding to that table.
Given this use case, would the optimal json representation of the database be this?
{
"employee": {
"salary": "int",
"destination": "string",
"eid": "int",
"name": "string"
},
"car": {
"price": "int",
"model": "string",
"cylinders": "int",
"name": "string",
"cid": "int"
}
}
EDIT:
Or would this be more appropriate?
{
"employee": [
{
"type": "int",
"colname": "eid"
},
{
"type": "string",
"colname": "name"
},
{
"type": "int",
"colname": "salary"
},
{
"type": "string",
"colname": "destination"
}
],
"car": [
{
"type": "int",
"colname": "cid"
},
{
"type": "string",
"colname": "name"
},
{
"type": "string",
"colname": "model"
},
{
"type": "int",
"colname": "cylinders"
},
{
"type": "int",
"colname": "price"
}
]
}
In the first example, all your data is stored in objects. Assuming the structure is stored in a var mytables, you can get the names with Object.keys(mytables), which returns ['employee', 'car']. Equivalent for the columns inside: Object.keys(mytables['employee'].cols) returns ['salary','destination','eid','name'].
In the second example I would suggest to also store the tables in an array as the columns, like
[name: 'employee',
cols: [ {
"type": "int",
"colname": "cid"
}, ...]
Then you can easily iterate over the arrays and get the names by accessing mytables[i].name
for (t in tables){
console.log(tables[t].name);
for (c in tables[t].cols)
console.log(" - ",tables[t].cols[c].colname, ": ", tables[t].cols[c].type);
}

Unnesting nested JSON structures in Apache Drill

I have the following JSON (roughly) and I'd like to extract the information from the header and defects fields separately:
{
"file": {
"header": {
"timeStamp": "2016-03-14T00:20:15.005+04:00",
"serialNo": "3456",
"sensorId": "1234567890",
},
"defects": [
{
"info": {
"systemId": "DEFCHK123",
"numDefects": "3",
"defectParts": [
"003", "006", "008"
]
}
}
]
}
}
I have tried to access the individual elements with file.header.timeStamp etc but that returns null. I have tried using flatten(file) but that gives me
Cannot cast org.apache.drill.exec.vector.complex.MapVector to org.apache.drill.exec.vector.complex.RepeatedValueVector
I've looked into kvgen() but don't see how that fits in my case. I tried kvgen(file.header) but that gets me
kvgen function only supports Simple maps as input
which is what I had expected anyway.
Does anyone know how I can get header and defects, so I can process the information contained in them. Ideally, I'd just select the information from header because it contains no arrays or maps, so I can take individual records as they are. For defects I'd simply use FLATTEN(defectParts) to obtain a table of the defective parts.
Any help would be appreciated.
What version of Drill are you using ? I tried querying the following file on latest master (1.7.0-SNAPHOT):
{
"file": {
"header": {
"timeStamp": "2016-03-14T00:20:15.005+04:00",
"serialNo": "3456",
"sensorId": "1234567890"
},
"defects": [
{
"info": {
"systemId": "DEFCHK123",
"numDefects": "3",
"defectParts": [
"003", "006", "008"
]
}
}
]
}
}
{
"file": {
"header": {
"timeStamp": "2016-03-14T00:20:15.005+04:00",
"serialNo": "3456",
"sensorId": "1234567890"
},
"defects": [
{
"info": {
"systemId": "DEFCHK123",
"numDefects": "3",
"defectParts": [
"003", "006", "008"
]
}
}
]
}
}
And the following queries are working fine:
1.
select t.file.header.serialno as serialno from `parts.json` t;
+-----------+
| serialno |
+-----------+
| 3456 |
| 3456 |
+-----------+
2 rows selected (0.098 seconds)
2.
select flatten(t.file.defects) defects from `parts.json` t;
+---------------------------------------------------------------------------------------+
| defects |
+---------------------------------------------------------------------------------------+
| {"info":{"systemId":"DEFCHK123","numDefects":"3","defectParts":["003","006","008"]}} |
| {"info":{"systemId":"DEFCHK123","numDefects":"3","defectParts":["003","006","008"]}} |
+---------------------------------------------------------------------------------------+
3.
select q.h.serialno as serialno, q.d.info.defectParts as defectParts from (select t.file.header h, flatten(t.file.defects) d from `parts.json` t) q;
+-----------+----------------------+
| serialno | defectParts |
+-----------+----------------------+
| 3456 | ["003","006","008"] |
| 3456 | ["003","006","008"] |
+-----------+----------------------+
2 rows selected (0.126 seconds)
PS: This should've been a comment but I don't have enough rep yet!
I don't have experience with Apache Drill, but checked the manual. Isn't this what you're looking for?
https://drill.apache.org/docs/selecting-multiple-columns-within-nested-data/
https://drill.apache.org/docs/selecting-nested-data-for-a-column/