how create collect_list without brackets pyspark - json

I have a dataframe that I export via json to an s3, however, I need the "_source" to be generated only with the keys, without the square brackets
def export_data(example):
example = example \
.groupby(col("_id"), col("_index")) \
.agg(F.collect_list(F.struct(
col("colA1"),
col("colA2"),
col("colA3"),
col("colA4"),
col("colA5"),
col("colA6"))).alias("_source"))
return example.select(collect_list(F.struct(col("_id"), col("_source"))).alias("hits"))
def main():
example_export = export_data(example)
example_export.coalesce(1) \
.write.format('json') \
.mode("overwrite") \
.save(
"s3://aws-example/output/EXPORTER")
What I Have:
{
"hits": [
{
"_id": "22b9d653-c07a-46da-a929-42337f0f020d",
"_source": [
{
"colA1": "202211",
"colA2": "A",
"colA3": "020160",
"colA4": "MANAUS",
"colA5": "18",
"colA6": ""
}
]
},
{
"_id": "6fe3b950-3b09-4f11-b4d7-5e8c9aab8f3e",
"_source": [
{
"colA1": "202211",
"colA2": "A",
"colA3": "21345",
"colA4": "RORAIMA",
"colA5": "10",
"colA6": ""
}
]
}
]
}
What I need:
{
"hits": [
{
"_id": "22b9d653-c07a-46da-a929-42337f0f020d",
"_source": {
"colA1": "202211",
"colA2": "A",
"colA3": "020160",
"colA4": "MANAUS",
"colA5": "18",
"colA6": ""
}
},
{
"_id": "6fe3b950-3b09-4f11-b4d7-5e8c9aab8f3e",
"_source": {
"colA1": "202211",
"colA2": "A",
"colA3": "21345",
"colA4": "RORAIMA",
"colA5": "10",
"colA6": ""
}
}
]
}

Related

jq command to add onto a map

Is there a command to use jq to add onto this type of map?
append an array of maps using keys (ie, vm1, vm2, vm3)
Note: I have an existing vm_map {} in a json file and i want to add to the vm_map
this is my new_json.json file
{
"gcs_config": [
{
"bucket_name": "somebucket",
"bucket_readers": [],
"bucket_writers": []
}
],
"label_application": "someapp",
"label_environment": "dev",
"lits_vm_zone": "somezone",
"project_id": "someproject",
"region": "someregion",
"storage_bucket_required": true,
"vm_map" : {}
}
expected: using jq to add onto vm_maps map. I will have an empty vm_map and each time it runs, i will add a x amount of new entries.
{
"gcs_config": [
{
"bucket_name": "somebucket",
"bucket_readers": [],
"bucket_writers": []
}
],
"label_application": "someapp",
"label_environment": "dev",
"lits_vm_zone": "zone-a",
"project_id": "someproject",
"region": "someregion",
"storage_bucket_required": true,
"vm_map": {
"vm1": {
"host": "vm1",
"network": "10.1.1.1",
"name": "vm1"
},
"vm2": {
"host": "123",
"network": "10.1.12",
"name": "vm2"
}
}
}
The file you describe is not valid JSON. I'm assuming you mean
{
"vm_map": {
"vm1": {
"host": "vm1",
"network": "xxxxx",
"name": "xxxxxxx"
},
"vm2": {
"host": "vm2",
"network": "xxxxx",
"name": "xxxxxxx"
}
}
}
You can use this:
jq \
--arg VMHOST "$VMHOST" \
--arg NETWORK_IP "$NETWORK_IP" \
--arg VM_NAME "$VM_NAME" \
'
.vm_map[ $VMHOST ] = {
host: $VMHOST,
network: $NETWORK_IP,
name: $VM_NAME
}
'

How to search a JSON hash using a Regex?

I'm using an API to return all Macros to me, I am trying to return all the "macros" where "actions" contains a "value" matching my Regexp Pattern which I will link below.
I've tried below and other methods, but it returns me nil for present values. Any tips appreciated
macros["value"].select { |m| m['key'] == 'value' }.first['/^DE([0-9a-zA-Z]\s?){20}$/gm']
API result snippet:
jsObj
=> {"macros"=>
[{"url"=>"https://s/1900002708354.json",
"id"=>1900002708354,
"title"=>"Append Signature",
"active"=>true,
"updated_at"=>"2021-10-22T14:11:15Z",
"created_at"=>"2021-10-22T14:11:15Z",
"position"=>10001,
"description"=>"This macro appends a signature to the message ",
"actions"=>[{"field"=>"comment_value_html", "value"=>"<p>Mit besten Grüßen,</p><p>{{current_user.name}} [{{ticket.account}}] <br></p><p><br></p><p>{{dc.signature_email}}<br></p><p><br></p>"}],
"restriction"=>nil},
{"url"=>"949.json",
"id"=>59071949,
"title"=>"information",
"description"=>nil,
"actions"=>[{"field"=>"priority", "value"=>"low"}, {"field"=>"comment_value", "value"=>"DE89370400440532013000" "DE89 3704
0044 0532 0130 00"
"}],
"restriction"=>nil},
Desired Result:
{
"macros": [
{
"url": "x.json",
"id": 1900002708354,
"actions": [
{
"field": "comment_value_html",
"value": "DE89 3704 0044 0532 0130 00"
}
],
"restriction": null
},
{
"url": "x.json",
"id": 59071949,
"actions": [
{
"field": "priority",
"value": "low"
},
{
"field": "comment_value",
"value": "DE89 3704 0044 0532 0130 00
"
}
],
"restriction": null
},
Given that macros is the JSON object containing the macros data, you can use
macros.select { |m| m["actions"].any? { |w| /\ADE(?:[0-9a-zA-Z]\s?){20}\z/.match?(w["value"]) } }
Here is a Ruby demo:
require 'json'
j = <<-DATA
{
"macros": [
{
"url": "x.json",
"id": 1900002708354,
"actions": [
{
"field": "comment_value_html",
"value": "DE11111111111222222220"
}
],
"restriction": null
},
{
"url": "x.json",
"id": 59071949,
"actions": [
{
"field": "priority",
"value": "low"
},
{
"field": "comment_value",
"value": "DE12345678901234567890"
}
],
"restriction": null
}
]}
DATA
jsObj = JSON.parse(j)
macros = jsObj['macros']
puts jsObj['macros'].select { |m| m["actions"].any? { |w| /\ADE(?:[0-9a-zA-Z]\s?){20}\z/.match?(w["value"]) } }
Output:
{"url"=>"x.json", "id"=>1900002708354, "actions"=>[{"field"=>"comment_value_html", "value"=>"DE11111111111222222220"}], "restriction"=>nil}
{"url"=>"x.json", "id"=>59071949, "actions"=>[{"field"=>"priority", "value"=>"low"}, {"field"=>"comment_value", "value"=>"DE12345678901234567890"}], "restriction"=>nil}
The .select { |m| m["actions"].any? { |w| /\ADE(?:[0-9a-zA-Z]\s?){20}\z/.match?(w["value"]) } } main part gets all actions nodes that contain an array with a value key whose value matches the regex given.

How to get required json output from complex nested json format.?

My original file is in CSV format which I have converted to python JSON array to JSON Sring.
jsonfile
<class 'list'>
<class 'dict'>
[
{
"key": "timestamp",
"source": "eia007",
"turnover": "65million",
"url": "abc.com",
"record": "",
"loc.reg": "nord000",
"loc.count": "abs39i5",
"loc.town": "cold54",
"co.gdp": "nscrt77",
"co.pop.min": "min50",
"co.pop.max": "max75",
"co.rev": "",
"chain.system": "5t5t5",
"chain.type": "765ef",
"chain.strat": "",
}
]
I would like to get the output as below:
{
"timestamp001": {
"key": "timestamp001",
"phNo": "ner007",
"turnover": "65million",
"url": "abc.com",
"record": "",
"loc": {
"reg": "nord000",
"count": "abs39i5",
"town": "cold54"
},
"co": {
"form": "nscrt77",
"pop": {
"min": "min50",
"max": "max75"
},
"rev: ""
},
"chain":{
"system": "5t5t5",
"type": "765ef",
"strat": ""
}
...
}
...
}
]
I have tried different options; tried to enumerate, but cannot get the required output. Please help me with this. Thanks in advance.
You can use something like this to create the nested dict:
import json
def unflatten(somedict):
unflattened = {}
for key, value in somedict.items():
splitkey = key.split(".")
print(f"doing {key} {value} {splitkey}")
# subdict is the dict that goes deeper in the nested structure
subdict = unflattened
for subkey in splitkey[:-1]:
# if this is the first time we see this key, add it
if subkey not in subdict:
subdict[subkey] = {}
# shift the subdict a level deeper
subdict = subdict[subkey]
# add the value
subdict[splitkey[-1]] = value
return unflattened
data = {
"key": "timestamp",
"source": "eia007",
"turnover": "65million",
"url": "abc.com",
"record": "",
"loc.reg": "nord000",
"loc.count": "abs39i5",
"loc.town": "cold54",
"co.gdp": "nscrt77",
"co.pop.min": "min50",
"co.pop.max": "max75",
"co.rev": "",
"chain.system": "5t5t5",
"chain.type": "765ef",
"chain.strat": "",
}
unflattened = unflatten(data)
print(json.dumps(unflattened, indent=4))
Which produces:
{
"key": "timestamp",
"source": "eia007",
"turnover": "65million",
"url": "abc.com",
"record": "",
"loc": {
"reg": "nord000",
"count": "abs39i5",
"town": "cold54"
},
"co": {
"gdp": "nscrt77",
"pop": {
"min": "min50",
"max": "max75"
},
"rev": ""
},
"chain": {
"system": "5t5t5",
"type": "765ef",
"strat": ""
}
}
Cheers!

Reconstructing JSON with jq

I have a JSON like this (sample.json):
{
"sheet1": [
{
"hostname": "sv001",
"role": "web",
"ip1": "172.17.0.3"
},
{
"hostname": "sv002",
"role": "web",
"ip1": "172.17.0.4"
},
{
"hostname": "sv003",
"role": "db",
"ip1": "172.17.0.5",
"ip2": "172.18.0.5"
}
],
"sheet2": [
{
"hostname": "sv004",
"role": "web",
"ip1": "172.17.0.6"
},
{
"hostname": "sv005",
"role": "db",
"ip1": "172.17.0.7"
},
{
"hostname": "vsv006",
"role": "db",
"ip1": "172.17.0.8"
}
],
"sheet3": []
}
I want to extract data like this:
sheet1
jq '(something command)' sample.json
{
"web": {
"hosts": [
"172.17.0.3",
"172.17.0.4"
]
},
"db": {
"hosts": [
"172.17.0.5"
]
}
}
Is it possible to perform the reconstruction with jq map?
(I will reuse the result for ansible inventory.)
Here's a short, straight-forward and efficient solution -- efficient in part because it avoids group_by by courtesy of the following generic helper function:
def add_by(f;g): reduce .[] as $x ({}; .[$x|f] += [$x|g]);
.sheet1
| add_by(.role; .ip1)
| map_values( {hosts: .} )
Output
This produces the required output:
{
"web": {
"hosts": [
"172.17.0.3",
"172.17.0.4"
]
},
"db": {
"hosts": [
"172.17.0.5"
]
}
}
If the goal is to regroup the ips by their roles within each sheet you could do this:
map_values(
reduce group_by(.role)[] as $g ({};
.[$g[0].role].hosts = [$g[] | del(.hostname, .role)[]]
)
)
Which produces something like this:
{
"sheet1": {
"db": {
"hosts": [
"172.17.0.5",
"172.18.0.5"
]
},
"web": {
"hosts": [
"172.17.0.3",
"172.17.0.4"
]
}
},
"sheet2": {
"db": {
"hosts": [
"172.17.0.7",
"172.17.0.8"
]
},
"web": {
"hosts": [
"172.17.0.6"
]
}
},
"sheet3": {}
}
https://jqplay.org/s/3VpRc5l4_m
If you want to flatten all to a single object keeping only unique ips, you can keep everything mostly the same, you'll just need to flatten the inputs prior to grouping and remove the map_values/1 call.
$ jq -n '
reduce ([inputs[][]] | group_by(.role)[]) as $g ({};
.[$g[0].role].hosts = ([$g[] | del(.hostname, .role)[]] | unique)
)
'
{
"db": {
"hosts": [
"172.17.0.5",
"172.17.0.7",
"172.17.0.8",
"172.18.0.5"
]
},
"web": {
"hosts": [
"172.17.0.3",
"172.17.0.4",
"172.17.0.6"
]
}
}
https://jqplay.org/s/ZGj1wC8hU3

How to access Dynamodb's original JSON elements?

I am trying to test my lambda manually with the following dynamodb event input configured in tests -
Let's call this Json-1
{
"Records": [
{
"eventID": "1",
"eventVersion": "1.0",
"dynamodb": {
"Keys": {
"Id": {
"N": "101"
}
},
"NewImage": {
"Message": {
"S": "New item!"
},
"Id": {
"N": "101"
}
},
"StreamViewType": "NEW_AND_OLD_IMAGES",
"SequenceNumber": "111",
"SizeBytes": 26
},
"awsRegion": "us-west-2",
"eventName": "INSERT",
"eventSourceARN": eventsourcearn,
"eventSource": "aws:dynamodb"
},
{
"eventID": "2",
"eventVersion": "1.0",
"dynamodb": {
"OldImage": {
"Message": {
"S": "New item!"
},
"Id": {
"N": "101"
}
},
"SequenceNumber": "222",
"Keys": {
"Id": {
"N": "101"
}
},
"SizeBytes": 59,
"NewImage": {
"Message": {
"S": "This item has changed"
},
"Id": {
"N": "101"
}
},
"StreamViewType": "NEW_AND_OLD_IMAGES"
},
"awsRegion": "us-west-2",
"eventName": "MODIFY",
"eventSourceARN": sourcearn,
"eventSource": "aws:dynamodb"
},
{
"eventID": "3",
"eventVersion": "1.0",
"dynamodb": {
"Keys": {
"Id": {
"N": "101"
}
},
"SizeBytes": 38,
"SequenceNumber": "333",
"OldImage": {
"Message": {
"S": "This item has changed"
},
"Id": {
"N": "101"
}
},
"StreamViewType": "NEW_AND_OLD_IMAGES"
},
"awsRegion": "us-west-2",
"eventName": "REMOVE",
"eventSourceARN": sourcearn,
"eventSource": "aws:dynamodb"
}
]
}
However, the json of dynamodb items look like this -
Let's call this Json-2
{
"id": {
"S": "RIGHT-aa465568-f4c8-4822-9c38-7563ae0cd37b-1131286033464633.jpg"
},
"lines": {
"L": [
{
"M": {
"points": {
"L": [
{
"L": [
{
"N": "0"
},
{
"N": "874.5625"
}
]
},
{
"L": [
{
"N": "1765.320601851852"
},
{
"N": "809.7800925925926"
}
]
},
{
"L": [
{
"N": "3264"
},
{
"N": "740.3703703703704"
}
]
}
]
},
"type": {
"S": "guard"
}
}
}
]
},
"modified": {
"N": "1483483932472"
},
"qastatus": {
"S": "reviewed"
}
}
Using the lambda function below, I can connect to my table. My goal is create a json which elastic search will accept.
#Override
public Object handleRequest(DynamodbEvent dynamodbEvent, Context context) {
List<DynamodbEvent.DynamodbStreamRecord> dynamodbStreamRecordlist = dynamodbEvent.getRecords();
DynamoDB dynamoDB = new DynamoDB(new AmazonDynamoDBClient());
log.info("Whole event - "+dynamodbEvent.toString());
dynamodbStreamRecordlist.stream().forEach(dynamodbStreamRecord -> {
if(dynamodbStreamRecord.getEventSource().equalsIgnoreCase("aws:dynamodb")){
log.info("one record - "+dynamodbStreamRecord.getDynamodb().toString());
log.info(" getting N from new image "+dynamodbStreamRecord.getDynamodb().getNewImage().toString());
String tableName = getTableNameFromARN(dynamodbStreamRecord.getEventSourceARN());
log.info("Table name :"+tableName);
Map<String, AttributeValue> keys = dynamodbStreamRecord.getDynamodb().getKeys();
log.info(keys.toString());
AttributeValue attributeValue = keys.get("Id");
log.info("Value of N: "+attributeValue.getN());
Table table = dynamoDB.getTable(tableName);
}
});
return dynamodbEvent;
}
The format of a JSON item that elastic search expects is this and this is what I want to map the test input json to-
Let's call this Json-3
{
_index: "bar-guard",
_type: "bar-guard_type",
_id: "LEFT-b1939610-442f-4d8d-9991-3ca54685b206-1147042497459511.jpg",
_score: 1,
_source: {
#SequenceNumber: "4901800000000019495704485",
#timestamp: "2017-01-04T02:24:20.560358",
lines: [{
points: [[0,
1222.7129629629628],
[2242.8252314814818,
1254.702546296296],
[4000.0000000000005,
1276.028935185185]],
type: "barr"
}],
modified: 1483483934697,
qastatus: "reviewed",
id: "LEFT-b1939610-442f-4d8d-9991-3ca54685b206-1147042497459511.jpg"
}
},
So what I need is read Json-1 and map it to Json-3.
However, Json-1 does not seem to be complete i.e. it does not have information that a dynamodb json has - like points and lines in Json-2.
And so, I was trying to get a connection to the original table and then read this additional information of lines and points by using the ID.
I am not sure if this is the right approach. Basically, want to figure out a way to get the actual JSON that dynamodb has and not the one that has attribute types
How can I get lines and points from json-2 using java? I know we have DocumentClient in javascript but I am looking for something in java.
Also, came across a converter here but doesn't help me- https://github.com/aws/aws-sdk-js/blob/master/lib/dynamodb/converter.js
Is this something that I should use DynamoDBMapper or ScanJavaDocumentAPI for ?
http://docs.aws.amazon.com/AWSJavaSDK/latest/javadoc/com/amazonaws/services/dynamodbv2/datamodeling/DynamoDBMapper.html#marshallIntoObjects-java.lang.Class-java.util.List-com.amazonaws.services.dynamodbv2.datamodeling.DynamoDBMapperConfig-
If yes, I am a little lost how to do that in the code below -
ScanRequest scanRequest = new ScanRequest().withTableName(tableName);
ScanResult result = dynamoDBClient.scan(scanRequest);
for(Map<String, AttributeValue> item : result.getItems()){
AttributeValue value = item.get("lines");
if(value != null){
List<AttributeValue> values = value.getL();
for(AttributeValue value2 : values){
//what next?
}
}
}
Ok, this seems to work for me.
ScanRequest scanRequest = new ScanRequest().withTableName(tableName);
ScanResult result = dynamoDBClient.scan(scanRequest);
for(Map<String, AttributeValue> item : result.getItems()){
AttributeValue value = item.get("lines");
if(value != null){
List<AttributeValue> values = value.getL();
for(AttributeValue value2 : values){
if(value2.getM() != null)
{
Map<String, AttributeValue> map = value2.getM();
AttributeValue points = map.get("points");
List<AttributeValue> pointsvalues = points.getL();
if(!pointsvalues.isEmpty()){
for(AttributeValue valueOfPoint : pointsvalues){
List<AttributeValue> pointList = valueOfPoint.getL();
for(AttributeValue valueOfPoint2 : pointList){
}
}
}
}
}
}
}