I have a table "MY_TABLE" with one column "VALUE" and the first row of the column contains a json that looks like:
{
"VALUE": {
"c1": "name",
"c10": "age",
"c100": "gender",
"c101": "address",
"c102": "status"
}
}
I would like to add a new key-value pair to this json in the first row where the pair is "c125" : "job" so that the result looks like:
{
"VALUE": {
"c1": "name",
"c10": "age",
"c100": "gender",
"c101": "address",
"c102": "status",
"c125": "job"
}
}
I tried:
SELECT object_insert(OBJECT_CONSTRUCT(*),'c125', 'job') FROM MY_TABLE;
But it inserted the new key value pair into the wrong spot so the result looks like:
{
"VALUE": {
"c1": "name",
"c10": "age",
"c100": "gender",
"c101": "address",
"c102": "status"
},
"c125": "job"
}
Is there another way to do this? Thanks!
Another, similar approach, using OBJECT_INSERT -
For original table (assuming, column data-type is variant, else use parse_json function) -
select * from temp_1;
+------------------------+
| COL1 |
|------------------------|
| { |
| "VALUE": { |
| "c1": "name", |
| "c10": "age", |
| "c100": "gender", |
| "c101": "address", |
| "c102": "status" |
| } |
| } |
+------------------------+
Query with added key ("c31":101) as output -
select
object_insert(col1,'VALUE',object_insert(col1:VALUE,'c31',101),TRUE)
as output_col from temp_1;
+------------------------+
| OUTPUT_COL |
|------------------------|
| { |
| "VALUE": { |
| "c1": "name", |
| "c10": "age", |
| "c100": "gender", |
| "c101": "address", |
| "c102": "status", |
| "c31": 101 |
| } |
| } |
+------------------------+
Clause used in a update (can be predicated based on another column to be used a key) -
update temp_1 set col1 = object_insert(col1,'VALUE',object_insert(col1:VALUE,'c31',101),TRUE);
After update -
select * from temp_1;
+------------------------+
| COL1 |
|------------------------|
| { |
| "VALUE": { |
| "c1": "name", |
| "c10": "age", |
| "c100": "gender", |
| "c101": "address", |
| "c102": "status", |
| "c31": 101 |
| } |
| } |
+------------------------+
One approach could be flattening the result first and construct again:
CREATE TABLE MY_TABLE
AS
SELECT PARSE_JSON('{
"VALUE": {
"c1": "name",
"c10": "age",
"c100": "gender",
"c101": "address",
"c102": "status"
}
}') AS VALUE;
SELECT * FROM MY_TABLE;
Before:
Query:
WITH cte(key, value) AS (
SELECT 'c125', 'job'::VARIANT
UNION ALL
SELECT s.key, s.value
FROM MY_TABLE
,TABLE(FLATTEN (input => VALUE, path => 'VALUE')) s
)
SELECT OBJECT_CONSTRUCT('VALUE', OBJECT_AGG(key, value))
FROM cte;
Output:
I have a JSON data which is a multi-line JSON. I have created a hive table to load that data into it. I have another JSON which is a single-line JSON record. When I load the single-line JSON record to its hive table and try to query, it works fine. But when I load the multi-line JSON into its hive table, it gives below exception:
Failed with exception java.io.IOException:org.apache.hadoop.hive.serde2.SerDeException: org.codehaus.jackson.JsonParseException: Unexpected end-of-input: expected close marker for OBJECT (from [Source: java.io.ByteArrayInputStream#8b89b3a; line: 1, column: 0]) at [Source: java.io.ByteArrayInputStream#8b89b3a; line: 1, column: 3]
Below is my JSON data:
{
"uploadTimeStamp" : "1486631318873",
"PDID" : "123",
"data" : [ {
"Data" : {
"unit" : "rpm",
"value" : "0"
},
"EventID" : "E1",
"PDID" : "123",
"Timestamp" : 1486631318873,
"Timezone" : 330,
"Version" : "1.0",
"pii" : { }
}, {
"Data" : {
"heading" : "N",
"loc3" : "false",
"loc" : "14.022425",
"loc1" : "78.760587",
"loc4" : "false",
"speed" : "10"
},
"EventID" : "E2",
"PDID" : "123",
"Timestamp" : 1486631318873,
"Timezone" : 330,
"Version" : "1.1",
"pii" : { }
}, {
"Data" : {
"x" : "1.1",
"y" : "1.2",
"z" : "2.2"
},
"EventID" : "E3",
"PDID" : "123",
"Timestamp" : 1486631318873,
"Timezone" : 330,
"Version" : "1.0",
"pii" : { }
}, {
"EventID" : "E4",
"Data" : {
"value" : "50",
"unit" : "percentage"
},
"Version" : "1.0",
"Timestamp" : 1486631318873,
"PDID" : "123",
"Timezone" : 330
}, {
"Data" : {
"unit" : "kmph",
"value" : "70"
},
"EventID" : "E5",
"PDID" : "123",
"Timestamp" : 1486631318873,
"Timezone" : 330,
"Version" : "1.0",
"pii" : { }
} ]
}
I am using /hive/lib/hive-hcatalog-core-0.13.0.jar
Below is my create table statement:
create table test7(
uploadtime bigint,
pdid string,
data array<
struct<Data:struct<
unit:string,
value:int>,
eventid:string,
pdid:bigint,
time:bigint,
timezone:int,
version:int,
pii:struct<pii:string>>,
struct<Data:struct<
heading:string,
Location:string,
latitude:bigint,
longitude:bigint,
Location2:string,
speed:int>,
eventid:string,
pdid:bigint,
time:bigint,
timezone:int,
version:int,
pii:struct<pii:string>>,
struct<Data:struct<
unit:string,
value:int>,
eventid:string,
pdid:bigint,
time:bigint,
timezone:int,
version:int,
pii:struct<pii:string>>,
struct<Data:struct<
x:int,
y:int,
z:int>,
eventid:string,
pdid:bigint,
time:bigint,
timezone:int,
version:int,
pii:struct<pii:string>>,
struct<Data:struct<
heading:string,
loc3:string,
latitude:bigint,
longitude:bigint,
loc4:string,
speed:int>,
eventid:string,
pdid:bigint,
time:bigint,
timezone:int,
version:int,
pii:struct<pii:string>>
>
)
ROW FORMAT SERDE
'org.apache.hive.hcatalog.data.JsonSerDe'
STORED AS TEXTFILE
LOCATION
'/xyz/abc/';
Edit:
Adding the single line JSON and new table create stmt with error:
{"uploadTimeStamp":"1487183800905","PDID":"123","data":[{"Data":{"unit":"rpm","value":"0"},"EventID":"event1","PDID":"123","Timestamp":1487183800905,"Timezone":330,"Version":"1.0","pii":{}},{"Data":{"heading":"N","loc1":"false","latitude":"16.032425","longitude":"80.770587","loc2":"false","speed":"10"},"EventID":"event2","PDID":"123","Timestamp":1487183800905,"Timezone":330,"Version":"1.1","pii":{}},{"Data":{"x":"1.1","y":"1.2","z":"2.2"},"event3":"AccelerometerInfo","PDID":"123","Timestamp":1487183800905,"Timezone":330,"Version":"1.0","pii":{}},{"EventID":"event4","Data":{"value":"50","unit":"percentage"},"Version":"1.0","Timestamp":1487183800905,"PDID":"123","Timezone":330},{"Data":{"unit":"kmph","value":"70"},"EventID":"event5","PDID":"123","Timestamp":1487183800905,"Timezone":330,"Version":"1.0","pii":{}}]}
create table test1(
uploadTimeStamp string,
PDID string,
data array<struct<
Data:struct<unit:string,value:int>,
EventID:string,
PDID:string,
TimeS:bigint,
Timezone:int,
Version:float,
pii:struct<>>,
struct<
Data:struct<heading:string,loc1:string,latitude:double,longitude:double,loc2:string,speed:int>,
EventID:string,
PDID:string,
TimeS:bigint,
Timezone:int,
Version:float,
pii:struct<>>,
struct<
Data:struct<x:float,y:float,z:float>,
EventID:string,
PDID:string,
TimeS:bigint,
Timezone:int,
Version:float,
pii:struct<>>,
struct<
EventID:string,
Data:struct<value:int,unit:percentage>,
Version:float,
TimeS:bigint,
PDID:string,
Timezone:int>,
struct<
Data:struct<unit:string,value:int>,
EventID:string,
PDID:string,
TimeS:bigint,
Timezone:int,
Version:float,
pii:struct<>>
>
ROW FORMAT SERDE
'org.apache.hive.hcatalog.data.JsonSerDe'
STORED AS TEXTFILE
LOCATION
'/ABC/XYZ/';
MismatchedTokenException(320!=313)
...
...
...
FAILED: ParseException line 11:10 mismatched input '<>' expecting < near 'struct' in struct type
Sample data
{"uploadTimeStamp":"1486631318873","PDID":"123","data":[{"Data":{"unit":"rpm","value":"0"},"EventID":"E1","PDID":"123","Timestamp":1486631318873,"Timezone":330,"Version":"1.0","pii":{}},{"Data":{"heading":"N","loc3":"false","loc":"14.022425","loc1":"78.760587","loc4":"false","speed":"10"},"EventID":"E2","PDID":"123","Timestamp":1486631318873,"Timezone":330,"Version":"1.1","pii":{}},{"Data":{"x":"1.1","y":"1.2","z":"2.2"},"EventID":"E3","PDID":"123","Timestamp":1486631318873,"Timezone":330,"Version":"1.0","pii":{}},{"EventID":"E4","Data":{"value":"50","unit":"percentage"},"Version":"1.0","Timestamp":1486631318873,"PDID":"123","Timezone":330},{"Data":{"unit":"kmph","value":"70"},"EventID":"E5","PDID":"123","Timestamp":1486631318873,"Timezone":330,"Version":"1.0","pii":{}}]}
add jar /usr/lib/hive-hcatalog/share/hcatalog/hive-hcatalog-core.jar
create external table myjson
(
uploadTimeStamp string
,PDID string
,data array
<
struct
<
Data:struct
<
unit:string
,value:string
,heading:string
,loc3:string
,loc:string
,loc1:string
,loc4:string
,speed:string
,x:string
,y:string
,z:string
>
,EventID:string
,PDID:string
,`Timestamp`:bigint
,Timezone:smallint
,Version:string
,pii:struct<dummy:string>
>
>
)
row format serde 'org.apache.hive.hcatalog.data.JsonSerDe'
stored as textfile
location '/tmp/myjson'
;
select * from myjson
;
+------------------------+-------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| myjson.uploadtimestamp | myjson.pdid | myjson.data |
+------------------------+-------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| 1486631318873 | 123 | [{"data":{"unit":"rpm","value":"0","heading":null,"loc3":null,"loc":null,"loc1":null,"loc4":null,"speed":null,"x":null,"y":null,"z":null},"eventid":"E1","pdid":"123","timestamp":1486631318873,"timezone":330,"version":"1.0","pii":{"dummy":null}},{"data":{"unit":null,"value":null,"heading":"N","loc3":"false","loc":"14.022425","loc1":"78.760587","loc4":"false","speed":"10","x":null,"y":null,"z":null},"eventid":"E2","pdid":"123","timestamp":1486631318873,"timezone":330,"version":"1.1","pii":{"dummy":null}},{"data":{"unit":null,"value":null,"heading":null,"loc3":null,"loc":null,"loc1":null,"loc4":null,"speed":null,"x":"1.1","y":"1.2","z":"2.2"},"eventid":"E3","pdid":"123","timestamp":1486631318873,"timezone":330,"version":"1.0","pii":{"dummy":null}},{"data":{"unit":"percentage","value":"50","heading":null,"loc3":null,"loc":null,"loc1":null,"loc4":null,"speed":null,"x":null,"y":null,"z":null},"eventid":"E4","pdid":"123","timestamp":1486631318873,"timezone":330,"version":"1.0","pii":null},{"data":{"unit":"kmph","value":"70","heading":null,"loc3":null,"loc":null,"loc1":null,"loc4":null,"speed":null,"x":null,"y":null,"z":null},"eventid":"E5","pdid":"123","timestamp":1486631318873,"timezone":330,"version":"1.0","pii":{"dummy":null}}] |
+------------------------+-------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
select j.uploadTimeStamp
,j.PDID
,d.val.EventID
,d.val.PDID
,d.val.`Timestamp`
,d.val.Timezone
,d.val.Version
,d.val.Data.unit
,d.val.Data.value
,d.val.Data.heading
,d.val.Data.loc3
,d.val.Data.loc
,d.val.Data.loc1
,d.val.Data.loc4
,d.val.Data.speed
,d.val.Data.x
,d.val.Data.y
,d.val.Data.z
from myjson j
lateral view explode (data) d as val
;
+-------------------+--------+---------+------+---------------+----------+---------+------------+-------+---------+-------+-----------+-----------+-------+-------+------+------+------+
| j.uploadtimestamp | j.pdid | eventid | pdid | timestamp | timezone | version | unit | value | heading | loc3 | loc | loc1 | loc4 | speed | x | y | z |
+-------------------+--------+---------+------+---------------+----------+---------+------------+-------+---------+-------+-----------+-----------+-------+-------+------+------+------+
| 1486631318873 | 123 | E1 | 123 | 1486631318873 | 330 | 1.0 | rpm | 0 | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL |
| 1486631318873 | 123 | E2 | 123 | 1486631318873 | 330 | 1.1 | NULL | NULL | N | false | 14.022425 | 78.760587 | false | 10 | NULL | NULL | NULL |
| 1486631318873 | 123 | E3 | 123 | 1486631318873 | 330 | 1.0 | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | 1.1 | 1.2 | 2.2 |
| 1486631318873 | 123 | E4 | 123 | 1486631318873 | 330 | 1.0 | percentage | 50 | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL |
| 1486631318873 | 123 | E5 | 123 | 1486631318873 | 330 | 1.0 | kmph | 70 | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL |
+-------------------+--------+---------+------+---------------+----------+---------+------------+-------+---------+-------+-----------+-----------+-------+-------+------+------+------+
Was having the same issue, then decided to create a custom input format which can extract the multiline(pretty print) json records.
This JsonRecordReader can read a multiline JSON record in Hive. It is extracting the record based on balancing of curly braces - { and }. So the content between first '{' to the balanced last '}' is considered as one complete record. Below is the code snippet:
public static class JsonRecordReader implements RecordReader<LongWritable, Text> {
public static final String START_TAG_KEY = "jsoninput.start";
public static final String END_TAG_KEY = "jsoninput.end";
private byte[] startTag = "{".getBytes();
private byte[] endTag = "}".getBytes();
private long start;
private long end;
private FSDataInputStream fsin;
private final DataOutputBuffer buffer = new DataOutputBuffer();
public JsonRecordReader(FileSplit split, JobConf jobConf) throws IOException {
// uncomment the below lines if you need to get the configuration
// from JobConf:
// startTag = jobConf.get(START_TAG_KEY).getBytes("utf-8");
// endTag = jobConf.get(END_TAG_KEY).getBytes("utf-8");
// open the file and seek to the start of the split:
start = split.getStart();
end = start + split.getLength();
Path file = split.getPath();
FileSystem fs = file.getFileSystem(jobConf);
fsin = fs.open(split.getPath());
fsin.seek(start);
}
#Override
public boolean next(LongWritable key, Text value) throws IOException {
if (fsin.getPos() < end) {
AtomicInteger count = new AtomicInteger(0);
if (readUntilMatch(false, count)) {
try {
buffer.write(startTag);
if (readUntilMatch(true, count)) {
key.set(fsin.getPos());
// create json record from buffer:
String jsonRecord = new String(buffer.getData(), 0, buffer.getLength());
value.set(jsonRecord);
return true;
}
} finally {
buffer.reset();
}
}
}
return false;
}
#Override
public LongWritable createKey() {
return new LongWritable();
}
#Override
public Text createValue() {
return new Text();
}
#Override
public long getPos() throws IOException {
return fsin.getPos();
}
#Override
public void close() throws IOException {
fsin.close();
}
#Override
public float getProgress() throws IOException {
return ((fsin.getPos() - start) / (float) (end - start));
}
private boolean readUntilMatch(boolean withinBlock, AtomicInteger count) throws IOException {
while (true) {
int b = fsin.read();
// end of file:
if (b == -1)
return false;
// save to buffer:
if (withinBlock)
buffer.write(b);
// check if we're matching start/end tag:
if (b == startTag[0]) {
count.incrementAndGet();
if (!withinBlock) {
return true;
}
} else if (b == endTag[0]) {
count.getAndDecrement();
if (count.get() == 0) {
return true;
}
}
// see if we've passed the stop point:
if (!withinBlock && count.get() == 0 && fsin.getPos() >= end)
return false;
}
}
}
This input format can be used along with the JSON Serde supplied by hive to read the multiline JSON file.
CREATE TABLE books (id string, bookname string, properties struct<subscription:string, unit:string>) ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe' STORED AS INPUTFORMAT 'JsonInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat';
The working code with samples is here: https://github.com/unayakdev/hive-json