I have these tables:
CREATE TABLE Progress_Category (
categoryId int(4) AUTO_INCREMENT NOT NULL,
name varchar(150) NOT NULL,
PRIMARY KEY (categoryId)
);
CREATE TABLE Progress_Skill (
skillId int(4) AUTO_INCREMENT NOT NULL,
name varchar(150) NOT NULL,
currentProgress int NOT NULL,
`25` varchar(300) NOT NULL,
`50` varchar(300) NOT NULL,
`75` varchar(300) NOT NULL,
`100` varchar(300) NOT NULL,
categoryId int(4) NOT NULL,
PRIMARY KEY (skillId),
CONSTRAINT Constr_Progress_Skill_Skill_fk FOREIGN KEY Skill_fk (categoryId) REFERENCES Progress_Category(categoryId) ON DELETE CASCADE ON UPDATE CASCADE
);
CREATE TABLE Progress_Message (
messageId int(4) AUTO_INCREMENT NOT NULL,
message varchar(500) NOT NULL,
messageDate DATE NOT NULL,
skillId int(4) NOT NULL,
PRIMARY KEY (messageId),
CONSTRAINT Constr_Progress_Message_Message_fk FOREIGN KEY Message_fk (skillId) REFERENCES Progress_Skill(skillId) ON DELETE CASCADE ON UPDATE CASCADE
);
I have this query to retrieve all the data in a table:
SELECT *
FROM Progress_Category AS pcat
LEFT JOIN Progress_Skill AS ps
ON pcat.categoryId = ps.catParentId
LEFT JOIN Progress_Message AS pm
ON ps.skillId = pm.skillParentId
For each skill of a category a new row of category will be created, with the respected skill. For each message of a skill a new row with the category and the skill will be created, with the respected message.
Query result:
+------------+-----------+---------+-----------+-----------------+------+-------+--------+-------+-------------+-----------+-------------------------+-------------+---------------+
| categoryId | catname | skillId | skillname | currentProgress | 25 | 50 | 75 | 100 | catParentId | messageId | message | messageDate | skillParentId |
+------------+-----------+---------+-----------+-----------------+------+-------+--------+-------+-------------+-----------+-------------------------+-------------+---------------+
| 1 | Languages | 1 | Spanish | 100 | Read | Write | Listen | Speak | 1 | 1 | Native language | 2022-08-27 | 1 |
| 1 | Languages | 2 | English | 85 | Read | Write | Listen | Speak | 1 | 2 | Learning since 2016 | 2022-08-27 | 2 |
| 1 | Languages | 2 | English | 85 | Read | Write | Listen | Speak | 1 | 3 | Can speak almost fluent | 2022-08-27 | 2 |
| 2 | Projects | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL |
| 3 | Ideas | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL |
+------------+-----------+---------+-----------+-----------------+------+-------+--------+-------+-------------+-----------+-------------------------+-------------+---------------+
5 rows in set (0.001 sec)
In nodejs I use that query and the following code:
connection.query(myquery, function(err, results, fields) {
if (err) {
console.log('----> Error with MySQL query in /api/showProgress: ' + err.message);
}
else{
console.log('Query successful, results are being displayed.');
var categories = [];
for (let category in results) {
if(categories.length > 0){
for(let key in categories){
if(results[category].categoryId !== categories[key].Category.Id){
console.log("Category Id: " + results[category].categoryId + " Id already in the array: " + categories[key].Category.Id);
categories.push({
"Category" : [{
"Id" : results[category].categoryId,
"Name" : results[category].catname
}]
});
}
}
}
}
else{
categories.push({
"Category" : [{
"Id" : results[category].categoryId,
"Name" : results[category].catname
}]
})
}
}
response.send({"My progress" : categories});
});
The result I get:
Query successful, results are being displayed.
Category Id: 1 Id already in the array: undefined
Category Id: 1 Id already in the array: undefined
Category Id: 1 Id already in the array: undefined
Category Id: 2 Id already in the array: undefined
Category Id: 2 Id already in the array: undefined
Category Id: 2 Id already in the array: undefined
Category Id: 2 Id already in the array: undefined
Category Id: 3 Id already in the array: undefined
Category Id: 3 Id already in the array: undefined
Category Id: 3 Id already in the array: undefined
Category Id: 3 Id already in the array: undefined
Category Id: 3 Id already in the array: undefined
Category Id: 3 Id already in the array: undefined
Category Id: 3 Id already in the array: undefined
Category Id: 3 Id already in the array: undefined
So the problem is categories[key].Category.Id. I don't know how to access the property Id that belong to Category that is in the array.
The final idea is use that if so only one category is shown with an array of skills instead of a category, skill, same category, other skill:
Current:
{"My progress":[
{
"Category":[{
"Id":1,
"Name":"Languages",
"Skill":"asd"
}]
},
{
"Category":[{
"Id":1,
"Name":"Languages",
"Skill":"fgh"
}]
},
{
"Category":[{
"Id":1,
"Name":"Languages",
"Skill":"ijk"
}]
},
]}
Expected:
{"My progress":[
{
"Category":[
{
"Id":1,
"Name":"Languages",
"Skills":[{
"Name":"asd",
"Name":"fgh",
"Name":"ijk"
}]
},
{
"Id":2,
"Name":"Projects",
"Skills":[{
"Name":"123",
"Name":"456",
"Name":"789"
}]
}
]
}
]}
Got the expected result, changing almost everything:
{"My skills":[
{
"categoryId":1,
"CategoryName":"Web development",
"Subcategories":[
{
"parentId":1,
"subcategoryId":1,
"SubcategoryName":"Frontend",
"Skills":[
"Sass",
"Css",
"Bootstrap",
"Figma"
]
},
{
"parentId":1,
"subcategoryId":2,
"SubcategoryName":"Backend",
"Skills":[
"Nodejs",
"Express",
"MySQL",
"PHP"
]
}
]
},
{
"categoryId":2,
"CategoryName":"Cybersecurity",
"Subcategories":[
{
"parentId":2,
"subcategoryId":3,
"SubcategoryName":"Red team",
"Skills":[
"curl",
"Sherlock",
"Wappalyzer",
"Burpsuite"
]
},
{
"parentId":2,
"subcategoryId":4,
"SubcategoryName":"Blue team",
"Skills":[
"Cloudfare"
]
}
]
}
]}
Nodejs code:
connection.query(myquery, function(err, results, fields) {
if (err) {
console.log('----> Error with MySQL query in /api/showSkills: ' + err.message);
}
else{
console.log('Query successful, results are being displayed.');
var mylist = [];
var subcat = [];
var lastPushedId = 0;
for (let key in results){
if(lastPushedId !== results[key].categoryId){
for (let otherkey in results){
if(results[otherkey].subcatParentId === results[key].categoryId){
subcat.push({
'parentId': results[otherkey].subcatParentId,
'subcategoryId': results[otherkey].subcategoryId,
'SubcategoryName': results[otherkey].subcatname,
'Skills': results[otherkey].skills.split(',')
});
}
}
mylist.push({
'categoryId': results[key].categoryId,
'CategoryName': results[key].catname,
'Subcategories': subcat
});
subcat = [];
lastPushedId = results[key].categoryId;
}
}
response.send({"My skills" : mylist});
}
});
I have a JSON data which is a multi-line JSON. I have created a hive table to load that data into it. I have another JSON which is a single-line JSON record. When I load the single-line JSON record to its hive table and try to query, it works fine. But when I load the multi-line JSON into its hive table, it gives below exception:
Failed with exception java.io.IOException:org.apache.hadoop.hive.serde2.SerDeException: org.codehaus.jackson.JsonParseException: Unexpected end-of-input: expected close marker for OBJECT (from [Source: java.io.ByteArrayInputStream#8b89b3a; line: 1, column: 0]) at [Source: java.io.ByteArrayInputStream#8b89b3a; line: 1, column: 3]
Below is my JSON data:
{
"uploadTimeStamp" : "1486631318873",
"PDID" : "123",
"data" : [ {
"Data" : {
"unit" : "rpm",
"value" : "0"
},
"EventID" : "E1",
"PDID" : "123",
"Timestamp" : 1486631318873,
"Timezone" : 330,
"Version" : "1.0",
"pii" : { }
}, {
"Data" : {
"heading" : "N",
"loc3" : "false",
"loc" : "14.022425",
"loc1" : "78.760587",
"loc4" : "false",
"speed" : "10"
},
"EventID" : "E2",
"PDID" : "123",
"Timestamp" : 1486631318873,
"Timezone" : 330,
"Version" : "1.1",
"pii" : { }
}, {
"Data" : {
"x" : "1.1",
"y" : "1.2",
"z" : "2.2"
},
"EventID" : "E3",
"PDID" : "123",
"Timestamp" : 1486631318873,
"Timezone" : 330,
"Version" : "1.0",
"pii" : { }
}, {
"EventID" : "E4",
"Data" : {
"value" : "50",
"unit" : "percentage"
},
"Version" : "1.0",
"Timestamp" : 1486631318873,
"PDID" : "123",
"Timezone" : 330
}, {
"Data" : {
"unit" : "kmph",
"value" : "70"
},
"EventID" : "E5",
"PDID" : "123",
"Timestamp" : 1486631318873,
"Timezone" : 330,
"Version" : "1.0",
"pii" : { }
} ]
}
I am using /hive/lib/hive-hcatalog-core-0.13.0.jar
Below is my create table statement:
create table test7(
uploadtime bigint,
pdid string,
data array<
struct<Data:struct<
unit:string,
value:int>,
eventid:string,
pdid:bigint,
time:bigint,
timezone:int,
version:int,
pii:struct<pii:string>>,
struct<Data:struct<
heading:string,
Location:string,
latitude:bigint,
longitude:bigint,
Location2:string,
speed:int>,
eventid:string,
pdid:bigint,
time:bigint,
timezone:int,
version:int,
pii:struct<pii:string>>,
struct<Data:struct<
unit:string,
value:int>,
eventid:string,
pdid:bigint,
time:bigint,
timezone:int,
version:int,
pii:struct<pii:string>>,
struct<Data:struct<
x:int,
y:int,
z:int>,
eventid:string,
pdid:bigint,
time:bigint,
timezone:int,
version:int,
pii:struct<pii:string>>,
struct<Data:struct<
heading:string,
loc3:string,
latitude:bigint,
longitude:bigint,
loc4:string,
speed:int>,
eventid:string,
pdid:bigint,
time:bigint,
timezone:int,
version:int,
pii:struct<pii:string>>
>
)
ROW FORMAT SERDE
'org.apache.hive.hcatalog.data.JsonSerDe'
STORED AS TEXTFILE
LOCATION
'/xyz/abc/';
Edit:
Adding the single line JSON and new table create stmt with error:
{"uploadTimeStamp":"1487183800905","PDID":"123","data":[{"Data":{"unit":"rpm","value":"0"},"EventID":"event1","PDID":"123","Timestamp":1487183800905,"Timezone":330,"Version":"1.0","pii":{}},{"Data":{"heading":"N","loc1":"false","latitude":"16.032425","longitude":"80.770587","loc2":"false","speed":"10"},"EventID":"event2","PDID":"123","Timestamp":1487183800905,"Timezone":330,"Version":"1.1","pii":{}},{"Data":{"x":"1.1","y":"1.2","z":"2.2"},"event3":"AccelerometerInfo","PDID":"123","Timestamp":1487183800905,"Timezone":330,"Version":"1.0","pii":{}},{"EventID":"event4","Data":{"value":"50","unit":"percentage"},"Version":"1.0","Timestamp":1487183800905,"PDID":"123","Timezone":330},{"Data":{"unit":"kmph","value":"70"},"EventID":"event5","PDID":"123","Timestamp":1487183800905,"Timezone":330,"Version":"1.0","pii":{}}]}
create table test1(
uploadTimeStamp string,
PDID string,
data array<struct<
Data:struct<unit:string,value:int>,
EventID:string,
PDID:string,
TimeS:bigint,
Timezone:int,
Version:float,
pii:struct<>>,
struct<
Data:struct<heading:string,loc1:string,latitude:double,longitude:double,loc2:string,speed:int>,
EventID:string,
PDID:string,
TimeS:bigint,
Timezone:int,
Version:float,
pii:struct<>>,
struct<
Data:struct<x:float,y:float,z:float>,
EventID:string,
PDID:string,
TimeS:bigint,
Timezone:int,
Version:float,
pii:struct<>>,
struct<
EventID:string,
Data:struct<value:int,unit:percentage>,
Version:float,
TimeS:bigint,
PDID:string,
Timezone:int>,
struct<
Data:struct<unit:string,value:int>,
EventID:string,
PDID:string,
TimeS:bigint,
Timezone:int,
Version:float,
pii:struct<>>
>
ROW FORMAT SERDE
'org.apache.hive.hcatalog.data.JsonSerDe'
STORED AS TEXTFILE
LOCATION
'/ABC/XYZ/';
MismatchedTokenException(320!=313)
...
...
...
FAILED: ParseException line 11:10 mismatched input '<>' expecting < near 'struct' in struct type
Sample data
{"uploadTimeStamp":"1486631318873","PDID":"123","data":[{"Data":{"unit":"rpm","value":"0"},"EventID":"E1","PDID":"123","Timestamp":1486631318873,"Timezone":330,"Version":"1.0","pii":{}},{"Data":{"heading":"N","loc3":"false","loc":"14.022425","loc1":"78.760587","loc4":"false","speed":"10"},"EventID":"E2","PDID":"123","Timestamp":1486631318873,"Timezone":330,"Version":"1.1","pii":{}},{"Data":{"x":"1.1","y":"1.2","z":"2.2"},"EventID":"E3","PDID":"123","Timestamp":1486631318873,"Timezone":330,"Version":"1.0","pii":{}},{"EventID":"E4","Data":{"value":"50","unit":"percentage"},"Version":"1.0","Timestamp":1486631318873,"PDID":"123","Timezone":330},{"Data":{"unit":"kmph","value":"70"},"EventID":"E5","PDID":"123","Timestamp":1486631318873,"Timezone":330,"Version":"1.0","pii":{}}]}
add jar /usr/lib/hive-hcatalog/share/hcatalog/hive-hcatalog-core.jar
create external table myjson
(
uploadTimeStamp string
,PDID string
,data array
<
struct
<
Data:struct
<
unit:string
,value:string
,heading:string
,loc3:string
,loc:string
,loc1:string
,loc4:string
,speed:string
,x:string
,y:string
,z:string
>
,EventID:string
,PDID:string
,`Timestamp`:bigint
,Timezone:smallint
,Version:string
,pii:struct<dummy:string>
>
>
)
row format serde 'org.apache.hive.hcatalog.data.JsonSerDe'
stored as textfile
location '/tmp/myjson'
;
select * from myjson
;
+------------------------+-------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| myjson.uploadtimestamp | myjson.pdid | myjson.data |
+------------------------+-------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
| 1486631318873 | 123 | [{"data":{"unit":"rpm","value":"0","heading":null,"loc3":null,"loc":null,"loc1":null,"loc4":null,"speed":null,"x":null,"y":null,"z":null},"eventid":"E1","pdid":"123","timestamp":1486631318873,"timezone":330,"version":"1.0","pii":{"dummy":null}},{"data":{"unit":null,"value":null,"heading":"N","loc3":"false","loc":"14.022425","loc1":"78.760587","loc4":"false","speed":"10","x":null,"y":null,"z":null},"eventid":"E2","pdid":"123","timestamp":1486631318873,"timezone":330,"version":"1.1","pii":{"dummy":null}},{"data":{"unit":null,"value":null,"heading":null,"loc3":null,"loc":null,"loc1":null,"loc4":null,"speed":null,"x":"1.1","y":"1.2","z":"2.2"},"eventid":"E3","pdid":"123","timestamp":1486631318873,"timezone":330,"version":"1.0","pii":{"dummy":null}},{"data":{"unit":"percentage","value":"50","heading":null,"loc3":null,"loc":null,"loc1":null,"loc4":null,"speed":null,"x":null,"y":null,"z":null},"eventid":"E4","pdid":"123","timestamp":1486631318873,"timezone":330,"version":"1.0","pii":null},{"data":{"unit":"kmph","value":"70","heading":null,"loc3":null,"loc":null,"loc1":null,"loc4":null,"speed":null,"x":null,"y":null,"z":null},"eventid":"E5","pdid":"123","timestamp":1486631318873,"timezone":330,"version":"1.0","pii":{"dummy":null}}] |
+------------------------+-------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
select j.uploadTimeStamp
,j.PDID
,d.val.EventID
,d.val.PDID
,d.val.`Timestamp`
,d.val.Timezone
,d.val.Version
,d.val.Data.unit
,d.val.Data.value
,d.val.Data.heading
,d.val.Data.loc3
,d.val.Data.loc
,d.val.Data.loc1
,d.val.Data.loc4
,d.val.Data.speed
,d.val.Data.x
,d.val.Data.y
,d.val.Data.z
from myjson j
lateral view explode (data) d as val
;
+-------------------+--------+---------+------+---------------+----------+---------+------------+-------+---------+-------+-----------+-----------+-------+-------+------+------+------+
| j.uploadtimestamp | j.pdid | eventid | pdid | timestamp | timezone | version | unit | value | heading | loc3 | loc | loc1 | loc4 | speed | x | y | z |
+-------------------+--------+---------+------+---------------+----------+---------+------------+-------+---------+-------+-----------+-----------+-------+-------+------+------+------+
| 1486631318873 | 123 | E1 | 123 | 1486631318873 | 330 | 1.0 | rpm | 0 | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL |
| 1486631318873 | 123 | E2 | 123 | 1486631318873 | 330 | 1.1 | NULL | NULL | N | false | 14.022425 | 78.760587 | false | 10 | NULL | NULL | NULL |
| 1486631318873 | 123 | E3 | 123 | 1486631318873 | 330 | 1.0 | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | 1.1 | 1.2 | 2.2 |
| 1486631318873 | 123 | E4 | 123 | 1486631318873 | 330 | 1.0 | percentage | 50 | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL |
| 1486631318873 | 123 | E5 | 123 | 1486631318873 | 330 | 1.0 | kmph | 70 | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL | NULL |
+-------------------+--------+---------+------+---------------+----------+---------+------------+-------+---------+-------+-----------+-----------+-------+-------+------+------+------+
Was having the same issue, then decided to create a custom input format which can extract the multiline(pretty print) json records.
This JsonRecordReader can read a multiline JSON record in Hive. It is extracting the record based on balancing of curly braces - { and }. So the content between first '{' to the balanced last '}' is considered as one complete record. Below is the code snippet:
public static class JsonRecordReader implements RecordReader<LongWritable, Text> {
public static final String START_TAG_KEY = "jsoninput.start";
public static final String END_TAG_KEY = "jsoninput.end";
private byte[] startTag = "{".getBytes();
private byte[] endTag = "}".getBytes();
private long start;
private long end;
private FSDataInputStream fsin;
private final DataOutputBuffer buffer = new DataOutputBuffer();
public JsonRecordReader(FileSplit split, JobConf jobConf) throws IOException {
// uncomment the below lines if you need to get the configuration
// from JobConf:
// startTag = jobConf.get(START_TAG_KEY).getBytes("utf-8");
// endTag = jobConf.get(END_TAG_KEY).getBytes("utf-8");
// open the file and seek to the start of the split:
start = split.getStart();
end = start + split.getLength();
Path file = split.getPath();
FileSystem fs = file.getFileSystem(jobConf);
fsin = fs.open(split.getPath());
fsin.seek(start);
}
#Override
public boolean next(LongWritable key, Text value) throws IOException {
if (fsin.getPos() < end) {
AtomicInteger count = new AtomicInteger(0);
if (readUntilMatch(false, count)) {
try {
buffer.write(startTag);
if (readUntilMatch(true, count)) {
key.set(fsin.getPos());
// create json record from buffer:
String jsonRecord = new String(buffer.getData(), 0, buffer.getLength());
value.set(jsonRecord);
return true;
}
} finally {
buffer.reset();
}
}
}
return false;
}
#Override
public LongWritable createKey() {
return new LongWritable();
}
#Override
public Text createValue() {
return new Text();
}
#Override
public long getPos() throws IOException {
return fsin.getPos();
}
#Override
public void close() throws IOException {
fsin.close();
}
#Override
public float getProgress() throws IOException {
return ((fsin.getPos() - start) / (float) (end - start));
}
private boolean readUntilMatch(boolean withinBlock, AtomicInteger count) throws IOException {
while (true) {
int b = fsin.read();
// end of file:
if (b == -1)
return false;
// save to buffer:
if (withinBlock)
buffer.write(b);
// check if we're matching start/end tag:
if (b == startTag[0]) {
count.incrementAndGet();
if (!withinBlock) {
return true;
}
} else if (b == endTag[0]) {
count.getAndDecrement();
if (count.get() == 0) {
return true;
}
}
// see if we've passed the stop point:
if (!withinBlock && count.get() == 0 && fsin.getPos() >= end)
return false;
}
}
}
This input format can be used along with the JSON Serde supplied by hive to read the multiline JSON file.
CREATE TABLE books (id string, bookname string, properties struct<subscription:string, unit:string>) ROW FORMAT SERDE 'org.apache.hive.hcatalog.data.JsonSerDe' STORED AS INPUTFORMAT 'JsonInputFormat' OUTPUTFORMAT 'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat';
The working code with samples is here: https://github.com/unayakdev/hive-json