orientdb load graph csv of nodes and edges - csv

I'm a newbie in Orientdb .
I have a csv file which has both the nodes and the edge and I need to create a graph out of that csv file .
csv file
"p1","p2","score"
"LGG_00001","LGG_01682",282
"LGG_00001",".LGG_01831",183
"LGG_00001","LGG_01491",238
The edge is IsActingWith which had the score attribute
{
"source": {
"file": {
"path": "C:/Users/sams/Desktop/OrientDB2/lac2.csv"
}
},
"extractor": {
"csv": {}
},
"transformers": [
{
"vertex": {
"class": "lac2"
}
},
{
"vertex": {
"class": "lac2"
}
},
{
"edge":
{
"class": "IsActingWith",
"joinFieldName": "score_p",
"lookup": "acore",
"direction": "out"
}
}
],
"loader": {
"orientdb": {
"dbURL": "plocal:C:/Users/sams/Desktop/OrientDB2/database/proj",
"dbType": "graph",
"dbAutoCreate": true,
"classes": [
{
"name": "lac2",
"extends": "V"
},
{
"name": "lac2",
"extends": "V"
},
{
"name": "IsActingWith",
"extends": "E"
},
]
}
}
}
That is what I tried but it does not seem logic to me.
The final result I'm looking for is to have a graaph made of p1->ACTINGWITH-> p2 and ACTINGWITH has score of the score attribute

maybe there's a better solution but this works.
My plan is to use 3 different etl scripts: first and second for inserting the vertices and the third for the edges. Of course you'll need to execute them in order.
vertex_import_p1.json
{
"source": { "file": { "path": "/home/ivan/Cose/OrientDB/issues/stack/44641116/file.csv" } },
"extractor": { "csv": {
"separator": ",",
"columns": ["p1:String","p2:String","s:Integer"] } },
"transformers": [
{ "command": { "command": "UPDATE lac2 set p='${input.p1}' UPSERT WHERE p='${input.p1}'"} }
],
"loader": {
"orientdb": {
"dbURL": "plocal:/home/ivan/Cose/OrientDB/issues/stack/44641116/db",
"dbUser": "admin",
"dbPassword": "admin",
"dbType": "graph",
"classes": [
{"name": "lac2", "extends": "V"},
{"name": "isActingWith", "extends": "E"}
]
}
}
}
vertex_import_p2.json
{
"source": { "file": { "path": "/home/ivan/Cose/OrientDB/issues/stack/44641116/file.csv" } },
"extractor": { "csv": {
"separator": ",",
"columns": ["p1:String","p2:String","s:Integer"] } },
"transformers": [
{ "command": { "command": "UPDATE lac2 set p='${input.p2}' UPSERT WHERE p='${input.p2}'"} }
],
"loader": {
"orientdb": {
"dbURL": "plocal:/home/ivan/Cose/OrientDB/issues/stack/44641116/db",
"dbUser": "admin",
"dbPassword": "admin",
"dbType": "graph",
"classes": [
{"name": "lac2", "extends": "V"},
{"name": "isActingWith", "extends": "E"}
]
}
}
}
edge_import_s.json
{
"source": { "file": { "path": "/home/ivan/Cose/OrientDB/issues/stack/44641116/file.csv" } },
"extractor": { "csv": {
"separator": ",",
"columns": ["p1:String","p2:String","s:Integer"] } },
"transformers": [
{ "command": { "command": "CREATE EDGE isActingWith FROM (SELECT FROM lac2 WHERE p='${input.p1}') TO (SELECT FROM lac2 WHERE p='${input.p2}') set score=${input.s}"} }
],
"loader": {
"orientdb": {
"dbURL": "plocal:/home/ivan/Cose/OrientDB/issues/stack/44641116/db",
"dbUser": "admin",
"dbPassword": "admin",
"dbType": "graph",
"classes": [
{"name": "lac2", "extends": "V"},
{"name": "isActingWith", "extends": "E"}
]
}
}
}
And here are the situation after the executions:
orientdb {db=db}> select from lac2
+----+-----+------+---------+-------------------+---------------+
|# |#RID |#CLASS|p |out_isActingWith |in_isActingWith|
+----+-----+------+---------+-------------------+---------------+
|0 |#21:6|lac2 |LGG_00001|[#25:5,#26:1,#27:1]| |
|1 |#21:7|lac2 |LGG_01682| |[#25:5] |
|2 |#22:3|lac2 |LGG_01831| |[#26:1] |
|3 |#23:1|lac2 |LGG_01491| |[#27:1] |
+----+-----+------+---------+-------------------+---------------+
4 item(s) found. Query executed in 0.003 sec(s).
orientdb {db=db}> select from isActingWith
+----+-----+------------+-----+-----+-----+
|# |#RID |#CLASS |score|out |in |
+----+-----+------------+-----+-----+-----+
|0 |#25:5|isActingWith|282 |#21:6|#21:7|
|1 |#26:1|isActingWith|183 |#21:6|#22:3|
|2 |#27:1|isActingWith|238 |#21:6|#23:1|
+----+-----+------------+-----+-----+-----+
3 item(s) found. Query executed in 0.004 sec(s).

Related

How Should the Json Output Look in Nlog?

I am using nlog json layout and took this from the example
{
"Logging": {
"NLog": {
"IncludeScopes": false,
"ParseMessageTemplates": true,
"CaptureMessageProperties": true
}
},
"NLog": {
"autoreload": true,
"internalLogLevel": "Info",
"internalLogFile": "c:/temp/console-example-internal2.log",
"throwConfigExceptions": true,
"targets": {
"console": {
"type": "Console",
"layout": "${date}|${level:uppercase=true}|${message} ${exception:format=tostring}|${logger}|${all-event-properties}"
},
"file": {
"type": "AsyncWrapper",
"target": {
"wrappedFile": {
"type": "File",
"fileName": "c:/temp/console-example2.log",
"layout": {
"type": "JsonLayout",
"Attributes": [
{ "name": "timestamp", "layout": "${date:format=o}" },
{ "name": "level", "layout": "${level}" },
{ "name": "logger", "layout": "${logger}" },
{ "name": "message", "layout": "${message:raw=true}" },
{ "name": "properties", "encode": false, "layout": { "type": "JsonLayout", "includeallproperties": "true" } }
]
}
}
}
}
},
"rules": [
{
"logger": "*",
"minLevel": "Trace",
"writeTo": "File,Console"
}
]
}
}
https://github.com/NLog/NLog.Extensions.Logging/blob/master/examples/NetCore2/ConsoleExampleJsonConfig/appsettings.json
on this line I saw this { "name": "message", "layout": "${message:raw=true}" } I changed it to false.
when I do this
var test = "Something";
logger.Info("This is what is stored in the variable: {var}", test);
I get
{
"message": This is what is stored in the variable: \"Something\""
}
Why is it in quotes?
When I change raw to "true" I get
{
"message": This is what is stored in the variable: {test}"
}
how do I just get "This is what is stored in the variable: Something"

Sort complex JSON object by specific property

How can I sort the given JSON object with property count. I want to sort the entire sub-object. The higher the count value should come on the top an so on.
{
"Resource": [
{
"details": [
{
"value": "3.70"
},
{
"value": "3.09"
}
],
"work": {
"count": 1
}
},
{
"details": [
{
"value": "4"
},
{
"value": "5"
}
],
"work": {
"count": 2
},
{
"details": [
{
"value": "5"
},
{
"value": "5"
}
],
"work": "null"
}
]
}
You can try this example to sort your data:
data = {
"data": {
"Resource": [
{
"details": [{"value": "3.70"}, {"value": "3.09"}],
"work": {"count": 1},
},
{"details": [{"value": "4"}, {"value": "5"}], "work": {"count": 2}},
]
}
}
# sort by 'work'/'count'
data["data"]["Resource"] = sorted(
data["data"]["Resource"], key=lambda r: r["work"]["count"]
)
# sort by 'details'/'value'
for r in data["data"]["Resource"]:
r["details"] = sorted(r["details"], key=lambda k: float(k["value"]))
# pretty print:
import json
print(json.dumps(data, indent=4))
Prints:
{
"data": {
"Resource": [
{
"details": [
{
"value": "3.09"
},
{
"value": "3.70"
}
],
"work": {
"count": 1
}
},
{
"details": [
{
"value": "4"
},
{
"value": "5"
}
],
"work": {
"count": 2
}
}
]
}
}

Reading hierarchy of a JSON using value

I want to get the hierarchy of a JSON using a value. For example: In below JSON for value "Medical record number" , the desired information is "resource->identifier->type->coding->display" . IS their any inbuilt function to do so.
The one way to look for opening and ending braces to locate nodes. Any other efficient way ?
{
"resourceType": "Bundle",
"type": "transaction",
"entry": [
{
"fullUrl": "Patient/996-756-495-101",
"resource": {
"resourceType": "Patient",
"id": "996-756-495-101",
"identifier": [
{
"type": {
"coding": [
{
"system": "http://hl7.org/fhir/v2/0203",
"code": "MR",
"display": "Medical record number"
}
]
},
"system": "https://www.lumc.nl",
"value": "996-756-495-101"
}
],
"name": [
{
"use": "usual",
"family": [
"Levin_4"
],
"given": [
"Henry_4"
]
}
],
"gender": "male",
"birthDate": "1932-09-24",
"maritalStatus": {
"coding": [
{}
]
},
"managingOrganization": {
"reference": "Organization/12"
}
},
"request": {
"method": "POST",
"url": "Patient",
"ifNoneExist": "identifier=https://www.lumc.nl|996-756-495-101"
}
},
{
"fullUrl": "FamilyMemberHistory/d42ebf70-5c89-11db-b0de-0800200c9a66",
"resource": {
"resourceType": "FamilyMemberHistory",
"id": "d42ebf70-5c89-11db-b0de-0800200c9a66",
"patient": {
"reference": "Patient/996-756-495-101"
},
"status": "Partial",
"relationship": {
"coding": [
{
"system": "http://hl7.org/fhir/ValueSet/v3-FamilyMember",
"code": "FTH",
"display": "Father"
}
],
"text": "Father"
},
"gender": "male",
"bornDate": "1912",
"deceasedBoolean": true
},
"request": {
"method": "POST",
"url": "FamilyMemberHistory"
}
},
{
"fullUrl": "FamilyMemberHistory/a13c6160-5c8b-11db-b0de-0800200c9a66",
"resource": {
"resourceType": "FamilyMemberHistory",
"id": "a13c6160-5c8b-11db-b0de-0800200c9a66",
"patient": {
"reference": "Patient/996-756-495-101"
},
"status": "Partial",
"relationship": {
"coding": [
{
"system": "http://hl7.org/fhir/ValueSet/v3-FamilyMember",
"code": "MTH",
"display": "Mother"
}
],
"text": "Mother"
},
"gender": "female",
"bornDate": "1912",
"deceasedBoolean": false
},
"request": {
"method": "POST",
"url": "FamilyMemberHistory"
}
}
]
}

Orientdb - CSV Import - Performance CSV Import Edge

I want to import two csv files to a Orientdb database. The first is the apex, with 1 million records. The second are the edges with 59 million records
I have two json file to import:
vértex
{
"source": { "file": { "path": "../csvs/metodo01/pesquisador.csv" } },
"extractor": { "row": {} },
"transformers": [
{ "csv": {} },
{ "vertex": { "class": "Pesquisador" } }
],
"loader": {
"orientdb": {
"dbURL": "remote:localhost/dbCemMilM01",
"dbType": "graph",
"batchCommit": 1000,
"classes": [
{"name": "Pesquisador", "extends": "V"}
], "indexes": [
{"class":"Pesquisador", "fields":["psq_id:integer"], "type":"UNIQUE" }
]
}
}
}
edge
{
"config": {
"log": "info",
"parallel": false
},
"source": {
"file": {
"path": "../csvs/metodo01/a10.csv"
}
},
"extractor": {
"row": {
}
},
"transformers": [{
"csv": {
"separator": ",",
"columnsOnFirstLine": true,
"columns": ["psq_id_from:integer",
"pub_id_to:integer",
"ordem:integer"]
}
},
{
"command": {
"command": "create edge PUBLICOU from (select from Pesquisador where psq_id = ${input.psq_id_from}) to (select from Publicacao where pub_id = ${input.pub_id_to}) set ordem = ${input.ordem} ",
"output": "edge"
}
}],
"loader": {
"orientdb": {
"dbURL": "remote:localhost/dbUmMilhaoM01",
"dbType": "graph",
"standardElementConstraints": false,
"batchCommit": 1000,
"classes": [{
"name": "PUBLICOU",
"extends": "E"
}]
}
}
}
In the process the Orientdb suggests using index to accelerate the process.
How do I do that?
Just the command is create edge PUBLICOU from (select from Pesquisador where psq_id = ${input.psq_id_from}) to (select from Publicacao where pub_id = ${input.pub_id_to}) set ordem = ${input.ordem}
To speed up the create edge process you may need indexes on both properties Pesquisador.psq_id , that you already have, and on Publicacao.pub_id.
Ivan
You can declare indexes directly in the ETL configuration. Example taken from DBPedia importer:
"orientdb": {
"dbURL": "plocal:/temp/databases/dbpedia",
"dbUser": "importer",
"dbPassword": "IMP",
"dbAutoCreate": true,
"tx": false,
"batchCommit": 1000,
"wal" : false,
"dbType": "graph",
"classes": [
{"name":"Person", "extends": "V" },
{"name":"Customer", "extends": "Person", "clusters":8 }
],
"indexes": [
{"class":"V", "fields":["URI:string"], "type":"UNIQUE" },
{"class":"Person", "fields":["town:string"], "type":"NOTUNIQUE" ,
metadata : { "ignoreNullValues" : false }
}
]
}
For more information look at: http://orientdb.com/docs/2.2/Loader.html
To speedup the load process my suggestion is to work in plocal mode and then mode the created db to a standalone OrientDB server.

OrientDB ETL JSON Error

I am new in OrientDB and facing problem when importing .csv (MovieLens) dataset to OrientDB.
This is the format of .csv file
movieId , title , genres
1 , Toy Story (1995) , Adventure|Animation|Children|Comedy|Fantasy
2 , Jumanji (1995) , Adventure|Children|Fantasy
3 , Grumpier Old Men (1995) , Comedy|Romance
4 , Waiting to Exhale (1995) , Comedy|Drama|Romance
5 , Father of the Bride Part II (1995), Comedy
6 , Heat (1995) , Action|Crime|Thriller
I am facing problem while splitting the "Genres" field with '|'. I created split function in orientdb studio and call it in JSON ETL configuration script and it still does not recognize it and throw exception.
Split Function in OrientDB Studio
Exception
JSON Script:
{
"config": {
"log": "info",
parallel: false
},
"source": { "file": { "path": "E:\Orient DB\OrientDB Project\Project DataSet MovieLens\ml-latest\movies.csv"} },
"extractor": { "row": {} },
"transformers": [
{ "csv": {"separator": ",",
"columnsOnFirstLine":true,
"columns":["id","title:string","genres"]}},
{ "field": { "fieldName": "genresArray", "expression": " split(genres,'|') "} },
{ "field": { "fieldName": "genresArray_0", "expression": "genresArray[0]"} },
{ "field": { "fieldName": "genresArray_1", "expression": "genresArray[1]"} },
{ "field": { "fieldName": "genresArray_2", "expression": "genresArray[2]"} },
{ "field": { "fieldName": "genresArray_3", "expression": "genresArray[3]"} },
{ "field": { "fieldName": "genresArray_4", "expression": "genresArray[4]"} },
{ "field": { "fieldName": "genresArray_5", "expression": "genresArray[5]"} },
{ "vertex": { "class": "Movies" } },
{ "edge": {
"class": "HasGenera",
"joinFieldName": "genresArray_0",
"lookup": "Genres.description",
"unresolvedLinkAction": "CREATE"
} },
{ "edge": {
"class": "HasGenera",
"joinFieldName": "genresArray_1",
"lookup": "Genres.description",
"unresolvedLinkAction": "CREATE"
} },
{ "edge": {
"class": "HasGenera",
"joinFieldName": "genresArray_2",
"lookup": "Genres.description",
"unresolvedLinkAction": "CREATE"
} },
{ "edge": {
"class": "HasGenera",
"joinFieldName": "genresArray_3",
"lookup": "Genres.description",
"unresolvedLinkAction": "CREATE"
} },
{ "edge": {
"class": "HasGenera",
"joinFieldName": "genresArray_4",
"lookup": "Genres.description",
"unresolvedLinkAction": "CREATE"
} },
{ "edge": {
"class": "HasGenera",
"joinFieldName": "genresArray_5",
"lookup": "Genres.description",
"unresolvedLinkAction": "CREATE"
} }
],
"loader": {
"orientdb": {
"dbURL": "remote:localhost/MovieRating",
"dbType": "graph",
"standardElementConstraints": false,
"classes": [
{"name": "Movies", "extends": "V"},
{"name": "Genres", "extends": "V"},
{"name": "HasGenera", "extends": "E"}
], "indexes": [
{"class":"Movies", "fields":["id:integer"], "type":"UNIQUE" },
{"class":"Genres", "fields":["description:string"], "type":"UNIQUE_HASH_INDEX" }
]
}
}
}