{
"config1":{
"url":"xxxx",
"database":"xxxx",
"dbTable":"xxxx"
},
"config2":{
"url":"xxxx",
"database":"xxxxx",
"dbTable":"xxxxx"
},
"snippets":{
"optionA":{
"months_back":"2",
"list":{
"code1":{
"id":"11111",
"country":"11111"
},
"code2":{
"id":"2222",
"country":"2222"
},
"code3":{
"id":"3333",
"country":"3333"
}
}
}
}
}
let's say I have a config.json that looks like that, I have some code with a query I need to swap parameters with the id and country in that json
So far my code is something like this
import spark.implicits._
val df = sqlContext.read.option("multiline","true").json("path_to_json")
val range_df = df.select("snippets.optionA.months_back").collect()
val range_str = range_df.map(x => x.get(0))
val range = range_str(0)
val list = df.select("snippets.optionA.list.*")).collect()
I need something like
For(x <- json_list){
val results = spark.sql("""
select * from table
where date >= add_months(current_date(), -"""+range+""")
and country = """+json_list(country)+"""
and id = """+json_lis(id)+""")
the List after collect() is list: Array[org.apache.spark.sql.Row] and I have no idea how to iterate over it.
Any help is welcome, thank you
Convert snippets.optionA.list.* inner struct into array(snippets.optionA.list.*) & iterate each value from this array.
Check below code.
val queriesResult = df
.withColumn(
"query",
explode(
expr(
"""
|transform(
| array(snippets.optionA.list.*),
| v -> concat(
| 'SELECT * FROM TABLE WHERE DATE >= add_months(current_date(), -',
| snippets.optionA.months_back,
| ') AND country=\"',
| v.country,
| '\" AND id =',
| v.id
| )
|)
|""".stripMargin
)
)
)
.select("query")
.as[String]
.collect
.map { query =>
spark.sql(query)
}
.collect function will return array of queries like below, then using map function to pass each query to spark.sql function to execute query.
Array(
"SELECT * FROM TABLE WHERE DATE >= add_months(current_date(), -2) AND country="11111" AND id =11111",
"SELECT * FROM TABLE WHERE DATE >= add_months(current_date(), -2) AND country="2222" AND id =2222",
"SELECT * FROM TABLE WHERE DATE >= add_months(current_date(), -2) AND country="3333" AND id =3333"
)
Spark Version >= 2.4 +
Related
I made a program in JavaFX with Kotlin I managed to make a CSV and TXT reading separated by ";" where I have efficiency problems and I don't know how I could improve the efficiency to make the SQL query construction.
fun generatedDelimited(filePath: String, table: String = "") {
val sourceFile = File(filePath)
var line: String?
var header: Array<String>? = null
val lines: MutableList<List<String>> = ArrayList()
try {
BufferedReader(FileReader(sourceFile)).use { br ->
header = br.readLine().split(";").toTypedArray();
while (br.readLine().also { line = it } != null) {
val values : Array<String> = line!!.split(";").toTypedArray();
lines.add(Arrays.asList(*values))
}
}
} catch (e: IOException) {
e.printStackTrace()
}
val joined = "INSERT INTO $table (${header!!.joinToString(separator = ",")})\n"
var textSelect = "${joined}SELECT * FROM ( \n"
var selectUnion = ""
var lineNo = 1
for (line in lines) {
var columnNo = 0
var comma = ", "
var select = "SELECT "
var union = "UNION ALL\n"
if (lines.size.equals(lineNo)) {
union = ""
}
for (value in line) {
if (columnNo == 1) {
select = ""
}
if (line.size.equals(columnNo+1)) {
comma = " FROM DUAL \n$union"
}
selectUnion += "$select'$value' as ${header!![columnNo]}$comma"
columnNo++
}
lineNo++
}
textSelect += "$selectUnion);"
querySQL.text = textSelect
}
Result:
INSERT INTO werwsf (DATA1,DATA2,DATA3,DATA4,DATA5)
SELECT * FROM (
SELECT 'HOLA1' as DATA1, 'HAKA2' as DATA2, 'HAD3' as DATA3, '' as DATA4, 'ASDAD5' as DATA5 FROM DUAL
UNION ALL
SELECT 'HOLA6' as DATA1, 'HAKA7' as DATA2, 'HAD8' as DATA3, 'FA9' as DATA4, 'ASDAD10' as DATA5 FROM DUAL
);
Is there a way to improve efficiency? With 1600 rows it takes 5 minutes
Thank you.
this should be an optimised version of your code:
I used kotlin standard joinToString functions, which uses StringBuilder under the hood like #0009laH advised. I also removed the redundant list <-> array conversions and replaced the splitting and then joining back the first line (header) by the replace function, because it has the same effect as in the original code and it is faster. All these changes should result in faster, more readable and more concise code
fun generatedDelimited(filePath: String, table: String = "") {
val sourceFile = File(filePath)
val fileLines: List<String> = sourceFile.readLines()
val header: String = fileLines.first().replace(';', ',')
val lines: List<List<String>> = fileLines.drop(1).map { line ->
line.split(";")
}
val selectUnion = lines.joinToString(separator = "UNION ALL\n") { line ->
line.withIndex().joinToString(separator = ", ", prefix = "SELECT", postfix = " FROM DUAL\n") { (columnNo, value) ->
"'$value' as ${header[columnNo]}"
}
}
querySQL.text = "INSERT INTO $table ($header)\nSELECT * FROM ( \n$selectUnion);"
}
I have a Data Frame which has 3 columns like this:
---------------------------------------------
| x(string) | date(date) | value(int) |
---------------------------------------------
I want to SELECT all the the rows [i] that satisfy all 4 conditions:
1) row [i] and row [i - 1] have the same value in column 'x'
AND
2) 'date' at row [i] == 'date' at row [i - 1] + 1 (two consecutive days)
AND
3) 'value' at row [i] > 5
AND
4) 'value' at row [i - 1] <= 5
I think maybe I need a For loop, but don't know how exactly! Please help me!
Every help is much appreciated!
It can be very easily done with Window functions, look at lag function:
import org.apache.spark.sql.types._
import org.apache.spark.sql._
import sqlContext.implicits._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.expressions._
// test data
val list = Seq(
("x", "2016-12-13", 1),
("x", "2016-12-14", 7)
);
val df = sc.parallelize(list).toDF("x", "date", "value");
// add lags - so read previous value from dataset
val withPrevs = df
.withColumn ("prevX", lag('x, 1).over(Window.orderBy($"date")))
.withColumn ("prevDate", lag('date, 1).over(Window.orderBy($"date")))
.withColumn ("prevValue", lag('value, 1).over(Window.orderBy($"date")))
// filter values and select only needed fields
withPrevs
.where('x === 'prevX)
.where('value > lit(5))
.where('prevValue < lit(5))
.where('date === date_add('prevDate, 1))
.select('x, 'date, 'value)
.show()
Note that without order, i.e. by date, this cannot be done. Dataset has none meaningful order, you must specify order explicity
If you have a DataFrame created, then all you need to do is to call a filter function on DataFrame will all your conditions.
For example:
df1.filter($"Column1" === 2 || $"Column2" === 3)
You can pass as many conditions as you want. It will return you a new DataFrame with filtered data.
In mysql cli, i get following result:
mysql> select * from words limit 1;
+----+------+--------------------+---------------------+---------------------+
| id | name | full | created_at | updated_at |
+----+------+--------------------+---------------------+---------------------+
| 30 | prpr | a full explanation | 2016-09-20 12:59:07 | 2016-09-20 12:59:07 |
+----+------+--------------------+---------------------+---------------------+
the "created_at" is 2016-09-20 12:59:07
but when i
static void main(String[] args) {
def c = Sql.newInstance("jdbc:mysql://127.0.0.1:3306/ro_test", "root", "root")
println c.rows("select * from words")[0]['created_at']
}
the output is
2016-09-21 05:30:58.0
I hope groovy code output is same with mysql cli, how to do that?
These two dates probably refer to (roughly) the same instant in time. Given that the dates are 5.5 hours apart, my guess is that the MySQL CLI is showing the date in the UTC timezone, whereas the Groovy code is showing the date in the UTC+05:30 (Indian) time zone.
In other words
2016-09-20 12:59:07 + 5.5 hours ≈ 2016-09-21 05:30:58.0
When I force specific timezone, it work
static void main(String[] args) {
def c = Sql.newInstance("jdbc:mysql://127.0.0.1:3306/ro_test", "root", "root")
def tz = TimeZone.default
def cal = Calendar.getInstance(TimeZone.getTimeZone("Asia/Shanghai"))
c.query("select * from words") { ResultSetImpl rs ->
while (rs.next()) {
println rs.getTimestamp(4, cal)
}
}
}
I think the best way is rewrite Groovy.sql.Sql#rows with above code, the full implementation is here:
List<LinkedHashMap> e2(String stmt) {
def cal = Calendar.getInstance(Time.timezone)
List<GroovyRowResult> rs = []
c.query(stmt) { ResultSetImpl rs2 ->
def md = rs2.metaData
int cc = md.columnCount
while (rs2.next()) {
def attrs = [:]
for (int i = 1; i <= cc; i++) {
def key = md.getColumnLabel(i)
def t = md.getColumnType(i)
def v
if (t == Types.TIMESTAMP) {
v = rs2.getTimestamp(i, cal)
} else {
v = rs2.getObject(i)
}
attrs[key] = v
}
rs.add(attrs)
}
}
rs
}
I'm sure this is a simple SQLContext question, but I can't find any answer in the Spark docs or Stackoverflow
I want to create a Spark Dataframe from a SQL Query on MySQL
For example, I have a complicated MySQL query like
SELECT a.X,b.Y,c.Z FROM FOO as a JOIN BAR as b ON ... JOIN ZOT as c ON ... WHERE ...
and I want a Dataframe with Columns X,Y and Z
I figured out how to load entire tables into Spark, and I could load them all, and then do the joining and selection there. However, that is very inefficient. I just want to load the table generated by my SQL query.
Here is my current approximation of the code, that doesn't work. Mysql-connector has an option "dbtable" that can be used to load a whole table. I am hoping there is some way to specify a query
val df = sqlContext.format("jdbc").
option("url", "jdbc:mysql://localhost:3306/local_content").
option("driver", "com.mysql.jdbc.Driver").
option("useUnicode", "true").
option("continueBatchOnError","true").
option("useSSL", "false").
option("user", "root").
option("password", "").
sql(
"""
select dl.DialogLineID, dlwim.Sequence, wi.WordRootID from Dialog as d
join DialogLine as dl on dl.DialogID=d.DialogID
join DialogLineWordInstanceMatch as dlwim o n dlwim.DialogLineID=dl.DialogLineID
join WordInstance as wi on wi.WordInstanceID=dlwim.WordInstanceID
join WordRoot as wr on wr.WordRootID=wi.WordRootID
where d.InSite=1 and dl.Active=1
limit 100
"""
).load()
I found this here Bulk data migration through Spark SQL
The dbname parameter can be any query wrapped in parenthesis with an alias. So in my case, I need to do this:
val query = """
(select dl.DialogLineID, dlwim.Sequence, wi.WordRootID from Dialog as d
join DialogLine as dl on dl.DialogID=d.DialogID
join DialogLineWordInstanceMatch as dlwim on dlwim.DialogLineID=dl.DialogLineID
join WordInstance as wi on wi.WordInstanceID=dlwim.WordInstanceID
join WordRoot as wr on wr.WordRootID=wi.WordRootID
where d.InSite=1 and dl.Active=1
limit 100) foo
"""
val df = sqlContext.format("jdbc").
option("url", "jdbc:mysql://localhost:3306/local_content").
option("driver", "com.mysql.jdbc.Driver").
option("useUnicode", "true").
option("continueBatchOnError","true").
option("useSSL", "false").
option("user", "root").
option("password", "").
option("dbtable",query).
load()
As expected, loading each table as its own Dataframe and joining them in Spark was very inefficient.
If you have your table already registered in your SQLContext, you could simply use sql method.
val resultDF = sqlContext.sql("SELECT a.X,b.Y,c.Z FROM FOO as a JOIN BAR as b ON ... JOIN ZOT as c ON ... WHERE ...")
to save the output of a query to a new dataframe, simple set the result equal to a variable:
val newDataFrame = spark.sql("SELECT a.X,b.Y,c.Z FROM FOO as a JOIN BAR as b ON ... JOIN ZOT as c ON ... WHERE ...")
and now newDataFrame is a dataframe with all the dataframe functionalities available to it.
TL;DR: just create a view in your database.
Detail:
I have a table t_city in my postgres database, on which I create a view:
create view v_city_3500 as
select asciiname, country, population, elevation
from t_city
where elevation>3500
and population>100000
select * from v_city_3500;
asciiname | country | population | elevation
-----------+---------+------------+-----------
Potosi | BO | 141251 | 3967
Oruro | BO | 208684 | 3936
La Paz | BO | 812799 | 3782
Lhasa | CN | 118721 | 3651
Puno | PE | 116552 | 3825
Juliaca | PE | 245675 | 3834
In the spark-shell:
val sx= new org.apache.spark.sql.SQLContext(sc)
var props=new java.util.Properties()
props.setProperty("driver", "org.postgresql.Driver" )
val url="jdbc:postgresql://buya/dmn?user=dmn&password=dmn"
val city_df=sx.read.jdbc(url=url,table="t_city",props)
val city_3500_df=sx.read.jdbc(url=url,table="v_city_3500",props)
Result:
city_df.count()
Long = 145725
city_3500_df.count()
Long = 6
with MYSQL read/loading data something like below
val conf = new SparkConf().setAppName("SparkMe Application").setMaster("local[2]")
val sc = new SparkContext(conf)
sc.setLogLevel("ERROR")
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
val jdbcDF = sqlContext.read.format("jdbc").options(
Map("url" -> "jdbc:mysql://<host>:3306/corbonJDBC?user=user&password=password",
"dbtable" -> "TABLE_NAME")).load()
write data to table as below
import java.util.Properties
val prop = new Properties()
prop.put("user", "<>")
prop.put("password", "simple$123")
val dfWriter = jdbcDF.write.mode("append")
dfWriter.jdbc("jdbc:mysql://<host>:3306/corbonJDBC?user=user&password=password", "tableName", prop)
to create dataframe from query do something like below
val finalModelDataDF = {
val query = "select * from table_name"
sqlContext.sql(query)
};
finalModelDataDF.show()
I would like to create a JSON from a Spark v.1.6 (using scala) dataframe. I know that there is the simple solution of doing df.toJSON.
However, my problem looks a bit different. Consider for instance a dataframe with the following columns:
| A | B | C1 | C2 | C3 |
-------------------------------------------
| 1 | test | ab | 22 | TRUE |
| 2 | mytest | gh | 17 | FALSE |
I would like to have at the end a dataframe with
| A | B | C |
----------------------------------------------------------------
| 1 | test | { "c1" : "ab", "c2" : 22, "c3" : TRUE } |
| 2 | mytest | { "c1" : "gh", "c2" : 17, "c3" : FALSE } |
where C is a JSON containing C1, C2, C3. Unfortunately, I at compile time I do not know what the dataframe looks like (except the columns A and B that are always "fixed").
As for the reason why I need this: I am using Protobuf for sending around the results. Unfortunately, my dataframe sometimes has more columns than expected and I would still send those via Protobuf, but I do not want to specify all columns in the definition.
How can I achieve this?
Spark 2.1 should have native support for this use case (see #15354).
import org.apache.spark.sql.functions.to_json
df.select(to_json(struct($"c1", $"c2", $"c3")))
I use this command to solve the to_json problem:
output_df = (df.select(to_json(struct(col("*"))).alias("content")))
Here, no JSON parser, and it adapts to your schema:
import org.apache.spark.sql.functions.{col, concat, concat_ws, lit}
df.select(
col(df.columns(0)),
col(df.columns(1)),
concat(
lit("{"),
concat_ws(",",df.dtypes.slice(2, df.dtypes.length).map(dt => {
val c = dt._1;
val t = dt._2;
concat(
lit("\"" + c + "\":" + (if (t == "StringType") "\""; else "") ),
col(c),
lit(if(t=="StringType") "\""; else "")
)
}):_*),
lit("}")
) as "C"
).collect()
First lets convert C's to a struct:
val dfStruct = df.select($"A", $"B", struct($"C1", $"C2", $"C3").alias("C"))
This is structure can be converted to JSONL using toJSON as before:
dfStruct.toJSON.collect
// Array[String] = Array(
// {"A":1,"B":"test","C":{"C1":"ab","C2":22,"C3":true}},
// {"A":2,"B":"mytest","C":{"C1":"gh","C2":17,"C3":false}})
I am not aware of any built-in method that can convert a single column but you can either convert it individually and join or use your favorite JSON parser in an UDF.
case class C(C1: String, C2: Int, C3: Boolean)
object CJsonizer {
import org.json4s._
import org.json4s.JsonDSL._
import org.json4s.jackson.Serialization
import org.json4s.jackson.Serialization.write
implicit val formats = Serialization.formats(org.json4s.NoTypeHints)
def toJSON(c1: String, c2: Int, c3: Boolean) = write(C(c1, c2, c3))
}
val cToJSON = udf((c1: String, c2: Int, c3: Boolean) =>
CJsonizer.toJSON(c1, c2, c3))
df.withColumn("c_json", cToJSON($"C1", $"C2", $"C3"))