my_data=[
{'stationCode': 'NB001',
'summaries': [{'period': {'year': 2017}, 'rainfall': 449},
{'period': {'year': 2018}, 'rainfall': 352.4},
{'period': {'year': 2019}, 'rainfall': 253.2},
{'period': {'year': 2020}, 'rainfall': 283},
{'period': {'year': 2021}, 'rainfall': 104.2}]},
{'stationCode': 'NA003',
'summaries': [{'period': {'year': 2019}, 'rainfall': 58.2},
{'period': {'year': 2020}, 'rainfall': 628.2},
{'period': {'year': 2021}, 'rainfall': 120}]}]
In Pandas I can:
import pandas as pd
from pandas import json_normalize
pd.concat([json_normalize(entry, 'summaries', 'stationCode')
for entry in my_data])
That will give me the following table:
rainfall period.year stationCode
0 449.0 2017 NB001
1 352.4 2018 NB001
2 253.2 2019 NB001
3 283.0 2020 NB001
4 104.2 2021 NB001
0 58.2 2019 NA003
1 628.2 2020 NA003
2 120.0 2021 NA003
Can this be achieved in one line of code in pyspark?
I have tried the code below and it gives me the same result. However, it is too long, is there a way to shorten it?;
df=sc.parallelize(my_data)
df1=spark.read.json(df)
df1.select("stationCode","summaries.period.year","summaries.rainfall").display()
df1 = df1.withColumn("year_rainfall", F.arrays_zip("year", "rainfall"))
.withColumn("year_rainfall", F.explode("year_rainfall"))
.select("stationCode",
F.col("year_rainfall.rainfall").alias("Rainfall"),
F.col("year_rainfall.year").alias("Year"))
df1.display(20, False)
Introducing myself to pyspark and so some explanation or good information sources will highly be appreciated
What you have looks fine to me and is readable. However you can also zip and explode directly:
out = (df1.select("stationCode",
F.explode(F.arrays_zip(*["summaries.period.year","summaries.rainfall"])))
.select("stationCode",F.col("col")['0'].alias("year"),F.col("col")['1'].alias("rainfall")))
out.show()
+-----------+----+--------+
|stationCode|year|rainfall|
+-----------+----+--------+
| NB001|2017| 449.0|
| NB001|2018| 352.4|
| NB001|2019| 253.2|
| NB001|2020| 283.0|
| NB001|2021| 104.2|
| NA003|2019| 58.2|
| NA003|2020| 628.2|
| NA003|2021| 120.0|
+-----------+----+--------+
Consider a sample json file with the following data.
{
"Name": "TestName",
"Date": "2021-04-09",
"Readings": [
{
"Id": 1,
"Reading": 5.678,
"datetime": "2021-04-09 00:00:00"
},
{
"Id": 2,
"Reading": 3.692,
"datetime": "2020-04-09 00:00:00"
}
]
}
Define a schema that we can enforce to read our data.
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, ArrayType
data_schema = StructType(fields=[
StructField('Name', StringType(), False),
StructField('Date', StringType(), True),
StructField(
'Readings', ArrayType(
StructType([
StructField('Id', IntegerType(), False),
StructField('Reading', DoubleType(), True),
StructField('datetime', StringType(), True)
])
)
)
])
Now we can use our schema to read the JSON files in our directory
data_df = spark.read.json('/mnt/data/' + '*.json', schema=data_schema)
We want the data that’s nested in "Readings" so we can use explode to get these sub-columns.
from pyspark.sql.functions import explode
data_df = data_df.select(
"Name",
explode("Readings").alias("ReadingsExplode")
).select("Name", "ReadingsExplode.*")
data_df.show()
This should provide the required output with flatten dataframe.
Related
I need to write a data into json file like in the below format using pyspark.
{
"list-item": [
{"author":"author1","title":"title1","pages":1,"email":"author1#gmail.com"},
{"author":"author2","title":"title2","pages":2,"email":"author2#gmail.com"},
{"author":"author3","title":"title3","pages":3,"email":"author3#gmail.com"},
{"author":"author4","title":"title4","pages":4,"email":"author4#gmail.com"},
],
"version": 1
}
I have written the below pyspark code but it write "" and adding "" at the beginning and end of each item. How to remove the backslash and double quote
import sys
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col,to_json,struct,collect_list,lit
from datetime import datetime
from time import time
if __name__ == '__main__':
spark = SparkSession.builder.appName("Test").enableHiveSupport().getOrCreate()
schema = StructType([
StructField("author", StringType(), False),
StructField("title", StringType(), False),
StructField("pages", IntegerType(), False),
StructField("email", StringType(), False)
])
data = [
["author1", "title1", 1, "author1#gmail.com"],
["author2", "title2", 2, "author2#gmail.com"],
["author3", "title3", 3, "author3#gmail.com"],
["author4", "title4", 4, "author4#gmail.com"]
]
df = spark.createDataFrame(data, schema)
df=df.select(to_json(struct("author", "title", "pages", "email")).alias("json-data")).agg(collect_list("json-data").alias("list-item"))
df=df.withColumn("version",lit("1.0").cast(IntegerType()))
df.printSchema()
df.show(2, False)
curDT = datetime.now()
targetPath = curDT.strftime("%m-%d-%Y-%H-%M-%S")
df.write.format("json").mode("overwrite").option("escape", "").save(targetPath)
my code writes the json with backslash and double quote enclosed each item like below.how to remove those.Please help
{"list-item":["{\"author\":\"author1\",\"title\":\"title1\",\"pages\":1,\"email\":\"author1#gmail.com\"}","{\"author\":\"author2\",\"title\":\"title2\",\"pages\":2,\"email\":\"author2#gmail.com\"}","{\"author\":\"author3\",\"title\":\"title3\",\"pages\\":3,\"email\":\"author3#gmail.com\\"}","{\"author\":\"author4\",\"title\":\"title4\",\"pages\":4,\"email\":\"author4#gmail.com\"}"],"version":1}
The reason is that the type of the elements of the list-item array is string, and the \ is there to point this fact out.
To avoid that you can try:
import pyspark.sql.functions as f
from pyspark.sql.types import *
schema = StructType([
StructField("author", StringType(), False),
StructField("title", StringType(), False),
StructField("pages", IntegerType(), False),
StructField("email", StringType(), False)
])
data = [
["author1", "title1", 1, "author1#gmail.com"],
["author2", "title2", 2, "author2#gmail.com"],
["author3", "title3", 3, "author3#gmail.com"],
["author4", "title4", 4, "author4#gmail.com"]
]
df = spark.createDataFrame(data, schema)
df=df.groupby().agg(f.collect_list(f.struct(f.col('author'), f.col('title'), f.col('pages'), f.col('email'))).alias("list-item"))
df=df.withColumn("version",f.lit("1.0").cast(IntegerType()))
df.printSchema()
df.show(2, False)
df.write.format("json").mode("overwrite").option("escape", "").save('./TestJson')
and the output json file is gonna look like:
{"list-item":[{"author":"author1","title":"title1","pages":1,"email":"author1#gmail.com"},{"author":"author2","title":"title2","pages":2,"email":"author2#gmail.com"},{"author":"author3","title":"title3","pages":3,"email":"author3#gmail.com"},{"author":"author4","title":"title4","pages":4,"email":"author4#gmail.com"}],"version":1}
I have the following code which grabs some data from the Marketo system
from marketorestpython.client import MarketoClient
munchkin_id = "xxx-xxx-xxx"
client_id = "00000000-0000-0000-0000-00000000000"
client_secret= "secret"
mc = MarketoClient(munchkin_id, client_id, client_secret)
mc.execute(method='get_multiple_leads_by_filter_type', filterType='email', filterValues=['email#domain.com'],
fields=['BG__c','email','company','createdAt'], batchSize=None)
This returns me the following data
[{'BG__c': 'ABC',
'company': 'MCS',
'createdAt': '2016-10-25T14:04:15Z',
'id': 4,
'email': 'email#domain.com'},
{'BG__c': 'CDE',
'company': 'MSC',
'createdAt': '2018-03-28T16:41:06Z',
'id': 10850879,
'email': 'email#domain.com'}]
What i want to do is, to save this returned to a Parquet file. But when i try this with the following code, i receive an error message.
from marketorestpython.client import MarketoClient
munchkin_id = "xxx-xxx-xxx"
client_id = "00000000-0000-0000-0000-00000000000"
client_secret= "secret"
mc = MarketoClient(munchkin_id, client_id, client_secret)
data = mc.execute(method='get_multiple_leads_by_filter_type', filterType='email', filterValues=['email#domain.com'],
fields=['BG__c','email','company','createdAt'], batchSize=None)
sqlContext.read.json(data)
data.write.parquet("adl://subscription.azuredatalakestore.net/folder1/Marketo/marketo_data")
java.lang.ClassCastException: java.util.HashMap cannot be cast to java.lang.String
---------------------------------------------------------------------------
Py4JJavaError Traceback (most recent call last)
<command-1431708582476650> in <module>()
7 fields=['BG__c','email','company','createdAt'], batchSize=None)
8
----> 9 sqlContext.read.json(data)
10 data.write.parquet("adl://subscription.azuredatalakestore.net/folder1/Marketo/marketo_data")
/databricks/spark/python/pyspark/sql/readwriter.py in json(self, path, schema, primitivesAsString, prefersDecimal, allowComments, allowUnquotedFieldNames, allowSingleQuotes, allowNumericLeadingZero, allowBackslashEscapingAnyCharacter, mode, columnNameOfCorruptRecord, dateFormat, timestampFormat, multiLine, allowUnquotedControlChars, charset)
261 path = [path]
262 if type(path) == list:
--> 263 return self._df(self._jreader.json(self._spark._sc._jvm.PythonUtils.toSeq(path)))
264 elif isinstance(path, RDD):
265 def func(iterator):
/databricks/spark/python/lib/py4j-0.10.6-src.zip/py4j/java_gateway.py in __call__(self, *args)
1158 answer = self.gateway_client.send_command(command)
1159 return_value = get_return_value(
-> 1160 answer, self.gateway_client, self.target_id, self.name)
1161
What am i doing wrong?
You have the following data
data = [{'BG__c': 'ABC',
'company': 'MCS',
'createdAt': '2016-10-25T14:04:15Z',
'id': 4,
'email': 'email#domain.com'},
{'BG__c': 'CDE',
'company': 'MSC',
'createdAt': '2018-03-28T16:41:06Z',
'id': 10850879,
'email': 'email#domain.com'}]
In order to save it to a parquet file, I would suggest creating a DataFrame to then save it as a parquet.
from pyspark.sql.types import *
df = spark.createDataFrame(data,
schema = StructType([
StructField("BC_g", StringType(), True),
StructField("company", StringType(), True),
StructField("createdAt", StringType(), True),
StructField("email", StringType(), True),
StructField("id", IntegerType(), True)]))
This would give the following types :
df.dtypes
[('BC_g', 'string'),
('company', 'string'),
('createdAt', 'string'),
('email', 'string'),
('id', 'int')]
You can then save the dataframe as a parquet file
df.show()
+-----+-------+--------------------+----------------+--------+
|BG__c|company| createdAt| email| id|
+-----+-------+--------------------+----------------+--------+
| ABC| MCS|2016-10-25T14:04:15Z|email#domain.com| 4|
| CDE| MSC|2018-03-28T16:41:06Z|email#domain.com|10850879|
+-----+-------+--------------------+----------------+--------+
df.write.format('parquet').save(parquet_path_in_hdfs)
Where parquet_path_in_hdfs is the path and name of the desired parquet file
As per below statement in your code you are directly writing data. You have to first create dataframe. You can convert json to df using val df = sqlContext.read.json("path/to/json/file").Then do df.write
data.write.parquet("adl://subscription.azuredatalakestore.net/folder1/Marketo/marketo_data")
I'm trying to create a table from a json datasource.
The problem is that there is a field in the json data that is not always present for every entry and looks like this.
[ { "k1" : "someValue",
"optK" : { "nestedK" : true } },
{ "k1" : "someOtherValue" }
]
When I try to specify the optional field in the schema, all the entries without that field have all null value in the table:
columns: k1 | optK
row1: "someValue" [true]
row2: null null
is it possible to write a schema such that I would have null only in the column where the value is missing?
Like this:
columns: k1 | optK
row1: "someValue" "optV"
row2: "someOtherValue" null
My current code:
import org.apache.spark.sql.expressions.scalalang._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
val session = SparkSession.builder().enableHiveSupport().getOrCreate()
val schema = StructType(Seq(
StructField("k1", StringType, false),
StructField("optK", StructType(Seq(StructField("nestedK", BooleanType, false))), false)
))
val df = session.read.schema(schema).json("data.json")
df.registerTempTable("Mr_Table")
There are several issues in your code/input data:
Input data - JSON keys aren't in quote.
You can use avoid this problem, by one of the following options:
Updating the input data by adding quotes to the json keys
Using .option("allowUnquotedFieldNames",true) in the following way:
val df = session.read.option("allowUnquotedFieldNames",true).schema(schema).json("data.json")
A string field in the input data was defined as boolean in the schema schema should be updated to be:
val schema = StructType(Seq(
StructField("k1", StringType, false),
StructField("optK", StructType(Seq(StructField("nestedK", StringType, false))), false)
))
JSON data format, I've update the sample json input to be in json lines format:
{ k1 : "someValue", optK : { nestedK : "optV" } }
{ k1 : "someOtherValue" }
Running the modify code shows the following:
Spark context available as 'sc' (master = yarn, app id = application_xxx).
Spark session available as 'spark'.
Welcome to
____ __
/ __/__ ___ _____/ /__
_\ \/ _ \/ _ `/ __/ '_/
/___/ .__/\_,_/_/ /_/\_\ version 2.2.0
/_/
Using Scala version 2.11.8 (OpenJDK 64-Bit Server VM, Java 1.8.0_141)
scala> :paste
// Entering paste mode (ctrl-D to finish)
import org.apache.spark.sql.expressions.scalalang._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
val schema = StructType(Seq(
StructField("k1", StringType, false),
StructField("optK", StructType(Seq(StructField("nestedK", StringType, false))), false)
))
val df = spark.read.option("allowUnquotedFieldNames",true).schema(schema).json("s3 location of data.json")
// Exiting paste mode, now interpreting.
import org.apache.spark.sql.expressions.scalalang._
import org.apache.spark.sql.types._
import org.apache.spark.sql.{DataFrame, Dataset, SparkSession}
schema: org.apache.spark.sql.types.StructType = StructType(StructField(k1,StringType,false), StructField(optK,StructType(StructField(nestedK,StringType,false)),false))
df: org.apache.spark.sql.DataFrame = [k1: string, optK: struct<nestedK: string>]
scala> df.show
+--------------+------+
| k1| optK|
+--------------+------+
| someValue|[optV]|
|someOtherValue| null|
+--------------+------+
So i was trying to load the csv file inferring custom schema but everytime i end up with the following errors:
Project_Bank.csv is not a Parquet file. expected magic number at tail [80, 65, 82, 49] but found [110, 111, 13, 10]
This is how my program looks like and my csv file entries ,
age;job;marital;education;default;balance;housing;loan;contact;day;month;duration;campaign;pdays;previous;poutcome;y
58;management;married;tertiary;no;2143;yes;no;unknown;5;may;261;1;-1;0;unknown;no
44;technician;single;secondary;no;29;yes;no;unknown;5;may;151;1;-1;0;unknown;no
33;entrepreneur;married;secondary;no;2;yes;yes;unknown;5;may;76;1;-1;0;unknown;no
My Code :
$spark-shell --packages com.databricks:spark-csv_2.10:1.5.0
val sqlContext = new org.apache.spark.sql.SQLContext(sc)
import org.apache.spark.sql.types._
import org.apache.spark.sql.SQLContext
import sqlContext.implicits._
import org.apache.spark.sql.types.{StructType, StructField, StringType, IntegerType}
val bankSchema = StructType(Array(
StructField("age", IntegerType, true),
StructField("job", StringType, true),
StructField("marital", StringType, true),
StructField("education", StringType, true),
StructField("default", StringType, true),
StructField("balance", IntegerType, true),
StructField("housing", StringType, true),
StructField("loan", StringType, true),
StructField("contact", StringType, true),
StructField("day", IntegerType, true),
StructField("month", StringType, true),
StructField("duration", IntegerType, true),
StructField("campaign", IntegerType, true),
StructField("pdays", IntegerType, true),
StructField("previous", IntegerType, true),
StructField("poutcome", StringType, true),
StructField("y", StringType, true)))
val df = sqlContext.
read.
schema(bankSchema).
option("header", "true").
option("delimiter", ";").
load("/user/amit.kudnaver_gmail/hadoop/project_bank/Project_Bank.csv").toDF()
df.registerTempTable("people")
df.printSchema()
val distinctage = sqlContext.sql("select distinct age from people")
Any suggestion as why am not able to work with the csv file here after pushing the correct schema. Thanks in advance for your advise.
Thanks
Amit K
Here the problem is Data Frame expects Parquet file while processing it. In order to handle data in CSV. Here what you can do.
First of all, remove the header row from the data.
58;management;married;tertiary;no;2143;yes;no;unknown;5;may;261;1;-1;0;unknown;no
44;technician;single;secondary;no;29;yes;no;unknown;5;may;151;1;-1;0;unknown;no
33;entrepreneur;married;secondary;no;2;yes;yes;unknown;5;may;76;1;-1;0;unknown;no
Next we write following code to read the data.
Create case class
case class BankSchema(age: Int, job: String, marital:String, education:String, default:String, balance:Int, housing:String, loan:String, contact:String, day:Int, month:String, duration:Int, campaign:Int, pdays:Int, previous:Int, poutcome:String, y:String)
Read data from HDFS and parse it
val bankData = sc.textFile("/user/myuser/Project_Bank.csv").map(_.split(";")).map(p => BankSchema(p(0).toInt, p(1), p(2),p(3),p(4), p(5).toInt, p(6), p(7), p(8), p(9).toInt, p(10), p(11).toInt, p(12).toInt, p(13).toInt, p(14).toInt, p(15), p(16))).toDF()
And then register table and execute queries.
bankData.registerTempTable("bankData")
val distinctage = sqlContext.sql("select distinct age from bankData")
Here is what the output would look like
+---+
|age|
+---+
| 33|
| 44|
| 58|
+---+
Here the expected file format is csv but as per error its looking for parquet file format.
This can be overcome by explicitly mentioning the file format as below (which was missing in the problem shared) because if we don't specify the file format then it by default expects Parquet format.
As per Java code version (sample example):
Dataset<Row> resultData = session.read().format("csv")
.option("sep", ",")
.option("header", true)
.option("mode", "DROPMALFORMED")
.schema(definedSchema)
.load(inputPath);
Here, schema can be defined either by using a java class (ie. POJO class) or by using StructType as already mentioned.
And inputPath is the path of input csv file.
I am trying to load a CSV file into a Spark data frame with spark-csv [1] using an Apache Zeppelin notebook and when loading a numeric field that doesn't have value the parser fails for that line and the line gets skipped.
I would have expected the line to get loaded and the value in the data frame load the line and have the value set to NULL so that aggregations just ignore the value.
%dep
z.reset()
z.addRepo("my-nexus").url("<my_local_nexus_repo_that_is_a_proxy_of_public_repos>")
z.load("com.databricks:spark-csv_2.10:1.1.0")
%spark
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.types._
import com.databricks.spark.csv._
import org.apache.spark.sql.functions._
val schema = StructType(
StructField("identifier", StringType, true) ::
StructField("name", StringType, true) ::
StructField("height", DoubleType, true) ::
Nil)
val sqlContext = new SQLContext(sc)
val df = sqlContext.read.format("com.databricks.spark.csv")
.schema(schema)
.option("header", "true")
.load("file:///home/spark_user/data.csv")
df.describe("height").show()
Here is the content of the data file: /home/spark_user/data.csv
identifier,name,height
1,sam,184
2,cath,180
3,santa, <-- note that there is not height recorded for Santa !
Here is the output:
+-------+------+
|summary|height|
+-------+------+
| count| 2| <- 2 of 3 lines loaded, ie. sam and cath
| mean| 182.0|
| stddev| 2.0|
| min| 180.0|
| max| 184.0|
+-------+------+
In the logs of zeppelin I can see the following error on parsing santa's line:
ERROR [2015-07-21 16:42:09,940] ({Executor task launch worker-45} CsvRelation.scala[apply]:209) - Exception while parsing line: 3,santa,.
java.lang.NumberFormatException: empty String
at sun.misc.FloatingDecimal.readJavaFormatString(FloatingDecimal.java:1842)
at sun.misc.FloatingDecimal.parseDouble(FloatingDecimal.java:110)
at java.lang.Double.parseDouble(Double.java:538)
at scala.collection.immutable.StringLike$class.toDouble(StringLike.scala:232)
at scala.collection.immutable.StringOps.toDouble(StringOps.scala:31)
at com.databricks.spark.csv.util.TypeCast$.castTo(TypeCast.scala:42)
at com.databricks.spark.csv.CsvRelation$$anonfun$com$databricks$spark$csv$CsvRelation$$parseCSV$1.apply(CsvRelation.scala:198)
at com.databricks.spark.csv.CsvRelation$$anonfun$com$databricks$spark$csv$CsvRelation$$parseCSV$1.apply(CsvRelation.scala:180)
at scala.collection.Iterator$$anon$13.hasNext(Iterator.scala:371)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
at scala.collection.Iterator$$anon$11.hasNext(Iterator.scala:327)
at org.apache.spark.sql.execution.Aggregate$$anonfun$doExecute$1$$anonfun$6.apply(Aggregate.scala:129)
at org.apache.spark.sql.execution.Aggregate$$anonfun$doExecute$1$$anonfun$6.apply(Aggregate.scala:126)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$17.apply(RDD.scala:686)
at org.apache.spark.rdd.RDD$$anonfun$mapPartitions$1$$anonfun$apply$17.apply(RDD.scala:686)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:35)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:277)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:244)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:70)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
at org.apache.spark.scheduler.Task.run(Task.scala:70)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:213)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1142)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:617)
at java.lang.Thread.run(Thread.java:745)
So you might tell me so far so good ... and you'd be right ;)
Now I want to add an extra column, say age and I always have data in that field.
identifier,name,height,age
1,sam,184,30
2,cath,180,32
3,santa,,70
Now ask politely for some stats about age:
%spark
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.types._
import com.databricks.spark.csv._
import org.apache.spark.sql.functions._
val schema = StructType(
StructField("identifier", StringType, true) ::
StructField("name", StringType, true) ::
StructField("height", DoubleType, true) ::
StructField("age", DoubleType, true) ::
Nil)
val sqlContext = new SQLContext(sc)
val df = sqlContext.read.format("com.databricks.spark.csv")
.schema(schema)
.option("header", "true")
.load("file:///home/spark_user/data2.csv")
df.describe("age").show()
Results
+-------+----+
|summary| age|
+-------+----+
| count| 2|
| mean|31.0|
| stddev| 1.0|
| min|30.0|
| max|32.0|
+-------+----+
ALL WRONG ! Since santa's height is not known, the whole line is lost and the calculation of age is only based on Sam and Cath while Santa has a perfectly valid age.
My question is what value do I need to plug in Santa's height so that the CSV can be loaded. I have tried to set the schema to be all StringType but then
The next question is more about
I have found in the API that one can handle N/A values using spark. SO I thought maybe I could load my data with all columns set to StringType and then do some cleanup and then only set the schema properly as written below:
%spark
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.types._
import com.databricks.spark.csv._
import org.apache.spark.sql.functions._
val schema = StructType(
StructField("identifier", StringType, true) ::
StructField("name", StringType, true) ::
StructField("height", StringType, true) ::
StructField("age", StringType, true) ::
Nil)
val sqlContext = new SQLContext(sc)
val df = sqlContext.read.format("com.databricks.spark.csv").schema(schema).option("header", "true").load("file:///home/spark_user/data.csv")
// eg. for each column of my dataframe, replace empty string by null
df.na.replace( "*", Map("" -> null) )
val toDouble = udf[Double, String]( _.toDouble)
df2 = df.withColumn("age", toDouble(df("age")))
df2.describe("age").show()
But df.na.replace() throws an exception and stops:
java.lang.IllegalArgumentException: Unsupported value type java.lang.String ().
at org.apache.spark.sql.DataFrameNaFunctions.org$apache$spark$sql$DataFrameNaFunctions$$convertToDouble(DataFrameNaFunctions.scala:417)
at org.apache.spark.sql.DataFrameNaFunctions$$anonfun$4.apply(DataFrameNaFunctions.scala:337)
at org.apache.spark.sql.DataFrameNaFunctions$$anonfun$4.apply(DataFrameNaFunctions.scala:337)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
at scala.collection.TraversableLike$$anonfun$map$1.apply(TraversableLike.scala:244)
at scala.collection.immutable.Map$Map1.foreach(Map.scala:109)
at scala.collection.TraversableLike$class.map(TraversableLike.scala:244)
at scala.collection.AbstractTraversable.map(Traversable.scala:105)
at org.apache.spark.sql.DataFrameNaFunctions.replace0(DataFrameNaFunctions.scala:337)
at org.apache.spark.sql.DataFrameNaFunctions.replace(DataFrameNaFunctions.scala:304)
Any help, & tips much appreciated !!
[1] https://github.com/databricks/spark-csv
Spark-csv lacks this option. It has been fixed in master branch. I guess you should use it or wait for the next stable version.