I am trying to write a simple pySpark program to read a txt file, do the word count, and save the output to the CSV file. Need help to resolve this issue.
from pyspark.sql import SparkSession
spark=SparkSession.builder.appName("WordCount").getOrCreate()
sc=spark.sparkContext
textFile=sc.textFile("words.txt")
words=textFile.flatMap(lambda line: line.split(""))
wordsCounts=words.map(lambda word: (word, 1)).reduceByKey(lambda a,b:a + b)
df = spark.createDataFrame(wordsCounts, ['count', 'word'])
df.coalesce(1).write.csv("word_count.csv", header=True)
When I run this code it gives me below error:
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 1 in stage 26.0 failed 1 times, most recent failure: Lost task 1.0 in stage 26.0 (TID 27) (host.docker.internal executor driver): org.apache.spark.SparkException: Python worker failed to connect back.
at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:189)
at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:164)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
at org.apache.spark.api.python.PairwiseRDD.compute(PythonRDD.scala:115)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
at org.apache.spark.scheduler.Task.run(Task.scala:136)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.net.SocketTimeoutException: Accept timed out
at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
at java.net.DualStackPlainSocketImpl.socketAccept(DualStackPlainSocketImpl.java:135)
at java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:409)
at java.net.PlainSocketImpl.accept(PlainSocketImpl.java:199)
at java.net.ServerSocket.implAccept(ServerSocket.java:545)
at java.net.ServerSocket.accept(ServerSocket.java:513)
at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:176)
... 19 more
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2672)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2608)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2607)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2607)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1182)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1182)
at scala.Option.foreach(Option.scala:407)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1182)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2860)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2802)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2791)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:952)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2228)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2249)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2268)
at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:166)
at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Python worker failed to connect back.
at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:189)
at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:164)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
at org.apache.spark.api.python.PairwiseRDD.compute(PythonRDD.scala:115)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
at org.apache.spark.scheduler.Task.run(Task.scala:136)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
... 1 more
Caused by: java.net.SocketTimeoutException: Accept timed out
at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
at java.net.DualStackPlainSocketImpl.socketAccept(DualStackPlainSocketImpl.java:135)
at java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:409)
at java.net.PlainSocketImpl.accept(PlainSocketImpl.java:199)
at java.net.ServerSocket.implAccept(ServerSocket.java:545)
at java.net.ServerSocket.accept(ServerSocket.java:513)
at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:176)
... 19 more
I checked type of wordCounts variable and it is PipelinedRDD
wordsCounts=words.map(lambda word: (word, 1)).reduceByKey(lambda a,b:a + b)
type(wordsCounts)
pyspark.rdd.PipelinedRDD
I tried to view the content of wordCounts and it is again giving me same error
wordsCounts=words.map(lambda word: (word, 1)).reduceByKey(lambda a,b:a + b)
print(wordsCounts.take(4))
Py4JJavaError: An error occurred while calling z:org.apache.spark.api.python.PythonRDD.runJob.
: org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 32.0 failed 1 times, most recent failure: Lost task 0.0 in stage 32.0 (TID 32) (host.docker.internal executor driver): org.apache.spark.SparkException: Python worker failed to connect back.
at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:189)
at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:164)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
at org.apache.spark.api.python.PairwiseRDD.compute(PythonRDD.scala:115)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
at org.apache.spark.scheduler.Task.run(Task.scala:136)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
Caused by: java.net.SocketTimeoutException: Accept timed out
at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
at java.net.DualStackPlainSocketImpl.socketAccept(DualStackPlainSocketImpl.java:135)
at java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:409)
at java.net.PlainSocketImpl.accept(PlainSocketImpl.java:199)
at java.net.ServerSocket.implAccept(ServerSocket.java:545)
at java.net.ServerSocket.accept(ServerSocket.java:513)
at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:176)
... 19 more
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.failJobAndIndependentStages(DAGScheduler.scala:2672)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2(DAGScheduler.scala:2608)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$abortStage$2$adapted(DAGScheduler.scala:2607)
at scala.collection.mutable.ResizableArray.foreach(ResizableArray.scala:62)
at scala.collection.mutable.ResizableArray.foreach$(ResizableArray.scala:55)
at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:49)
at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:2607)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1(DAGScheduler.scala:1182)
at org.apache.spark.scheduler.DAGScheduler.$anonfun$handleTaskSetFailed$1$adapted(DAGScheduler.scala:1182)
at scala.Option.foreach(Option.scala:407)
at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:1182)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:2860)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2802)
at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:2791)
at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:49)
at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:952)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2228)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2249)
at org.apache.spark.SparkContext.runJob(SparkContext.scala:2268)
at org.apache.spark.api.python.PythonRDD$.runJob(PythonRDD.scala:166)
at org.apache.spark.api.python.PythonRDD.runJob(PythonRDD.scala)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
at py4j.Gateway.invoke(Gateway.java:282)
at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
at py4j.commands.CallCommand.execute(CallCommand.java:79)
at py4j.ClientServerConnection.waitForCommands(ClientServerConnection.java:182)
at py4j.ClientServerConnection.run(ClientServerConnection.java:106)
at java.lang.Thread.run(Thread.java:748)
Caused by: org.apache.spark.SparkException: Python worker failed to connect back.
at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:189)
at org.apache.spark.api.python.PythonWorkerFactory.create(PythonWorkerFactory.scala:109)
at org.apache.spark.SparkEnv.createPythonWorker(SparkEnv.scala:124)
at org.apache.spark.api.python.BasePythonRunner.compute(PythonRunner.scala:164)
at org.apache.spark.api.python.PythonRDD.compute(PythonRDD.scala:65)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
at org.apache.spark.api.python.PairwiseRDD.compute(PythonRDD.scala:115)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:365)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:329)
at org.apache.spark.shuffle.ShuffleWriteProcessor.write(ShuffleWriteProcessor.scala:59)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:99)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:52)
at org.apache.spark.scheduler.Task.run(Task.scala:136)
at org.apache.spark.executor.Executor$TaskRunner.$anonfun$run$3(Executor.scala:548)
at org.apache.spark.util.Utils$.tryWithSafeFinally(Utils.scala:1504)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:551)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
... 1 more
Caused by: java.net.SocketTimeoutException: Accept timed out
at java.net.DualStackPlainSocketImpl.waitForNewConnection(Native Method)
at java.net.DualStackPlainSocketImpl.socketAccept(DualStackPlainSocketImpl.java:135)
at java.net.AbstractPlainSocketImpl.accept(AbstractPlainSocketImpl.java:409)
at java.net.PlainSocketImpl.accept(PlainSocket.java:545)
at java.net.ServerSocket.accept(ServerSocket.java:513)
at org.apache.spark.api.python.PythonWorkerFactory.createSimpleWorker(PythonWorkerFactory.scala:176)
... 19 more
Related
i was trying to solve the 2009 problem from the data exposition of the Statistical
computing and Statistical graphics. The first step was about loading dataset, merging them and save them as parquet file. i am having trouble with this last step.
a.spark<-spark_read_csv(sc,"1987.csv")
b.spark<-spark_read_csv(sc,"1988.csv")
c.spark<-spark_read_csv(sc,"1989.csv")
d.spark<-spark_read_csv(sc,"1990.csv")
e.spark<-spark_read_csv(sc,"1991.csv")
f.spark<-spark_read_csv(sc,"1992.csv")
g.spark<-spark_read_csv(sc,"1993.csv")
h.spark<-spark_read_csv(sc,"1994.csv")
i.spark<-spark_read_csv(sc,"1995.csv")
j.spark<-spark_read_csv(sc,"1996.csv")
k.spark<-spark_read_csv(sc,"1997.csv")
l.spark<-spark_read_csv(sc,"1998.csv")
m.spark<-spark_read_csv(sc,"1999.csv")
n.spark<-spark_read_csv(sc,"2000.csv")
o.spark<-spark_read_csv(sc,"2001.csv")
p.spark<-spark_read_csv(sc,"2002.csv")
q.spark<-spark_read_csv(sc,"2003.csv")
r.spark<-spark_read_csv(sc,"2004.csv")
s.spark<-spark_read_csv(sc,"2005.csv")
t.spark<-spark_read_csv(sc,"2006.csv")
u.spark<-spark_read_csv(sc,"2007.csv")
v.spark<-spark_read_csv(sc,"2008.csv")
ab<-full_join(a.spark,b.spark,suffix=c("a.spark","b.spark"))
cd<-full_join(c.spark,d.spark,suffix=c("c.spark","d.spark"))
ef<-full_join(e.spark,f.spark,suffix=c("e.spark","f.spark"))
gh<-full_join(g.spark,h.spark)
ij<-full_join(i.spark,j.spark)
kl<-full_join(k.spark,l.spark)
mn<-full_join(m.spark,n.spark)
op<-full_join(o.spark,p.spark)
qr<-full_join(q.spark,r.spark)
st<-full_join(s.spark,t.spark)
uv<-full_join(u.spark,v.spark)
abcd<-full_join(ab,cd,suffix=c("ab","cd"))
efgh<-full_join(ef,gh)
ijkl<-full_join(ij,kl)
mnop<-full_join(mn,op)
qrst<-full_join(qr,st)
ah<-full_join(abcd,efgh)
ip<-full_join(ijkl,mnop)
qv<-full_join(qrst,uv)
ap<-full_join(ah,ip)
av<-full_join(ap,qv)
spark_write_parquet(av,path=("the path on the pc"),mode="overwrite")
and i get this error
Error: org.apache.spark.SparkException: Job aborted.
at org.apache.spark.sql.execution.datasources.FileFormatWriter$.write(FileFormatWriter.scala:224)
at org.apache.spark.sql.execution.datasources.InsertIntoHadoopFsRelationCommand.run(InsertIntoHadoopFsRelationCommand.scala:154)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult$lzycompute(commands.scala:104)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.sideEffectResult(commands.scala:102)
at org.apache.spark.sql.execution.command.DataWritingCommandExec.doExecute(commands.scala:122)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:131)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$execute$1.apply(SparkPlan.scala:127)
at org.apache.spark.sql.execution.SparkPlan$$anonfun$executeQuery$1.apply(SparkPlan.scala:155)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:151)
at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:152)
at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:127)
at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:80)
at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:80)
at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:654)
at org.apache.spark.sql.DataFrameWriter$$anonfun$runCommand$1.apply(DataFrameWriter.scala:654)
at org.apache.spark.sql.execution.SQLExecution$.withNewExecutionId(SQLExecution.scala:77)
at org.apache.spark.sql.DataFrameWriter.runCommand(DataFrameWriter.scala:654)
at org.apache.spark.sql.DataFrameWriter.saveToV1Source(DataFrameWriter.scala:273)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:267)
at org.apache.spark.sql.DataFrameWriter.save(DataFrameWriter.scala:225)
at org.apache.spark.sql.DataFrameWriter.parquet(DataFrameWriter.scala:547)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(Unknown Source)
at java.lang.reflect.Method.invoke(Unknown Source)
at sparklyr.Invoke.invoke(invoke.scala:161)
at sparklyr.StreamHandler.handleMethodCall(stream.scala:141)
at sparklyr.StreamHandler.read(stream.scala:62)
at sparklyr.BackendHandler$$anonfun$channelRead0$1.apply$mcV$sp(handler.scala:60)
at scala.util.control.Breaks.breakable(Breaks.scala:38)
at sparklyr.BackendHandler.channelRead0(handler.scala:40)
at sparklyr.BackendHandler.channelRead0(handler.scala:14)
at io.netty.channel.SimpleChannelInboundHandler.channelRead(SimpleChannelInboundHandler.java:105)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:362)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:348)
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:340)
at io.netty.handler.codec.MessageToMessageDecoder.channelRead(MessageToMessageDecoder.java:102)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:362)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:348)
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:340)
at io.netty.handler.codec.ByteToMessageDecoder.fireChannelRead(ByteToMessageDecoder.java:310)
at io.netty.handler.codec.ByteToMessageDecoder.channelRead(ByteToMessageDecoder.java:284)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:362)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:348)
at io.netty.channel.AbstractChannelHandlerContext.fireChannelRead(AbstractChannelHandlerContext.java:340)
at io.netty.channel.DefaultChannelPipeline$HeadContext.channelRead(DefaultChannelPipeline.java:1359)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:362)
at io.netty.channel.AbstractChannelHandlerContext.invokeChannelRead(AbstractChannelHandlerContext.java:348)
at io.netty.channel.DefaultChannelPipeline.fireChannelRead(DefaultChannelPipeline.java:935)
at io.netty.channel.nio.AbstractNioByteChannel$NioByteUnsafe.read(AbstractNioByteChannel.java:138)
at io.netty.channel.nio.NioEventLoop.processSelectedKey(NioEventLoop.java:645)
at io.netty.channel.nio.NioEventLoop.processSelectedKeysOptimized(NioEventLoop.java:580)
at io.netty.channel.nio.NioEventLoop.processSelectedKeys(NioEventLoop.java:497)
at io.netty.channel.nio.NioEventLoop.run(NioEventLoop.java:459)
at io.netty.util.concurrent.SingleThreadEventExecutor$5.run(SingleThreadEventExecutor.java:858)
at io.netty.util.concurrent.DefaultThreadFactory$DefaultRunnableDecorator.run(DefaultThreadFactory.java:138)
at java.lang.Thread.run(Unknown Source)
Caused by: org.apache.spark.SparkException: Job aborted due to stage failure: Task 7 in stage 159.0 failed 1 times, most recent failure: Lost task 7.0 in stage 159.0 (TID 663, localhost, executor driver): org.apache.spark.memory.SparkOutOfMemoryError: error while calling spill() on org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter#1337ff33 : Spazio su disco insufficiente
at org.apache.spark.memory.TaskMemoryManager.acquireExecutionMemory(TaskMemoryManager.java:217)
at org.apache.spark.memory.TaskMemoryManager.allocatePage(TaskMemoryManager.java:283)
at org.apache.spark.memory.MemoryConsumer.allocatePage(MemoryConsumer.java:117)
at org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter.acquireNewPageIfNecessary(UnsafeExternalSorter.java:383)
at org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter.insertRecord(UnsafeExternalSorter.java:407)
at org.apache.spark.sql.execution.UnsafeExternalRowSorter.insertRow(UnsafeExternalRowSorter.java:135)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage11.sort_addToSorter$(Unknown Source)
at org.apache.spark.sql.catalyst.expressions.GeneratedClass$GeneratedIteratorForCodegenStage11.processNext(Unknown Source)
at org.apache.spark.sql.execution.BufferedRowIterator.hasNext(BufferedRowIterator.java:43)
at org.apache.spark.sql.execution.WholeStageCodegenExec$$anonfun$10$$anon$1.hasNext(WholeStageCodegenExec.scala:614)
at org.apache.spark.sql.execution.RowIteratorFromScala.advanceNext(RowIterator.scala:83)
at org.apache.spark.sql.execution.joins.SortMergeFullOuterJoinScanner.advancedLeft(SortMergeJoinExec.scala:992)
at org.apache.spark.sql.execution.joins.SortMergeFullOuterJoinScanner.<init>(SortMergeJoinExec.scala:982)
at org.apache.spark.sql.execution.joins.SortMergeJoinExec$$anonfun$doExecute$1.apply(SortMergeJoinExec.scala:243)
at org.apache.spark.sql.execution.joins.SortMergeJoinExec$$anonfun$doExecute$1.apply(SortMergeJoinExec.scala:150)
at org.apache.spark.rdd.ZippedPartitionsRDD2.compute(ZippedPartitionsRDD.scala:89)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:324)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:288)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:96)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:53)
at org.apache.spark.scheduler.Task.run(Task.scala:109)
at org.apache.spark.executor.Executor$TaskRunner.r
i dont know how to solve the problem. i've tried with options(java.parameters="-Xmx3072m"), but with 0 results.
I want to load the simple json data into hive table without using Json Serde.
So I have tried to Load the json file in Pig and store the result into hive table.
A = LOAD '/data/pig_files/test.json' Using JsonLoader('merchant_id: chararray, platform_id: chararray, last_cdd_review_date: chararray, risk_factors:{(area:chararray,risk:chararray)}, expected_annual_transaction_number: chararray');
STORE A INTO 'Test_tbl' USING org.apache.hive.hcatalog.pig.HCatStorer();
It works fine but after server patching getting below error
Pig Stack Trace
---------------
ERROR 1115: org.apache.hive.hcatalog.common.HCatException : 2001 :
Error setting output information. Cause :
com.google.common.util.concurrent.UncheckedExecutionException: java.lang.RuntimeException:
Unable to instantiate org.apache.hive.hcatalog.common.HiveClientCache$CacheableHiveMetaStoreClient
org.apache.pig.impl.logicalLayer.FrontendException: ERROR 1002: Unable to store alias A
at org.apache.pig.PigServer$Graph.registerQuery(PigServer.java:1779)
at org.apache.pig.PigServer.registerQuery(PigServer.java:708)
at org.apache.pig.tools.grunt.GruntParser.processPig(GruntParser.java:1110)
at org.apache.pig.tools.pigscript.parser.PigScriptParser.parse(PigScriptParser.java:512)
at org.apache.pig.tools.grunt.GruntParser.parseStopOnError(GruntParser.java:230)
at org.apache.pig.tools.grunt.GruntParser.parseStopOnError(GruntParser.java:205)
at org.apache.pig.tools.grunt.Grunt.run(Grunt.java:66)
at org.apache.pig.Main.run(Main.java:564)
at org.apache.pig.Main.main(Main.java:175)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.hadoop.util.RunJar.run(RunJar.java:313)
at org.apache.hadoop.util.RunJar.main(RunJar.java:227)
Caused by: org.apache.pig.impl.plan.VisitorException: ERROR 1115: <line 3, column 0> Output Location Validation Failed for: 'test_tbl More info to follow:
org.apache.hive.hcatalog.common.HCatException : 2001 : Error setting output information.
Cause : com.google.common.util.concurrent.UncheckedExecutionException: java.lang.RuntimeException:
Unable to instantiate org.apache.hive.hcatalog.common.HiveClientCache$CacheableHiveMetaStoreClient
at org.apache.pig.newplan.logical.visitor.InputOutputFileValidatorVisitor.visit(InputOutputFileValidatorVisitor.java:64)
at org.apache.pig.newplan.logical.relational.LOStore.accept(LOStore.java:99)
at org.apache.pig.newplan.DepthFirstWalker.depthFirst(DepthFirstWalker.java:64)
at org.apache.pig.newplan.DepthFirstWalker.depthFirst(DepthFirstWalker.java:66)
at org.apache.pig.newplan.DepthFirstWalker.depthFirst(DepthFirstWalker.java:66)
at org.apache.pig.newplan.DepthFirstWalker.walk(DepthFirstWalker.java:53)
at org.apache.pig.newplan.PlanVisitor.visit(PlanVisitor.java:52)
at org.apache.pig.newplan.logical.relational.LogicalPlan.validate(LogicalPlan.java:234)
at org.apache.pig.PigServer$Graph.compile(PigServer.java:1852)
at org.apache.pig.PigServer$Graph.access$300(PigServer.java:1528)
at org.apache.pig.PigServer.execute(PigServer.java:1441)
at org.apache.pig.PigServer.access$500(PigServer.java:119)
at org.apache.pig.PigServer$Graph.registerQuery(PigServer.java:1774)
... 14 more
Caused by: org.apache.pig.PigException: ERROR 1115: org.apache.hive.hcatalog.common.HCatException : 2001 :
Error setting output information. Cause : com.google.common.util.concurrent.UncheckedExecutionException: java.lang.RuntimeException:
Unable to instantiate org.apache.hive.hcatalog.common.HiveClientCache$CacheableHiveMetaStoreClient
at org.apache.hive.hcatalog.pig.HCatStorer.setStoreLocation(HCatStorer.java:196)
at org.apache.pig.newplan.logical.visitor.InputOutputFileValidatorVisitor.visit(InputOutputFileValidatorVisitor.java:57)
... 26 more
Caused by: org.apache.hive.hcatalog.common.HCatException : 2001 : Error setting output information. Cause :
com.google.common.util.concurrent.UncheckedExecutionException: java.lang.RuntimeException:
Unable to instantiate org.apache.hive.hcatalog.common.HiveClientCache$CacheableHiveMetaStoreClient
at org.apache.hive.hcatalog.mapreduce.HCatOutputFormat.setOutput(HCatOutputFormat.java:220)
at org.apache.hive.hcatalog.mapreduce.HCatOutputFormat.setOutput(HCatOutputFormat.java:70)
at org.apache.hive.hcatalog.pig.HCatStorer.setStoreLocation(HCatStorer.java:191)
... 27 more
Caused by: com.google.common.util.concurrent.UncheckedExecutionException: java.lang.RuntimeException: Unable to instantiate org.apache.hive.hcatalog.common.HiveClientCache$CacheableHiveMetaStoreClient
at com.google.common.cache.LocalCache$Segment.get(LocalCache.java:2263)
at com.google.common.cache.LocalCache.get(LocalCache.java:4000)
at com.google.common.cache.LocalCache$LocalManualCache.get(LocalCache.java:4789)
at org.apache.hive.hcatalog.common.HiveClientCache.getOrCreate(HiveClientCache.java:229)
at org.apache.hive.hcatalog.common.HiveClientCache.get(HiveClientCache.java:204)
at org.apache.hive.hcatalog.common.HCatUtil.getHiveMetastoreClient(HCatUtil.java:563)
at org.apache.hive.hcatalog.mapreduce.HCatOutputFormat.setOutput(HCatOutputFormat.java:90)
... 29 more
Caused by: java.lang.RuntimeException: Unable to instantiate org.apache.hive.hcatalog.common.HiveClientCache$CacheableHiveMetaStoreClient
at org.apache.hadoop.hive.metastore.MetaStoreUtils.newInstance(MetaStoreUtils.java:1775)
at org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.<init>(RetryingMetaStoreClient.java:80)
at org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.getProxy(RetryingMetaStoreClient.java:130)
at org.apache.hadoop.hive.metastore.RetryingMetaStoreClient.getProxy(RetryingMetaStoreClient.java:115)
at org.apache.hive.hcatalog.common.HiveClientCache$5.call(HiveClientCache.java:234)
at org.apache.hive.hcatalog.common.HiveClientCache$5.call(HiveClientCache.java:229)
at com.google.common.cache.LocalCache$LocalManualCache$1.load(LocalCache.java:4792)
at com.google.common.cache.LocalCache$LoadingValueReference.loadFuture(LocalCache.java:3599)
at com.google.common.cache.LocalCache$Segment.loadSync(LocalCache.java:2379)
at com.google.common.cache.LocalCache$Segment.lockedGetOrLoad(LocalCache.java:2342)
at com.google.common.cache.LocalCache$Segment.get(LocalCache.java:2257)
... 35 more
Caused by: java.lang.reflect.InvocationTargetException
at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method)
at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62)
at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45)
at java.lang.reflect.Constructor.newInstance(Constructor.java:423)
at org.apache.hadoop.hive.metastore.MetaStoreUtils.newInstance(MetaStoreUtils.java:1773)
... 45 more
Caused by: java.lang.NoClassDefFoundError: org/datanucleus/NucleusContext
at java.lang.Class.forName0(Native Method)
at java.lang.Class.forName(Class.java:348)
at org.apache.hadoop.hive.metastore.MetaStoreUtils.getClass(MetaStoreUtils.java:1741)
at org.apache.hadoop.hive.metastore.RawStoreProxy.getProxy(RawStoreProxy.java:64)
at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.newRawStoreForConf(HiveMetaStore.java:688)
at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.getMSForConf(HiveMetaStore.java:654)
at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.getMS(HiveMetaStore.java:648)
at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.createDefaultDB(HiveMetaStore.java:717)
at org.apache.hadoop.hive.metastore.HiveMetaStore$HMSHandler.init(HiveMetaStore.java:420)
at org.apache.hadoop.hive.metastore.RetryingHMSHandler.<init>(RetryingHMSHandler.java:78)
at org.apache.hadoop.hive.metastore.RetryingHMSHandler.getProxy(RetryingHMSHandler.java:84)
at org.apache.hadoop.hive.metastore.HiveMetaStore.newRetryingHMSHandler(HiveMetaStore.java:7036)
at org.apache.hadoop.hive.metastore.HiveMetaStoreClient.<init>(HiveMetaStoreClient.java:254)
at org.apache.hive.hcatalog.common.HiveClientCache$CacheableHiveMetaStoreClient.<init>(HiveClientCache.java:334)
... 50 more
Caused by: java.lang.ClassNotFoundException: org.datanucleus.NucleusContext
at java.net.URLClassLoader.findClass(URLClassLoader.java:381)
at java.lang.ClassLoader.loadClass(ClassLoader.java:424)
at sun.misc.Launcher$AppClassLoader.loadClass(Launcher.java:335)
at java.lang.ClassLoader.loadClass(ClassLoader.java:357)
... 64 more
================================================================================
Help me to resolve this or suggest me other ways to load Json data into hive table without Serde.
This is the error running Primefaces 8.0 please note that it ran with Primefaces 6.1 fine.
07:24:10,717 INFO [javax.enterprise.resource.webcontainer.jsf.config] (ServerService Thread Pool -- 224) Initializing Mojarra 2.3.5.SP2-redhat-00001 for context '/ais'
07:24:36,289 SEVERE [javax.enterprise.resource.webcontainer.jsf.config] (ServerService Thread Pool -- 224) Critical error during deployment: : com.sun.faces.config.ConfigurationException: java.lang.Exception: No Function Found on type: org.primefaces.util.ComponentUtils with signature: java.lang.String findComponentClientId(java.lang.String)
at com.sun.faces.config.processor.FaceletTaglibConfigProcessor.processFunctions(FaceletTaglibConfigProcessor.java:680)
at com.sun.faces.config.processor.FaceletTaglibConfigProcessor.processTagLibrary(FaceletTaglibConfigProcessor.java:331)
at com.sun.faces.config.processor.FaceletTaglibConfigProcessor.process(FaceletTaglibConfigProcessor.java:271)
at com.sun.faces.config.ConfigManager.initialize(ConfigManager.java:445)
at com.sun.faces.config.ConfigureListener.contextInitialized(ConfigureListener.java:237)
at io.undertow.servlet.core.ApplicationListeners.contextInitialized(ApplicationListeners.java:187)
at io.undertow.servlet.core.DeploymentManagerImpl$1.call(DeploymentManagerImpl.java:216)
at io.undertow.servlet.core.DeploymentManagerImpl$1.call(DeploymentManagerImpl.java:185)
at io.undertow.servlet.core.ServletRequestContextThreadSetupAction$1.call(ServletRequestContextThreadSetupAction.java:42)
at io.undertow.servlet.core.ContextClassLoaderSetupAction$1.call(ContextClassLoaderSetupAction.java:43)
at org.wildfly.extension.undertow.security.SecurityContextThreadSetupAction.lambda$create$0(SecurityContextThreadSetupAction.java:105)
at org.wildfly.extension.undertow.security.SecurityContextThreadSetupAction$$Lambda$843/1466016034.call(Unknown Source)
at org.wildfly.extension.undertow.deployment.UndertowDeploymentInfoService$UndertowThreadSetupAction.lambda$create$0(UndertowDeploymentInfoService.java:1502)
at org.wildfly.extension.undertow.deployment.UndertowDeploymentInfoService$UndertowThreadSetupAction$$Lambda$844/113487826.call(Unknown Source)
at org.wildfly.extension.undertow.deployment.UndertowDeploymentInfoService$UndertowThreadSetupAction.lambda$create$0(UndertowDeploymentInfoService.java:1502)
at org.wildfly.extension.undertow.deployment.UndertowDeploymentInfoService$UndertowThreadSetupAction$$Lambda$844/113487826.call(Unknown Source)
at org.wildfly.extension.undertow.deployment.UndertowDeploymentInfoService$UndertowThreadSetupAction.lambda$create$0(UndertowDeploymentInfoService.java:1502)
at org.wildfly.extension.undertow.deployment.UndertowDeploymentInfoService$UndertowThreadSetupAction$$Lambda$844/113487826.call(Unknown Source)
at org.wildfly.extension.undertow.deployment.UndertowDeploymentInfoService$UndertowThreadSetupAction.lambda$create$0(UndertowDeploymentInfoService.java:1502)
at org.wildfly.extension.undertow.deployment.UndertowDeploymentInfoService$UndertowThreadSetupAction$$Lambda$844/113487826.call(Unknown Source)
at org.wildfly.extension.undertow.deployment.UndertowDeploymentInfoService$UndertowThreadSetupAction.lambda$create$0(UndertowDeploymentInfoService.java:1502)
at org.wildfly.extension.undertow.deployment.UndertowDeploymentInfoService$UndertowThreadSetupAction$$Lambda$844/113487826.call(Unknown Source)
at io.undertow.servlet.core.DeploymentManagerImpl.deploy(DeploymentManagerImpl.java:250)
at org.wildfly.extension.undertow.deployment.UndertowDeploymentService.startContext(UndertowDeploymentService.java:96)
at org.wildfly.extension.undertow.deployment.UndertowDeploymentService$1.run(UndertowDeploymentService.java:78)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at org.jboss.threads.ContextClassLoaderSavingRunnable.run(ContextClassLoaderSavingRunnable.java:35)
at org.jboss.threads.EnhancedQueueExecutor.safeRun(EnhancedQueueExecutor.java:1985)
at org.jboss.threads.EnhancedQueueExecutor$ThreadBody.doRunTask(EnhancedQueueExecutor.java:1487)
at org.jboss.threads.EnhancedQueueExecutor$ThreadBody.run(EnhancedQueueExecutor.java:1378)
at java.lang.Thread.run(Thread.java:745)
at org.jboss.threads.JBossThread.run(JBossThread.java:485)
Caused by: java.lang.Exception: No Function Found on type: org.primefaces.util.ComponentUtils with signature: java.lang.String findComponentClientId(java.lang.String)
at com.sun.faces.config.processor.FaceletTaglibConfigProcessor.createMethod(FaceletTaglibConfigProcessor.java:721)
at com.sun.faces.config.processor.FaceletTaglibConfigProcessor.processFunctions(FaceletTaglibConfigProcessor.java:677)
... 32 more
07:24:36,294 ERROR [org.jboss.msc.service.fail] (ServerService Thread Pool -- 224) MSC000001: Failed to start service jboss.deployment.subunit."ais-ear-1.0.0.ear"."ais-web-1.0.0.war".undertow-deployment: org.jboss.msc.service.StartException in service jboss.deployment.subunit."ais-ear-1.0.0.ear"."ais-web-1.0.0.war".undertow-deployment: java.lang.RuntimeException: java.lang.RuntimeException: com.sun.faces.config.ConfigurationException: java.lang.Exception: No Function Found on type: org.primefaces.util.ComponentUtils with signature: java.lang.String findComponentClientId(java.lang.String)
at org.wildfly.extension.undertow.deployment.UndertowDeploymentService$1.run(UndertowDeploymentService.java:81)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at org.jboss.threads.ContextClassLoaderSavingRunnable.run(ContextClassLoaderSavingRunnable.java:35)
at org.jboss.threads.EnhancedQueueExecutor.safeRun(EnhancedQueueExecutor.java:1985)
at org.jboss.threads.EnhancedQueueExecutor$ThreadBody.doRunTask(EnhancedQueueExecutor.java:1487)
at org.jboss.threads.EnhancedQueueExecutor$ThreadBody.run(EnhancedQueueExecutor.java:1378)
at java.lang.Thread.run(Thread.java:745)
at org.jboss.threads.JBossThread.run(JBossThread.java:485)
Caused by: java.lang.RuntimeException: java.lang.RuntimeException: com.sun.faces.config.ConfigurationException: java.lang.Exception: No Function Found on type: org.primefaces.util.ComponentUtils with signature: java.lang.String findComponentClientId(java.lang.String)
at io.undertow.servlet.core.DeploymentManagerImpl.deploy(DeploymentManagerImpl.java:252)
at org.wildfly.extension.undertow.deployment.UndertowDeploymentService.startContext(UndertowDeploymentService.java:96)
at org.wildfly.extension.undertow.deployment.UndertowDeploymentService$1.run(UndertowDeploymentService.java:78)
... 8 more
I am trying to connect to MySQL instance from AWS EMR - Zeppelin notebook. Loaded the mysql connector to this location - /usr/lib/spark/jars/mysql-connector-java-5.0.4-bin.jar . And added this as an artifact in zeppelin interpreter.
Initiating the driver,
Class.forName("com.mysql.jdbc.Driver")
res77: Class[_] = class com.mysql.jdbc.Driver
Using Scala code as here,
Trial 1,
val jdbcDF = spark.read.format("jdbc").options(
Map("url" -> "jdbc:mysql://our-host.readm.co.nz:3306/testdb?user=testuser&password=****",
"dbtable" -> "testdb.tablename",
"driver" -> "com.mysql.jdbc.Driver",
"partitionColumn" -> "id", "lowerBound" -> "1", "upperBound" -> "41514638", "numPartitions" -> "20"
)).load()
Got this error,
com.mysql.jdbc.CommunicationsException: Communications link failure due to underlying exception:
** BEGIN NESTED EXCEPTION **
java.net.SocketException
MESSAGE: java.net.ConnectException: Connection timed out (Connection timed out)
STACKTRACE:
java.net.SocketException: java.net.ConnectException: Connection timed out (Connection timed out)
at com.mysql.jdbc.StandardSocketFactory.connect(StandardSocketFactory.java:156)
at com.mysql.jdbc.MysqlIO.<init>(MysqlIO.java:276)
at com.mysql.jdbc.Connection.createNewIO(Connection.java:2666)
at com.mysql.jdbc.Connection.<init>(Connection.java:1531)
at com.mysql.jdbc.NonRegisteringDriver.connect(NonRegisteringDriver.java:266)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$createConnectionFactory$1.apply(JdbcUtils.scala:61)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$createConnectionFactory$1.apply(JdbcUtils.scala:52)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD$.resolveTable(JDBCRDD.scala:58)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCRelation.<init>(JDBCRelation.scala:114)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:52)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:307)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:146)
at <init>(<console>:50)
at <init>(<console>:55)
at <init>(<console>:57)
at <init>(<console>:59)
at <init>(<console>:61)
at <init>(<console>:63)
at <init>(<console>:65)
at <init>(<console>:67)
at <init>(<console>:69)
at <init>(<console>:71)
at <init>(<console>:73)
at <init>(<console>:75)
at <init>(<console>:77)
at <init>(<console>:79)
at <init>(<console>:81)
at <init>(<console>:83)
at <init>(<console>:85)
at <init>(<console>:87)
at <init>(<console>:89)
at <init>(<console>:91)
at <init>(<console>:93)
at .<init>(<console>:97)
at .<clinit>(<console>)
at .$print$lzycompute(<console>:7)
at .$print(<console>:6)
at $print(<console>)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at scala.tools.nsc.interpreter.IMain$ReadEvalPrint.call(IMain.scala:786)
at scala.tools.nsc.interpreter.IMain$Request.loadAndRun(IMain.scala:1047)
at scala.tools.nsc.interpreter.IMain$WrappedRequest$$anonfun$loadAndRunReq$1.apply(IMain.scala:638)
at scala.tools.nsc.interpreter.IMain$WrappedRequest$$anonfun$loadAndRunReq$1.apply(IMain.scala:637)
at scala.reflect.internal.util.ScalaClassLoader$class.asContext(ScalaClassLoader.scala:31)
at scala.reflect.internal.util.AbstractFileClassLoader.asContext(AbstractFileClassLoader.scala:19)
at scala.tools.nsc.interpreter.IMain$WrappedRequest.loadAndRunReq(IMain.scala:637)
at scala.tools.nsc.interpreter.IMain.interpret(IMain.scala:569)
at scala.tools.nsc.interpreter.IMain.interpret(IMain.scala:565)
at sun.reflect.GeneratedMethodAccessor369.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.zeppelin.spark.Utils.invokeMethod(Utils.java:38)
at org.apache.zeppelin.spark.SparkInterpreter.interpret(SparkInterpreter.java:1000)
at org.apache.zeppelin.spark.SparkInterpreter.interpretInput(SparkInterpreter.java:1205)
at org.apache.zeppelin.spark.SparkInterpreter.interpret(SparkInterpreter.java:1172)
at org.apache.zeppelin.spark.SparkInterpreter.interpret(SparkInterpreter.java:1165)
at org.apache.zeppelin.interpreter.LazyOpenInterpreter.interpret(LazyOpenInterpreter.java:97)
at org.apache.zeppelin.interpreter.remote.RemoteInterpreterServer$InterpretJob.jobRun(RemoteInterpreterServer.java:498)
at org.apache.zeppelin.scheduler.Job.run(Job.java:175)
at org.apache.zeppelin.scheduler.FIFOScheduler$1.run(FIFOScheduler.java:139)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180)
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
** END NESTED EXCEPTION **
Last packet sent to the server was 1 ms ago.
at com.mysql.jdbc.Connection.createNewIO(Connection.java:2741)
at com.mysql.jdbc.Connection.<init>(Connection.java:1531)
at com.mysql.jdbc.NonRegisteringDriver.connect(NonRegisteringDriver.java:266)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$createConnectionFactory$1.apply(JdbcUtils.scala:61)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$createConnectionFactory$1.apply(JdbcUtils.scala:52)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD$.resolveTable(JDBCRDD.scala:58)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCRelation.<init>(JDBCRelation.scala:114)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:52)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:307)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:146)
... 58 elided
Then trial 2,
val partitionColumn = "id"
val dbUrl = "jdbc:mysql://our-host.readm.co.nz:3306"
val lowerBound = 1
val upperBound = 41514638
val jdbcDF = spark.read.format("jdbc")
.option("url", dbUrl)
.option("databaseName", "testdb")
.option("dbtable", "tablename")
.option("user", "username")
.option("password", "*****")
.option("driver","com.mysql.jdbc.Driver")
.option("partitionColumn", partitionColumn)
.option("lowerBound", lowerBound)
.option("upperBound", upperBound)
.option("numPartitions", 20)
.load()
Got this error,
java.lang.NullPointerException
at org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD$.resolveTable(JDBCRDD.scala:72)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCRelation.<init>(JDBCRelation.scala:114)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:52)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:307)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:146)
... 58 elided
Trial 3,
import java.util.Properties
val table = "tablename"
val url = "jdbc:mysql://our-host.readm.co.nz"
val prop = new Properties()
prop.put("driver","com.mysql.jdbc.Driver")
prop.put("user","username")
prop.put("password","****")
prop.put("databaseName","testdb")
val jdbcDF = spark.read.jdbc(url,
table,
prop)
Getting this error,
java.lang.NullPointerException
at org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD$.resolveTable(JDBCRDD.scala:72)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCRelation.<init>(JDBCRelation.scala:114)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:52)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:307)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:146)
at org.apache.spark.sql.DataFrameReader.jdbc(DataFrameReader.scala:193)
... 58 elided
Not sure whats wrong here, can any help me? Thanks.
EDIT:- This is what I am getting after trying the answer given by #geo
com.mysql.jdbc.CommunicationsException: Communications link failure due to underlying exception:
** BEGIN NESTED EXCEPTION **
java.net.SocketException
MESSAGE: java.net.ConnectException: Connection timed out (Connection timed out)
STACKTRACE:
java.net.SocketException: java.net.ConnectException: Connection timed out (Connection timed out)
at com.mysql.jdbc.StandardSocketFactory.connect(StandardSocketFactory.java:156)
at com.mysql.jdbc.MysqlIO.<init>(MysqlIO.java:276)
at com.mysql.jdbc.Connection.createNewIO(Connection.java:2666)
at com.mysql.jdbc.Connection.<init>(Connection.java:1531)
at com.mysql.jdbc.NonRegisteringDriver.connect(NonRegisteringDriver.java:266)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$createConnectionFactory$1.apply(JdbcUtils.scala:61)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$createConnectionFactory$1.apply(JdbcUtils.scala:52)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD$.resolveTable(JDBCRDD.scala:58)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCRelation.<init>(JDBCRelation.scala:114)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:52)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:307)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:146)
at <init>(<console>:40)
at <init>(<console>:45)
at <init>(<console>:47)
at <init>(<console>:49)
at <init>(<console>:51)
at <init>(<console>:53)
at <init>(<console>:55)
at <init>(<console>:57)
at <init>(<console>:59)
at <init>(<console>:61)
at <init>(<console>:63)
at <init>(<console>:65)
at <init>(<console>:67)
at .<init>(<console>:71)
at .<clinit>(<console>)
at .$print$lzycompute(<console>:7)
at .$print(<console>:6)
at $print(<console>)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at scala.tools.nsc.interpreter.IMain$ReadEvalPrint.call(IMain.scala:786)
at scala.tools.nsc.interpreter.IMain$Request.loadAndRun(IMain.scala:1047)
at scala.tools.nsc.interpreter.IMain$WrappedRequest$$anonfun$loadAndRunReq$1.apply(IMain.scala:638)
at scala.tools.nsc.interpreter.IMain$WrappedRequest$$anonfun$loadAndRunReq$1.apply(IMain.scala:637)
at scala.reflect.internal.util.ScalaClassLoader$class.asContext(ScalaClassLoader.scala:31)
at scala.reflect.internal.util.AbstractFileClassLoader.asContext(AbstractFileClassLoader.scala:19)
at scala.tools.nsc.interpreter.IMain$WrappedRequest.loadAndRunReq(IMain.scala:637)
at scala.tools.nsc.interpreter.IMain.interpret(IMain.scala:569)
at scala.tools.nsc.interpreter.IMain.interpret(IMain.scala:565)
at sun.reflect.GeneratedMethodAccessor340.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.zeppelin.spark.Utils.invokeMethod(Utils.java:38)
at org.apache.zeppelin.spark.SparkInterpreter.interpret(SparkInterpreter.java:1000)
at org.apache.zeppelin.spark.SparkInterpreter.interpretInput(SparkInterpreter.java:1205)
at org.apache.zeppelin.spark.SparkInterpreter.interpret(SparkInterpreter.java:1172)
at org.apache.zeppelin.spark.SparkInterpreter.interpret(SparkInterpreter.java:1165)
at org.apache.zeppelin.interpreter.LazyOpenInterpreter.interpret(LazyOpenInterpreter.java:97)
at org.apache.zeppelin.interpreter.remote.RemoteInterpreterServer$InterpretJob.jobRun(RemoteInterpreterServer.java:498)
at org.apache.zeppelin.scheduler.Job.run(Job.java:175)
at org.apache.zeppelin.scheduler.FIFOScheduler$1.run(FIFOScheduler.java:139)
at java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:511)
at java.util.concurrent.FutureTask.run(FutureTask.java:266)
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.access$201(ScheduledThreadPoolExecutor.java:180)
at java.util.concurrent.ScheduledThreadPoolExecutor$ScheduledFutureTask.run(ScheduledThreadPoolExecutor.java:293)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1149)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:624)
at java.lang.Thread.run(Thread.java:748)
** END NESTED EXCEPTION **
Last packet sent to the server was 7 ms ago.
at com.mysql.jdbc.Connection.createNewIO(Connection.java:2741)
at com.mysql.jdbc.Connection.<init>(Connection.java:1531)
at com.mysql.jdbc.NonRegisteringDriver.connect(NonRegisteringDriver.java:266)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$createConnectionFactory$1.apply(JdbcUtils.scala:61)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$createConnectionFactory$1.apply(JdbcUtils.scala:52)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD$.resolveTable(JDBCRDD.scala:58)
at org.apache.spark.sql.execution.datasources.jdbc.JDBCRelation.<init>(JDBCRelation.scala:114)
at org.apache.spark.sql.execution.datasources.jdbc.JdbcRelationProvider.createRelation(JdbcRelationProvider.scala:52)
at org.apache.spark.sql.execution.datasources.DataSource.resolveRelation(DataSource.scala:307)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:178)
at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:146)
... 50 elided
Assuming the JDBC interpreter works without any problems in Zeppelin, this is what worked for me:
// DB connection
val options = Map(
"url" -> "jdbc:mysql://localhost:3306/dbname",
"dbtable" -> "dbtable",
"user" -> "user",
"password" -> "password"
)
val data = spark
.read
.format("jdbc")
.options(options)
.load
It goes without saying that there should be a mysql process running at localhost:3306 (or wherever you are hosting the DB) and that the firewall/iptables allows you in/out access to/from that process.
How is your cluster setup? firewall rules etc.? Is mysqld running? Anyway it looks more like a connection issue than a problem with the JDBC driver. Therefore I would recommed: try to connect to MySQL from the host that zepplin is running on via bash
I am trying to extract MySQL data through Spark but unfortunately I am getting below error. Here is my sample RDD :
emp_df = sqlContext.read.format("jdbc").option("url",url).option("dbtable","employee").option("driver", 'com.mysql.jdbc.Driver').load()
Error :
**Py4JJavaError: An error occurred while calling o149.load. : java.lang.UnsupportedClassVersionError: com/mysql/jdbc/Driver : Unsupported major.minor version 52.0** at java.lang.ClassLoader.defineClass1(Native Method) at java.lang.ClassLoader.defineClass(ClassLoader.java:803) at java.security.SecureClassLoader.defineClass(SecureClassLoader.java:142) at java.net.URLClassLoader.defineClass(URLClassLoader.java:449) at java.net.URLClassLoader.access$100(URLClassLoader.java:71) at java.net.URLClassLoader$1.run(URLClassLoader.java:361) at java.net.URLClassLoader$1.run(URLClassLoader.java:355) at java.security.AccessController.doPrivileged(Native Method) at java.net.URLClassLoader.findClass(URLClassLoader.java:354) at java.lang.ClassLoader.loadClass(ClassLoader.java:425) at java.lang.ClassLoader.loadClass(ClassLoader.java:358) at org.apache.spark.sql.execution.datasources.jdbc.DriverRegistry$.register(DriverRegistry.scala:38) at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$createConnectionFactory$1.apply(JdbcUtils.scala:45) at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$$anonfun$createConnectionFactory$1.apply(JdbcUtils.scala:45) at scala.Option.foreach(Option.scala:236) at org.apache.spark.sql.execution.datasources.jdbc.JdbcUtils$.createConnectionFactory(JdbcUtils.scala:45) at org.apache.spark.sql.execution.datasources.jdbc.JDBCRDD$.resolveTable(JDBCRDD.scala:120) at org.apache.spark.sql.execution.datasources.jdbc.JDBCRelation.<init>(JDBCRelation.scala:91) at org.apache.spark.sql.execution.datasources.jdbc.DefaultSource.createRelation(DefaultSource.scala:57) at org.apache.spark.sql.execution.datasources.ResolvedDataSource$.apply(ResolvedDataSource.scala:158) at org.apache.spark.sql.DataFrameReader.load(DataFrameReader.scala:119) at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57) at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) at java.lang.reflect.Method.invoke(Method.java:606) at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231) at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:381) at py4j.Gateway.invoke(Gateway.java:259) at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133) at py4j.commands.CallCommand.execute(CallCommand.java:79) at py4j.GatewayConnection.run(GatewayConnection.java:209) at java.lang.Thread.run(Thread.java:745)
It is a Java version mismatch. The JDK and JRE should not be conflicting. If you are using JDk-1.8, use JRE 8