1
私はスパークMllibを使用して小売業界にリンク解析プロジェクトをやっています。私のスキーマは次のとおりです。機能テイク()エラー - リンク解析研究用いたスパークMllib
ID - ロング チェーン - のInt 部門 - のInt 会社 - - ロング ブランド - ロング 日 - 日 ProductSize - のInt ProductMeasure - Chararray PurchaseQuantity - のInt PurchaseAmountお カテゴリーをint型 - ダブル
そして、私が使用しているコードは次のとおりです。
scala> import org.apache.spark._
scala> import org.apache.spark.rdd.RDD
scala> import org.apache.spark.util.IntParam
scala> import org.apache.spark.graphx._
scala> import org.apache.spark.graphx.util.GraphGenerators
scala> case class Transactions(ID:Long,Chain:Int,Dept:Int,Category:Int,Company:Long,Brand:Long,Date:String,ProductSize:Int,ProductMeasure:String,PurchaseQuantity:Int,PurchaseAmount:Double)
defined class Transactions
scala> def parseTransactions(str:String): Transactions = {
| val line = str.split(",")
| Transactions(line(0).toLong,line(1).toInt,line(2).toInt,line(3).toInt,line(4).toInt,line(5).toInt,line(6),line(7).toInt,line(8),line(9).toInt,line(10).toInt)
| }
scala> val textRDD = sc.textFile("/user/cloudera/transactions.csv")
scala> val transactionsRDD = textRDD.map(parseTransactions).cache()
scala> val products = transactionsRDD.map(Transactions => (Transactions.ID,Transactions.Chain,Transactions.Dept,Transactions.Category,Transactions.Company,Transactions.Brand,Transactions.Date)).distinct
scala> products.take(1)
しかし最後の行を提出すると、次のエラーが表示されます。
[Stage 0:> (0 + 1)/7]16/08/24 04:56:13 ERROR executor.Executor: Exception in task 0.0 in stage 0.0 (TID 0)
java.lang.NumberFormatException: For input string: "id"
at java.lang.NumberFormatException.forInputString(NumberFormatException.java:65)
at java.lang.Long.parseLong(Long.java:441)
at java.lang.Long.parseLong(Long.java:483)
at scala.collection.immutable.StringLike$class.toLong(StringLike.scala:230)
\t at scala.collection.immutable.StringOps.toLong(StringOps.scala:31)
\t at $line65.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.parseTransactions(<console>:38)
\t at $line67.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$1.apply(<console>:42)
at $line67.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$1.apply(<console>:42)
\t at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
\t at org.apache.spark.storage.MemoryStore.unrollSafely(MemoryStore.scala:285)
\t at org.apache.spark.CacheManager.putInBlockManager(CacheManager.scala:171)
\t at org.apache.spark.CacheManager.getOrCompute(CacheManager.scala:78)
\t at org.apache.spark.rdd.RDD.iterator(RDD.scala:268)
\t at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
\t at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
\t at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
\t at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
\t at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
\t at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
\t at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73)
\t at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
\t at org.apache.spark.scheduler.Task.run(Task.scala:89)
\t at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
\t at java.lang.Thread.run(Thread.java:745)
16/08/24 04:56:13 WARN scheduler.TaskSetManager: Lost task 0.0 in stage 0.0 (TID 0, localhost): java.lang.NumberFormatException: For input string: "id"
\t at java.lang.NumberFormatException.forInputString(NumberFormatException.java:65)
\t at java.lang.Long.parseLong(Long.java:441)
\t at java.lang.Long.parseLong(Long.java:483)
\t at scala.collection.immutable.StringLike$class.toLong(StringLike.scala:230)
at scala.collection.immutable.StringOps.toLong(StringOps.scala:31)
at $line65.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.parseTransactions(<console>:38)
at $line67.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$1.apply(<console>:42)
\t at $line67.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$1.apply(<console>:42)
\t at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at org.apache.spark.storage.MemoryStore.unrollSafely(MemoryStore.scala:285)
at org.apache.spark.CacheManager.putInBlockManager(CacheManager.scala:171)
at org.apache.spark.CacheManager.getOrCompute(CacheManager.scala:78)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:268)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
\t at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
\t at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
16/08/24 04:56:13 ERROR scheduler.TaskSetManager: Task 0 in stage 0.0 failed 1 times; aborting job
16/08/24 04:56:13 ERROR executor.Executor: Exception in task 1.0 in stage 0.0 (TID 1)
java.lang.NumberFormatException: For input string: "6.67"
at java.lang.NumberFormatException.forInputString(NumberFormatException.java:65)
at java.lang.Integer.parseInt(Integer.java:492)
at java.lang.Integer.parseInt(Integer.java:527)
at scala.collection.immutable.StringLike$class.toInt(StringLike.scala:229)
\t at scala.collection.immutable.StringOps.toInt(StringOps.scala:31)
\t at $line65.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.parseTransactions(<console>:38)
\t at $line67.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$1.apply(<console>:42)
at $line67.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$1.apply(<console>:42)
\t at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
\t at org.apache.spark.storage.MemoryStore.unrollSafely(MemoryStore.scala:285)
\t at org.apache.spark.CacheManager.putInBlockManager(CacheManager.scala:171)
\t at org.apache.spark.CacheManager.getOrCompute(CacheManager.scala:78)
\t at org.apache.spark.rdd.RDD.iterator(RDD.scala:268)
\t at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
\t at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
\t at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
\t at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
\t at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
\t at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
\t at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73)
\t at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
\t at org.apache.spark.scheduler.Task.run(Task.scala:89)
\t at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
\t at java.lang.Thread.run(Thread.java:745)
16/08/24 04:56:13 WARN scheduler.TaskSetManager: Lost task 1.0 in stage 0.0 (TID 1, localhost): java.lang.NumberFormatException: For input string: "6.67"
\t at java.lang.NumberFormatException.forInputString(NumberFormatException.java:65)
\t at java.lang.Integer.parseInt(Integer.java:492)
\t at java.lang.Integer.parseInt(Integer.java:527)
\t at scala.collection.immutable.StringLike$class.toInt(StringLike.scala:229)
at scala.collection.immutable.StringOps.toInt(StringOps.scala:31)
at $line65.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.parseTransactions(<console>:38)
at $line67.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$1.apply(<console>:42)
\t at $line67.$read$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$1.apply(<console>:42)
\t at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at org.apache.spark.storage.MemoryStore.unrollSafely(MemoryStore.scala:285)
at org.apache.spark.CacheManager.putInBlockManager(CacheManager.scala:171)
at org.apache.spark.CacheManager.getOrCompute(CacheManager.scala:78)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:268)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
\t at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
\t at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
org.apache.spark.SparkException: Job aborted due to stage failure: Task 0 in stage 0.0 failed 1 times, most recent failure: Lost task 0.0 in stage 0.0 (TID 0, localhost): java.lang.NumberFormatException: For input string: "id"
at java.lang.NumberFormatException.forInputString(NumberFormatException.java:65)
at java.lang.Long.parseLong(Long.java:441)
at java.lang.Long.parseLong(Long.java:483)
at scala.collection.immutable.StringLike$class.toLong(StringLike.scala:230)
\t at scala.collection.immutable.StringOps.toLong(StringOps.scala:31)
\t at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.parseTransactions(<console>:38)
\t at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$1.apply(<console>:42)
\t at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$1.apply(<console>:42)
\t at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
\t at org.apache.spark.storage.MemoryStore.unrollSafely(MemoryStore.scala:285)
\t at org.apache.spark.CacheManager.putInBlockManager(CacheManager.scala:171)
\t at org.apache.spark.CacheManager.getOrCompute(CacheManager.scala:78)
\t at org.apache.spark.rdd.RDD.iterator(RDD.scala:268)
\t at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
\t at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
\t at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
\t at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
\t at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
\t at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
\t at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73)
\t at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
\t at org.apache.spark.scheduler.Task.run(Task.scala:89)
\t at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
Driver stacktrace:
at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1431)
\t at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1419)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$abortStage$1.apply(DAGScheduler.scala:1418)
\t at scala.collection.mutable.ResizableArray$class.foreach(ResizableArray.scala:59)
\t at scala.collection.mutable.ArrayBuffer.foreach(ArrayBuffer.scala:47)
\t at org.apache.spark.scheduler.DAGScheduler.abortStage(DAGScheduler.scala:1418)
\t at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleTaskSetFailed$1.apply(DAGScheduler.scala:799)
\t at scala.Option.foreach(Option.scala:236)
\t at org.apache.spark.scheduler.DAGScheduler.handleTaskSetFailed(DAGScheduler.scala:799)
\t at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1640)
\t at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1599)
\t at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1588)
\t at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
\t at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:620)
\t at org.apache.spark.SparkContext.runJob(SparkContext.scala:1843)
\t at org.apache.spark.SparkContext.runJob(SparkContext.scala:1856)
\t at org.apache.spark.SparkContext.runJob(SparkContext.scala:1869)
\t at org.apache.spark.rdd.RDD$$anonfun$take$1.apply(RDD.scala:1328)
\t at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:150)
at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:111)
\t at org.apache.spark.rdd.RDD.withScope(RDD.scala:316)
\t at org.apache.spark.rdd.RDD.take(RDD.scala:1302)
\t at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:47)
\t at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:52)
\t at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:54)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:56)
\t at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:58)
\t at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:60)
\t at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:62)
at $iwC$$iwC$$iwC$$iwC$$iwC.<init>(<console>:64)
\t at $iwC$$iwC$$iwC$$iwC.<init>(<console>:66)
\t at $iwC$$iwC$$iwC.<init>(<console>:68)
\t at $iwC$$iwC.<init>(<console>:70)
at $iwC.<init>(<console>:72)
\t at <init>(<console>:74)
\t at .<init>(<console>:78)
\t at .<clinit>(<console>)
\t at .<init>(<console>:7)
\t at .<clinit>(<console>)
\t at $print(<console>)
at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:606)
at org.apache.spark.repl.SparkIMain$ReadEvalPrint.call(SparkIMain.scala:1045)
\t at org.apache.spark.repl.SparkIMain$Request.loadAndRun(SparkIMain.scala:1326)
at org.apache.spark.repl.SparkIMain.loadAndRunReq$1(SparkIMain.scala:821)
\t at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:852)
\t at org.apache.spark.repl.SparkIMain.interpret(SparkIMain.scala:800)
\t at org.apache.spark.repl.SparkILoop.reallyInterpret$1(SparkILoop.scala:857)
at org.apache.spark.repl.SparkILoop.interpretStartingWith(SparkILoop.scala:902)
at org.apache.spark.repl.SparkILoop.command(SparkILoop.scala:814)
at org.apache.spark.repl.SparkILoop.processLine$1(SparkILoop.scala:657)
\t at org.apache.spark.repl.SparkILoop.innerLoop$1(SparkILoop.scala:665)
at org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$loop(SparkILoop.scala:670)
\t at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply$mcZ$sp(SparkILoop.scala:997)
at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:945)
\t at org.apache.spark.repl.SparkILoop$$anonfun$org$apache$spark$repl$SparkILoop$$process$1.apply(SparkILoop.scala:945)
\t at scala.tools.nsc.util.ScalaClassLoader$.savingContextLoader(ScalaClassLoader.scala:135)
\t at org.apache.spark.repl.SparkILoop.org$apache$spark$repl$SparkILoop$$process(SparkILoop.scala:945)
at org.apache.spark.repl.SparkILoop.process(SparkILoop.scala:1064)
at org.apache.spark.repl.Main$.main(Main.scala:31)
\t at org.apache.spark.repl.Main.main(Main.scala)
\t at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
\t at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:57)
\t at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
\t at java.lang.reflect.Method.invoke(Method.java:606)
\t at org.apache.spark.deploy.SparkSubmit$.org$apache$spark$deploy$SparkSubmit$$runMain(SparkSubmit.scala:731)
\t at org.apache.spark.deploy.SparkSubmit$.doRunMain$1(SparkSubmit.scala:181)
\t at org.apache.spark.deploy.SparkSubmit$.submit(SparkSubmit.scala:206)
\t at org.apache.spark.deploy.SparkSubmit$.main(SparkSubmit.scala:121)
\t at org.apache.spark.deploy.SparkSubmit.main(SparkSubmit.scala)
Caused by: java.lang.NumberFormatException: For input string: "id"
\t at java.lang.NumberFormatException.forInputString(NumberFormatException.java:65)
\t at java.lang.Long.parseLong(Long.java:441)
\t at java.lang.Long.parseLong(Long.java:483)
\t at scala.collection.immutable.StringLike$class.toLong(StringLike.scala:230)
\t at scala.collection.immutable.StringOps.toLong(StringOps.scala:31)
\t at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC.parseTransactions(<console>:38)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$1.apply(<console>:42)
at $iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$iwC$$anonfun$1.apply(<console>:42)
at scala.collection.Iterator$$anon$11.next(Iterator.scala:328)
at org.apache.spark.storage.MemoryStore.unrollSafely(MemoryStore.scala:285)
at org.apache.spark.CacheManager.putInBlockManager(CacheManager.scala:171)
at org.apache.spark.CacheManager.getOrCompute(CacheManager.scala:78)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:268)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.rdd.MapPartitionsRDD.compute(MapPartitionsRDD.scala:38)
at org.apache.spark.rdd.RDD.computeOrReadCheckpoint(RDD.scala:306)
at org.apache.spark.rdd.RDD.iterator(RDD.scala:270)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:73)
at org.apache.spark.scheduler.ShuffleMapTask.runTask(ShuffleMapTask.scala:41)
at org.apache.spark.scheduler.Task.run(Task.scala:89)
at org.apache.spark.executor.Executor$TaskRunner.run(Executor.scala:214)
at java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1145)
at java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:615)
at java.lang.Thread.run(Thread.java:745)
なぜこのエラーが発生するのですか?私は
多くのおかげで...私が作成した配列を返すようになっていました!それはあなたのtoInt
呼び出しのように見えます
あなたは日付の右にそれを解析することはありません「(6)ライン」Stringとして私は、フィールドの日付をしたと私は渡しているところ –
私が考えていたとエラーがint型だろうか?それを文字列として保持するだけです。私はScalaを知らないので、私は離れるかもしれないが、私はSparkを知っている。文字列を数値に解析しようとするので、NumberFormatExceptionです。 Scalaに構造体を除いてみますか?また、TransactionsRDDのいくつかの行を取り出してから別のトランザクションを実行してみてください。何かを削除して問題があるかどうかを確認してください(多分それらをすべて削除してください) –
1月にすべてを削除すると、このエラーが発生します: def parseTransactions(str:String):Transactions = { | val line = str.split( "、") |トランザクション(ライン0、ライン1、ライン2、ライン3、ライン4、ライン5、ライン6、ライン7、ライン8、ライン(9)、行(10)) | }:47:エラー:タイプの不一致。 found:文字列 必須:Int トランザクション(行(0).toLong、行(1)、行(2)、行(3)、行(4)、行(5)、行(6)、行7)、ライン(8)、ライン(9)、ライン(10)) 理由を知っていますか? –