スパークストリーミング：Google Pubsubでカスタム受信者kryo登録

私はKryoシリアル化でSpark 2.0.2を使用しています。スパークストリーミング：Google Pubsubでカスタム受信者kryo登録

私は、SparkストリーミングにグーグルのPubSubからのメッセージを摂取するためのカスタム受信機を実装しようとしています：

class PubSubReceiver(project: String, topic: String, subscription: String) 
    extends Receiver[Array[Byte]](StorageLevel.MEMORY_AND_DISK_2) with Logging { 
    val projectFullName = ProjectName.create(project) 
    val topicName = TopicName.create(project, topic) 
    val subscriptionName = SubscriptionName.create(project, subscription) 
    val subscriber = Subscriber.defaultBuilder(subscriptionName, new receiver).build 

    def onStart() { 
    new Thread() { 
     override def run() { 
     subscriber.startAsync() 
     //ensure subscriber is running as well as spark receiver 
     while (subscriber.isRunning && !isStopped()) { 
      logger.info(s"${subscriber.getSubscriptionName} receiver running") 
      //sleep 10s 
      Thread.sleep(10000) 
     } 
     logger.info(s"${subscriber.getSubscriptionName} receiver stopping") 
     } 
    }.start() 
    } 

    def onStop(): Unit = { 
    // There is nothing much to do as the thread calling receive() 
    // is designed to stop by itself if isStopped() returns false 
    } 

    private class receiver extends MessageReceiver { 
    override def receiveMessage(message: PubsubMessage, consumer: AckReplyConsumer): Unit = { 
     store(ArrayBuffer(message.getData.toByteArray), message.getAttributesMap) 
    } 
    } 

}

この受信機を利用スパークジョブを実行するときしかし、私が仕事をシリアライズする必要があることと思われますそれ自体は正しいとは思われません（スパークの文脈はその後直列化されます）。

object PubSubStreamingIngestionJob extends App { 
    //... setup 

    lazy val ssc = new StreamingContext(spark.sparkContext, batchInterval) 

    lazy val pubsubUnionStream =the stream 
    ssc.receiverStream(new PubSubReceiver(projectName, topicName, subscriptionName)) 

    pubsubUnionStream.map(messageBytes => ...business logic...) 

    ssc.start() 
    ssc.awaitTermination() 

}

次のエラーがスローされます。

java.io.IOException: com.esotericsoftware.kryo.KryoException: java.lang.IllegalArgumentException: Class is not registered: com.c2fo.atlas.jobs.streaming.gcp.PubSubStreamingIngestionJob 
Note: To register this class use: kryo.register(com.mycompany.package.PubSubStreamingIngestionJob.class); 
Serialization trace: 
classes (sun.misc.Launcher$AppClassLoader) 
contextClassLoader (java.lang.Thread) 
threads (java.lang.ThreadGroup) 
parent (java.lang.ThreadGroup) 
group (java.util.concurrent.Executors$DefaultThreadFactory) 
val$backingThreadFactory (com.google.common.util.concurrent.ThreadFactoryBuilder$1) 
threadFactory (java.util.concurrent.ScheduledThreadPoolExecutor) 
e (java.util.concurrent.Executors$DelegatedScheduledExecutorService) 
executor (com.google.cloud.pubsub.spi.v1.Subscriber) 
subscriber (com.mycompany.package.PubSubReceiver) 
array (scala.collection.mutable.WrappedArray$ofRef)

はこれを実現するためのより良い方法はありますか？

出典

2017-04-24 autodidacticon

閉鎖全体がシリアル化されないようにするには、Subscriberインスタンスをスレッドローカルにする必要がありました。

package org.apache.spark.streaming.gcp 

import com.c2fo.atlas.util.LazyLogging 
import com.google.cloud.pubsub.spi.v1._ 
import com.google.iam.v1.ProjectName 
import com.google.pubsub.v1._ 
import org.apache.spark.storage.StorageLevel 
import org.apache.spark.streaming.receiver.Receiver 

import scala.collection.mutable.ArrayBuffer 

class PubSubReceiver(project: String, topic: String, subscription: String) 
    extends Receiver[PubsubMessage](StorageLevel.MEMORY_AND_DISK_2) with LazyLogging{ 
    val projectFullName = ProjectName.create(project) 
    val topicName = TopicName.create(project, topic) 
    val subscriptionName = SubscriptionName.create(project, subscription) 
    def onStart() { 
    new Thread() { 
     **//crucial change below**  
     val subscriber = Subscriber.defaultBuilder(subscriptionName, new receiver).build 
     override def run() { 
     subscriber.startAsync() 
     //ensure subscriber is running as well as spark receiver 
     while (subscriber.isRunning && !isStopped()) { 
      logger.info(s"${subscriber.getSubscriptionName} receiver running") 
      //sleep 10s 
      Thread.sleep(10000) 
     } 
     logger.info(s"${subscriber.getSubscriptionName} receiver stopping") 
     } 
    }.start() 
    } 

    def onStop(): Unit = { 
    // There is nothing much to do as the thread calling receive() 
    // is designed to stop by itself if isStopped() returns false 
    } 

    class receiver extends MessageReceiver { 
    override def receiveMessage(message: PubsubMessage, consumer: AckReplyConsumer): Unit = { 
     store(ArrayBuffer(message), message.getAttributesMap) 
    } 
    } 
}

出典

2017-04-24 23:09:17 autodidacticon

スパークストリーミング：Google Pubsubでカスタム受信者kryo登録

答えて

関連する問題