本文主要記錄使用SparkStreaming從Kafka里讀取數(shù)據(jù),并使用Redis保存Offset孔飒,并監(jiān)聽Redis中的某個(gè)Key是否存在來停止程序秽荞,將讀取到的數(shù)據(jù)轉(zhuǎn)換為json寫入到Kafka
相關(guān)文章:
1.Spark之PI本地
2.Spark之WordCount集群
3.SparkStreaming之讀取Kafka數(shù)據(jù)
4.SparkStreaming之使用redis保存Kafka的Offset
5.SparkStreaming之優(yōu)雅停止
6.SparkStreaming之寫數(shù)據(jù)到Kafka
7.Spark計(jì)算《西虹市首富》短評(píng)詞云
KafkaSink
對(duì)KafkaProducer進(jìn)行封裝便于廣播
import java.util.concurrent.Future
import org.apache.kafka.clients.producer.{KafkaProducer, ProducerRecord, RecordMetadata}
class KafkaSink[K, V](createProducer: () => KafkaProducer[K, V]) extends Serializable {
/* This is the key idea that allows us to work around running into
NotSerializableExceptions. */
lazy val producer = createProducer()
def send(topic: String, key: K, value: V): Future[RecordMetadata] = {
producer.send(new ProducerRecord[K, V](topic, key, value))
}
def send(topic: String, value: V): Future[RecordMetadata] = {
producer.send(new ProducerRecord[K, V](topic, value))
}
}
object KafkaSink {
import scala.collection.JavaConversions._
def apply[K, V](config: Map[String, Object]): KafkaSink[K, V] = {
val createProducerFunc = () => {
val producer = new KafkaProducer[K, V](config)
sys.addShutdownHook {
// Ensure that, on executor JVM shutdown, the Kafka producer sends
// any buffered messages to Kafka before shutting down.
producer.close()
}
producer
}
new KafkaSink(createProducerFunc)
}
def apply[K, V](config: java.util.Properties): KafkaSink[K, V] = apply(config.toMap)
}
初始化KafkaSink,并廣播
// 初始化KafkaSink,并廣播
val kafkaProducer: Broadcast[KafkaSink[String, String]] = {
val kafkaProducerConfig = {
val p = new Properties()
p.setProperty("bootstrap.servers", bootstrapServers)
p.setProperty("key.serializer", classOf[StringSerializer].getName)
p.setProperty("value.serializer", classOf[StringSerializer].getName)
p
}
if (LOG.isInfoEnabled)
LOG.info("kafka producer init done!")
ssc.sparkContext.broadcast(KafkaSink[String, String](kafkaProducerConfig))
}
變量Partition并使用廣播變量發(fā)送到Kafka
// 使用廣播變量發(fā)送到Kafka
partition.foreach(record => {
kafkaProducer.value.send("Test_Json", new Gson().toJson(record))
})
完整程序 Kafka2KafkaStreaming
import com.google.gson.Gson
import me.jinkun.scala.util.{InternalRedisClient, KafkaSink}
import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.{StringDeserializer, StringSerializer}
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, HasOffsetRanges, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext, TaskContext}
import org.slf4j.LoggerFactory
/**
*
*/
object Kafka2KafkaStreaming {
private val LOG = LoggerFactory.getLogger("Kafka2KafkaStreaming")
private val STOP_FLAG = "TEST_STOP_FLAG"
def initRedisPool() = {
// Redis configurations
val maxTotal = 20
val maxIdle = 10
val minIdle = 1
val redisHost = "47.98.119.122"
val redisPort = 6379
val redisTimeout = 30000
InternalRedisClient.makePool(redisHost, redisPort, redisTimeout, maxTotal, maxIdle, minIdle)
}
/**
* 從redis里獲取Topic的offset值
*
* @param topicName
* @param partitions
* @return
*/
def getLastCommittedOffsets(topicName: String, partitions: Int): Map[TopicPartition, Long] = {
if (LOG.isInfoEnabled())
LOG.info("||--Topic:{},getLastCommittedOffsets from Redis--||", topicName)
//從Redis獲取上一次存的Offset
val jedis = InternalRedisClient.getPool.getResource
val fromOffsets = collection.mutable.HashMap.empty[TopicPartition, Long]
for (partition <- 0 to partitions - 1) {
val topic_partition_key = topicName + "_" + partition
val lastSavedOffset = jedis.get(topic_partition_key)
val lastOffset = if (lastSavedOffset == null) 0L else lastSavedOffset.toLong
fromOffsets += (new TopicPartition(topicName, partition) -> lastOffset)
}
jedis.close()
fromOffsets.toMap
}
def main(args: Array[String]): Unit = {
//初始化Redis Pool
initRedisPool()
val conf = new SparkConf()
.setAppName("ScalaKafkaStream")
.setMaster("local[3]")
val sc = new SparkContext(conf)
sc.setLogLevel("WARN")
val ssc = new StreamingContext(sc, Seconds(3))
val bootstrapServers = "hadoop1:9092,hadoop2:9092,hadoop3:9092"
val groupId = "kafka-test-group"
val topicName = "Test"
val maxPoll = 1000
val kafkaParams = Map(
ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> bootstrapServers,
ConsumerConfig.GROUP_ID_CONFIG -> groupId,
ConsumerConfig.MAX_POLL_RECORDS_CONFIG -> maxPoll.toString,
ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer],
ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer]
)
// 這里指定Topic的Partition的總數(shù)
val fromOffsets = getLastCommittedOffsets(topicName, 3)
// 初始化KafkaDS
val kafkaTopicDS =
KafkaUtils.createDirectStream(ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Assign[String, String](fromOffsets.keys.toList, kafkaParams, fromOffsets))
// 初始化KafkaSink,并廣播
val kafkaProducer: Broadcast[KafkaSink[String, String]] = {
val kafkaProducerConfig = {
val p = new Properties()
p.setProperty("bootstrap.servers", bootstrapServers)
p.setProperty("key.serializer", classOf[StringSerializer].getName)
p.setProperty("value.serializer", classOf[StringSerializer].getName)
p
}
if (LOG.isInfoEnabled)
LOG.info("kafka producer init done!")
ssc.sparkContext.broadcast(KafkaSink[String, String](kafkaProducerConfig))
}
kafkaTopicDS.foreachRDD(rdd => {
val offsetRanges = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
// 如果rdd有數(shù)據(jù)
if (!rdd.isEmpty()) {
// 在每個(gè)Partition里執(zhí)行
rdd
.map(_.value())
.flatMap(_.split(" "))
.map(x => (x, 1L))
.reduceByKey(_ + _)
.foreachPartition(partition => {
val jedis = InternalRedisClient.getPool.getResource
val p = jedis.pipelined()
p.multi() //開啟事務(wù)
// 使用廣播變量發(fā)送到Kafka
partition.foreach(record => {
kafkaProducer.value.send("Test_Json", new Gson().toJson(record))
})
val offsetRange = offsetRanges(TaskContext.get.partitionId)
println("partition : " + offsetRange.partition + " fromOffset: " + offsetRange.fromOffset + " untilOffset: " + offsetRange.untilOffset)
val topic_partition_key = offsetRange.topic + "_" + offsetRange.partition
p.set(topic_partition_key, offsetRange.untilOffset + "")
p.exec() //提交事務(wù)
p.sync //關(guān)閉pipeline
jedis.close()
})
}
})
ssc.start()
// 優(yōu)雅停止
stopByMarkKey(ssc)
ssc.awaitTermination()
}
/**
* 優(yōu)雅停止
*
* @param ssc
*/
def stopByMarkKey(ssc: StreamingContext): Unit = {
val intervalMills = 10 * 1000 // 每隔10秒掃描一次消息是否存在
var isStop = false
while (!isStop) {
isStop = ssc.awaitTerminationOrTimeout(intervalMills)
if (!isStop && isExists(STOP_FLAG)) {
LOG.warn("2秒后開始關(guān)閉sparstreaming程序.....")
Thread.sleep(2000)
ssc.stop(true, true)
}
}
}
/**
* 判斷Key是否存在
*
* @param key
* @return
*/
def isExists(key: String): Boolean = {
val jedis = InternalRedisClient.getPool.getResource
val flag = jedis.exists(key)
jedis.close()
flag
}
}
創(chuàng)建名為Test_Json的Topic
kafka-topics.sh --create --zookeeper hadoop1:2181,hadoop2:2181,hadoop3:2181/kafka --topic Test_Json --partitions 3 --replication-factor 3
運(yùn)行結(jié)果如下: