阿里云上面安裝hdp服務(wù) hdfs服務(wù),本地 spark stream 消費(fèi)kafka數(shù)據(jù),在hdfs上面設(shè)置保存點(diǎn),但是在寫(xiě)入hdfs的時(shí)候報(bào)錯(cuò)。
There are 3 datanode(s) running and 3 node(s) are excluded in this operation
解決: 在hdfs-site.xml 中加入此配置參數(shù)构灸,使得客戶端往訪問(wèn)hdfs返回datanode地址是主機(jī)名,
在hosts文件本地配置相對(duì)應(yīng)的映射后 才能訪問(wèn)云主機(jī)上面的hadoop 的datanode.
<property>
<name>dfs.client.use.datanode.hostname</name>
<value>true</value>
</property>
020-08-01 11:38:03,938 ERROR --- [ Executor task launch worker for task 80] org.apache.spark.executor.Executor (line: 91) : Exception in task 3.0 in stage 36.0 (TID 80)
org.apache.hadoop.ipc.RemoteException(java.io.IOException):
File /user/atguigu/sparkstreaming/checkpoint
/b7e390a6-0a54-4b67-9401-c9c7eb2bcb6d/rdd-22/.part-00003-attempt-0 could only
be replicated to 0 nodes instead of minReplication (=1).
There are 3 datanode(s) running and 3 node(s) are excluded in this operation.
at org.apache.hadoop.hdfs.server.blockmanagement.BlockManager.chooseTarget4NewBlock(BlockManager.java:1719)
at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.getNewBlockTargets(FSNamesystem.java:3372)
at org.apache.hadoop.hdfs.server.namenode.FSNamesystem.getAdditionalBlock(FSNamesystem.java:3296)
at org.apache.hadoop.hdfs.server.namenode.NameNodeRpcServer.addBlock(NameNodeRpcServer.java:850)
at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolServerSideTranslatorPB.addBlock(ClientNamenodeProtocolServerSideTranslatorPB.java:504)
at org.apache.hadoop.hdfs.protocol.proto.ClientNamenodeProtocolProtos$ClientNamenodeProtocol$2.callBlockingMethod(ClientNamenodeProtocolProtos.java)
at org.apache.hadoop.ipc.ProtobufRpcEngine$Server$ProtoBufRpcInvoker.call(ProtobufRpcEngine.java:640)
at org.apache.hadoop.ipc.RPC$Server.call(RPC.java:982)
at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:2351)
at org.apache.hadoop.ipc.Server$Handler$1.run(Server.java:2347)
at java.security.AccessController.doPrivileged(Native Method)
at javax.security.auth.Subject.doAs(Subject.java:422)
at org.apache.hadoop.security.UserGroupInformation.doAs(UserGroupInformation.java:1869)
at org.apache.hadoop.ipc.Server$Handler.run(Server.java:2347)
at org.apache.hadoop.ipc.Client.call(Client.java:1347)
at org.apache.hadoop.ipc.Client.call(Client.java:1300)
at org.apache.hadoop.ipc.ProtobufRpcEngine$Invoker.invoke(ProtobufRpcEngine.java:206)
at com.sun.proxy.$Proxy10.addBlock(Unknown Source)
at org.apache.hadoop.hdfs.protocolPB.ClientNamenodeProtocolTranslatorPB.addBlock(ClientNamenodeProtocolTranslatorPB.java:330)
at sun.reflect.GeneratedMethodAccessor65.invoke(Unknown Source)
at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.lang.reflect.Method.invoke(Method.java:498)
at org.apache.hadoop.io.retry.RetryInvocationHandler.invokeMethod(RetryInvocationHandler.java:186)
at org.apache.hadoop.io.retry.RetryInvocationHandler.invoke(RetryInvocationHandler.java:102)
at com.sun.proxy.$Proxy11.addBlock(Unknown Source)
at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.locateFollowingBlock(DFSOutputStream.java:1226)
at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.nextBlockOutputStream(DFSOutputStream.java:1078)
at org.apache.hadoop.hdfs.DFSOutputStream$DataStreamer.run(DFSOutputStream.java:514)
import java.lang
import java.sql.ResultSet
import com.atguigu.qzpoint.util.{DataSourceUtil, QueryCallback, SqlProxy}
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.TopicPartition
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.dstream.InputDStream
import org.apache.spark.streaming.kafka010._
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}
import scala.collection.mutable
import scala.util.Random
object RegisterStreaming {
private val groupid = "register_group11"
def main(args: Array[String]): Unit = {
// System.setProperty("HADOOP_USER_NAME", "root")
val conf = new SparkConf().setAppName(this.getClass.getSimpleName)
.set("spark.streaming.kafka.maxRatePerPartition", "100")
// .set("spark.streaming.backpressure.enabled", "true")
// .set("spark.streaming.stopGracefullyOnShutdown", "true")
.setMaster("local[*]")
val ssc = new StreamingContext(conf, Seconds(3))
val sparkContext: SparkContext = ssc.sparkContext
sparkContext.hadoopConfiguration.set("fs.defaultFS", "hdfs://hadoopha1")
sparkContext.hadoopConfiguration.set("dfs.nameservices", "hadoopha1")
val topics = Array("register_topic")
val kafkaMap: Map[String, Object] = Map[String, Object](
"bootstrap.servers" -> "hadoop102:6667,hadoop103:6667,hadoop104:6667",
"key.deserializer" -> classOf[StringDeserializer],
"value.deserializer" -> classOf[StringDeserializer],
"group.id" -> groupid,
"auto.offset.reset" -> "earliest", //sparkstreaming第一次啟動(dòng)岸梨,不丟數(shù)
//如果是true喜颁,則這個(gè)消費(fèi)者的偏移量會(huì)在后臺(tái)自動(dòng)提交,但是kafka宕機(jī)容易丟失數(shù)據(jù)
//如果是false曹阔,則需要手動(dòng)維護(hù)kafka偏移量
"enable.auto.commit" -> (false: lang.Boolean)
)
//sparkStreaming對(duì)有狀態(tài)的數(shù)據(jù)操作半开,需要設(shè)定檢查點(diǎn)目錄,然后將狀態(tài)保存到檢查點(diǎn)中
ssc.checkpoint("/user/atguigu/sparkstreaming/checkpoint")
//查詢mysql中是否有偏移量
val sqlProxy = new SqlProxy()
val offsetMap = new mutable.HashMap[TopicPartition, Long]()
val client = DataSourceUtil.getConnection
try {
sqlProxy.executeQuery(client, "select * from `offset_manager` where groupid=?", Array(groupid), new QueryCallback {
override def process(rs: ResultSet): Unit = {
while (rs.next()) {
val model = new TopicPartition(rs.getString(2), rs.getInt(3))
val offset = rs.getLong(4)
offsetMap.put(model, offset)
}
rs.close() //關(guān)閉游標(biāo)
}
})
} catch {
case e: Exception => e.printStackTrace()
} finally {
sqlProxy.shutdown(client)
}
//設(shè)置kafka消費(fèi)數(shù)據(jù)的參數(shù) 判斷本地是否有偏移量 有則根據(jù)偏移量繼續(xù)消費(fèi) 無(wú)則重新消費(fèi)
val stream: InputDStream[ConsumerRecord[String, String]] = if (offsetMap.isEmpty) {
KafkaUtils.createDirectStream(
ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String, String](topics, kafkaMap))
} else {
KafkaUtils.createDirectStream(
ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String, String](topics, kafkaMap, offsetMap))
}
//stream原始流無(wú)法進(jìn)行使用和打印赃份,會(huì)報(bào)序列化錯(cuò)誤寂拆,所以需要做下面的map轉(zhuǎn)換
val resultDStream = stream.filter(item => item.value().split("\t").length == 3).
mapPartitions(partitions => {
partitions.map(item => {
val line = item.value()
val arr = line.split("\t")
val app_name = arr(1) match {
case "1" => "PC"
case "2" => "APP"
case _ => "Other"
}
(app_name, 1)
})
})
resultDStream.cache()
//(PC,1),(PC,1),(APP,1),(Other,1),(APP,1),(Other,1),(PC,1),(APP,1)
//"=================每6s間隔1分鐘內(nèi)的注冊(cè)數(shù)據(jù)================="
// resultDStream.reduceByKeyAndWindow((x: Int, y: Int) => x + y, Seconds(60), Seconds(6)).print()
//"========================================================="
//"+++++++++++++++++++++++實(shí)時(shí)注冊(cè)人數(shù)+++++++++++++++++++++++"http://狀態(tài)計(jì)算
val updateFunc = (values: Seq[Int], state: Option[Int]) => {
val currentCount = values.sum //本批次求和
val previousCount = state.getOrElse(0) //歷史數(shù)據(jù)
Some(currentCount + previousCount)
}
resultDStream.updateStateByKey(updateFunc).print()
//"++++++++++++++++++++++++++++++++++++++++++++++++++++++++"
/* val dsStream = stream.filter(item => item.value().split("\t").length == 3)
.mapPartitions(partitions =>
partitions.map(item => {
val rand = new Random()
val line = item.value()
val arr = line.split("\t")
val app_id = arr(1)
(rand.nextInt(3) + "_" + app_id, 1)
}))
val result = dsStream.reduceByKey(_ + _)
result.map(item => {
val appid = item._1.split("_")(1)
(appid, item._2)
}).reduceByKey(_ + _).print()*/
//處理完 業(yè)務(wù)邏輯后 手動(dòng)提交offset維護(hù)到本地 mysql中
stream.foreachRDD(rdd => {
val sqlProxy = new SqlProxy()
val client = DataSourceUtil.getConnection
try {
val offsetRanges: Array[OffsetRange] = rdd.asInstanceOf[HasOffsetRanges].offsetRanges
for (or <- offsetRanges) {
sqlProxy.executeUpdate(client, "replace into `offset_manager` (groupid,topic,`partition`,untilOffset) values(?,?,?,?)",
Array(groupid, or.topic, or.partition.toString, or.untilOffset))
}
} catch {
case e: Exception => e.printStackTrace()
} finally {
sqlProxy.shutdown(client)
}
})
ssc.start()
ssc.awaitTermination()
}
}