Hadoop2.7.4+Spark2.2.0滴滴云分布式集群搭建過(guò)程
使用IDEA+sbt構(gòu)建Scala+spark應(yīng)用世吨,統(tǒng)計(jì)英文詞頻
代碼很簡(jiǎn)單
import org.apache.spark.{SparkConf, SparkContext}
object WordCount{
def main(args: Array[String]): Unit = {
val conf = new SparkConf().setAppName("wordcount")
val sc = new SparkContext(conf)
// 接收文件參數(shù)
val input=sc.textFile(args(0))
// flatMap展平返回list
val lines=input.flatMap(x=>x.split("[ ,.'?/\\|><:;\"-+_=()*&^%$#@!`~]+"))
val count=lines.map(word=>(word,1)).reduceByKey{(x,y)=>x+y}
// 保存到目錄
val output=count.saveAsTextFile(args(1))
}
}
打包成wordcount.jar,上傳到Master
scp /opt/spark-2.2.0-bin-hadoop2.7 dc2-user@116.85.9.118:
spark-submit --master spark://114.55.246.88:7077 --class \
WordCount wordcount.jar \
hdfs://Master:9000/Hadoop/Input/Jane.txt \
hdfs://Master:9000/Hadoop/Output