ObjectCount1.scala
package day08
import java.net.URL
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkConf, SparkContext}
/**
* 統(tǒng)計(jì)用戶對每個學(xué)科的各個模塊訪問的次數(shù)的top3
*/
object ObjectCount1 {
def main(args: Array[String]): Unit = {
val conf: SparkConf = new SparkConf().setAppName("ObjectCount1").setMaster("local[2]")
val sc: SparkContext = new SparkContext(conf)
//獲取數(shù)據(jù)
val file: RDD[String] = sc.textFile("D:/teachingprogram/Spark學(xué)習(xí)視頻/day08/access.txt")
// 提取出url并生成一個元組
val urlAndOne: RDD[(String, Int)] = file.map(line => {
val fields = line.split("\t")
val url = fields(1)
(url, 1)
})
// 把相同的url聚合
val sumedUrl: RDD[(String, Int)] = urlAndOne.reduceByKey(_+_)
// 獲取學(xué)科信息
val project: RDD[(String, String, Int)] = sumedUrl.map(x => {
val url = x._1 // url
val count = x._2 // 請求url的次數(shù)
val project = new URL(url).getHost
(project, url, count)
})
// 用學(xué)科來分組钞钙,聚合后得到結(jié)果
val res: RDD[(String, List[(String, String, Int)])] = project.groupBy(_._1).mapValues(_.toList.sortBy(_._3).reverse.take(3))
println(res.collect().toBuffer)
sc.stop()
}
}