【2018-04-09】【2.1版本】spark sql 讀源碼 notes

SparkSession是spark sql的入口類：

val spark = SparkSession
  .builder()
  .appName("Spark SQL data sources example")
  .config("spark.some.config.option", "some-value")
  .getOrCreate()

涉及到的這些類源碼需要掌握：

1.SparkSession

2.Builder

3.DataFrameReader：各數(shù)據(jù)源讀入器小腊，csv,json,jdbc,parquet,orc,text,table

4.DataFrameWriter:數(shù)據(jù)輸出器：jdbc,json,parquet,text,table,orc,csv,

注意數(shù)據(jù)輸出的四種模式：追加，覆蓋莱褒，報錯空镜，忽略

1.數(shù)據(jù)源demo

package org.apache.spark.examples.sql

import java.util.Properties

import org.apache.spark.sql.SparkSession

object SQLDataSourceExample {

  case class Person(name: String, age: Long)

  def main(args: Array[String]) {
    val spark = SparkSession
  .builder()
  .appName("Spark SQL data sources example")
  .config("spark.some.config.option", "some-value")
  .getOrCreate()

runBasicDataSourceExample(spark)
runBasicParquetExample(spark)
runParquetSchemaMergingExample(spark)
runJsonDatasetExample(spark)
runJdbcDatasetExample(spark)

spark.stop()
  }

  private def runBasicDataSourceExample(spark: SparkSession): Unit = {
// $example on:generic_load_save_functions$
val usersDF = spark.read.load("examples/src/main/resources/users.parquet")
usersDF.select("name", "favorite_color").write.save("namesAndFavColors.parquet")
// $example off:generic_load_save_functions$
// $example on:manual_load_options$
val peopleDF = spark.read.format("json").load("examples/src/main/resources/people.json")
peopleDF.select("name", "age").write.format("parquet").save("namesAndAges.parquet")
// $example off:manual_load_options$
// $example on:direct_sql$
val sqlDF = spark.sql("SELECT * FROM parquet.`examples/src/main/resources/users.parquet`")
// $example off:direct_sql$
// $example on:write_sorting_and_bucketing$
peopleDF.write.bucketBy(42, "name").sortBy("age").saveAsTable("people_bucketed")
// $example off:write_sorting_and_bucketing$
// $example on:write_partitioning$
usersDF.write.partitionBy("favorite_color").format("parquet").save("namesPartByColor.parquet")
// $example off:write_partitioning$
// $example on:write_partition_and_bucket$
peopleDF
  .write
  .partitionBy("favorite_color")
  .bucketBy(42, "name")
  .saveAsTable("people_partitioned_bucketed")
// $example off:write_partition_and_bucket$

spark.sql("DROP TABLE IF EXISTS people_bucketed")
spark.sql("DROP TABLE IF EXISTS people_partitioned_bucketed")
  }

  private def runBasicParquetExample(spark: SparkSession): Unit = {
// $example on:basic_parquet_example$
// Encoders for most common types are automatically provided by importing spark.implicits._
import spark.implicits._

val peopleDF = spark.read.json("examples/src/main/resources/people.json")

// DataFrames can be saved as Parquet files, maintaining the schema information
peopleDF.write.parquet("people.parquet")

// Read in the parquet file created above
// Parquet files are self-describing so the schema is preserved
// The result of loading a Parquet file is also a DataFrame
val parquetFileDF = spark.read.parquet("people.parquet")

// Parquet files can also be used to create a temporary view and then used in SQL statements
parquetFileDF.createOrReplaceTempView("parquetFile")
val namesDF = spark.sql("SELECT name FROM parquetFile WHERE age BETWEEN 13 AND 19")
namesDF.map(attributes => "Name: " + attributes(0)).show()
// +------------+
// |       value|
// +------------+
// |Name: Justin|
// +------------+
// $example off:basic_parquet_example$
  }

  private def runParquetSchemaMergingExample(spark: SparkSession): Unit = {
// $example on:schema_merging$
// This is used to implicitly convert an RDD to a DataFrame.
import spark.implicits._

// Create a simple DataFrame, store into a partition directory
val squaresDF = spark.sparkContext.makeRDD(1 to 5).map(i => (i, i * i)).toDF("value", "square")
squaresDF.write.parquet("data/test_table/key=1")

// Create another DataFrame in a new partition directory,
// adding a new column and dropping an existing column
val cubesDF = spark.sparkContext.makeRDD(6 to 10).map(i => (i, i * i * i)).toDF("value", "cube")
cubesDF.write.parquet("data/test_table/key=2")

// Read the partitioned table
val mergedDF = spark.read.option("mergeSchema", "true").parquet("data/test_table")
mergedDF.printSchema()

// The final schema consists of all 3 columns in the Parquet files together
// with the partitioning column appeared in the partition directory paths
// root
//  |-- value: int (nullable = true)
//  |-- square: int (nullable = true)
//  |-- cube: int (nullable = true)
//  |-- key: int (nullable = true)
// $example off:schema_merging$
  }

  private def runJsonDatasetExample(spark: SparkSession): Unit = {
// $example on:json_dataset$
// Primitive types (Int, String, etc) and Product types (case classes) encoders are
// supported by importing this when creating a Dataset.
import spark.implicits._

// A JSON dataset is pointed to by path.
// The path can be either a single text file or a directory storing text files
val path = "examples/src/main/resources/people.json"
val peopleDF = spark.read.json(path)

// The inferred schema can be visualized using the printSchema() method
peopleDF.printSchema()
// root
//  |-- age: long (nullable = true)
//  |-- name: string (nullable = true)

// Creates a temporary view using the DataFrame
peopleDF.createOrReplaceTempView("people")

// SQL statements can be run by using the sql methods provided by spark
val teenagerNamesDF = spark.sql("SELECT name FROM people WHERE age BETWEEN 13 AND 19")
teenagerNamesDF.show()
// +------+
// |  name|
// +------+
// |Justin|
// +------+

// Alternatively, a DataFrame can be created for a JSON dataset represented by
// a Dataset[String] storing one JSON object per string
val otherPeopleDataset = spark.createDataset(
  """{"name":"Yin","address":{"city":"Columbus","state":"Ohio"}}""" :: Nil)
val otherPeople = spark.read.json(otherPeopleDataset)
otherPeople.show()
// +---------------+----+
// |        address|name|
// +---------------+----+
// |[Columbus,Ohio]| Yin|
// +---------------+----+
// $example off:json_dataset$
  }

  private def runJdbcDatasetExample(spark: SparkSession): Unit = {
// $example on:jdbc_dataset$
// Note: JDBC loading and saving can be achieved via either the load/save or jdbc methods
// Loading data from a JDBC source
val jdbcDF = spark.read
  .format("jdbc")
  .option("url", "jdbc:postgresql:dbserver")
  .option("dbtable", "schema.tablename")
  .option("user", "username")
  .option("password", "password")
  .load()

val connectionProperties = new Properties()
connectionProperties.put("user", "username")
connectionProperties.put("password", "password")
val jdbcDF2 = spark.read
  .jdbc("jdbc:postgresql:dbserver", "schema.tablename", connectionProperties)

// Saving data to a JDBC source
jdbcDF.write
  .format("jdbc")
  .option("url", "jdbc:postgresql:dbserver")
  .option("dbtable", "schema.tablename")
  .option("user", "username")
  .option("password", "password")
  .save()

jdbcDF2.write
  .jdbc("jdbc:postgresql:dbserver", "schema.tablename", connectionProperties)

// Specifying create table column data types on write
jdbcDF.write
  .option("createTableColumnTypes", "name CHAR(64), comments VARCHAR(1024)")
  .jdbc("jdbc:postgresql:dbserver", "schema.tablename", connectionProperties)
// $example off:jdbc_dataset$
  }
}

2.hive操作的demo

package org.apache.spark.examples.sql.hive

// $example on:spark_hive$
import java.io.File

import org.apache.spark.sql.Row
  import org.apache.spark.sql.SparkSession
  // $example off:spark_hive$
object SparkHiveExample {

  // $example on:spark_hive$
  case class Record(key: Int, value: String)
  // $example off:spark_hive$

  def main(args: Array[String]) {
// When working with Hive, one must instantiate `SparkSession` with Hive support, including
// connectivity to a persistent Hive metastore, support for Hive serdes, and Hive user-defined
// functions. Users who do not have an existing Hive deployment can still enable Hive support.
// When not configured by the hive-site.xml, the context automatically creates `metastore_db`
// in the current directory and creates a directory configured by `spark.sql.warehouse.dir`,
// which defaults to the directory `spark-warehouse` in the current directory that the spark
// application is started.

// $example on:spark_hive$
// warehouseLocation points to the default location for managed databases and tables
val warehouseLocation = new File("spark-warehouse").getAbsolutePath

val spark = SparkSession
  .builder()
  .appName("Spark Hive Example")
  .config("spark.sql.warehouse.dir", warehouseLocation)
  .enableHiveSupport()
  .getOrCreate()

import spark.implicits._
import spark.sql

sql("CREATE TABLE IF NOT EXISTS src (key INT, value STRING) USING hive")
sql("LOAD DATA LOCAL INPATH 'examples/src/main/resources/kv1.txt' INTO TABLE src")

// Queries are expressed in HiveQL
sql("SELECT * FROM src").show()
// +---+-------+
// |key|  value|
// +---+-------+
// |238|val_238|
// | 86| val_86|
// |311|val_311|
// ...

// Aggregation queries are also supported.
sql("SELECT COUNT(*) FROM src").show()
// +--------+
// |count(1)|
// +--------+
// |    500 |
// +--------+

// The results of SQL queries are themselves DataFrames and support all normal functions.
val sqlDF = sql("SELECT key, value FROM src WHERE key < 10 ORDER BY key")

// The items in DataFrames are of type Row, which allows you to access each column by ordinal.
val stringsDS = sqlDF.map {
  case Row(key: Int, value: String) => s"Key: $key, Value: $value"
}
stringsDS.show()
// +--------------------+
// |               value|
// +--------------------+
// |Key: 0, Value: val_0|
// |Key: 0, Value: val_0|
// |Key: 0, Value: val_0|
// ...

// You can also use DataFrames to create temporary views within a SparkSession.
val recordsDF = spark.createDataFrame((1 to 100).map(i => Record(i, s"val_$i")))
recordsDF.createOrReplaceTempView("records")

// Queries can then join DataFrame data with data stored in Hive.
sql("SELECT * FROM records r JOIN src s ON r.key = s.key").show()
// +---+------+---+------+
// |key| value|key| value|
// +---+------+---+------+
// |  2| val_2|  2| val_2|
// |  4| val_4|  4| val_4|
// |  5| val_5|  5| val_5|
// ...
// $example off:spark_hive$

spark.stop()
  }
}

3. rdd與dataFrame相互轉(zhuǎn)換的demo

// scalastyle:off println
package org.apache.spark.examples.sql

import org.apache.spark.sql.SaveMode
// $example on:init_session$
import org.apache.spark.sql.SparkSession
// $example off:init_session$

// One method for defining the schema of an RDD is to make a case class with the desired column
// names and types.
case class Record(key: Int, value: String)

object RDDRelation {
  def main(args: Array[String]) {
    // $example on:init_session$
    val spark = SparkSession
      .builder
      .appName("Spark Examples")
      .config("spark.some.config.option", "some-value")
      .getOrCreate()

    // Importing the SparkSession gives access to all the SQL functions and implicit conversions.
    import spark.implicits._
    // $example off:init_session$

    val df = spark.createDataFrame((1 to 100).map(i => Record(i, s"val_$i")))
    // Any RDD containing case classes can be used to create a temporary view.  The schema of the
    // view is automatically inferred using scala reflection.
    df.createOrReplaceTempView("records")

    // Once tables have been registered, you can run SQL queries over them.
    println("Result of SELECT *:")
    spark.sql("SELECT * FROM records").collect().foreach(println)

    // Aggregation queries are also supported.
    val count = spark.sql("SELECT COUNT(*) FROM records").collect().head.getLong(0)
    println(s"COUNT(*): $count")

    // The results of SQL queries are themselves RDDs and support all normal RDD functions. The
    // items in the RDD are of type Row, which allows you to access each column by ordinal.
    val rddFromSql = spark.sql("SELECT key, value FROM records WHERE key < 10")

    println("Result of RDD.map:")
    rddFromSql.rdd.map(row => s"Key: ${row(0)}, Value: ${row(1)}").collect().foreach(println)

    // Queries can also be written using a LINQ-like Scala DSL.
    df.where($"key" === 1).orderBy($"value".asc).select($"key").collect().foreach(println)

    // Write out an RDD as a parquet file with overwrite mode.
    df.write.mode(SaveMode.Overwrite).parquet("pair.parquet")

    // Read in parquet file.  Parquet files are self-describing so the schema is preserved.
    val parquetFile = spark.read.parquet("pair.parquet")

    // Queries can be run using the DSL on parquet files just like the original RDD.
    parquetFile.where($"key" === 1).select($"value".as("a")).collect().foreach(println)

    // These files can also be used to create a temporary view.
    parquetFile.createOrReplaceTempView("parquetFile")
    spark.sql("SELECT * FROM parquetFile").collect().foreach(println)

    spark.stop()
  }
}
// scalastyle:on println

?著作權(quán)歸作者所有,轉(zhuǎn)載或內(nèi)容合作請聯(lián)系作者

人面猴
序言：七十年代末刷允，一起剝皮案震驚了整個濱河市，隨后出現(xiàn)的幾起案子，更是在濱河造成了極大的恐慌兔毒，老刑警劉巖，帶你破解...
沈念sama閱讀 211,348評論 6贊 491
死咒
序言：濱河連續(xù)發(fā)生了三起死亡事件甸箱，死亡現(xiàn)場離奇詭異育叁，居然都是意外死亡，警方通過查閱死者的電腦和手機(jī)芍殖，發(fā)現(xiàn)死者居然都...
沈念sama閱讀 90,122評論 2贊 385
救了他兩次的神仙讓他今天三更去死
文/潘曉璐我一進(jìn)店門豪嗽，熙熙樓的掌柜王于貴愁眉苦臉地迎上來，“玉大人豌骏，你說我怎么就攤上這事龟梦。” “怎么了窃躲？”我有些...
開封第一講書人閱讀 156,936評論 0贊 347
道士緝兇錄：失蹤的賣姜人
文/不壞的土叔我叫張陵计贰，是天一觀的道長。經(jīng)常有香客問我蒂窒，道長躁倒，這世上最難降的妖魔是什么荞怒？我笑而不...
開封第一講書人閱讀 56,427評論 1贊 283
?港島之戀（遺憾婚禮）
正文為了忘掉前任，我火速辦了婚禮秧秉，結(jié)果婚禮上褐桌，老公的妹妹穿的比我還像新娘。我一直安慰自己福贞，他們只是感情好撩嚼，可當(dāng)我...
茶點故事閱讀 65,467評論 6贊 385
惡毒庶女頂嫁案：這布局不是一般人想出來的
文/花漫我一把揭開白布。她就那樣靜靜地躺著挖帘，像睡著了一般完丽。火紅的嫁衣襯著肌膚如雪。梳的紋絲不亂的頭發(fā)上拇舀，一...
開封第一講書人閱讀 49,785評論 1贊 290
城市分裂傳說
那天逻族，我揣著相機(jī)與錄音，去河邊找鬼骄崩。笑死聘鳞，一個胖子當(dāng)著我的面吹牛，可吹牛的內(nèi)容都是我干的要拂。我是一名探鬼主播抠璃，決...
沈念sama閱讀 38,931評論 3贊 406
雙鴛鴦連環(huán)套：你想象不到人心有多黑
文/蒼蘭香墨我猛地睜開眼，長吁一口氣：“原來是場噩夢啊……” “哼脱惰！你這毒婦竟也來了搏嗡？” 一聲冷哼從身側(cè)響起，我...
開封第一講書人閱讀 37,696評論 0贊 266
萬榮殺人案實錄
序言：老撾萬榮一對情侶失蹤拉一，失蹤者是張志新（化名）和其女友劉穎采盒，沒想到半個月后，有當(dāng)?shù)厝嗽跇淞掷锇l(fā)現(xiàn)了一具尸體蔚润，經(jīng)...
沈念sama閱讀 44,141評論 1贊 303
?護(hù)林員之死
正文獨居荒郊野嶺守林人離奇死亡磅氨，尸身上長有42處帶血的膿包…… 初始之章·張勛以下內(nèi)容為張勛視角年9月15日...
茶點故事閱讀 36,483評論 2贊 327
?白月光啟示錄
正文我和宋清朗相戀三年，在試婚紗的時候發(fā)現(xiàn)自己被綠了嫡纠。大學(xué)時的朋友給我發(fā)了我未婚夫和他白月光在一起吃飯的照片烦租。...
茶點故事閱讀 38,625評論 1贊 340
活死人
序言：一個原本活蹦亂跳的男人離奇死亡，死狀恐怖除盏，靈堂內(nèi)的尸體忽然破棺而出叉橱，到底是詐尸還是另有隱情，我是刑警寧澤痴颊，帶...
沈念sama閱讀 34,291評論 4贊 329
?日本核電站爆炸內(nèi)幕
正文年R本政府宣布赏迟，位于F島的核電站屡贺，受9級特大地震影響蠢棱，放射性物質(zhì)發(fā)生泄漏锌杀。R本人自食惡果不足惜，卻給世界環(huán)境...
茶點故事閱讀 39,892評論 3贊 312
男人毒藥：我在死后第九天來索命
文/蒙蒙一泻仙、第九天我趴在偏房一處隱蔽的房頂上張望糕再。院中可真熱鬧，春花似錦玉转、人聲如沸突想。這莊子的主人今日做“春日...
開封第一講書人閱讀 30,741評論 0贊 21
一樁弒父案究抓，背后竟有這般陰謀
文/蒼蘭香墨我抬頭看了看天上的太陽猾担。三九已至，卻和暖如春刺下，著一層夾襖步出監(jiān)牢的瞬間绑嘹，已是汗流浹背。一陣腳步聲響...
開封第一講書人閱讀 31,977評論 1贊 265
情欲美人皮
我被黑心中介騙來泰國打工橘茉，沒想到剛下飛機(jī)就差點兒被人妖公主榨干…… 1. 我叫王不留工腋，地道東北人。一個月前我還...
沈念sama閱讀 46,324評論 2贊 360
代替公主和親
正文我出身青樓畅卓，卻偏偏與公主長得像擅腰，于是被迫代替她去往敵國和親。傳聞我的和親對象是個殘疾皇子翁潘，可洞房花燭夜當(dāng)晚...
茶點故事閱讀 43,492評論 2贊 348