該文主要展示的是spark sql 例子
(內(nèi)容是找了份oracle的例子含衔,翻譯成spark sql的)
1、需要準(zhǔn)備好四張表,既四個文本文件逗號分隔
2原献、為這四張表創(chuàng)建好schema态坦,并注冊成表
3盐数、時間處理有小部分改動
準(zhǔn)備的四張表
image
image
image
image
四張表中的數(shù)據(jù)
Teacher
884,zhagnxueyou,female,1980/2/10,Assistant professor,department of computer
856,liudehua,female,1980/12/10,professor,department of electronic engineering
825,liming,female,1980/1/10,advandced professor,department of computer
831,zhourunfa,female,1981/12/10,jonior professor,department of electronic engineering
Student
108,ceng hua,male,1977/9/1,95033
105,zhangsan,male,1975/10/22,95031
107,wangli,female,1976/1/8,95033
101,lili,male,1978/5/14,95033
109,wangfang,female,1979/5/10,95031
103,lijun,male,1974/5/21,95031
Score
103,3-245,86
105,3-245,75
109,3-245,68
103,3-105,92
105,3-105,88
109,3-105,76
101,3-105,64
107,3-105,91
108,3-105,78
101,6-166,85
107,6-166,79
108,6-166,81
Course
3-105,introduction to computer,825
3-245,the operating system,804
6-166,the digital circuit,856
9-888,higher math,831
maven
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-core_2.11</artifactId>
<version>2.3.1</version>
</dependency>
<!-- https://mvnrepository.com/artifact/org.apache.spark/spark-sql -->
<dependency>
<groupId>org.apache.spark</groupId>
<artifactId>spark-sql_2.11</artifactId>
<version>2.3.1</version>
</dependency>
例子代碼,粘貼可用伞梯,注意注釋掉不需要的地方即可
import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.types._
import scala.collection.mutable
import java.text.SimpleDateFormat
/**
* 2017/05/26
* CDP
*/
object SparkSqlExample1 {
def main(args: Array[String]): Unit = {
/** ***************************************************************************************************************
* sparksession
*/
val spark = SparkSession
.builder()
.master("local")
.appName("test")
.config("spark.sql.shuffle.partitions", "5")
.getOrCreate()
/** ***************************************************************************************************************
* 表結(jié)構(gòu)
*/
val StudentSchema: StructType = StructType(mutable.ArraySeq( //學(xué)生表
StructField("Sno", StringType, nullable = false), //學(xué)號
StructField("Sname", StringType, nullable = false), //學(xué)生姓名
StructField("Ssex", StringType, nullable = false), //學(xué)生性別
StructField("Sbirthday", StringType, nullable = true), //學(xué)生出生年月
StructField("SClass", StringType, nullable = true) //學(xué)生所在班級
))
val CourseSchema: StructType = StructType(mutable.ArraySeq( //課程表
StructField("Cno", StringType, nullable = false), //課程號
StructField("Cname", StringType, nullable = false), //課程名稱
StructField("Tno", StringType, nullable = false) //教工編號
))
val ScoreSchema: StructType = StructType(mutable.ArraySeq( //成績表
StructField("Sno", StringType, nullable = false), //學(xué)號(外鍵)
StructField("Cno", StringType, nullable = false), //課程號(外鍵)
StructField("Degree", IntegerType, nullable = true) //成績
))
val TeacherSchema: StructType = StructType(mutable.ArraySeq( //教師表
StructField("Tno", StringType, nullable = false), //教工編號(主鍵)
StructField("Tname", StringType, nullable = false), //教工姓名
StructField("Tsex", StringType, nullable = false), //教工性別
StructField("Tbirthday", StringType, nullable = true), //教工出生年月
StructField("Prof", StringType, nullable = true), //職稱
StructField("Depart", StringType, nullable = false) //教工所在部門
))
/** ***************************************************************************************************************
* 獲取當(dāng)前時間函數(shù)
*/
def getDate(time: String) = {
val now: Long=System.currentTimeMillis()
var df: SimpleDateFormat = new SimpleDateFormat(time)
df.format(now)
}
/** ***************************************************************************************************************
* 讀取數(shù)據(jù)
*/
val StudentData = spark.sparkContext.textFile("input/sqltable/Student").map{
lines =>
val line = lines.split(",")
Row(line(0),line(1),line(2),line(3),line(4))
}
val CourseData = spark.sparkContext.textFile("input/sqltable/Course").map{
lines =>
val line = lines.split(",")
Row(line(0),line(1),line(2))
}
val ScoreData = spark.sparkContext.textFile("input/sqltable/Score").map{
lines =>
val line = lines.split(",")
Row(line(0),line(1),line(2).toInt)
}
val TeacherData = spark.sparkContext.textFile("input/sqltable/Teacher").map{
lines =>
val line = lines.split(",")
Row(line(0),line(1),line(2),line(3),line(4),line(5))
}
/** ***************************************************************************************************************
* 轉(zhuǎn)換成表
*/
val StudentTable = spark.createDataFrame(StudentData, StudentSchema)
StudentTable.createOrReplaceTempView("Student")
val CourseTable = spark.createDataFrame(CourseData, CourseSchema)
CourseTable.createOrReplaceTempView("Course")
val ScoreTable = spark.createDataFrame(ScoreData, ScoreSchema)
ScoreTable.createOrReplaceTempView("Score")
val TeacherTable = spark.createDataFrame(TeacherData, TeacherSchema)
TeacherTable.createOrReplaceTempView("Teacher")
/** ***************************************************************************************************************
* 走sql節(jié)奏
* 表名玫氢,字段名,區(qū)分大小寫
*/
////1谜诫、 查詢Student表中的所有記錄的Sname漾峡、Ssex和Class列。
spark.sql("SELECT sname, ssex, sclass FROM Student").show()
////2喻旷、 查詢教師所有的單位即不重復(fù)的Depart列生逸。
spark.sql("SELECT DISTINCT depart FROM Teacher").show()
////3、 查詢Student表的所有記錄
spark.sql("SELECT * FROM Student").show()
////4且预、 查詢Score表中成績在60到80之間的所有記錄槽袄。
//spark.sql("SELECT * FROM Score WHERE degree BETWEEN 60 and 80").show()
spark.sql("SELECT * FROM Score WHERE degree >= 60 and degree <= 80").show()
////5、 查詢Score表中成績?yōu)?5锋谐,86或88的記錄遍尺。
spark.sql("SELECT * FROM Score WHERE degree = '85' OR degree = '86' OR degree = '88'").show()
////6、 查詢Student表中“95031”班或性別為“女”的同學(xué)記錄涮拗。
spark.sql("SELECT * FROM Student WHERE sclass = '95031' OR ssex = 'female'").show()
////7乾戏、 以Class降序,升序查詢Student表的所有記錄迂苛。
spark.sql("SELECT * FROM Student ORDER BY sclass DESC").show()
spark.sql("SELECT * FROM Student ORDER BY sclass").show()
////8、 以Cno升序鼓择、Degree降序查詢Score表的所有記錄三幻。
spark.sql("SELECT * FROM Score t ORDER BY t.sno ASC, t.degree DESC").show()
////9、 查詢“95031”班的學(xué)生人數(shù)惯退。
spark.sql("SELECT t.sclass totalnum FROM Student t WHERE sclass = '95031'").show()
spark.sql("SELECT t.sclass AS totalnum FROM Student t WHERE sclass = '95031'").show()
////10赌髓、 查詢Score表中的最高分的學(xué)生學(xué)號和課程號。(子查詢或者排序)
//// oracle => WHERE rownum = 1
//// spark sql => LIMIT 1
spark.sql("SELECT * FROM (SELECT * FROM Score ORDER BY degree DESC LIMIT 1)").show()
spark.sql("SELECT t.sno, t.cno FROM Score t ORDER BY degree DESC").show()
spark.sql("SELECT * FROM Score WHERE degree IN(SELECT MAX(degree) FROM Score t)").show()
////11催跪、 查詢每門課的平均成績锁蠕。
spark.sql("SELECT AVG(degree) average FROM Score t WHERE cno = '3-245'").show()
spark.sql("SELECT AVG(degree) average FROM Score WHERE cno = '3-105'").show()
spark.sql("SELECT AVG(degree) average FROM Score WHERE cno = '6-166'").show()
spark.sql("SELECT cno, AVG(degree) FROM Score t GROUP BY cno").show()
////12、查詢Score表中至少有5名學(xué)生選修的并以3開頭的課程的平均分?jǐn)?shù)懊蒸。
spark.sql("SELECT cno, AVG(degree) FROM Score WHERE cno LIKE '3%' GROUP BY cno HAVING COUNT(1) >= 5").show()
////13荣倾、查詢分?jǐn)?shù)大于70,小于90的Sno列骑丸。
spark.sql("SELECT sno FROM Score WHERE degree BETWEEN 70 AND 90").show()
////14舌仍、查詢所有學(xué)生的Sname、Cno和Degree列通危。
spark.sql("SELECT s.sname, t.cno, t.degree FROM Score t, Student s WHERE t.sno = s.sno").show()
spark.sql("SELECT s.sname, t.cno, t.degree FROM Score t JOIN Student s ON t.sno = s.sno").show()
////15铸豁、查詢所有學(xué)生的Sno、Cname和Degree列菊碟。
spark.sql("SELECT s.sname, t.cno, t.degree FROM Score t JOIN Student s ON t.sno = s.sno").show()
////16节芥、查詢所有學(xué)生的Sname、Cname和Degree列逆害。
spark.sql("SELECT s.sname, t.degree, c.cname FROM Score t, Student s, Course c WHERE t.sno = s.sno AND t.cno = c.cno").show()
spark.sql("SELECT s.sname, t.degree, c.cname FROM Score t " +
"JOIN Student s on t.sno = s.sno " +
"JOIN Course c on c.cno = t.cno").show()
////17头镊、 查詢“95033”班學(xué)生的平均分。
spark.sql("SELECT AVG(degree) average FROM Score WHERE sno IN (SELECT sno FROM Student WHERE sclass = '95033')").show()
////19魄幕、 查詢選修“3-105”課程的成績高于“109”號同學(xué)成績的所有同學(xué)的記錄相艇。
spark.sql("SELECT * FROM Score WHERE cno = '3-105' AND degree > (SELECT degree FROM score WHERE sno = '109' AND cno = '3-105')").show()
////20、查詢score中選學(xué)多門課程的同學(xué)中分?jǐn)?shù)為非最高分成績的記錄纯陨。
spark.sql("SELECT * FROM Score WHERE sno IN " +
"(SELECT sno FROM Score t GROUP BY t.sno HAVING COUNT(1) > 1) AND degree != (SELECT MAX(degree) FROM Score)").show()
spark.sql("SELECT * FROM Score WHERE degree != (SELECT MAX(degree) FROM Score)").show()
////21坛芽、 查詢成績高于學(xué)號為“109”、課程號為“3-105”的成績的所有記錄队丝。
spark.sql("SELECT * FROM Score t WHERE t.degree > (SELECT degree FROM Score WHERE sno = '109' AND cno = '3-105')").show()
////22靡馁、查詢和學(xué)號為108的同學(xué)同年出生的所有學(xué)生的Sno、Sname和Sbirthday列机久。
//// oracle => to_char(t.sbirthday,'yyyy')
//// spark sql => substring(t.sbirthday, 0, 4)
spark.sql("SELECT sno, sname , sbirthday " +
"FROM Student " +
"WHERE substring(sbirthday, 0, 4) = ( " +
"SELECT substring(t.sbirthday, 0, 4) " +
"FROM Student t " +
"WHERE sno = '108')").show()
////23臭墨、查詢“張旭“教師任課的學(xué)生成績。
spark.sql("SELECT t.tno, c.cno, c.cname, s.degree FROM Teacher t " +
"JOIN Course c ON t.tno = c.tno " +
"JOIN Score s ON c.cno = s.cno WHERE t.tname = 'Zhang xu'").show()
////24膘盖、查詢選修某課程的同學(xué)人數(shù)多于5人的教師姓名胧弛。
spark.sql("SELECT tname FROM Teacher e " +
"JOIN Course c ON e.tno = c.tno " +
"JOIN(SELECT cno FROM Score GROUP BY cno HAVING COUNT(cno) > 5) t ON c.cno = t.cno").show()
////25尤误、查詢95033班和95031班全體學(xué)生的記錄。
spark.sql("SELECT * FROM Student WHERE sclass IN('95031', '95033')").show()
spark.sql("SELECT * FROM Student WHERE sclass LIKE '9503%'").show()
////26结缚、 查詢存在有85分以上成績的課程Cno.
spark.sql("SELECT cno FROM Score WHERE degree > 85 GROUP BY cno").show()
////27损晤、查詢出“計算機系“教師所教課程的成績表。
spark.sql("SELECT t.sno, t.cno, t.degree FROM Score t " +
"JOIN Course c ON t.cno = c.cno " +
"JOIN Teacher e ON c.tno = e.tno WHERE e.depart = 'department of computer'").show()
////29红竭、查詢選修編號為“3-105“課程且成績至少高于選修編號為“3-245”的同學(xué)的Cno尤勋、Sno和Degree,并按Degree從高到低次序排序。
spark.sql("SELECT t.sno, t.cno, degree " +
"FROM SCORE t " +
"WHERE degree > ( " +
"SELECT MIN(degree) " +
"FROM score " +
"WHERE cno = '3-245' " +
") " +
"AND t.cno = '3-105' " +
"ORDER BY degree DESC").show()
////30茵宪、查詢選修編號為“3-105”且成績高于選修編號為“3-245”課程的同學(xué)的Cno最冰、Sno和Degree.
// oracle方式 spark.sql("select t.sno, t.cno, t.degree from SCORE t where t.degree > select degree from score where cno='3-245' or cno='3-105'").show()
spark.sql("SELECT t.sno, t.cno, t.degree FROM Score t WHERE t.degree > (SELECT MAX(degree) FROM Score WHERE cno = '3-245' ) AND t.cno = '3-105'").show()
////31、 查詢所有教師和同學(xué)的name稀火、sex和birthday.
spark.sql("SELECT sname, ssex, sbirthday FROM Student " +
"UNION SELECT tname, tsex, tbirthday FROM Teacher").show()
//// 32暖哨、查詢所有“女”教師和“女”同學(xué)的name、sex和birthday. union
spark.sql("SELECT sname, ssex, sbirthday " +
"FROM Student " +
"WHERE ssex = 'female' " +
"UNION " +
"SELECT tname, tsex, tbirthday " +
"FROM Teacher " +
"WHERE tsex = 'female'").show()
////33凰狞、 查詢成績比該課程平均成績低的同學(xué)的成績表篇裁。
spark.sql("SELECT s.* " +
"FROM score s " +
"WHERE s.degree < ( " +
"SELECT AVG(degree) " +
"FROM score c " +
"WHERE s.cno = c.cno)").show()
////34、 查詢所有任課教師的Tname和Depart. in
spark.sql("SELECT tname, depart " +
"FROM teacher t " +
"WHERE t.tno IN ( " +
"SELECT tno " +
"FROM course c " +
"WHERE c.cno IN (" +
"SELECT cno " +
"FROM score))").show()
////35 赡若、 查詢所有未講課的教師的Tname和Depart. not in
spark.sql("SELECT tname, depart " +
"FROM teacher t " +
"WHERE t.tno NOT IN ( " +
"SELECT tno " +
"FROM course c " +
"WHERE c.cno IN ( " +
"SELECT cno " +
"FROM score))").show()
////36达布、查詢至少有2名男生的班號。 group by, having count
spark.sql("SELECT SClass " +
"FROM Student t " +
"WHERE Ssex = 'male' " +
"GROUP BY SClass " +
"HAVING COUNT(Ssex) >= 2").show()
////37逾冬、查詢Student表中不姓“王”的同學(xué)記錄往枣。 not like
spark.sql("SELECT * FROM Student t WHERE Sname NOT LIKE('Wang%')").show()
////38、查詢Student表中每個學(xué)生的姓名和年齡粉渠。
////將函數(shù)運用到spark sql中去計算,可以直接拿String的類型計算不需要再轉(zhuǎn)換成數(shù)值型 默認(rèn)是會轉(zhuǎn)換成Double類型計算
spark.sql("SELECT Sname, ("+ getDate("yyyy") +" - substring(sbirthday, 0, 4)) AS age FROM STUDENT t").show()
////浮點型轉(zhuǎn)整型
spark.sql("SELECT Sname, (CAST("+ getDate("yyyy") +" AS INT) - CAST(substring(sbirthday, 0, 4) AS INT)) AS age " +
"FROM Student t").show()
////39圾另、查詢Student表中最大和最小的Sbirthday日期值霸株。 時間格式最大值,最小值
spark.sql("SELECT MAX(t.sbirthday) AS maximum FROM Student t").show()
spark.sql("SELECT MIN(t.sbirthday) AS minimum FROM Student t").show()
////40、以班號和年齡從大到小的順序查詢Student表中的全部記錄集乔。 查詢結(jié)果排序
spark.sql("SELECT * " +
"FROM Student " +
"ORDER BY SClass DESC, CAST("+ getDate("yyyy") +" AS INT) - CAST(substring(Sbirthday, 0, 4) AS INT) DESC").show()
////41去件、查詢“男”教師及其所上的課程。 select join
spark.sql("SELECT TSex, CName " +
"FROM Teacher t " +
"JOIN course c ON t.tno = c.tno " +
"WHERE TSex = 'male'").show()
////42扰路、查詢最高分同學(xué)的Sno尤溜、Cno和Degree列。 子查詢
spark.sql("SELECT * " +
"FROM Score " +
"WHERE degree = ( " +
"SELECT MAX(degree) " +
"FROM SCORE t)").show()
////43汗唱、查詢和“李軍”同性別的所有同學(xué)的Sname.
spark.sql("SELECT sname " +
"FROM STUDENT t " +
"WHERE ssex IN ( " +
"SELECT ssex " +
"FROM student " +
"WHERE sname = 'Liu Jun')").show()
////44宫莱、查詢和“李軍”同性別并同班的同學(xué)Sname.
spark.sql("SELECT sname " +
"FROM Student t " +
"WHERE ssex IN ( " +
"SELECT ssex " +
"FROM student " +
"WHERE sname = 'Liu Jun') " +
"AND sclass IN (SELECT sclass " +
"FROM student " +
"WHERE sname = 'Liu Jun')").show()
////45、查詢所有選修“計算機導(dǎo)論”課程的“男”同學(xué)的成績表哩罪。
spark.sql("SELECT t.sno, t.cno, t.degree " +
"FROM Score t " +
"JOIN Course c ON t.cno = c.cno " +
"JOIN Student s ON s.sno = t.sno " +
"WHERE s.SSex = 'male' " +
"AND c.CName = 'Introduction to computer'").show()
}
}