- 常用的包
from pyspark.sql.types import StringType, IntegerType, DoubleType, FloatType
from pyspark import SparkContext
from pyspark.storagelevel import StorageLevel
from pyspark.sql import HiveContext
from pyspark.sql.functions import countDistinct
2.基本操作
//初始化環(huán)境
sc = SparkContext(appName="appname")
hiveContext = HiveContext(sc)
//通過sql獲取dataframe
df = hiveContext.sql(mysqlstring)
//計(jì)算另一列
df = df.withColumn('new_name', fun.udf)
//persist
_ = df.persist(storageLevel=StorageLevel.MEMORY_ONLY).count()
//filter
df = df.filter('day=20200912 and day is not null')
//groupby
df_group = df.filter.groupby(key_list).agg({"pid":"count"})
df_group = df_group.withColumnRenamed("count(pid)", "pid_num")
//join
df = df1.join(df2, [key1, key2], "left_outer")
//union
df = df1.union(df2)