?If函數: if
?非空查找函數: COALESCE
?條件判斷函數:CASE
? If 函數
語法: if(boolean testCondition, T valueTrue, T valueFalseOrNull)
返回值: T
說明: 當條件testCondition為TRUE時刻蟹,返回valueTrue风皿;否則返回valueFalseOrNull
舉例:
hive> select if(1=2,100,200) from dual;
200
hive> select if(1=1,100,200) from dual;
100
? 非空查找函數 : COALESCE
COALESCE使用時和AS搭配使用而柑,對于合并數據列非常有用鳄哭。
語法: COALESCE(T v1, T v2, …)
返回值: T
說明: 返回參數中的第一個非空值江解;如果所有值都為NULL睛挚,那么返回NULL
舉例:
hive> select COALESCE(null,'100','50′) from dual;
100
條件判斷函數: CASE
語法 : CASE a WHEN b THEN c [WHEN d THEN e]* [ELSE f] END
返回值 : T
說明:如果 a 等于 b ,那么返回 c 蜂大;如果 a 等于 d ,那么返回 e 蝶怔;否則返回 f
舉例:
hive> Select case 100 when 50 then 'tom' when 100 then 'mary' else 'tim' end from dual;
mary
code例子
############################################################################
'''
# function:求取篩選字段
# 功能:求取兩個表的存儲字段奶浦,先取第一個表特有的字段,再取第二個表特有的字段踢星,最后取兩個表字段的交集(如果第一個表的該字段的值為空澳叉,則取第二個該字段的該值)
# 創(chuàng)建時間:2017/08/16
'''
#############################################################################
def colChoose(first_tb,second_tb):
"""兩個表字段的全集等于兩個表之間相互之間的差集和一個交集這三者之和"""
# 構建需要篩選的差集
col_difference_A = str(",".join([tmp for tmp in ['A.' +
i for i in list(set(hiveContext.sql("SELECT * FROM %s"%first_tb).columns). \
difference(set(hiveContext.sql("SELECT * FROM %s"%second_tb).columns)))]]))
col_difference_B = str(",".join([tmp for tmp in ['B.' +
i for i in list(set(hiveContext.sql("SELECT * FROM %s"%second_tb).columns). \
difference(set(hiveContext.sql("SELECT * FROM %s"%first_tb).columns)))]]))
# 求兩個表之間的交集
col_intersection_A = [tmp for tmp in ['A.' + i for i in list(set(hiveContext.sql("SELECT * FROM %s"%first_tb).columns).intersection(
set(hiveContext.sql("SELECT * FROM %s"%second_tb).columns)))]]
col_intersection_B = [tmp for tmp in ['B.' +i for i in list(set(hiveContext.sql("SELECT * FROM %s"%first_tb).columns).intersection(
set(hiveContext.sql("SELECT * FROM %s"%second_tb).columns)))]]
col_intersection = [i for i in list(set(hiveContext.sql("SELECT * FROM %s"%first_tb).columns).intersection(
set(hiveContext.sql("SELECT * FROM %s"%second_tb).columns)))]
# col_intersection_AB = str(",".join([tmp for tmp in ["COALESCE" + str(i) for i in map(None, col_intersection_A,col_intersection_B)]]))
# col_intersection_AB = str(",".join([tmp for tmp in list(map(lambda x,y : x + " " + "AS" + " " + y, ["COALESCE" + str(i) for i in map(None, col_intersection_A,col_intersection_B)],col_intersection))]))
col_intersection_AB = str(",".join([tmp for tmp in list(map(lambda x,y,z : "COALESCE(" + x + "," + y + ")" + " " + "AS" + " " + z, col_intersection_A,col_intersection_B,col_intersection))]))
# col_interset = str(",".join([tmp for tmp in ["IFNULL" + str(i) for i in map(None, col_interset_A,col_interset_B)]])) sql語句中含有IFNULL,hive中并沒有沐悦,hive中用COALESCE成洗。
return col_difference_A,col_difference_B,col_intersection_AB
# 使用示例
# 1. colChoose("tb_source_data.loan_applications","tb_source_data.user_profiles")
# 2. print colChoose("tb_source_data.loan_applications","tb_source_data.user_profiles")
# 3. 其他函數調用:col_difference_A,col_difference_B,col_intersection_AB=colChoose(first_tb,second_tb)
############################################################################
'''
# function:join_two_tables
# 功能:可以用來聚合loan_application和user_profiles的三張表,(包含loan_application&user_profiles,nanyue_loan_application&nanyue_user_profiles,partner_loan_application&partner_user_profiles)
# 創(chuàng)建時間:2017/08/11 修改時間:2017/08/16
'''
#############################################################################
def joinLoanApp_UserProfilesTable(first_tb,second_tb,store_tb,first_tb_on_col,second_tb_on_col,joinway):
begin = datetime.datetime.now()
hiveContext.sql("DROP TABLE IF EXISTS %s PURGE"%store_tb)
col_difference_A, col_difference_B, col_intersection_AB = colChoose(first_tb,second_tb)
sql_join = """
SELECT %s, %s, %s
FROM
(SELECT * FROM
(SELECT *, row_number() OVER (PARTITION BY %s ORDER BY created_at DESC) AS num FROM %s) t1
WHERE t1.num=1 AND %s IS NOT NULL) A
%s JOIN
(SELECT * FROM
(SELECT *, row_number() OVER (PARTITION BY %s ORDER BY created_at DESC) AS num FROM %s) t2
WHERE t2.num=1 AND %s IS NOT NULL) B
ON A.%s=B.%s """%(col_difference_A, col_difference_B, col_intersection_AB, first_tb_on_col, first_tb, first_tb_on_col,joinway, second_tb_on_col, second_tb, second_tb_on_col, first_tb_on_col, second_tb_on_col)
print "-----------建表語句-----------"
print sql_join
print "-----------開始join表-----------"
hiveContext.sql(sql_join).write.saveAsTable("%s"%store_tb)
print "-----------join表結束-----------"
end = datetime.datetime.now()
print "耗時:",end-begin
# 使用示例
# joinLoanApp_UserProfilesTable("tb_source_data.loan_applications","tb_source_data.user_profiles","stage_data.loan_application_join_user_profiles","user_profile_id","id","FULL")