在使用Flink批處理完成數(shù)據(jù)比對(對賬)一中,我們只是簡單的實現(xiàn)了F000/F113/F114的情況淘这,如果我的需求場景需要實現(xiàn)F115的場景該怎么辦呢?
編寫代碼
在上一篇文章的基礎(chǔ)上完成代碼如下:
import org.apache.flink.api.java.DataSet;
import org.apache.flink.api.java.ExecutionEnvironment;
import org.apache.flink.api.java.operators.DataSource;
import org.apache.flink.api.java.operators.MapOperator;
import org.apache.flink.table.api.Table;
import org.apache.flink.table.api.java.BatchTableEnvironment;
import java.util.List;
/***
* <strong>對賬流程</strong>
* <ol>
* <li>兩方文件處理如下:</li>
* <ul>
* <li>所有唯一性字段(如OrderNO)存放到一個table1</li>
* <li>所有唯一性字段+比較字段(如OrderNO+OrderMoney)存放到一個table2</li>
* </ul>
* <li>比對
* <ul>
* <li>兩個文件的table1做差集可以得到F113、F114</li>
* <li>兩個文件的table1做交集可以得到F000+F115</li>
* <li>兩個文件的set2做差集可以得到F113+F115</li>
* <li>F113+F115去除比較字段磺樱,只留下關(guān)鍵字段</li>
* <li>去除F113+F115中的F113誓竿,得到F115</li>
* <li>去除F000+F115中的F115磅网,得到F000</li>
* </ul>
* </ol>
*/
public class BatchJob2 {
public static void main(String[] args) throws Exception {
// set up the batch execution environment
final ExecutionEnvironment env = ExecutionEnvironment.getExecutionEnvironment();
// Table Environment
BatchTableEnvironment tableEnvironment = BatchTableEnvironment.getTableEnvironment(env);
/**
* 構(gòu)造兩個數(shù)據(jù)集,實際生產(chǎn)從自己需要的source中獲取即可
*/
// 只包含唯一性(用于關(guān)聯(lián))字段的數(shù)據(jù)源
DataSource<String> dataSourceA_unique = env.fromElements("orderId_1_f113", "orderId_2_f000", "orderId_3_f115");
DataSource<String> dataSourceB_unique = env.fromElements("orderId_2_f000", "orderId_3_f115", "orderId_4_f114");
// 包含唯一性字段和比較字段
DataSource<String> dataSourceA_compare = env.fromElements("orderId_1_f113:payment_1", "orderId_2_f000:payment_2", "orderId_3_f115:payment_33");
DataSource<String> dataSourceB_compare = env.fromElements("orderId_2_f000:payment_2", "orderId_3_f115:payment_333", "orderId_4_f114:payment_4");
// 轉(zhuǎn)換成table
Table tableA_unique = tableEnvironment.fromDataSet(dataSourceA_unique);
Table tableB_unique = tableEnvironment.fromDataSet(dataSourceB_unique);
Table tableA_compare = tableEnvironment.fromDataSet(dataSourceA_compare);
Table tableB_compare = tableEnvironment.fromDataSet(dataSourceB_compare);
/**
* 核心對賬邏輯
*/
Table f113_table = tableA_unique.minusAll(tableB_unique);
Table f114_table = tableB_unique.minusAll(tableA_unique);
Table f000_f115_table = tableA_unique.intersect(tableB_unique);
Table f113_f115_compare_table = tableA_compare.minusAll(tableB_compare);
// 拆分筷屡,留下唯一性字段
Table f113_f115_table = convert(tableEnvironment, f113_f115_compare_table);
Table f115_table = f113_f115_table.minusAll(f113_table);
Table f000_table = f000_f115_table.minusAll(f115_table);
DataSet<String> f000 = tableEnvironment.toDataSet(f000_table, String.class);
DataSet<String> f113 = tableEnvironment.toDataSet(f113_table, String.class);
DataSet<String> f114 = tableEnvironment.toDataSet(f114_table, String.class);
DataSet<String> f115 = tableEnvironment.toDataSet(f115_table, String.class);
/**
* 輸出涧偷,實際輸出到自己需要的sink即可
*/
List<String> f000_list = f000.collect();
List<String> f113_list = f113.collect();
List<String> f114_list = f114.collect();
List<String> f115_list = f115.collect();
System.out.println("==============================");
System.out.println("f000 ->" + f000_list);
System.out.println("==============================");
System.out.println("f113 ->" + f113_list);
System.out.println("==============================");
System.out.println("f114 ->" + f114_list);
System.out.println("==============================");
System.out.println("f115 ->" + f115_list);
}
private static Table convert(BatchTableEnvironment tableEnvironment, Table inputTable) {
DataSet<String> f000_compare_dataset = tableEnvironment.toDataSet(inputTable, String.class);
MapOperator<String, String> map = f000_compare_dataset.map(e -> {
return e.split(":")[0];// 留下前半段,關(guān)鍵字段
});
return tableEnvironment.fromDataSet(map);
}
}
中間的處理邏輯在代碼中對注釋清楚了毙死。
源碼
總結(jié)
需要知道兩邊都有數(shù)據(jù)(訂單號相同)但存在差異的情況需要處理的步驟多點燎潮。
如果你有更好的想法,歡迎留言扼倘,多多指教确封。
轉(zhuǎn)載請注明出處