流量匯總程序開(kāi)發(fā),利用生成好的匯總過(guò)的文件接著來(lái)進(jìn)行按照總流量由高到低排序并淋。
因?yàn)閙aptask的最終生成文件中的數(shù)據(jù)是已經(jīng)排序過(guò)的,默認(rèn)就是按照key 歸并排序,所以在傳給reduce task的時(shí)候也就是排序過(guò)的欺抗。所以我們可以將輸出bean作為key,電話號(hào)碼作為value來(lái)輸出强重。既然需要對(duì)bean根據(jù)總流量來(lái)進(jìn)行排序绞呈,那么可以讓FlowBean來(lái)實(shí)現(xiàn)WritableComparable接口而不是Writable接口,重寫(xiě)compareTo方法间景。
public class FlowBean implements WritableComparable<FlowBean>{
private long upFlow;//上行流量
private long downFlow;//下行流量
private long totalFlow;//總流量
//按照總流量倒序排
public int compareTo(FlowBean bean) {
return bean.totalFlow>this.totalFlow?1:-1;
}
//序列化時(shí)需要無(wú)參構(gòu)造方法
public FlowBean() {
}
public FlowBean(long upFlow, long downFlow) {
this.upFlow = upFlow;
this.downFlow = downFlow;
this.totalFlow = upFlow + downFlow;
}
public void setFlowBean(long upFlow, long downFlow) {
this.upFlow = upFlow;
this.downFlow = downFlow;
this.totalFlow = upFlow + downFlow;
}
//序列化方法 hadoop的序列化很簡(jiǎn)單佃声,要傳遞的數(shù)據(jù)寫(xiě)出去即可
public void write(DataOutput out) throws IOException {
out.writeLong(upFlow);
out.writeLong(downFlow);
out.writeLong(totalFlow);
}
//反序列化方法 注意:反序列化的順序跟序列化的順序完全一致
public void readFields(DataInput in) throws IOException {
this.upFlow = in.readLong();
this.downFlow = in.readLong();
this.totalFlow = in.readLong();
}
//重寫(xiě)toString以便展示
@Override
public String toString() {
return upFlow + "\t" + downFlow + "\t" + totalFlow;
}
get,set方法
}
public class FlowCountSort {
/**
* KEYIN:默認(rèn)情況下倘要,是mr框架所讀到的一行文本的起始偏移量圾亏,Long,但是在hadoop中有自己的
* 更精簡(jiǎn)的序列化接口(Seria會(huì)將類(lèi)結(jié)構(gòu)都序列化,而實(shí)際我們只需要序列化數(shù)據(jù))志鹃,所以不直接用Long父晶,而用LongWritable
* VALUEIN:默認(rèn)情況下,是mr框架所讀到的一行文本的內(nèi)容弄跌,String,同上甲喝,用Text
* KEYOUT:是用戶自定義邏輯處理完成之后輸出數(shù)據(jù)中的key
* VALUEOUT:是用戶自定義邏輯處理完成之后輸出數(shù)據(jù)中的value
* @author 12706
*
*/
static class FlowCountSortMapper extends Mapper<LongWritable, Text, FlowBean, Text>{
FlowBean flowBean = new FlowBean();
Text text = new Text();
@Override
protected void map(LongWritable key, Text value, Context context)
throws IOException, InterruptedException {
String line = value.toString();
String[] infos = line.split("\t");
//獲取手機(jī)號(hào)
String phoneNum = infos[0];
//獲取上行流量,下行流量
String upFlow = infos[1];
String downFlow = infos[2];
//設(shè)置總流量
text.set(phoneNum);
flowBean.setFlowBean(new Long(upFlow), new Long(downFlow));
//根據(jù)key進(jìn)行了排序铛只,所以需要FlowBean實(shí)現(xiàn)WritableComparable接口
context.write(flowBean, text);
}
}
/**
* KEYIN VALUEIN對(duì)應(yīng)mapper輸出的KEYOUT KEYOUT類(lèi)型對(duì)應(yīng)
* KEYOUT,VALUEOUT:是自定義reduce邏輯處理結(jié)果的輸出數(shù)據(jù)類(lèi)型
* KEYOUT
* VALUEOUT
* @author 12706
*
*/
static class FlowCountSortReducer extends Reducer<FlowBean, Text, Text, FlowBean>{
@Override
protected void reduce(FlowBean key, Iterable<Text> values, Context context)
throws IOException, InterruptedException {
//直接寫(xiě)出去
context.write(values.iterator().next(), key);
}
}
/**
* 相當(dāng)于一個(gè)yarn集群的客戶端
* 需要在此封裝mr程序的相關(guān)運(yùn)行參數(shù)埠胖,指定jar包
* 最后提交給yarn
* @author 12706
* @param args
* @throws Exception
*/
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = Job.getInstance(conf);
job.setJarByClass(FlowCountSort.class);
//指定本業(yè)務(wù)job要使用的mapper,reducer業(yè)務(wù)類(lèi)
job.setMapperClass(FlowCountSortMapper.class);
job.setReducerClass(FlowCountSortReducer.class);
//雖然指定了泛型,以防框架使用第三方的類(lèi)型
//指定mapper輸出數(shù)據(jù)的kv類(lèi)型
job.setMapOutputKeyClass(FlowBean.class);
job.setMapOutputValueClass(Text.class);
//指定最終輸出的數(shù)據(jù)的kv類(lèi)型
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(FlowBean.class);
//指定job輸入原始文件所在位置
FileInputFormat.setInputPaths(job, new Path(args[0]));
//指定job輸入原始文件所在位置
FileOutputFormat.setOutputPath(job,new Path(args[1]));
//將job中配置的相關(guān)參數(shù)以及job所用的java類(lèi)所在的jar包淳玩,提交給yarn去運(yùn)行
boolean b = job.waitForCompletion(true);
System.exit(b?0:1);
}
}
測(cè)試:
將工程打成jar包(flowcount.jar)上傳到linux,啟動(dòng)hadoop集群直撤。
在/flowcount/output下有匯總過(guò)的文件
[root@mini2 ~]# hadoop fs -cat /flowcount/output/part-r-00000
13480253104 180 180 360
13502468823 7335 110349 117684
13560436666 1116 954 2070
13560439658 2034 5892 7926
13602846565 1938 2910 4848
13660577991 6960 690 7650
13719199419 240 0 240
13726230503 2481 24681 27162
13726238888 2481 24681 27162
13760778710 120 120 240
13826544101 264 0 264
13922314466 3008 3720 6728
13925057413 11058 48243 59301
13926251106 240 0 240
13926435656 132 1512 1644
15013685858 3659 3538 7197
15920133257 3156 2936 6092
15989002119 1938 180 2118
18211575961 1527 2106 3633
18320173382 9531 2412 11943
84138413 4116 1432 5548
[root@mini2 ~]# hadoop jar flowcount.jar com.scu.hadoop.mr.FlowCountSort /flowcount/output /flowcount/sortoutput
...
[root@mini2 ~]# hadoop fs -ls /flowcount/sortoutput
Found 2 items
-rw-r--r-- 2 root supergroup 0 2017-10-13 04:45 /flowcount/sortoutput/_SUCCESS
-rw-r--r-- 2 root supergroup 551 2017-10-13 04:45 /flowcount/sortoutput/part-r-00000
[root@mini2 ~]# hadoop fs -cat /flowcount/sortoutput/part-r-00000
13502468823 7335 110349 117684
13925057413 11058 48243 59301
13726230503 2481 24681 27162
13726238888 2481 24681 27162
18320173382 9531 2412 11943
13560439658 2034 5892 7926
13660577991 6960 690 7650
15013685858 3659 3538 7197
13922314466 3008 3720 6728
15920133257 3156 2936 6092
84138413 4116 1432 5548
13602846565 1938 2910 4848
18211575961 1527 2106 3633
15989002119 1938 180 2118
13560436666 1116 954 2070
13926435656 132 1512 1644
13480253104 180 180 360
13826544101 264 0 264
13719199419 240 0 240
13760778710 120 120 240
13926251106 240 0 240
輸出文件/flowcount/sortoutput/part-r-00000中看到了記錄就是按照總流量由高到低排序。