1、文檔編寫目的
目前各個企業(yè)都在利用Hadoop大數(shù)據(jù)平臺橄镜,每天都會通過ETL產(chǎn)生大量的文件到hdfs上绩脆,如何有效的去監(jiān)測數(shù)據(jù)的有效性顷链,防止數(shù)據(jù)的無限增長導(dǎo)致物理資源跟不上節(jié)奏,我們必須控制成本焚刚,讓有限的資源發(fā)揮大數(shù)據(jù)的極致功能点弯。本文介紹如何去分析hdfs上的文件變化情況,以及老生常談的小文件的監(jiān)控情況的一種實現(xiàn)方式矿咕。
2抢肛、實現(xiàn)方式說明
本次分析方案有兩種:1、利用hdfs的api文檔碳柱,通過hdfs實例的listStatus方法遞歸出hdfs上所有的文件及目錄的具體情況捡絮,包括path、ower莲镣、size等重要屬性锦援。然后將這些數(shù)據(jù)寫到本地文件中,上傳到hdfs上剥悟,然后在hive上建一個外表來映射這些數(shù)據(jù)灵寺,最后利用sql進(jìn)行各種分析;2区岗、第二種方式主要是在獲取源數(shù)據(jù)時跟第一種不同略板,這次采用的是hdfs自帶的分析fsimage文件的命令hdfs oiv -i + fsimage文件 -o +輸出文件 -p Delimited,該命令將fsimage文件解析成可閱讀的csv文件慈缔,后續(xù)操作跟第一種一樣都是上傳到hdfs建外表用sql來分析各種指標(biāo)叮称。下面主要講一下第一種方式。
3藐鹤、代碼講解
3.1 第一種用java代碼通過hdfs的api文檔獲取完整數(shù)據(jù)
package com.mljr.hdfs;
import java.io.*;
import java.net.URI;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileStatus;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
public class HdfsStatus {
public static void main(String[] args) {
FileSystem hdfs = null;
try{
Configuration config = new Configuration();
config.set("fs.default.name", "nameservice1");
hdfs = FileSystem.get(new URI("nameservice1"),//主節(jié)點ip或者h(yuǎn)osts
config, "hdfs");
Path path = new Path("/");//這里定義從hdfs的根節(jié)點開始計算
String content_csv = "/tmp/content.csv";
long startTime=System.currentTimeMillis(); //獲取開始時間
BufferedOutputStream out =new BufferedOutputStream(new FileOutputStream(new File(content_csv)));
iteratorShowFiles(hdfs, path,out);
out.close();
long endTime=System.currentTimeMillis(); //獲取結(jié)束時間
long runTime = (endTime-startTime)/1000/60;
System.out.println("程序運行時間: "+runTime+"min");
}catch(Exception e){
e.printStackTrace();
}finally{
if(hdfs != null){
try {
hdfs.closeAll();
} catch (Exception e) {
e.printStackTrace();
}
}
}
}
/**
*
* @param hdfs FileSystem 對象
* @param path 文件路徑
*/
public static void iteratorShowFiles(FileSystem hdfs, Path path,BufferedOutputStream out){
String line = System.getProperty("line.separator");
try{
if(hdfs == null || path == null){
return;
}
//獲取文件列表
FileStatus[] files = hdfs.listStatus(path);
//創(chuàng)建輸出文件
//展示文件信息
for (int i = 0; i < files.length; i++) {
try{
if(files[i].isDirectory()){
String text = (files[i].getPath().toString().replace("hdfs://nameservice1","")
+ "," + files[i].getOwner()
+ "," + "0"
+ "," + "0"
+ "," + files[i].getBlockSize()
+ "," + files[i].getPermission()
+ "," + files[i].getAccessTime()
+ "," + files[i].getModificationTime()
+ "," + files[i].getReplication()+line);
out.write(text.getBytes());
//遞歸調(diào)用
iteratorShowFiles(hdfs, files[i].getPath(),out);
}else if(files[i].isFile()){
String text=files[i].getPath().toString().replace("hdfs://nameservice1","")
+ "," + files[i].getOwner()
+ "," + "1"
+ "," + files[i].getLen()
+ "," + files[i].getBlockSize()
+ "," + files[i].getPermission()
+ "," + files[i].getAccessTime()
+ "," + files[i].getModificationTime()
+ "," + files[i].getReplication()+line;
out.write(text.getBytes());
}
}catch(Exception e){
e.printStackTrace();
}
}
}catch(Exception e){
e.printStackTrace();
}
}
}
接下來就是將本地的文件上傳到hdfs上瓤檐,然后建hive外表
#!/bin/bash
source /etc/profile
cd /home/dmp/hdfs
#生成hdfs目錄文件和節(jié)點信息
java -cp ./HdfsStatus-1.0-SNAPSHOT.jar com.mljr.hdfs.HdfsStatus
#將文件上傳到hdfs(hdfs目錄需要提前創(chuàng)建好)
hadoop fs -rm -r /tmp/dfs/content/content.csv /tmp/dfs/nodes/nodes.csv
hadoop fs -put /tmp/content.csv /tmp/dfs/content
接下來就是sql分析了
#建外表
CREATE EXTERNAL TABLE `default.hdfs_info`(
`path` string,
`owner` string,
`is_dir` string,
`filesize` string,
`blocksize` string,
`permisson` string,
`acctime` string,
`modificatetime` string,
`replication` string)
ROW FORMAT SERDE
'org.apache.hadoop.hive.serde2.lazy.LazySimpleSerDe'
WITH SERDEPROPERTIES (
'field.delim'=',',
'serialization.format'=',')
STORED AS INPUTFORMAT
'org.apache.hadoop.mapred.TextInputFormat'
OUTPUTFORMAT
'org.apache.hadoop.hive.ql.io.HiveIgnoreKeyTextOutputFormat'
LOCATION
'hdfs://nameservice1/tmp/dfs/content'
#sql分析一級目錄大小
select joinedpath, sumsize
from
(
select joinedpath,round(sum(filesize)/1024/1024/1024,2) as sumsize
from
(select concat('/',split(path,'\/')[1]) as joinedpath,accTime,filesize,owner
from default.hdfs_info
)t
group by joinedpath
)h
order by sumsize desc
#sql分析二級目錄大小
select joinedpath, sumsize
from
(
select joinedpath,round(sum(filesize)/1024/1024/1024,2) as sumsize
from
(select concat('/',split(path,'\/')[1],'/',split(path,'\/')[2]) as joinedpath,accTime,filesize,owner
from default.hdfs_info
)t
group by joinedpath
)h
order by sumsize desc
###后面的各級目錄方式類似,就不再詳述了娱节,下面說下各級目錄小文件統(tǒng)計的sql
#三級目錄下小于100k文件數(shù)量的統(tǒng)計
SELECT concat('/',split(path,'\/')[1],'/',split(path,'\/')[2],'/',split(path,'\/')[3]) as path ,count(*) as small_file_num
FROM
(SELECT relative_size,path
FROM
(SELECT (case filesize < 100*1024 WHEN true THEN 'small' ELSE 'large' end)
AS
relative_size, path
FROM default.hdfs_info WHERE is_dir='1') tmp
WHERE
relative_size='small') tmp2
group by concat('/',split(path,'\/')[1],'/',split(path,'\/')[2],'/',split(path,'\/')[3])
order by small_file_num desc;
###其他各級目錄小文件數(shù)量的統(tǒng)計挠蛉,方法類似,下面說下hive某個庫下面表大小以及修改時間的統(tǒng)計
SELECT joinedpath,
from_unixtime(ceil(acctime/1000),'yyyy-MM-dd HH:mm:ss') AS acctime,
from_unixtime(ceil(modificatetime/1000),'yyyy-MM-dd HH:mm:ss') AS modificatetime,
sumsize
FROM
(SELECT joinedpath,
min(accTime) AS acctime,
max(modificatetime) AS modificatetime,
round(sum(filesize)/1024/1024/1024,2) AS sumsize
FROM
(SELECT concat('/',split(path,'\/')[1],'/',split(path,'\/')[2],'/',split(path,'\/')[3],'/',split(path,'\/')[4],'/',split(path,'\/')[5]) AS joinedpath,
accTime,
modificatetime,
filesize,
OWNER
FROM default.hdfs_info
WHERE concat('/',split(path,'\/')[1],'/',split(path,'\/')[2],'/',split(path,'\/')[3],'/',split(path,'\/')[4])='/user/hive/warehouse/default.db')t
WHERE joinedpath != 'null'
GROUP BY joinedpath)h
ORDER BY sumsize DESC
其實hdfs上可用來分析的太多了肄满,我這邊只是拋磚引玉給出了一些基本的sql分析谴古,還有一些其他的后面補(bǔ)充吧质涛。
3.2 還是簡單介紹下第二種方式吧,其主要的實現(xiàn)手段就是shell腳本掰担,貼一下主要的代碼
prepare_operation()
{
# get parameters
t_save_fsimage_path=$1
# delete history fsimage
fsimage_tmp_file=`find ${t_save_fsimage_path} -name "fsimage*"`
if [ ! -z "${fsimage_tmp_file}" ]
then
for file in ${fsimage_tmp_file}
do
rm -f ${file}
done
fi
# 使用set -e時汇陆,如果命令返回結(jié)果不為0就報錯,即無法再使用$?獲取命令結(jié)果带饱,可用||或!處理
}
get_hdfs_fsimage()
{
# 獲取傳入?yún)?shù)
t_save_fsimage_path=$1
# 從namenode上下載fsimage
hdfs dfsadmin -fetchImage ${t_save_fsimage_path}
# 獲取下載的fsimage具體文件路徑
t_fsimage_file=`ls ${t_save_fsimage_path}/fsimage*`
# 處理fsimage為可讀的csv格式文件
hdfs oiv -i ${t_fsimage_file} -o ${t_save_fsimage_path}/fsimage.csv -p Delimited
# 刪除fsimage.csv的首行數(shù)據(jù)
sed -i -e "1d" ${t_save_fsimage_path}/fsimage.csv
# 創(chuàng)建數(shù)據(jù)目錄
hadoop fs -test -e ${t_save_fsimage_path}/fsimage || hdfs dfs -mkdir -p ${t_save_fsimage_path}/fsimage
# 拷貝fsimage.csv到指定的路徑
hdfs dfs -copyFromLocal -f ${t_save_fsimage_path}/fsimage.csv ${t_save_fsimage_path}/fsimage/
}
main()
{
# 開始時間
begin_time=`date +%s`
# 定義本地和HDFS的臨時目錄路徑
t_save_fsimage_path=/tmp/dfs
# 創(chuàng)建臨時目錄毡代,刪除歷史數(shù)據(jù)等操作
prepare_operation ${t_save_fsimage_path}
# 獲取HDFS的FSImage
hdfs_fsimage_update_time=`date "+%Y-%m-%d %H:%M:%S"`
get_hdfs_fsimage ${t_save_fsimage_path}
# 結(jié)束時間
end_time=`date +%s`
# 耗時(秒數(shù))
result_time=$((end_time-begin_time))
echo "******************************************************************"
echo "The script has taken ${result_time} seconds..."
echo "Result Table: default.hdfs_meta"
echo "HDFS FSImage update-time before: ${hdfs_fsimage_update_time}"
echo "******************************************************************"
}
#執(zhí)行主方法
main "$@"
在獲取到源數(shù)據(jù)之后的操作就跟之前的一樣了。
4勺疼、總結(jié)
其實基于hdfs上的文件以及目錄的分析還有很多工作要做月趟,比如分析hdfs各級目錄每天的增量變化情況,得出集群主要的增長數(shù)據(jù)來自哪個地方恢口;分析hdfs上文件的生命周期孝宗,一個文件在hdfs上很久都沒變動了是否代表這個數(shù)據(jù)就沒價值了;另外hive表實質(zhì)上也是hdfs上的文件耕肩,通過分析hdfs上文件包含的小文件可以知道哪些hive表沒有正常使用參數(shù)產(chǎn)生了大量的小文件等等因妇。合理的利用hdfs存儲空間可是能幫公司節(jié)約很大的成本哦。