實(shí)現(xiàn)存儲(chǔ)目錄內(nèi)重復(fù)文件以及重復(fù)文件個(gè)數(shù)的統(tǒng)計(jì)从祝,輸出統(tǒng)計(jì)文件
#!/usr/bin/python
# -*- coding: UTF-8 -*-
import os
import datetime
import platform
import sys
import? hashlib
def CalcFileSha256(filname):
''' calculate file sha256 '''
? ? bufsize =1024 *1024 *16
? ? with open(filname,"rb")as f:
sha256obj = hashlib.sha256()
while True:
data = f.read(bufsize)
if data? ==None or len(data) ==0:
break
? ? ? ? ? ? sha256obj.update(data)
hash_value = sha256obj.hexdigest()
return hash_value
class CoSailTextFile():
def __init__(self, filename =None):
self.filename = filename
self.f =None
? ? def open(self, m):
try:
self.f = open(self.filename, m)
except Exceptionas e:
print(e)
self.f =None
return False
return True
? ? def writeLine(self, line):
try:
self.f.write(line +"\n")
return True
? ? ? ? except Exceptionas? e:
print(e)
return False
? ? def writeLines(self, lines):
for linein lines:
if self.writeLine(line) ==False:
return False
return True
? ? def readLine(self):
text =None
? ? ? ? try:
text = self.f.readline()
except Exceptionas e:
print(e)
return None
? ? ? ? return text
def readAll(self):
return self.f.read()
def readLines(self):
return self.f.readlines()
def close(self):
if self.f !=None:
try:
self.f.close()
self.f =None
? ? ? ? ? ? except Exceptionas e:
print(e)
return False
return True
return True
? ? def isOpen(self):
return self.f !=None
def getFilePathExtend(fileName):
filePath, suffix = os.path.splitext(fileName)
filePath = os.path.dirname(fileName)
l = len(suffix)
fileName = os.path.split(fileName)[1]
return filePath,? suffix[1:l +1], fileName
filesdict = {}
def scansamenamefile(path):
if os.path.exists(path) ==False:
print("Store Path :" + path +" isn't exist!")
return None
? ? try:
files = os.listdir(path)
except FileNotFoundError:
print("File Not FoundError")
return None
? ? for fiin files:
# file full path
# sleep 1
# time.sleep(1)
? ? ? ? fi_d = os.path.join(path, fi)
samefilelist =None
? ? ? ? # is directory
? ? ? ? if os.path.isdir(fi_d):
if os.path.islink(fi_d):
continue
? ? ? ? ? ? """ recursion"""
? ? ? ? ? ? scansamenamefile(fi_d)
# print("dir = ", fi_d)
# is file
? ? ? ? else:
file_path, suffix, file_name = getFilePathExtend(fi_d)
tmpfullpath = file_path +"/" + file_name
try:
samefilelist = filesdict[file_name]
except Exceptionas e:
print(e)
samefilelist =None
? ? ? ? ? ? if samefilelist ==None:
samefilelist = []
filesdict[file_name] = samefilelist
fileattrdict = {}
fileattrdict["filepath"] = tmpfullpath
fileattrdict["size"] = os.path.getsize(tmpfullpath)
fileattrdict["sha256"] = CalcFileSha256(tmpfullpath)
samefilelist.append(fileattrdict)
return filesdict
def statallfiles(d):
v =0;
o =None
? ? for kin d:
o = d[k]
v += len(o)
return v
def statsamenamefiles(d):
v =0;
o =None
? ? for kin d:
o = d[k]
if len(o) >=2:
v +=1
? ? return v
def outduplicatefilemsg(d, textFile):
v =0;
o =None
? ? textFile.writeLine("")
textFile.writeLine("**************Duplicate file list*****************")
for kin d:
o = d[k]
if len(o) >=2:
textFile.writeLine("")
txt ="file name : " + k +" Duplicate : " +str(len(o))
textFile.writeLine(txt)
txt ="file list :"
? ? ? ? ? ? for objin o:
txt ="fileputh :" + obj["filepath"] +" size : " + str(obj["size"]) +" SHA256 = " + obj["sha256"]
textFile.writeLine(txt)
def outonefilemsg(d, textFile):
v =0;
o =None
? ? textFile.writeLine("")
textFile.writeLine("**************Unique file list*****************")
for kin d:
o = d[k]
if len(o) ==1:
for objin o:
txt ="fileputh :" + obj["filepath"] +" size : " + str(obj["size"])
textFile.writeLine(txt)
def statpathfileattr(pathlistfilename, logpath):
"""
? ? :param pathlistfilename:? 要統(tǒng)計(jì)的文件目錄列表名胚股,每行一個(gè)目錄? ? :param logpath 日志文件路徑:return:
"""
? ? """
"""
? ? if logpathis None:
"""
? ? ? ? 如果日志路徑為空胰舆,則以輸入文件列表路徑為日志存儲(chǔ)路徑"""
? ? ? ? logpath, _, _= getFilePathExtend(pathlistfilename)
if os.path.exists(pathlistfilename) ==False:
print("path list filename isn't exist!")
return
? ? if os.path.exists(logpath) ==False:
print("log path isn't exist")
return
? ? f = CoSailTextFile(pathlistfilename)
f.open("r")
lines = f.readLines()
for linein lines:
line = line.strip('\n')
statpathfileattrbypath(line, logpath)
def statpathfileattrbypath(path, logpath):
"""
? ? 統(tǒng)計(jì)指定路徑的文件數(shù)量以及重復(fù)文件,并輸出統(tǒng)計(jì)信息
? ? :param path: 要統(tǒng)計(jì)的路徑? ? :param logpath: 統(tǒng)計(jì)信息存儲(chǔ)路基
:return:
"""
? ? d = scansamenamefile(path)
if d ==None :
return
? ? print("allfiles", statallfiles(d))
print("sameallfiles", statsamenamefiles(d))
t = path.replace("/","-");
t = t.replace("\\","-");
t = t.replace(":","")
if (platform.system() =='Windows'):
pass
? ? elif (platform.system()=='Linux'):
t = t[1:]
lonfilename = logpath +"/" + t + datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') +".log"
? ? lonfilename = lonfilename.replace("http://","/")
f = CoSailTextFile(lonfilename)
f.open("w")
f.writeLine("stat path : " + path)
f.writeLine("all files : " + str(statallfiles(d)))
f.writeLine("Duplicate files : " + str(statsamenamefiles(d)))
outduplicatefilemsg(d, f)
outonefilemsg(d, f)
f.close()
#print("file_name : ", file_path + "/" + file_name)
if __name__ =='__main__':
filesdict.clear()
filesdict={}
argv = sys.argv
print(argv)
if (len(argv) ==3):
statpathfileattr(argv[1], argv[2])
elif (len(argv) ==2):
statpathfileattr(argv[1],None)
else:
statpathfileattr("E:/1.txt","E:/")
print(type(CalcFileSha256("E:/gsl2.4.zip")))
sha2561 =CalcFileSha256("E:/gsl2.4.zip")
sha2562 = CalcFileSha256("E:/1.txt")
if (sha2561 == sha2562):
print("true")
print(sha2562)
print(sha2561)
#d = buildfilenamedictformfilelist("/home/hadoop/tmp/dir-and-files.list")
? ? """存儲(chǔ)目的路徑根目錄"""
"""
destrootpath = "/home/hadoop/tmp4"
path = "E:/data"
d =? scansamenamefile(path)
print(d)
for k in d:
print("key =", k)
print("files = ", len(d[k]))
print("allfiles", statallfiles(d))
print("sameallfiles", statsamenamefiles(d))
f = CoSailTextFile("e:/sss.log")
f.open("w")
f.writeLine("stat path : " + path)
f.writeLine("all files : " + str(statallfiles(d)))
f.writeLine("Duplicate files : " +str(statsamenamefiles(d)))
outduplicatefilemsg(d, f)
outonefilemsg(d, f)
f.close()
f = CoSailTextFile("E:/1.txt")
f.open("r")
text = f.readLines()
print((text))
"""