今天要處理大量的csv文件,出現(xiàn)UnicodeDecodeError: 'utf-8' codec can't decode byte 0xc9 in position 0: invalid continuation byte 報(bào)錯(cuò)奢入,于是統(tǒng)一將文件轉(zhuǎn)為utf-8編碼漠另,代碼如下:
# 將編碼轉(zhuǎn)化為utf-8編碼
def change_code(original_file, newfile):
files = os.listdir(original_file)
for namein files:
original_path = original_file +'\\' + name
f =open(original_path, 'rb+')
content = f.read()
source_encoding ='utf-8'
try:
content.decode('utf-8').encode('utf-8')
source_encoding ='utf-8'
except:
try:
content.decode('gbk').encode('utf-8')
source_encoding ='gbk'
except:
try:
content.decode('gb2312').encode('utf-8')
source_encoding ='gb2312'
except:
try:
content.decode('gb18030').encode('utf-8')
source_encoding ='gb18030'
except:
try:
content.decode('big5').encode('utf-8')
source_encoding ='gb18030'
except:
content.decode('cp936').encode('utf-8')
source_encoding ='cp936'
f.close()
# 按照確定的encoding讀取文件內(nèi)容闯两,并另存為utf-8編碼:
block_size =4096
with codecs.open(original_path, 'r', source_encoding)as f:
newfile_path = newfile +'\\' + name
with codecs.open(newfile_path, 'w', 'utf-8')as f2:
while True:
content = f.read(block_size)
if not content:
break
f2.write(content)