查看文件编码 + 查看文件扩展名 + 文件编码转换
参考资料:python中的encode()和decode()函数
查看文件编码 + 查看文件扩展名
import os
import sys
import codecs
import chardet
def GetFileEncodingFormat(file):
fileHandle = open(file, \'r\', errors = \'ignore\')
fileContext = fileHandle.read()
return chardet.detect(fileContext.encode())["encoding"]
def GetFileExtension(file):
(filepath, filename) = os.path.split(file)
(shortname, extension) = os.path.splitext(filename)
return extension
def CovertFileCodeFormat(file, out_encode):
try:
encoding = GetFileEncodingFormat(file)
extension = GetFileExtension(file)
if (encoding != out_encode and (extension == \'.c\' or extension == \'.h\')):
fileHandle = codecs.open(file, \'r\', encoding, errors = \'ignore\')
fileContext = fileHandle.read()
codecs.open(file, \'w\', out_encode, errors = \'ignore\').write(fileContext)
print ("convert:" + file + " sucess")
except IOError as err:
print ("I/O error: {0}".format(err))
def ProcessDir(path):
for root, dirs, files in os.walk(path):
for file in files:
filePath = os.path.join(root, file)
CovertFileCodeFormat(filePath, sys.argv[2])
def main():
path = sys.argv[1]
if (os.path.isfile(path)):
CovertFileCodeFormat(path, sys.argv[2])
elif (os.path.isdir(path)):
ProcessDir(path)
else:
pass
查看文件编码+扩展名
filepath = r\'C:\Users\Administrator\Desktop\njhcfx_1205\zjtpymplan_1204.csv\'
GetFileEncodingFormat(filepath)
GetFileExtension(filepath)
def ProcessDir(path):
for root, dirs, files in os.walk(path):
for file in files:
filePath = os.path.join(root, file)
#CovertFileCodeFormat(filePath, sys.argv[2])
print(GetFileEncodingFormat(filePath))
def main():
path = sys.argv[1]
if (os.path.isfile(path)):
#CovertFileCodeFormat(path, sys.argv[2])
pass
elif (os.path.isdir(path)):
ProcessDir(path)
else:
pass
文件编码转换
def GB18030ToUTF8(path, new_path, chunksize):
for root, dirs, files in os.walk(path):
for file in files:
#if file not in (\'zjtpjl_1204.csv\'):
# continue
filePath = os.path.join(root, file)
#print(filePath, \'\n\', GetFileEncodingFormat(filepath))
chunks = pd.read_csv(filePath, chunksize=chunksize, encoding=\'gb18030\',
engine=\'python\', dtype=str, na_values=\'\')
filePath = os.path.join(new_path, file)
flag = 1
for chunk in chunks:
if flag==1:
chunk.to_csv(filePath, encoding=\'utf_8_sig\', index=False, header=True)
flag = 0
else:
chunk.to_csv(filePath, encoding=\'utf_8_sig\', mode=\'a+\', index=False, header=False) #
#测试
chunksize = 1000000
path = r\'C:\Users\Administrator\Desktop\njhcfx_1205\'
GB18030ToUTF8(path, path+\'1\', chunksize)
#测试转换后文件是否可读
#filepath = r\'C:\Users\Administrator\Desktop\njhcfx_12051\zjtpjl_1204.csv\'
#aa = pd.read_csv(filepath, encoding=\'utf_8_sig\', dtype=str)
#content = open(filepath).read().decode("gb18030")
#open("C:\\Users\\Administrator\\Desktop\\njhcfx_1205\\zjtpymplan_1205.txt","w").write(content.encode("utf8"))