import os
import magic
# 决定使用这种方式识别,描述较全面,利用关键字匹配,如果能匹配到,就确定匹配的关键字为其文件类型
# 如果匹配不到,就用之前的文件格式
# a = magic.from_file(path)
# b = magic.from_buffer(open(path).read(1024))
# c = magic.from_file(path, mime=True)
# print(a)
FILE_TYPE = {
\'Executables\': [\'exe\', \'mz\', \'msi\', \'coff\', \'elf\', \'krnl\', \'rpm\', \'linux\', \'macho\'],
\'Documents\': [\'ps\', \'rtf\', \'odp\', \'ods\', \'odt\', \'hwp\', \'gul\', \'ebook\', \'latex\'],
\'Code\': [\'php\', \'python\', \'perl\', \'ruby\', \'cpp\', \'java\', \'shell\', \'pascal\', \'awk\', \'dyalog\',
\'fortran\', \'java-bytecode\'],
\'Bundles\': [\'zip\', \'gzip\', \'bzip\', \'rzip\', \'dzip\', \'7-zip\', \'cab\', \'jar\', \'rar\', \'mscompress\', \'ace\',
\'arj\', \'asd\', \'blackhole\', \'kgb\'],
\'Other\': [\'bat\', \'cmd\']
}
def identify_file_type():
"""
文件类型识别
:return:
"""
# path = "D:/scripts/file/AF7.5.1.mf"
path = "D:/scripts/file/CmdHelperService.7z"
# 识别之前先根据文件后缀判断,排除以下文件格式:
exclude_file_type = [\'txt\', \'pdf\', \'doc\', \'docx\', \'ppt\', \'pptx\', \'xls\', \'xlsx\', \'com\', \'mf\']
file_type = os.path.splitext(path)
if file_type[1]:
file_type = file_type[1].strip(\'.\')
if file_type not in exclude_file_type:
file_type_info = magic.from_file(path)
print(file_type_info)
# TODO magic库识别后无法准确匹配的特殊文件类型,陆续添加
# dll 类型
if \'DLL\' in file_type_info:
file_type = \'dll\'
# vmdk 类型
if \'VMware4 disk image\' in file_type_info:
file_type = \'vmdk\'
else:
for file_types in FILE_TYPE.values():
for f_type in file_types:
if f_type.upper() in file_type_info or f_type.capitalize() in file_type_info or f_type in file_type_info:
file_type = f_type
return file_type
file_type = identify_file_type()
print(file_type)
# 7-zip archive data, version 0.4
# 7-zip
相关文章: