对于机器学习、图像处理有时要对图像数据进行分割处理。用python写了一些常用小工具代码。
目录
1 caffe数据集txt文本制作
很多时候要建立如下数据集txt文本,类似图片所示(图片网上找的)
生成代码如下:
"""
caffe数据集txt文本制作
"""
import os
#文件保存路径
f =open(r'd:/val.txt',"w")
path = 'D:/smoke_datasets/val/0'
for filename in os.listdir(path) :
#数据标签
count = 0
ff = filename+" "+ "0"+"\n"
f.write(ff)
print('{} class: {}'.format(filename,count))
path = 'D:/smoke_datasets/val/1'
for filename in os.listdir(path) :
#数据标签
count = 1
ff = filename+" "+ "1"+"\n"
f.write(ff)
print('{} class: {}'.format(filename,count))
f.close()
2 jpg图像完整性检测
有时爬虫所获得图像可能不完整,需要进行图像完整性检测。代码如下:
"""
jpg图像完整性检测
"""
from skimage import io
from PIL import Image
import numpy as np
import os
def is_valid_jpg(path):
#判断JPG文件下载是否完整
if path.split('.')[-1].lower() == 'jpg':
with open(path, 'rb') as fr:
fr.seek(-2, 2)
#判定jpg是否包含结束字段
return fr.read() == '\xff\xd9'
else:
return False
#文件头的方式
def is_jpg(path):
data = open(path,'rb').read(11)
if data[:4] != '\xff\xd8\xff\xe0' and data[:4]!='\xff\xd8\xff\xe1':
return False
if data[6:] != 'JFIF\0' and data[6:] != 'Exif\0':
return False
return True
def check_pic_PIL(path):
try:
Image.open(path).load()
Image.open(path).verify()
except:
return False
try:
img = Image.open(path)
img = np.array(img, dtype=np.float32)
except:
return False
if len(img.shape)!=3:
return False
return True
def check_pic_skimage(path):
try:
img = io.imread(path)
except:
return False
try:
img = np.array(img, dtype=np.float32)
except:
return False
if len(img.shape)!=3:
return False
return True
if __name__ == '__main__':
#结果
f =open(r'd:/state.txt',"w")
#路径
paths = ["d:/train"]
for path in paths:
print('the current path is : {}\n'.format(path))
#path = "D:/smoke_data/datas/deal/smoke_auto_aug"
#文件头检测
#精细检测
for filename in os.listdir(path):
#print('current jpg is {}'.format(path+"/"+filename))
#文件头检测
status_valid_jpg = is_valid_jpg(path+"/"+filename)
status_jpg = is_jpg(path+"/"+filename)
if( status_valid_jpg == False or status_jpg == False):
ff = filename+"\n"
f.write(ff)
print('{} \n'.format(path+"/"+filename))
continue
#状态检测
status_PIL = check_pic_PIL(path+"/"+filename)
status_skimage = check_pic_skimage(path+"/"+filename)
if (status_PIL == False or status_skimage == False):
ff = filename+"\n"
f.write(ff)
print("=" * 50)
print('{} \n'.format(path+"/"+filename))
print("=" * 50)
#分割线
print("*" * 50)
print("end!")
f.close()
3 图像随机提取
对于windows系统,移动太慢,也费时。linux也差不多。通过python可以快速移动大量图像,代码如下:
"""
图像随机移动
"""
import os, random, shutil
def moveFile(fileDir, tarDir, picknumber):
#取图像原始路径
pathDir = os.listdir(fileDir)
filenumber = len(pathDir)
if filenumber < picknumber:
picknumber = filenumber
#抽取一定比例
sample = random.sample(pathDir, picknumber)
print(sample)
for name in sample:
shutil.move(fileDir+name, tarDir+name)
return
if __name__ == '__main__':
#图像路径
fileDir = 'D:/datasets/train/'
#移动路径
tarDir = 'D:/datasets/move/'
#从fileDir随机移动500张图像到tarDir文件夹
moveFile(fileDir, tarDir, 500)
4 图像尺寸统计
主要是统计图像尺寸,可以添加过滤条件,滤掉尺寸过小或者过大的图像。代码如下:
"""
统计数据集下图像尺寸
"""
import os
from PIL import Image
import pandas as pd
#数据集路径
path = 'D:/test/'
#存入列表
f = os.listdir(path)
count = 0
df = pd.DataFrame(columns=['width','height'])
for i in f:
#旧文件名
oldname=path+f[count]
im = Image.open(oldname)
df.loc[count,'width']=im.width
df.loc[count,'height']=im.height
print(oldname)
count += 1
#保存结果
df.to_csv('test.csv')
f.close()
5 图像名字后缀重命名
对图像的名字以及后缀名重新命名,代码如下:
"""
图像名称后缀重命名
"""
import os
#图像路径
path = 'D:/train/'
#保存路径
save_path = 'D:/result/'
#存入列表
f = os.listdir(path)
count = 0
for i in f:
#旧文件名
oldname=path+f[count]
print(oldname)
#新文件名
newname=save_path+'smoke.'+str(count)+'.jpg'
os.rename(oldname,newname)
count += 1