Task1：構建自己的圖像分類數(shù)據集

圖像分類：input圖像——>output每個類別的概率

數(shù)據集的質量直接決定算法的質量
訓練集和測試集文件夾數(shù)相同溺健，并且圖片沒有交集

1. 安裝配置環(huán)境

pip install numpy pandas matplotlib requests tqdm opencv-python

2. 圖像采集

B1栗弟、B2代碼用來獲取網絡圖像
若使用手機或者單反拍照活逆，則不需要B1驱犹、B2代碼

注：刪除無關圖片；類別均衡钦铺；圖片要具有多樣性张漂、代表性、一致性读存；刪除系統(tǒng)自動生成的多余文件和文件夾（B4）（最好在linux中運行）为流；刪除GIF文件（B4）

3. 刪除多余文件

find . -iname '__MACOSX' #當前根目錄
find . -iname '.DS_Store'
find . -iname '.ipynb_checkpoints'

for i in `find . -iname '__MACOSX'`; do rm -rf $i;done
for i in `find . -iname '.DS_Store'`; do rm -rf $i;done
for i in `find . -iname '.ipynb_checkpoints'`; do rm -rf $i;done

4. 下載數(shù)據集

wget https://zihao-openmmlab.obs.cn-east-3.myhuaweicloud.com/20220716-mmclassification/dataset/fruit81/fruit81_full.zip
unzip fruit81_full.zip >> /dev/null

# 刪除文件
rm -rf fruit81_full.zip

5. 統(tǒng)計圖像尺寸窜醉、比例分布

import os
import numpy as np
import pandas as pd
import cv2
from tqdm import tqdm

import matplotlib.pyplot as plt
%matplotlib inline

# 指定數(shù)據集路徑
dataset_path = 'fruit81_full'
os.chdir(dataset_path)
os.listdir()

df = pd.DataFrame()
for fruit in tqdm(os.listdir()): # 遍歷每個類別    
    os.chdir(fruit)
    for file in os.listdir(): # 遍歷每張圖像
        try:
            img = cv2.imread(file)
            df = df.append({'類別':fruit, '文件名':file, '圖像寬':img.shape[1], '圖像高':img.shape[0]}, ignore_index=True)
        except:
            print(os.path.join(fruit, file), '讀取錯誤')
    os.chdir('../')
os.chdir('../')

df

#可視化圖像尺寸分布
from scipy.stats import gaussian_kde
from matplotlib.colors import LogNorm

x = df['圖像寬']
y = df['圖像高']

xy = np.vstack([x,y])
z = gaussian_kde(xy)(xy)

# Sort the points by density, so that the densest points are plotted last
idx = z.argsort()
x, y, z = x[idx], y[idx], z[idx]

plt.figure(figsize=(10,10))
# plt.figure(figsize=(12,12))
plt.scatter(x, y, c=z,  s=5, cmap='Spectral_r')
# plt.colorbar()
# plt.xticks([])
# plt.yticks([])

plt.tick_params(labelsize=15)

xy_max = max(max(df['圖像寬']), max(df['圖像高']))
plt.xlim(xmin=0, xmax=xy_max)
plt.ylim(ymin=0, ymax=xy_max)

plt.ylabel('height', fontsize=25)
plt.xlabel('width', fontsize=25)

plt.savefig('圖像尺寸分布.pdf', dpi=120, bbox_inches='tight')

plt.show()

圖像尺寸分布

6. 劃分訓練集和測試集

import os
import shutil
import random
import pandas as pd

# 指定數(shù)據集路徑
dataset_path = 'fruit81_full'

dataset_name = dataset_path.split('_')[0]
print('數(shù)據集', dataset_name)

classes = os.listdir(dataset_path)

len(classes)

# 創(chuàng)建 train 文件夾
os.mkdir(os.path.join(dataset_path, 'train'))

# 創(chuàng)建 test 文件夾
os.mkdir(os.path.join(dataset_path, 'val'))

# 在 train 和 test 文件夾中創(chuàng)建各類別子文件夾
for fruit in classes:
    os.mkdir(os.path.join(dataset_path, 'train', fruit))
    os.mkdir(os.path.join(dataset_path, 'val', fruit))

test_frac = 0.2  # 測試集比例
random.seed(123) # 隨機數(shù)種子，便于復現(xiàn)

df = pd.DataFrame()

print('{:^18} {:^18} {:^18}'.format('類別', '訓練集數(shù)據個數(shù)', '測試集數(shù)據個數(shù)'))

for fruit in classes: # 遍歷每個類別

    # 讀取該類別的所有圖像文件名
    old_dir = os.path.join(dataset_path, fruit)
    images_filename = os.listdir(old_dir)
    random.shuffle(images_filename) # 隨機打亂

    # 劃分訓練集和測試集
    testset_numer = int(len(images_filename) * test_frac) # 測試集圖像個數(shù)
    testset_images = images_filename[:testset_numer]      # 獲取擬移動至 test 目錄的測試集圖像文件名
    trainset_images = images_filename[testset_numer:]     # 獲取擬移動至 train 目錄的訓練集圖像文件名

    # 移動圖像至 test 目錄
    for image in testset_images:
        old_img_path = os.path.join(dataset_path, fruit, image)         # 獲取原始文件路徑
        new_test_path = os.path.join(dataset_path, 'val', fruit, image) # 獲取 test 目錄的新文件路徑
        shutil.move(old_img_path, new_test_path) # 移動文件

    # 移動圖像至 train 目錄
    for image in trainset_images:
        old_img_path = os.path.join(dataset_path, fruit, image)           # 獲取原始文件路徑
        new_train_path = os.path.join(dataset_path, 'train', fruit, image) # 獲取 train 目錄的新文件路徑
        shutil.move(old_img_path, new_train_path) # 移動文件
    
    # 刪除舊文件夾
    assert len(os.listdir(old_dir)) == 0 # 確保舊文件夾中的所有圖像都被移動走
    shutil.rmtree(old_dir) # 刪除文件夾
    
    # 工整地輸出每一類別的數(shù)據個數(shù)
    print('{:^18} {:^18} {:^18}'.format(fruit, len(trainset_images), len(testset_images)))
    
    # 保存到表格中
    df = df.append({'class':fruit, 'trainset':len(trainset_images), 'testset':len(testset_images)}, ignore_index=True)

# 重命名數(shù)據集文件夾
shutil.move(dataset_path, dataset_name+'_split')

# 數(shù)據集各類別數(shù)量統(tǒng)計表格艺谆，導出為 csv 文件
df['total'] = df['trainset'] + df['testset']
df.to_csv('數(shù)據量統(tǒng)計.csv', index=False)

7. 可視化文件夾中的圖片

import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from mpl_toolkits.axes_grid1 import ImageGrid
%matplotlib inline

import numpy as np
import math
import os

import cv2

from tqdm import tqdm

#可視化的文件夾
folder_path = 'fruit81_split/train/西瓜'
# 可視化圖像的個數(shù)
N = 36
# n 行 n 列
n = math.floor(np.sqrt(N))
n

#讀取文件夾中的所有圖像
images = []
for each_img in os.listdir(folder_path)[:N]:
    img_path = os.path.join(folder_path, each_img)
    img_bgr = cv2.imread(img_path)
    img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
    images.append(img_rgb)

len(images)

#畫圖
fig = plt.figure(figsize=(10, 10))
grid = ImageGrid(fig, 111,  # 類似繪制子圖 subplot(111)
                 nrows_ncols=(n, n),  # 創(chuàng)建 n 行 m 列的 axes 網格
                 axes_pad=0.02,  # 網格間距
                 share_all=True
                 )

# 遍歷每張圖像
for ax, im in zip(grid, images):
    ax.imshow(im)
    ax.axis('off')

plt.tight_layout()
plt.show()

8. 圖像分類數(shù)據集探索統(tǒng)計

#各類別數(shù)據個數(shù)柱狀圖
#package
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
#設置中文字體
# # windows操作系統(tǒng)
# plt.rcParams['font.sans-serif']=['SimHei']  # 用來正常顯示中文標簽 
# plt.rcParams['axes.unicode_minus']=False  # 用來正常顯示負號

# Mac操作系統(tǒng)榨惰，參考 https://www.ngui.cc/51cto/show-727683.html
# 下載 simhei.ttf 字體文件
# !wget https://zihao-openmmlab.obs.cn-east-3.myhuaweicloud.com/20220716-mmclassification/dataset/SimHei.ttf

# Linux操作系統(tǒng)，例如 云GPU平臺：https://featurize.cn/?s=d7ce99f842414bfcaea5662a97581bd1
# 運行完畢后重啟 kernel静汤，再從頭運行一次
!wget https://zihao-openmmlab.obs.cn-east-3.myhuaweicloud.com/20220716-mmclassification/dataset/SimHei.ttf -O /environment/miniconda3/lib/python3.7/site-packages/matplotlib/mpl-data/fonts/ttf/SimHei.ttf
!rm -rf /home/featurize/.cache/matplotlib

import matplotlib
matplotlib.rc("font",family='SimHei') # 中文字體
# plt.rcParams['font.sans-serif']=['SimHei']  # 用來正常顯示中文標簽
plt.rcParams['axes.unicode_minus']=False  # 用來正常顯示負號

#導入數(shù)據集統(tǒng)計表格
df = pd.read_csv('數(shù)據量統(tǒng)計.csv')
df.shape
df

#圖像數(shù)量柱狀圖可視化
# 指定可視化的特征
feature = 'total'
# feature = 'trainset'
# feature = 'testset'
df = df.sort_values(by=feature, ascending=False)
df.head()

plt.figure(figsize=(22, 7))

x = df['class']
y = df[feature]

plt.bar(x, y, facecolor='#1f77b4', edgecolor='k')

plt.xticks(rotation=90)
plt.tick_params(labelsize=15)
plt.xlabel('類別', fontsize=20)
plt.ylabel('圖像數(shù)量', fontsize=20)

# plt.savefig('各類別圖片數(shù)量.pdf', dpi=120, bbox_inches='tight')

plt.show()

plt.figure(figsize=(22, 7))
x = df['class']
y1 = df['testset']
y2 = df['trainset']

width = 0.55 # 柱狀圖寬度

plt.xticks(rotation=90) # 橫軸文字旋轉

plt.bar(x, y1, width, label='測試集')
plt.bar(x, y2, width, label='訓練集', bottom=y1)


plt.xlabel('類別', fontsize=20)
plt.ylabel('圖像數(shù)量', fontsize=20)
plt.tick_params(labelsize=13) # 設置坐標文字大小

plt.legend(fontsize=16) # 圖例

# 保存為高清的 pdf 文件
plt.savefig('各類別圖像數(shù)量.pdf', dpi=120, bbox_inches='tight')

plt.show()

8. 訓練圖像分類模型的方法

Pytorch琅催、MMClassification、fast.ai虫给、Tensorflow藤抡、Keras

最后編輯于：2023.01.16 10:37:25

?著作權歸作者所有,轉載或內容合作請聯(lián)系作者

人面猴
序言：七十年代末，一起剝皮案震驚了整個濱河市抹估，隨后出現(xiàn)的幾起案子缠黍，更是在濱河造成了極大的恐慌，老刑警劉巖药蜻，帶你破解...
沈念sama閱讀 218,546評論 6贊 507
死咒
序言：濱河連續(xù)發(fā)生了三起死亡事件瓷式，死亡現(xiàn)場離奇詭異，居然都是意外死亡语泽，警方通過查閱死者的電腦和手機贸典，發(fā)現(xiàn)死者居然都...
沈念sama閱讀 93,224評論 3贊 395
救了他兩次的神仙讓他今天三更去死
文/潘曉璐我一進店門，熙熙樓的掌柜王于貴愁眉苦臉地迎上來踱卵，“玉大人廊驼，你說我怎么就攤上這事⊥锷埃” “怎么了妒挎？”我有些...
開封第一講書人閱讀 164,911評論 0贊 354
道士緝兇錄：失蹤的賣姜人
文/不壞的土叔我叫張陵，是天一觀的道長西饵。經常有香客問我酝掩，道長，這世上最難降的妖魔是什么罗标？我笑而不...
開封第一講書人閱讀 58,737評論 1贊 294
?港島之戀（遺憾婚禮）
正文為了忘掉前任庸队，我火速辦了婚禮积蜻，結果婚禮上闯割，老公的妹妹穿的比我還像新娘。我一直安慰自己竿拆，他們只是感情好宙拉，可當我...
茶點故事閱讀 67,753評論 6贊 392
惡毒庶女頂嫁案：這布局不是一般人想出來的
文/花漫我一把揭開白布。她就那樣靜靜地躺著丙笋，像睡著了一般谢澈。火紅的嫁衣襯著肌膚如雪煌贴。梳的紋絲不亂的頭發(fā)上，一...
開封第一講書人閱讀 51,598評論 1贊 305
城市分裂傳說
那天锥忿，我揣著相機與錄音牛郑，去河邊找鬼。笑死敬鬓，一個胖子當著我的面吹牛淹朋，可吹牛的內容都是我干的。我是一名探鬼主播钉答，決...
沈念sama閱讀 40,338評論 3贊 418
雙鴛鴦連環(huán)套：你想象不到人心有多黑
文/蒼蘭香墨我猛地睜開眼础芍，長吁一口氣：“原來是場噩夢啊……” “哼！你這毒婦竟也來了数尿？” 一聲冷哼從身側響起仑性，我...
開封第一講書人閱讀 39,249評論 0贊 276
萬榮殺人案實錄
序言：老撾萬榮一對情侶失蹤，失蹤者是張志新（化名）和其女友劉穎右蹦，沒想到半個月后诊杆，有當?shù)厝嗽跇淞掷锇l(fā)現(xiàn)了一具尸體，經...
沈念sama閱讀 45,696評論 1贊 314
?護林員之死
正文獨居荒郊野嶺守林人離奇死亡何陆，尸身上長有42處帶血的膿包…… 初始之章·張勛以下內容為張勛視角年9月15日...
茶點故事閱讀 37,888評論 3贊 336
?白月光啟示錄
正文我和宋清朗相戀三年刽辙，在試婚紗的時候發(fā)現(xiàn)自己被綠了。大學時的朋友給我發(fā)了我未婚夫和他白月光在一起吃飯的照片甲献。...
茶點故事閱讀 40,013評論 1贊 348
活死人
序言：一個原本活蹦亂跳的男人離奇死亡宰缤，死狀恐怖，靈堂內的尸體忽然破棺而出晃洒，到底是詐尸還是另有隱情慨灭，我是刑警寧澤，帶...
沈念sama閱讀 35,731評論 5贊 346
?日本核電站爆炸內幕
正文年R本政府宣布球及，位于F島的核電站氧骤，受9級特大地震影響，放射性物質發(fā)生泄漏吃引。R本人自食惡果不足惜筹陵，卻給世界環(huán)境...
茶點故事閱讀 41,348評論 3贊 330
男人毒藥：我在死后第九天來索命
文/蒙蒙一、第九天我趴在偏房一處隱蔽的房頂上張望镊尺。院中可真熱鬧朦佩，春花似錦、人聲如沸庐氮。這莊子的主人今日做“春日...
開封第一講書人閱讀 31,929評論 0贊 22
一樁弒父案，背后竟有這般陰謀
文/蒼蘭香墨我抬頭看了看天上的太陽。三九已至仙畦，卻和暖如春输涕，著一層夾襖步出監(jiān)牢的瞬間，已是汗流浹背慨畸。一陣腳步聲響...
開封第一講書人閱讀 33,048評論 1贊 270
情欲美人皮
我被黑心中介騙來泰國打工莱坎，沒想到剛下飛機就差點兒被人妖公主榨干…… 1. 我叫王不留，地道東北人寸士。一個月前我還...
沈念sama閱讀 48,203評論 3贊 370
代替公主和親
正文我出身青樓型奥，卻偏偏與公主長得像，于是被迫代替她去往敵國和親碉京。傳聞我的和親對象是個殘疾皇子厢汹，可洞房花燭夜當晚...
茶點故事閱讀 44,960評論 2贊 355