本文的目标是对整个验证和预测的过程进行梳理,至于其中出现的一些概念,比如GeM Pooling,ArcFace等,将在后续出一篇文章进行介绍。

导入所需库

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import os
import gc
import cv2
import math
import copy
import time
import random

# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.cuda import amp

# Utils
import joblib
from tqdm import tqdm
from collections import defaultdict

# Sklearn Imports
from sklearn.preprocessing import LabelEncoder, normalize
from sklearn.model_selection import StratifiedKFold

# For Image Models
import timm

# For Similarity Search
import faiss

# Albumentations for augmentations
import albumentations as A
from albumentations.pytorch import ToTensorV2

# For colored terminal text
from colorama import Fore, Back, Style
b_ = Fore.BLUE
y_ = Fore.YELLOW
sr_ = Style.RESET_ALL

import warnings
warnings.filterwarnings("ignore")

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

配置文件

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
CONFIG = {"seed": 2022,
"img_size": 448,
"model_name": "tf_efficientnet_b0_ns",
"num_classes": 15587,
"embedding_size": 512,
"train_batch_size": 64,
"valid_batch_size": 64,
"n_fold": 5,
"device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
# ArcFace Hyperparameters
"s": 30.0,
"m": 0.30,
"ls_eps": 0.0,
"easy_margin": False
}
ROOT_DIR = '../input/happy-whale-and-dolphin'
TRAIN_DIR = '../input/happy-whale-and-dolphin/train_images'
TEST_DIR = '../input/happy-whale-and-dolphin/test_images'

读取数据

1
2
3
4
5
6
def get_train_file_path(id):
return f"{TRAIN_DIR}/{id}"

df = pd.read_csv(f"{ROOT_DIR}/train.csv")
df['file_path'] = df['image'].apply(get_train_file_path)
df.head()

标签编码

直接读取训练前存储下来的编码结果:

1
2
3
4
5
6
encoder = LabelEncoder()

with open("../input/labelencodersss/le.pkl", "rb") as fp:
encoder = joblib.load(fp)

df['individual_id'] = encoder.transform(df['individual_id'])

数据划分

1
2
3
4
skf = StratifiedKFold(n_splits=CONFIG['n_fold'])

for fold, ( _, val_) in enumerate(skf.split(X=df, y=df.individual_id)):
df.loc[val_ , "kfold"] = fold

至此,处理后数据格式如下:
Alt text

定义DataSet

和训练时的定义不同,这里添加了id,用于记录每张图片的文件名:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
class HappyWhaleDataset(Dataset):
def __init__(self, df, transforms=None):
self.df = df
self.ids = df['image'].values
self.file_names = df['file_path'].values
self.labels = df['individual_id'].values
self.transforms = transforms

def __len__(self):
return len(self.df)

def __getitem__(self, index):
idx = self.ids[index]#‘image’
img_path = self.file_names[index]
img = cv2.imread(img_path)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
label = self.labels[index]

if self.transforms:
img = self.transforms(image=img)["image"]

return {
'image': img,
'label': torch.tensor(label, dtype=torch.long),
'id': idx#'image'
}

数据增强

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
data_transforms = {
"train": A.Compose([
A.Resize(CONFIG['img_size'], CONFIG['img_size']),
A.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225],
max_pixel_value=255.0,
p=1.0
),
ToTensorV2()], p=1.),

"valid": A.Compose([
A.Resize(CONFIG['img_size'], CONFIG['img_size']),
A.Normalize(
mean=[0.485, 0.456, 0.406],
std=[0.229, 0.224, 0.225],
max_pixel_value=255.0,
p=1.0
),
ToTensorV2()], p=1.)
}

GeM Pooling

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
class GeM(nn.Module):
def __init__(self, p=3, eps=1e-6):
super(GeM, self).__init__()
self.p = nn.Parameter(torch.ones(1)*p)
self.eps = eps

def forward(self, x):
return self.gem(x, p=self.p, eps=self.eps)

def gem(self, x, p=3, eps=1e-6):
return F.avg_pool2d(x.clamp(min=eps).pow(p), (x.size(-2), x.size(-1))).pow(1./p)

def __repr__(self):
return self.__class__.__name__ + \
'(' + 'p=' + '{:.4f}'.format(self.p.data.tolist()[0]) + \
', ' + 'eps=' + str(self.eps) + ')'

ArcFace

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
class ArcMarginProduct(nn.Module):
r"""Implement of large margin arc distance: :
Args:
in_features: size of each input sample
out_features: size of each output sample
s: norm of input feature
m: margin
cos(theta + m)
"""
def __init__(self, in_features, out_features, s=30.0,
m=0.50, easy_margin=False, ls_eps=0.0):
super(ArcMarginProduct, self).__init__()
self.in_features = in_features
self.out_features = out_features
self.s = s
self.m = m
self.ls_eps = ls_eps # label smoothing
self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
nn.init.xavier_uniform_(self.weight)

self.easy_margin = easy_margin
self.cos_m = math.cos(m)
self.sin_m = math.sin(m)
self.th = math.cos(math.pi - m)
self.mm = math.sin(math.pi - m) * m

def forward(self, input, label):
# --------------------------- cos(theta) & phi(theta) ---------------------
cosine = F.linear(F.normalize(input), F.normalize(self.weight))
sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
phi = cosine * self.cos_m - sine * self.sin_m
if self.easy_margin:
phi = torch.where(cosine > 0, phi, cosine)
else:
phi = torch.where(cosine > self.th, phi, cosine - self.mm)
# --------------------------- convert label to one-hot ---------------------
# one_hot = torch.zeros(cosine.size(), requires_grad=True, device='cuda')
one_hot = torch.zeros(cosine.size(), device=CONFIG['device'])
one_hot.scatter_(1, label.view(-1, 1).long(), 1)
if self.ls_eps > 0:
one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.out_features
# -------------torch.where(out_i = {x_i if condition_i else y_i) ------------
output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
output *= self.s

return output

构建模型

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
class HappyWhaleModel(nn.Module):
def __init__(self, model_name, embedding_size, pretrained=True):
super(HappyWhaleModel, self).__init__()
self.model = timm.create_model(model_name, pretrained=pretrained)
in_features = self.model.classifier.in_features
self.model.classifier = nn.Identity()
self.model.global_pool = nn.Identity()
self.pooling = GeM()
self.embedding = nn.Linear(in_features, embedding_size)
self.fc = ArcMarginProduct(embedding_size,
CONFIG["num_classes"],
s=CONFIG["s"],
m=CONFIG["m"],
easy_margin=CONFIG["ls_eps"],
ls_eps=CONFIG["ls_eps"])

def forward(self, images, labels):
features = self.model(images)
pooled_features = self.pooling(features).flatten(1)
embedding = self.embedding(pooled_features)
output = self.fc(embedding, labels)
return output

def extract(self, images):
features = self.model(images)
print('features:',features)
pooled_features = self.pooling(features).flatten(1)
print('pooled_features:',pooled_features)
embedding = self.embedding(pooled_features)
return embedding


model = HappyWhaleModel(CONFIG['model_name'], CONFIG['embedding_size'])
# 加载训练好的模型
model.load_state_dict(torch.load("../input/arcface-gap-embed/Loss14.0082_epoch10.bin"))
model.to(CONFIG['device']);

准备某个fold的数据集

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
def prepare_loaders(df, fold):
df_train = df[df.kfold != fold].reset_index(drop=True)
df_valid = df[df.kfold == fold].reset_index(drop=True)

train_dataset = HappyWhaleDataset(df_train, transforms=data_transforms["train"])
valid_dataset = HappyWhaleDataset(df_valid, transforms=data_transforms["valid"])

train_loader = DataLoader(train_dataset, batch_size=CONFIG['train_batch_size'],
num_workers=2, shuffle=False, pin_memory=True)
valid_loader = DataLoader(valid_dataset, batch_size=CONFIG['valid_batch_size'],
num_workers=2, shuffle=False, pin_memory=True)

return train_loader, valid_loader

train_loader, valid_loader = prepare_loaders(df, fold=0)

提取训练集和验证集的embedding

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
@torch.inference_mode()
def get_embeddings(model, dataloader, device):
model.eval()

LABELS = []
EMBEDS = []
IDS = []

bar = tqdm(enumerate(dataloader), total=len(dataloader))
for step, data in bar:
images = data['image'].to(device, dtype=torch.float)
labels = data['label'].to(device, dtype=torch.long)
ids = data['id']

outputs = model.extract(images)

LABELS.append(labels.cpu().numpy())
EMBEDS.append(outputs.cpu().numpy())
IDS.append(ids)

EMBEDS = np.vstack(EMBEDS)
LABELS = np.concatenate(LABELS)
IDS = np.concatenate(IDS)

return EMBEDS, LABELS, IDS


train_embeds, train_labels, train_ids = get_embeddings(model, train_loader, CONFIG['device'])
valid_embeds, valid_labels, valid_ids = get_embeddings(model, valid_loader, CONFIG['device'])

embedding的维度是512,即train_embeds.shape[1]valid_embeds.shape[1]都等于512。

之后对embedding做归一化,对labels由数字编码转回字符串编码:

1
2
3
4
5
train_embeds = normalize(train_embeds, axis=1, norm='l2')
valid_embeds = normalize(valid_embeds, axis=1, norm='l2')

train_labels = encoder.inverse_transform(train_labels)
valid_labels = encoder.inverse_transform(valid_labels)

计算距离(验证集)

计算验证集中每张图片与验证集特征向量数据库中的embeddings之间的距离,取出距离最近的前50个对应的距离以及individual_id:

1
2
3
index = faiss.IndexFlatIP(CONFIG['embedding_size'])
index.add(train_embeds)
D, I = index.search(valid_embeds, k=50)

D中存储了验证集中每张图片与对应的50个embedding之间的距离,I中存储了50个embedding对应的individual_id,即真实标签。

1
2
D.shape:[N, 50]
I.shape:[N,50]

获取验证集中每张图片的真实individual_id

可能测试集中有一些图片并没有在训练集中出现,这些图片的真实individual_id应该设置为new_individual.

ps:在做数据划分时,已经尽可能使得训练集和验证集分布一致,但是许多类别只有1张图,所以仍然会有一些图片只存在于训练集或验证集中。

1
2
3
4
allowed_targets = np.unique(train_labels)

val_targets_df = pd.DataFrame(np.stack([valid_ids, valid_labels], axis=1), columns=['image','target'])
val_targets_df.loc[~val_targets_df.target.isin(allowed_targets), 'target'] = 'new_individual'

处理验证集对应的备选答案

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
valid_df = []
for i, val_id in tqdm(enumerate(valid_ids)):
targets = train_labels[I[i]]
distances = D[i]
subset_preds = pd.DataFrame(np.stack([targets,distances],axis=1),columns=['target','distances'])
subset_preds['image'] = val_id
valid_df.append(subset_preds)


valid_df = pd.concat(valid_df).reset_index(drop=True)
# 训练集embedding数据库中,不同图片(embedding)可能对应同一个individual_id,
# 反映在valid_df中,就是有重复的target
# 对于重复的target,只保留距离最近的那一项即可,这里的groupby就是实现这个功能的
valid_df = valid_df.groupby(['image','target']).distances.max().reset_index()

# 按照距离由小到大排序
valid_df = valid_df.sort_values('distances', ascending=True).reset_index(drop=True)

此时的valid_df如下:

1
2
3
4
5
6
index	image	target	distances
0 0c42057255dbd6.jpg 987af6968486 0.484859
1 0c42057255dbd6.jpg c7f0ac17fc14 0.487009
2 0c42057255dbd6.jpg 35f898e6595e 0.488048
3 0c42057255dbd6.jpg dad9b2cc8452 0.489151
4 0c42057255dbd6.jpg f65395cdfcc2 0.491308

定义 获取top5答案 的函数

对于验证集中的每张图片,从当前valid_df中选择5个答案作为预测结果:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
sample_list = ['938b7e931166', '5bf17305f073', '7593d2aee842', '7362d7a01d00','956562ff2888']

def get_predictions(test_df, threshold=0.2):
predictions = {}
for i, row in tqdm(test_df.iterrows()):#test_df行数从小到大,距离也从小到大
if row.image in predictions:#test_df中每张图片会出现50次
if len(predictions[row.image]) == 5:
continue
predictions[row.image].append(row.target)
elif row.distances > threshold:
predictions[row.image] = [row.target, 'new_individual']
else:
predictions[row.image] = ['new_individual', row.target]
#处理不足5个的情况,因为50里面有重复的individual_id,去重后不同的individual_id数可能少于5
for x in tqdm(predictions):
if len(predictions[x]) < 5:
remaining = [y for y in sample_list if y not in predictions]
predictions[x] = predictions[x] + remaining
predictions[x] = predictions[x][:5]

return predictions

定义评估指标

在第一篇文章中已经实现过,直接拿来:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
def map_per_image(label, predictions):
"""Computes the precision score of one image.

Parameters
----------
label : string
The true label of the image
predictions : list
A list of predicted elements (order does matter, 5 predictions allowed per image)

Returns
-------
score : double
"""
try:
return 1 / (predictions[:5].index(label) + 1)
except ValueError:
return 0.0

计算交叉验证得分

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
best_th = 0
best_cv = 0
for th in [0.1*x for x in range(11)]:
#预测的top5
all_preds = get_predictions(valid_df, threshold=th)
cv = 0
# 对于验证集中的每张图片,计算预测的top5答案和真实label之间的MAP@5
for i,row in val_targets_df.iterrows():
target = row.target
preds = all_preds[row.image]
val_targets_df.loc[i,th] = map_per_image(target, preds)
cv = val_targets_df[th].mean()
print(f"CV at threshold {th}: {cv}")
if cv > best_cv:
best_th = th
best_cv = cv

由于在Public 数据集中有10%的new_individual,所以需要调整下best_th:

1
2
3
4
5
6
val_targets_df['is_new_individual'] = val_targets_df.target=='new_individual'
print(val_targets_df.is_new_individual.value_counts().to_dict())
val_scores = val_targets_df.groupby('is_new_individual').mean().T
val_scores['adjusted_cv'] = val_scores[True]*0.1+val_scores[False]*0.9
best_threshold_adjusted = val_scores['adjusted_cv'].idxmax()
print("best_threshold",best_threshold_adjusted)

后续将使用best_threshold_adjusted作为是否将分类结果判定为new_individual的阈值,这一点体现在上面的get_predictions中。

合并train和valid的embedding

将train和valid的embedding合并,作为最终的特征向量数据库:

1
2
train_embeds = np.concatenate([train_embeds, valid_embeds])
train_labels = np.concatenate([train_labels, valid_labels])

提取测试集的embedding

1
2
3
4
5
6
7
8
9
10
11
12
13
# 读取测试集所含图片路径
test = pd.DataFrame()
test["image"] = os.listdir("../input/happy-whale-and-dolphin/test_images")
test["file_path"] = test["image"].apply(lambda x: f"{TEST_DIR}/{x}")
test["individual_id"] = -1 #dummy value

# 制作测试集loader
test_dataset = HappyWhaleDataset(test, transforms=data_transforms["valid"])
test_loader = DataLoader(test_dataset, batch_size=CONFIG['valid_batch_size'],
num_workers=2, shuffle=False, pin_memory=True)
# 提取测试集embedding并对结果做归一化
test_embeds, _, test_ids = get_embeddings(model, test_loader, CONFIG['device'])
test_embeds = normalize(test_embeds, axis=1, norm='l2')

计算距离(测试集)

计算测试集中每张图片与特征向量数据库中的embeddings之间的距离,取出距离最近的前50个对应的距离以及individual_id:

1
2
3
4
index = faiss.IndexFlatIP(CONFIG['embedding_size'])
index.add(train_embeds)

D, I = index.search(test_embeds, k=50)

生成提交文件

接下来的操作和上面验证集的操作一样,只是不再需要计算交叉验证得分了。

处理测试集对应的备选答案:

1
2
3
4
5
6
7
8
9
10
11
test_df = []
for i, test_id in tqdm(enumerate(test_ids)):
targets = train_labels[I[i]]
distances = D[i]
subset_preds = pd.DataFrame(np.stack([targets, distances], axis=1), columns=['target','distances'])
subset_preds['image'] = test_id
test_df.append(subset_preds)

test_df = pd.concat(test_df).reset_index(drop=True)
test_df = test_df.groupby(['image','target']).distances.max().reset_index()
test_df = test_df.sort_values('distances', ascending=False).reset_index(drop=True)

生成提交文件:

1
2
3
4
5
6
predictions = get_predictions(test_df, best_threshold_adjusted)

predictions = pd.Series(predictions).reset_index()
predictions.columns = ['image','predictions']
predictions['predictions'] = predictions['predictions'].apply(lambda x: ' '.join(x))
predictions.to_csv('submission.csv',index=False)

参考: