鲸鱼和海豚识别竞赛解析:验证&预测代码

本文的目标是对整个验证和预测的过程进行梳理，至于其中出现的一些概念，比如GeM Pooling，ArcFace等，将在后续出一篇文章进行介绍。

导入所需库

import os
import gc
import cv2
import math
import copy
import time
import random

# For data manipulation
import numpy as np
import pandas as pd

# Pytorch Imports
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.cuda import amp

# Utils
import joblib
from tqdm import tqdm
from collections import defaultdict

# Sklearn Imports
from sklearn.preprocessing import LabelEncoder, normalize
from sklearn.model_selection import StratifiedKFold

# For Image Models
import timm

# For Similarity Search
import faiss

# Albumentations for augmentations
import albumentations as A
from albumentations.pytorch import ToTensorV2

# For colored terminal text
from colorama import Fore, Back, Style
b_ = Fore.BLUE
y_ = Fore.YELLOW
sr_ = Style.RESET_ALL

import warnings
warnings.filterwarnings("ignore")

# For descriptive error messages
os.environ['CUDA_LAUNCH_BLOCKING'] = "1"

配置文件

CONFIG = {"seed": 2022,
          "img_size": 448,
          "model_name": "tf_efficientnet_b0_ns",
          "num_classes": 15587,
          "embedding_size": 512,
          "train_batch_size": 64,
          "valid_batch_size": 64,
          "n_fold": 5,
          "device": torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
          # ArcFace Hyperparameters
          "s": 30.0, 
          "m": 0.30,
          "ls_eps": 0.0,
          "easy_margin": False
          }
ROOT_DIR = '../input/happy-whale-and-dolphin'
TRAIN_DIR = '../input/happy-whale-and-dolphin/train_images'
TEST_DIR = '../input/happy-whale-and-dolphin/test_images'

读取数据

def get_train_file_path(id):
    return f"{TRAIN_DIR}/{id}"
    
df = pd.read_csv(f"{ROOT_DIR}/train.csv")
df['file_path'] = df['image'].apply(get_train_file_path)
df.head()

标签编码

直接读取训练前存储下来的编码结果：

encoder = LabelEncoder()

with open("../input/labelencodersss/le.pkl", "rb") as fp:
    encoder = joblib.load(fp)
    
df['individual_id'] = encoder.transform(df['individual_id'])

数据划分

skf = StratifiedKFold(n_splits=CONFIG['n_fold'])

for fold, ( _, val_) in enumerate(skf.split(X=df, y=df.individual_id)):
      df.loc[val_ , "kfold"] = fold

至此，处理后数据格式如下：
Alt text

定义DataSet

和训练时的定义不同，这里添加了id，用于记录每张图片的文件名：

class HappyWhaleDataset(Dataset):
    def __init__(self, df, transforms=None):
        self.df = df
        self.ids = df['image'].values
        self.file_names = df['file_path'].values
        self.labels = df['individual_id'].values
        self.transforms = transforms
        
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, index):
        idx = self.ids[index]#‘image’
        img_path = self.file_names[index]
        img = cv2.imread(img_path)
        img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
        label = self.labels[index]
        
        if self.transforms:
            img = self.transforms(image=img)["image"]
            
        return {
            'image': img,
            'label': torch.tensor(label, dtype=torch.long),
            'id': idx#'image'
        }

数据增强

data_transforms = {
    "train": A.Compose([
        A.Resize(CONFIG['img_size'], CONFIG['img_size']),
        A.Normalize(
                mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225], 
                max_pixel_value=255.0, 
                p=1.0
            ),
        ToTensorV2()], p=1.),
    
    "valid": A.Compose([
        A.Resize(CONFIG['img_size'], CONFIG['img_size']),
        A.Normalize(
                mean=[0.485, 0.456, 0.406], 
                std=[0.229, 0.224, 0.225], 
                max_pixel_value=255.0, 
                p=1.0
            ),
        ToTensorV2()], p=1.)
}

GeM Pooling

class GeM(nn.Module):
    def __init__(self, p=3, eps=1e-6):
        super(GeM, self).__init__()
        self.p = nn.Parameter(torch.ones(1)*p)
        self.eps = eps

    def forward(self, x):
        return self.gem(x, p=self.p, eps=self.eps)
        
    def gem(self, x, p=3, eps=1e-6):
        return F.avg_pool2d(x.clamp(min=eps).pow(p), (x.size(-2), x.size(-1))).pow(1./p)
        
    def __repr__(self):
        return self.__class__.__name__ + \
                '(' + 'p=' + '{:.4f}'.format(self.p.data.tolist()[0]) + \
                ', ' + 'eps=' + str(self.eps) + ')'

ArcFace

class ArcMarginProduct(nn.Module):
    r"""Implement of large margin arc distance: :
        Args:
            in_features: size of each input sample
            out_features: size of each output sample
            s: norm of input feature
            m: margin
            cos(theta + m)
        """
    def __init__(self, in_features, out_features, s=30.0, 
                 m=0.50, easy_margin=False, ls_eps=0.0):
        super(ArcMarginProduct, self).__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.s = s
        self.m = m
        self.ls_eps = ls_eps  # label smoothing
        self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features))
        nn.init.xavier_uniform_(self.weight)

        self.easy_margin = easy_margin
        self.cos_m = math.cos(m)
        self.sin_m = math.sin(m)
        self.th = math.cos(math.pi - m)
        self.mm = math.sin(math.pi - m) * m

    def forward(self, input, label):
        # --------------------------- cos(theta) & phi(theta) ---------------------
        cosine = F.linear(F.normalize(input), F.normalize(self.weight))
        sine = torch.sqrt(1.0 - torch.pow(cosine, 2))
        phi = cosine * self.cos_m - sine * self.sin_m
        if self.easy_margin:
            phi = torch.where(cosine > 0, phi, cosine)
        else:
            phi = torch.where(cosine > self.th, phi, cosine - self.mm)
        # --------------------------- convert label to one-hot ---------------------
        # one_hot = torch.zeros(cosine.size(), requires_grad=True, device='cuda')
        one_hot = torch.zeros(cosine.size(), device=CONFIG['device'])
        one_hot.scatter_(1, label.view(-1, 1).long(), 1)
        if self.ls_eps > 0:
            one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.out_features
        # -------------torch.where(out_i = {x_i if condition_i else y_i) ------------
        output = (one_hot * phi) + ((1.0 - one_hot) * cosine)
        output *= self.s

        return output

构建模型

class HappyWhaleModel(nn.Module):
    def __init__(self, model_name, embedding_size, pretrained=True):
        super(HappyWhaleModel, self).__init__()
        self.model = timm.create_model(model_name, pretrained=pretrained)
        in_features = self.model.classifier.in_features
        self.model.classifier = nn.Identity()
        self.model.global_pool = nn.Identity()
        self.pooling = GeM()
        self.embedding = nn.Linear(in_features, embedding_size)
        self.fc = ArcMarginProduct(embedding_size, 
                                   CONFIG["num_classes"],
                                   s=CONFIG["s"], 
                                   m=CONFIG["m"], 
                                   easy_margin=CONFIG["ls_eps"], 
                                   ls_eps=CONFIG["ls_eps"])

    def forward(self, images, labels):
        features = self.model(images)
        pooled_features = self.pooling(features).flatten(1)
        embedding = self.embedding(pooled_features)
        output = self.fc(embedding, labels)
        return output
    
    def extract(self, images):
        features = self.model(images)
        print('features:',features)
        pooled_features = self.pooling(features).flatten(1)
        print('pooled_features:',pooled_features)
        embedding = self.embedding(pooled_features)
        return embedding

    
model = HappyWhaleModel(CONFIG['model_name'], CONFIG['embedding_size'])
# 加载训练好的模型
model.load_state_dict(torch.load("../input/arcface-gap-embed/Loss14.0082_epoch10.bin"))
model.to(CONFIG['device']);

准备某个fold的数据集

def prepare_loaders(df, fold):
    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)
    
    train_dataset = HappyWhaleDataset(df_train, transforms=data_transforms["train"])
    valid_dataset = HappyWhaleDataset(df_valid, transforms=data_transforms["valid"])

    train_loader = DataLoader(train_dataset, batch_size=CONFIG['train_batch_size'], 
                              num_workers=2, shuffle=False, pin_memory=True)
    valid_loader = DataLoader(valid_dataset, batch_size=CONFIG['valid_batch_size'], 
                              num_workers=2, shuffle=False, pin_memory=True)
    
    return train_loader, valid_loader

train_loader, valid_loader = prepare_loaders(df, fold=0)

提取训练集和验证集的embedding

@torch.inference_mode()
def get_embeddings(model, dataloader, device):
    model.eval()
    
    LABELS = []
    EMBEDS = []
    IDS = []
    
    bar = tqdm(enumerate(dataloader), total=len(dataloader))
    for step, data in bar:        
        images = data['image'].to(device, dtype=torch.float)
        labels = data['label'].to(device, dtype=torch.long)
        ids = data['id']

        outputs = model.extract(images)
        
        LABELS.append(labels.cpu().numpy())
        EMBEDS.append(outputs.cpu().numpy())
        IDS.append(ids)
    
    EMBEDS = np.vstack(EMBEDS)
    LABELS = np.concatenate(LABELS)
    IDS = np.concatenate(IDS)
    
    return EMBEDS, LABELS, IDS


train_embeds, train_labels, train_ids = get_embeddings(model, train_loader, CONFIG['device'])
valid_embeds, valid_labels, valid_ids = get_embeddings(model, valid_loader, CONFIG['device'])

embedding的维度是512，即train_embeds.shape[1]和valid_embeds.shape[1]都等于512。

之后对embedding做归一化，对labels由数字编码转回字符串编码：

train_embeds = normalize(train_embeds, axis=1, norm='l2')
valid_embeds = normalize(valid_embeds, axis=1, norm='l2')

train_labels = encoder.inverse_transform(train_labels)
valid_labels = encoder.inverse_transform(valid_labels)

计算距离(验证集)

计算验证集中每张图片与验证集特征向量数据库中的embeddings之间的距离，取出距离最近的前50个对应的距离以及individual_id:

1
2
3

index = faiss.IndexFlatIP(CONFIG['embedding_size'])
index.add(train_embeds)
D, I = index.search(valid_embeds, k=50)

D中存储了验证集中每张图片与对应的50个embedding之间的距离，I中存储了50个embedding对应的individual_id，即真实标签。

1 2	D.shape:[N, 50] I.shape:[N,50]

获取验证集中每张图片的真实individual_id

可能测试集中有一些图片并没有在训练集中出现，这些图片的真实individual_id应该设置为new_individual.

ps：在做数据划分时，已经尽可能使得训练集和验证集分布一致，但是许多类别只有1张图，所以仍然会有一些图片只存在于训练集或验证集中。

allowed_targets = np.unique(train_labels)

val_targets_df = pd.DataFrame(np.stack([valid_ids, valid_labels], axis=1), columns=['image','target'])
val_targets_df.loc[~val_targets_df.target.isin(allowed_targets), 'target'] = 'new_individual'

处理验证集对应的备选答案

valid_df = []
for i, val_id in tqdm(enumerate(valid_ids)):
    targets = train_labels[I[i]]
    distances = D[i]
    subset_preds = pd.DataFrame(np.stack([targets,distances],axis=1),columns=['target','distances'])
    subset_preds['image'] = val_id
    valid_df.append(subset_preds)


valid_df = pd.concat(valid_df).reset_index(drop=True)
# 训练集embedding数据库中，不同图片(embedding)可能对应同一个individual_id，
# 反映在valid_df中，就是有重复的target
# 对于重复的target，只保留距离最近的那一项即可，这里的groupby就是实现这个功能的
valid_df = valid_df.groupby(['image','target']).distances.max().reset_index()

# 按照距离由小到大排序
valid_df = valid_df.sort_values('distances', ascending=True).reset_index(drop=True)

此时的valid_df如下：

index	image	target	distances
0	0c42057255dbd6.jpg	987af6968486	0.484859
1	0c42057255dbd6.jpg	c7f0ac17fc14	0.487009
2	0c42057255dbd6.jpg	35f898e6595e	0.488048
3	0c42057255dbd6.jpg	dad9b2cc8452	0.489151
4	0c42057255dbd6.jpg	f65395cdfcc2	0.491308

定义获取top5答案的函数

对于验证集中的每张图片，从当前valid_df中选择5个答案作为预测结果：

sample_list = ['938b7e931166', '5bf17305f073', '7593d2aee842', '7362d7a01d00','956562ff2888']

def get_predictions(test_df, threshold=0.2):
    predictions = {}
    for i, row in tqdm(test_df.iterrows()):#test_df行数从小到大，距离也从小到大
        if row.image in predictions:#test_df中每张图片会出现50次
            if len(predictions[row.image]) == 5:
                continue
            predictions[row.image].append(row.target)
        elif row.distances > threshold:
            predictions[row.image] = [row.target, 'new_individual']
        else:
            predictions[row.image] = ['new_individual', row.target]
    #处理不足5个的情况，因为50里面有重复的individual_id，去重后不同的individual_id数可能少于5
    for x in tqdm(predictions):
        if len(predictions[x]) < 5:
            remaining = [y for y in sample_list if y not in predictions]
            predictions[x] = predictions[x] + remaining
            predictions[x] = predictions[x][:5]
        
    return predictions

定义评估指标

在第一篇文章中已经实现过，直接拿来：

def map_per_image(label, predictions):
    """Computes the precision score of one image.

    Parameters
    ----------
    label : string
            The true label of the image
    predictions : list
            A list of predicted elements (order does matter, 5 predictions allowed per image)

    Returns
    -------
    score : double
    """    
    try:
        return 1 / (predictions[:5].index(label) + 1)
    except ValueError:
        return 0.0

计算交叉验证得分

best_th = 0
best_cv = 0
for th in [0.1*x for x in range(11)]:
    #预测的top5
    all_preds = get_predictions(valid_df, threshold=th)
    cv = 0
    # 对于验证集中的每张图片，计算预测的top5答案和真实label之间的MAP@5
    for i,row in val_targets_df.iterrows():
        target = row.target
        preds = all_preds[row.image]
        val_targets_df.loc[i,th] = map_per_image(target, preds)
    cv = val_targets_df[th].mean()
    print(f"CV at threshold {th}: {cv}")
    if cv > best_cv:
        best_th = th
        best_cv = cv

由于在Public 数据集中有10%的new_individual，所以需要调整下best_th:

val_targets_df['is_new_individual'] = val_targets_df.target=='new_individual'
print(val_targets_df.is_new_individual.value_counts().to_dict())
val_scores = val_targets_df.groupby('is_new_individual').mean().T
val_scores['adjusted_cv'] = val_scores[True]*0.1+val_scores[False]*0.9
best_threshold_adjusted = val_scores['adjusted_cv'].idxmax()
print("best_threshold",best_threshold_adjusted)

后续将使用best_threshold_adjusted作为是否将分类结果判定为new_individual的阈值，这一点体现在上面的get_predictions中。

合并train和valid的embedding

将train和valid的embedding合并，作为最终的特征向量数据库：

1 2	train_embeds = np.concatenate([train_embeds, valid_embeds]) train_labels = np.concatenate([train_labels, valid_labels])

提取测试集的embedding

# 读取测试集所含图片路径
test = pd.DataFrame()
test["image"] = os.listdir("../input/happy-whale-and-dolphin/test_images")
test["file_path"] = test["image"].apply(lambda x: f"{TEST_DIR}/{x}")
test["individual_id"] = -1  #dummy value

# 制作测试集loader
test_dataset = HappyWhaleDataset(test, transforms=data_transforms["valid"])
test_loader = DataLoader(test_dataset, batch_size=CONFIG['valid_batch_size'], 
                         num_workers=2, shuffle=False, pin_memory=True)
# 提取测试集embedding并对结果做归一化
test_embeds, _, test_ids = get_embeddings(model, test_loader, CONFIG['device'])
test_embeds = normalize(test_embeds, axis=1, norm='l2')

计算距离(测试集)

计算测试集中每张图片与特征向量数据库中的embeddings之间的距离，取出距离最近的前50个对应的距离以及individual_id:

index = faiss.IndexFlatIP(CONFIG['embedding_size'])
index.add(train_embeds)

D, I = index.search(test_embeds, k=50)

生成提交文件

接下来的操作和上面验证集的操作一样，只是不再需要计算交叉验证得分了。

处理测试集对应的备选答案：

test_df = []
for i, test_id in tqdm(enumerate(test_ids)):
    targets = train_labels[I[i]]
    distances = D[i]
    subset_preds = pd.DataFrame(np.stack([targets, distances], axis=1), columns=['target','distances'])
    subset_preds['image'] = test_id
    test_df.append(subset_preds)
    
test_df = pd.concat(test_df).reset_index(drop=True)
test_df = test_df.groupby(['image','target']).distances.max().reset_index()
test_df = test_df.sort_values('distances', ascending=False).reset_index(drop=True)

生成提交文件：

predictions = get_predictions(test_df, best_threshold_adjusted)

predictions = pd.Series(predictions).reset_index()
predictions.columns = ['image','predictions']
predictions['predictions'] = predictions['predictions'].apply(lambda x: ' '.join(x))
predictions.to_csv('submission.csv',index=False)