本文的目标是对整个验证和预测的过程进行梳理,至于其中出现的一些概念,比如GeM Pooling,ArcFace等,将在后续出一篇文章进行介绍。
导入所需库 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 import osimport gcimport cv2import mathimport copyimport timeimport randomimport numpy as npimport pandas as pdimport torchimport torch.nn as nnimport torch.nn.functional as Ffrom torch.utils.data import Dataset, DataLoaderfrom torch.cuda import ampimport joblibfrom tqdm import tqdmfrom collections import defaultdictfrom sklearn.preprocessing import LabelEncoder, normalizefrom sklearn.model_selection import StratifiedKFoldimport timmimport faissimport albumentations as Afrom albumentations.pytorch import ToTensorV2from colorama import Fore, Back, Styleb_ = Fore.BLUE y_ = Fore.YELLOW sr_ = Style.RESET_ALL import warningswarnings.filterwarnings("ignore" ) os.environ['CUDA_LAUNCH_BLOCKING' ] = "1"
配置文件 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 CONFIG = {"seed" : 2022 , "img_size" : 448 , "model_name" : "tf_efficientnet_b0_ns" , "num_classes" : 15587 , "embedding_size" : 512 , "train_batch_size" : 64 , "valid_batch_size" : 64 , "n_fold" : 5 , "device" : torch.device("cuda:0" if torch.cuda.is_available() else "cpu" ), "s" : 30.0 , "m" : 0.30 , "ls_eps" : 0.0 , "easy_margin" : False } ROOT_DIR = '../input/happy-whale-and-dolphin' TRAIN_DIR = '../input/happy-whale-and-dolphin/train_images' TEST_DIR = '../input/happy-whale-and-dolphin/test_images'
读取数据 1 2 3 4 5 6 def get_train_file_path (id ): return f"{TRAIN_DIR} /{id } " df = pd.read_csv(f"{ROOT_DIR} /train.csv" ) df['file_path' ] = df['image' ].apply(get_train_file_path) df.head()
标签编码 直接读取训练前存储下来的编码结果:
1 2 3 4 5 6 encoder = LabelEncoder() with open ("../input/labelencodersss/le.pkl" , "rb" ) as fp: encoder = joblib.load(fp) df['individual_id' ] = encoder.transform(df['individual_id' ])
数据划分 1 2 3 4 skf = StratifiedKFold(n_splits=CONFIG['n_fold' ]) for fold, ( _, val_) in enumerate (skf.split(X=df, y=df.individual_id)): df.loc[val_ , "kfold" ] = fold
至此,处理后数据格式如下:
定义DataSet 和训练时的定义不同,这里添加了id
,用于记录每张图片的文件名:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 class HappyWhaleDataset (Dataset ): def __init__ (self, df, transforms=None ): self.df = df self.ids = df['image' ].values self.file_names = df['file_path' ].values self.labels = df['individual_id' ].values self.transforms = transforms def __len__ (self ): return len (self.df) def __getitem__ (self, index ): idx = self.ids[index] img_path = self.file_names[index] img = cv2.imread(img_path) img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) label = self.labels[index] if self.transforms: img = self.transforms(image=img)["image" ] return { 'image' : img, 'label' : torch.tensor(label, dtype=torch.long), 'id' : idx }
数据增强 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 data_transforms = { "train" : A.Compose([ A.Resize(CONFIG['img_size' ], CONFIG['img_size' ]), A.Normalize( mean=[0.485 , 0.456 , 0.406 ], std=[0.229 , 0.224 , 0.225 ], max_pixel_value=255.0 , p=1.0 ), ToTensorV2()], p=1. ), "valid" : A.Compose([ A.Resize(CONFIG['img_size' ], CONFIG['img_size' ]), A.Normalize( mean=[0.485 , 0.456 , 0.406 ], std=[0.229 , 0.224 , 0.225 ], max_pixel_value=255.0 , p=1.0 ), ToTensorV2()], p=1. ) }
GeM Pooling 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 class GeM (nn.Module ): def __init__ (self, p=3 , eps=1e-6 ): super (GeM, self).__init__() self.p = nn.Parameter(torch.ones(1 )*p) self.eps = eps def forward (self, x ): return self.gem(x, p=self.p, eps=self.eps) def gem (self, x, p=3 , eps=1e-6 ): return F.avg_pool2d(x.clamp(min =eps).pow (p), (x.size(-2 ), x.size(-1 ))).pow (1. /p) def __repr__ (self ): return self.__class__.__name__ + \ '(' + 'p=' + '{:.4f}' .format (self.p.data.tolist()[0 ]) + \ ', ' + 'eps=' + str (self.eps) + ')'
ArcFace 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 class ArcMarginProduct (nn.Module ): r"""Implement of large margin arc distance: : Args: in_features: size of each input sample out_features: size of each output sample s: norm of input feature m: margin cos(theta + m) """ def __init__ (self, in_features, out_features, s=30.0 , m=0.50 , easy_margin=False , ls_eps=0.0 ): super (ArcMarginProduct, self).__init__() self.in_features = in_features self.out_features = out_features self.s = s self.m = m self.ls_eps = ls_eps self.weight = nn.Parameter(torch.FloatTensor(out_features, in_features)) nn.init.xavier_uniform_(self.weight) self.easy_margin = easy_margin self.cos_m = math.cos(m) self.sin_m = math.sin(m) self.th = math.cos(math.pi - m) self.mm = math.sin(math.pi - m) * m def forward (self, input , label ): cosine = F.linear(F.normalize(input ), F.normalize(self.weight)) sine = torch.sqrt(1.0 - torch.pow (cosine, 2 )) phi = cosine * self.cos_m - sine * self.sin_m if self.easy_margin: phi = torch.where(cosine > 0 , phi, cosine) else : phi = torch.where(cosine > self.th, phi, cosine - self.mm) one_hot = torch.zeros(cosine.size(), device=CONFIG['device' ]) one_hot.scatter_(1 , label.view(-1 , 1 ).long(), 1 ) if self.ls_eps > 0 : one_hot = (1 - self.ls_eps) * one_hot + self.ls_eps / self.out_features output = (one_hot * phi) + ((1.0 - one_hot) * cosine) output *= self.s return output
构建模型 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 class HappyWhaleModel (nn.Module ): def __init__ (self, model_name, embedding_size, pretrained=True ): super (HappyWhaleModel, self).__init__() self.model = timm.create_model(model_name, pretrained=pretrained) in_features = self.model.classifier.in_features self.model.classifier = nn.Identity() self.model.global_pool = nn.Identity() self.pooling = GeM() self.embedding = nn.Linear(in_features, embedding_size) self.fc = ArcMarginProduct(embedding_size, CONFIG["num_classes" ], s=CONFIG["s" ], m=CONFIG["m" ], easy_margin=CONFIG["ls_eps" ], ls_eps=CONFIG["ls_eps" ]) def forward (self, images, labels ): features = self.model(images) pooled_features = self.pooling(features).flatten(1 ) embedding = self.embedding(pooled_features) output = self.fc(embedding, labels) return output def extract (self, images ): features = self.model(images) print ('features:' ,features) pooled_features = self.pooling(features).flatten(1 ) print ('pooled_features:' ,pooled_features) embedding = self.embedding(pooled_features) return embedding model = HappyWhaleModel(CONFIG['model_name' ], CONFIG['embedding_size' ]) model.load_state_dict(torch.load("../input/arcface-gap-embed/Loss14.0082_epoch10.bin" )) model.to(CONFIG['device' ]);
准备某个fold的数据集 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 def prepare_loaders (df, fold ): df_train = df[df.kfold != fold].reset_index(drop=True ) df_valid = df[df.kfold == fold].reset_index(drop=True ) train_dataset = HappyWhaleDataset(df_train, transforms=data_transforms["train" ]) valid_dataset = HappyWhaleDataset(df_valid, transforms=data_transforms["valid" ]) train_loader = DataLoader(train_dataset, batch_size=CONFIG['train_batch_size' ], num_workers=2 , shuffle=False , pin_memory=True ) valid_loader = DataLoader(valid_dataset, batch_size=CONFIG['valid_batch_size' ], num_workers=2 , shuffle=False , pin_memory=True ) return train_loader, valid_loader train_loader, valid_loader = prepare_loaders(df, fold=0 )
提取训练集和验证集的embedding 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 @torch.inference_mode() def get_embeddings (model, dataloader, device ): model.eval () LABELS = [] EMBEDS = [] IDS = [] bar = tqdm(enumerate (dataloader), total=len (dataloader)) for step, data in bar: images = data['image' ].to(device, dtype=torch.float ) labels = data['label' ].to(device, dtype=torch.long) ids = data['id' ] outputs = model.extract(images) LABELS.append(labels.cpu().numpy()) EMBEDS.append(outputs.cpu().numpy()) IDS.append(ids) EMBEDS = np.vstack(EMBEDS) LABELS = np.concatenate(LABELS) IDS = np.concatenate(IDS) return EMBEDS, LABELS, IDS train_embeds, train_labels, train_ids = get_embeddings(model, train_loader, CONFIG['device' ]) valid_embeds, valid_labels, valid_ids = get_embeddings(model, valid_loader, CONFIG['device' ])
embedding的维度是512,即train_embeds.shape[1]
和valid_embeds.shape[1]
都等于512。
之后对embedding做归一化,对labels由数字编码转回字符串编码:
1 2 3 4 5 train_embeds = normalize(train_embeds, axis=1 , norm='l2' ) valid_embeds = normalize(valid_embeds, axis=1 , norm='l2' ) train_labels = encoder.inverse_transform(train_labels) valid_labels = encoder.inverse_transform(valid_labels)
计算距离(验证集) 计算验证集中每张图片与验证集特征向量数据库中的embeddings之间的距离,取出距离最近的前50个对应的距离以及individual_id
:
1 2 3 index = faiss.IndexFlatIP(CONFIG['embedding_size' ]) index.add(train_embeds) D, I = index.search(valid_embeds, k=50 )
D
中存储了验证集中每张图片与对应的50个embedding之间的距离,I
中存储了50个embedding对应的individual_id
,即真实标签。
1 2 D.shape:[N, 50] I.shape:[N,50]
获取验证集中每张图片的真实individual_id 可能测试集中有一些图片并没有在训练集中出现,这些图片的真实individual_id
应该设置为new_individual
.
ps:在做数据划分时,已经尽可能使得训练集和验证集分布一致,但是许多类别只有1张图,所以仍然会有一些图片只存在于训练集或验证集中。
1 2 3 4 allowed_targets = np.unique(train_labels) val_targets_df = pd.DataFrame(np.stack([valid_ids, valid_labels], axis=1 ), columns=['image' ,'target' ]) val_targets_df.loc[~val_targets_df.target.isin(allowed_targets), 'target' ] = 'new_individual'
处理验证集对应的备选答案 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 valid_df = [] for i, val_id in tqdm(enumerate (valid_ids)): targets = train_labels[I[i]] distances = D[i] subset_preds = pd.DataFrame(np.stack([targets,distances],axis=1 ),columns=['target' ,'distances' ]) subset_preds['image' ] = val_id valid_df.append(subset_preds) valid_df = pd.concat(valid_df).reset_index(drop=True ) valid_df = valid_df.groupby(['image' ,'target' ]).distances.max ().reset_index() valid_df = valid_df.sort_values('distances' , ascending=True ).reset_index(drop=True )
此时的valid_df如下:
1 2 3 4 5 6 index image target distances 0 0c42057255dbd6.jpg 987af6968486 0.484859 1 0c42057255dbd6.jpg c7f0ac17fc14 0.487009 2 0c42057255dbd6.jpg 35f898e6595e 0.488048 3 0c42057255dbd6.jpg dad9b2cc8452 0.489151 4 0c42057255dbd6.jpg f65395cdfcc2 0.491308
定义 获取top5答案 的函数 对于验证集中的每张图片,从当前valid_df中选择5个答案作为预测结果:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 sample_list = ['938b7e931166' , '5bf17305f073' , '7593d2aee842' , '7362d7a01d00' ,'956562ff2888' ] def get_predictions (test_df, threshold=0.2 ): predictions = {} for i, row in tqdm(test_df.iterrows()): if row.image in predictions: if len (predictions[row.image]) == 5 : continue predictions[row.image].append(row.target) elif row.distances > threshold: predictions[row.image] = [row.target, 'new_individual' ] else : predictions[row.image] = ['new_individual' , row.target] for x in tqdm(predictions): if len (predictions[x]) < 5 : remaining = [y for y in sample_list if y not in predictions] predictions[x] = predictions[x] + remaining predictions[x] = predictions[x][:5 ] return predictions
定义评估指标 在第一篇文章中已经实现过,直接拿来:
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 def map_per_image (label, predictions ): """Computes the precision score of one image. Parameters ---------- label : string The true label of the image predictions : list A list of predicted elements (order does matter, 5 predictions allowed per image) Returns ------- score : double """ try : return 1 / (predictions[:5 ].index(label) + 1 ) except ValueError: return 0.0
计算交叉验证得分 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 best_th = 0 best_cv = 0 for th in [0.1 *x for x in range (11 )]: all_preds = get_predictions(valid_df, threshold=th) cv = 0 for i,row in val_targets_df.iterrows(): target = row.target preds = all_preds[row.image] val_targets_df.loc[i,th] = map_per_image(target, preds) cv = val_targets_df[th].mean() print (f"CV at threshold {th} : {cv} " ) if cv > best_cv: best_th = th best_cv = cv
由于在Public 数据集中有10%的new_individual
,所以需要调整下best_th
:
1 2 3 4 5 6 val_targets_df['is_new_individual' ] = val_targets_df.target=='new_individual' print (val_targets_df.is_new_individual.value_counts().to_dict())val_scores = val_targets_df.groupby('is_new_individual' ).mean().T val_scores['adjusted_cv' ] = val_scores[True ]*0.1 +val_scores[False ]*0.9 best_threshold_adjusted = val_scores['adjusted_cv' ].idxmax() print ("best_threshold" ,best_threshold_adjusted)
后续将使用best_threshold_adjusted
作为是否将分类结果判定为new_individual
的阈值,这一点体现在上面的get_predictions
中。
合并train和valid的embedding 将train和valid的embedding合并,作为最终的特征向量数据库:
1 2 train_embeds = np.concatenate([train_embeds, valid_embeds]) train_labels = np.concatenate([train_labels, valid_labels])
提取测试集的embedding 1 2 3 4 5 6 7 8 9 10 11 12 13 test = pd.DataFrame() test["image" ] = os.listdir("../input/happy-whale-and-dolphin/test_images" ) test["file_path" ] = test["image" ].apply(lambda x: f"{TEST_DIR} /{x} " ) test["individual_id" ] = -1 test_dataset = HappyWhaleDataset(test, transforms=data_transforms["valid" ]) test_loader = DataLoader(test_dataset, batch_size=CONFIG['valid_batch_size' ], num_workers=2 , shuffle=False , pin_memory=True ) test_embeds, _, test_ids = get_embeddings(model, test_loader, CONFIG['device' ]) test_embeds = normalize(test_embeds, axis=1 , norm='l2' )
计算距离(测试集) 计算测试集中每张图片与特征向量数据库中的embeddings之间的距离,取出距离最近的前50个对应的距离以及individual_id
:
1 2 3 4 index = faiss.IndexFlatIP(CONFIG['embedding_size' ]) index.add(train_embeds) D, I = index.search(test_embeds, k=50 )
生成提交文件 接下来的操作和上面验证集的操作一样,只是不再需要计算交叉验证得分了。
处理测试集对应的备选答案:
1 2 3 4 5 6 7 8 9 10 11 test_df = [] for i, test_id in tqdm(enumerate (test_ids)): targets = train_labels[I[i]] distances = D[i] subset_preds = pd.DataFrame(np.stack([targets, distances], axis=1 ), columns=['target' ,'distances' ]) subset_preds['image' ] = test_id test_df.append(subset_preds) test_df = pd.concat(test_df).reset_index(drop=True ) test_df = test_df.groupby(['image' ,'target' ]).distances.max ().reset_index() test_df = test_df.sort_values('distances' , ascending=False ).reset_index(drop=True )
生成提交文件:
1 2 3 4 5 6 predictions = get_predictions(test_df, best_threshold_adjusted) predictions = pd.Series(predictions).reset_index() predictions.columns = ['image' ,'predictions' ] predictions['predictions' ] = predictions['predictions' ].apply(lambda x: ' ' .join(x)) predictions.to_csv('submission.csv' ,index=False )
参考: