torchsight.losses.ccs
module
Implementation of the Classification vector-centered Cosine Similarity from the paper One-shot Face Recognition by Promoting Underrepresented Classes.
Source code
"""Implementation of the Classification vector-centered Cosine Similarity from the paper
[One-shot Face Recognition by Promoting Underrepresented Classes](https://arxiv.org/pdf/1707.05574.pdf).
"""
import torch
from torch import nn
from ..models import Anchors
class CCSLoss(nn.Module):
"""Classification vector-centered Cosine Similarity Loss.
As indicated in the equation 5 of the paper, this loss tries to minimize the angular distance
between the embeddings (or features before the classification) and the weighted vector that does
the classification.
This is done by simply doing a dot product between the embedding and the classification vector
and normalizing by their norms.
It will apply this loss term only to those embeddings that are assigned to an object.
As there could a be a lot of assigned anchors, it's a little naive to only have a threshold,
so this loss provide an option 'soft' to compute the similarity according to the IoU between the
anchor and the real annotation.
Suppose that your iou_threshold for the objects is 0.5, if you have an anchor with IoU 0.51 and
another with 0.99, both weight the same for the loss?
A soft version of the loss will decrease the weight of the anchor in the final loss according
to its IoU, so the final loss of an anchor is it's IoU * similarity.
"""
def __init__(self, iou_thresholds=None, soft=False):
"""Initialize the loss.
Arguments:
iou_thresholds (dict, optional): Indicates the thresholds to assign an anchor as background or object.
soft (bool, optional): Apply the soft version of the loss.
"""
super().__init__()
if iou_thresholds is None:
iou_thresholds = {'background': 0.4, 'object': 0.5}
self.iou_thresholds = iou_thresholds
self.soft = soft
def forward(self, anchors, embeddings, weights, annotations):
"""Get the mean CCS loss.
Arguments:
anchors (torch.Tensor): The base anchors (without the transformation to adjust the
bounding boxes).
Shape:
(batch size, total boxes, 4)
embeddings (torch.Tensor): The embeddings generated for each anchor.
Shape:
(batch size, number of anchors, embedding size)
annotations (torch.Tensor): Ground truth. Tensor with the bounding boxes and the label for
the object. The values must be x1, y1 (top left corner), x2, y2 (bottom right corner)
and the last value is the label.
Shape:
(batch size, maximum objects in any image, 5).
Why maximum objects in any image? Because if we have more than one image, each image
could have different amounts of objects inside and have different dimensions in the
ground truth (dim 1 of the batch). So we could have the maximum amount of objects
inside any image and then the rest of the images ground truths could be populated
with -1.0. So if this loss finds a ground truth box populated with -1.0 it understands
that it was to match the dimensions and have only one tensor.
Returns:
torch.Tensor: The mean CSS loss.
"""
# We want to use the weights but not backprop over they, we want to backprop over the embeddings
original_weights = weights.detach()
batch_anchors = anchors
batch_embeddings = embeddings
batch_annotations = annotations
losses = []
for i, anchors in enumerate(batch_anchors):
embeddings = batch_embeddings[i]
annotations = batch_annotations[i]
weights = original_weights.clone()
# Keep only the real labels
annotations = annotations[annotations[:, -1] != -1]
# Zero loss for this image if it does not have any annotation
if annotations.shape[0] == 0:
losses.append(embeddings.new_zeros(1).mean())
continue
# Get assignations of the annotations to the anchors
# Get the assigned annotations (the i-th assigned annotation is the annotation assigned to the i-th
# anchor)
# Get the masks to select the anchors assigned to an object (IoU bigger than iou_object threshold)
# Also get the IoU value to weight their loss
assignations = Anchors.assign(anchors, annotations, thresholds=self.iou_thresholds)
assigned_annotations, selected_anchors_objects, _, iou = assignations
# Continue with the next image if there are no selected objects
if selected_anchors_objects.sum() == 0:
losses.append(embeddings.new_zeros(1).mean())
continue
# We must compute the cosine similarity between each embedding and its corresponding weight vector of its
# assigned annotation. So we can do this by a single matrix multiplication between all the selected anchors
# as objects embeddings and their corresponding vectors.
# Shape (selected embeddings, embedding size)
embeddings = embeddings[selected_anchors_objects]
# Shape (embedding size, number of selected embeddings)
weights = weights[:, assigned_annotations[selected_anchors_objects, -1].long()]
# We need to do a batch matrix multiplication with shape:
# (number of selected anchors, 1, embedding size) * (number of selected anchors, embedding size, 1)
# Reshape the embeddings to have shape (number of selected embeddings, 1, embedding size)
embeddings = embeddings.unsqueeze(dim=1)
# Reshape the weights to have shape (number of selected embeddings, embedding size, 1)
weights = weights.t().unsqueeze(dim=2)
# Compute the loss
loss = -1 * torch.matmul(embeddings, weights).view(-1) # Shape (selected embeddings,)
loss /= embeddings.squeeze(dim=1).norm(dim=1) # Normalize by the embeddings' norms
loss /= weights.squeeze(dim=2).norm(dim=1) # Normalize by the weights' norms
if self.soft:
loss *= iou # Weight each loss term according to its IoU
# Add one to have a minimum loss of zero (because cosine similarity ranges from -1 to 1) and normalize
# the value between 0 and 1 to have a more meaningfull loss
loss = (loss + 1) / 2
losses.append(loss.mean())
return torch.stack(losses).mean()
Classes
class CCSLoss (ancestors: torch.nn.modules.module.Module)
-
Classification vector-centered Cosine Similarity Loss.
As indicated in the equation 5 of the paper, this loss tries to minimize the angular distance between the embeddings (or features before the classification) and the weighted vector that does the classification.
This is done by simply doing a dot product between the embedding and the classification vector and normalizing by their norms.
It will apply this loss term only to those embeddings that are assigned to an object.
As there could a be a lot of assigned anchors, it's a little naive to only have a threshold, so this loss provide an option 'soft' to compute the similarity according to the IoU between the anchor and the real annotation. Suppose that your iou_threshold for the objects is 0.5, if you have an anchor with IoU 0.51 and another with 0.99, both weight the same for the loss? A soft version of the loss will decrease the weight of the anchor in the final loss according to its IoU, so the final loss of an anchor is it's IoU * similarity.
Source code
class CCSLoss(nn.Module): """Classification vector-centered Cosine Similarity Loss. As indicated in the equation 5 of the paper, this loss tries to minimize the angular distance between the embeddings (or features before the classification) and the weighted vector that does the classification. This is done by simply doing a dot product between the embedding and the classification vector and normalizing by their norms. It will apply this loss term only to those embeddings that are assigned to an object. As there could a be a lot of assigned anchors, it's a little naive to only have a threshold, so this loss provide an option 'soft' to compute the similarity according to the IoU between the anchor and the real annotation. Suppose that your iou_threshold for the objects is 0.5, if you have an anchor with IoU 0.51 and another with 0.99, both weight the same for the loss? A soft version of the loss will decrease the weight of the anchor in the final loss according to its IoU, so the final loss of an anchor is it's IoU * similarity. """ def __init__(self, iou_thresholds=None, soft=False): """Initialize the loss. Arguments: iou_thresholds (dict, optional): Indicates the thresholds to assign an anchor as background or object. soft (bool, optional): Apply the soft version of the loss. """ super().__init__() if iou_thresholds is None: iou_thresholds = {'background': 0.4, 'object': 0.5} self.iou_thresholds = iou_thresholds self.soft = soft def forward(self, anchors, embeddings, weights, annotations): """Get the mean CCS loss. Arguments: anchors (torch.Tensor): The base anchors (without the transformation to adjust the bounding boxes). Shape: (batch size, total boxes, 4) embeddings (torch.Tensor): The embeddings generated for each anchor. Shape: (batch size, number of anchors, embedding size) annotations (torch.Tensor): Ground truth. Tensor with the bounding boxes and the label for the object. The values must be x1, y1 (top left corner), x2, y2 (bottom right corner) and the last value is the label. Shape: (batch size, maximum objects in any image, 5). Why maximum objects in any image? Because if we have more than one image, each image could have different amounts of objects inside and have different dimensions in the ground truth (dim 1 of the batch). So we could have the maximum amount of objects inside any image and then the rest of the images ground truths could be populated with -1.0. So if this loss finds a ground truth box populated with -1.0 it understands that it was to match the dimensions and have only one tensor. Returns: torch.Tensor: The mean CSS loss. """ # We want to use the weights but not backprop over they, we want to backprop over the embeddings original_weights = weights.detach() batch_anchors = anchors batch_embeddings = embeddings batch_annotations = annotations losses = [] for i, anchors in enumerate(batch_anchors): embeddings = batch_embeddings[i] annotations = batch_annotations[i] weights = original_weights.clone() # Keep only the real labels annotations = annotations[annotations[:, -1] != -1] # Zero loss for this image if it does not have any annotation if annotations.shape[0] == 0: losses.append(embeddings.new_zeros(1).mean()) continue # Get assignations of the annotations to the anchors # Get the assigned annotations (the i-th assigned annotation is the annotation assigned to the i-th # anchor) # Get the masks to select the anchors assigned to an object (IoU bigger than iou_object threshold) # Also get the IoU value to weight their loss assignations = Anchors.assign(anchors, annotations, thresholds=self.iou_thresholds) assigned_annotations, selected_anchors_objects, _, iou = assignations # Continue with the next image if there are no selected objects if selected_anchors_objects.sum() == 0: losses.append(embeddings.new_zeros(1).mean()) continue # We must compute the cosine similarity between each embedding and its corresponding weight vector of its # assigned annotation. So we can do this by a single matrix multiplication between all the selected anchors # as objects embeddings and their corresponding vectors. # Shape (selected embeddings, embedding size) embeddings = embeddings[selected_anchors_objects] # Shape (embedding size, number of selected embeddings) weights = weights[:, assigned_annotations[selected_anchors_objects, -1].long()] # We need to do a batch matrix multiplication with shape: # (number of selected anchors, 1, embedding size) * (number of selected anchors, embedding size, 1) # Reshape the embeddings to have shape (number of selected embeddings, 1, embedding size) embeddings = embeddings.unsqueeze(dim=1) # Reshape the weights to have shape (number of selected embeddings, embedding size, 1) weights = weights.t().unsqueeze(dim=2) # Compute the loss loss = -1 * torch.matmul(embeddings, weights).view(-1) # Shape (selected embeddings,) loss /= embeddings.squeeze(dim=1).norm(dim=1) # Normalize by the embeddings' norms loss /= weights.squeeze(dim=2).norm(dim=1) # Normalize by the weights' norms if self.soft: loss *= iou # Weight each loss term according to its IoU # Add one to have a minimum loss of zero (because cosine similarity ranges from -1 to 1) and normalize # the value between 0 and 1 to have a more meaningfull loss loss = (loss + 1) / 2 losses.append(loss.mean()) return torch.stack(losses).mean()
Methods
def __init__(self, iou_thresholds=None, soft=False)
-
Initialize the loss.
Arguments
iou_thresholds
:dict
, optional- Indicates the thresholds to assign an anchor as background or object.
soft
:bool
, optional- Apply the soft version of the loss.
Source code
def __init__(self, iou_thresholds=None, soft=False): """Initialize the loss. Arguments: iou_thresholds (dict, optional): Indicates the thresholds to assign an anchor as background or object. soft (bool, optional): Apply the soft version of the loss. """ super().__init__() if iou_thresholds is None: iou_thresholds = {'background': 0.4, 'object': 0.5} self.iou_thresholds = iou_thresholds self.soft = soft
def forward(self, anchors, embeddings, weights, annotations)
-
Get the mean CCS loss.
Arguments
anchors
:torch.Tensor
- The base anchors (without the transformation to adjust the bounding boxes). Shape: (batch size, total boxes, 4)
embeddings
:torch.Tensor
- The embeddings generated for each anchor. Shape: (batch size, number of anchors, embedding size)
annotations
:torch.Tensor
-
Ground truth. Tensor with the bounding boxes and the label for the object. The values must be x1, y1 (top left corner), x2, y2 (bottom right corner) and the last value is the label. Shape: (batch size, maximum objects in any image, 5).
Why maximum objects in any image? Because if we have more than one image, each image could have different amounts of objects inside and have different dimensions in the ground truth (dim 1 of the batch). So we could have the maximum amount of objects inside any image and then the rest of the images ground truths could be populated with -1.0. So if this loss finds a ground truth box populated with -1.0 it understands that it was to match the dimensions and have only one tensor.
Returns
torch.Tensor: The mean CSS loss.
Source code
def forward(self, anchors, embeddings, weights, annotations): """Get the mean CCS loss. Arguments: anchors (torch.Tensor): The base anchors (without the transformation to adjust the bounding boxes). Shape: (batch size, total boxes, 4) embeddings (torch.Tensor): The embeddings generated for each anchor. Shape: (batch size, number of anchors, embedding size) annotations (torch.Tensor): Ground truth. Tensor with the bounding boxes and the label for the object. The values must be x1, y1 (top left corner), x2, y2 (bottom right corner) and the last value is the label. Shape: (batch size, maximum objects in any image, 5). Why maximum objects in any image? Because if we have more than one image, each image could have different amounts of objects inside and have different dimensions in the ground truth (dim 1 of the batch). So we could have the maximum amount of objects inside any image and then the rest of the images ground truths could be populated with -1.0. So if this loss finds a ground truth box populated with -1.0 it understands that it was to match the dimensions and have only one tensor. Returns: torch.Tensor: The mean CSS loss. """ # We want to use the weights but not backprop over they, we want to backprop over the embeddings original_weights = weights.detach() batch_anchors = anchors batch_embeddings = embeddings batch_annotations = annotations losses = [] for i, anchors in enumerate(batch_anchors): embeddings = batch_embeddings[i] annotations = batch_annotations[i] weights = original_weights.clone() # Keep only the real labels annotations = annotations[annotations[:, -1] != -1] # Zero loss for this image if it does not have any annotation if annotations.shape[0] == 0: losses.append(embeddings.new_zeros(1).mean()) continue # Get assignations of the annotations to the anchors # Get the assigned annotations (the i-th assigned annotation is the annotation assigned to the i-th # anchor) # Get the masks to select the anchors assigned to an object (IoU bigger than iou_object threshold) # Also get the IoU value to weight their loss assignations = Anchors.assign(anchors, annotations, thresholds=self.iou_thresholds) assigned_annotations, selected_anchors_objects, _, iou = assignations # Continue with the next image if there are no selected objects if selected_anchors_objects.sum() == 0: losses.append(embeddings.new_zeros(1).mean()) continue # We must compute the cosine similarity between each embedding and its corresponding weight vector of its # assigned annotation. So we can do this by a single matrix multiplication between all the selected anchors # as objects embeddings and their corresponding vectors. # Shape (selected embeddings, embedding size) embeddings = embeddings[selected_anchors_objects] # Shape (embedding size, number of selected embeddings) weights = weights[:, assigned_annotations[selected_anchors_objects, -1].long()] # We need to do a batch matrix multiplication with shape: # (number of selected anchors, 1, embedding size) * (number of selected anchors, embedding size, 1) # Reshape the embeddings to have shape (number of selected embeddings, 1, embedding size) embeddings = embeddings.unsqueeze(dim=1) # Reshape the weights to have shape (number of selected embeddings, embedding size, 1) weights = weights.t().unsqueeze(dim=2) # Compute the loss loss = -1 * torch.matmul(embeddings, weights).view(-1) # Shape (selected embeddings,) loss /= embeddings.squeeze(dim=1).norm(dim=1) # Normalize by the embeddings' norms loss /= weights.squeeze(dim=2).norm(dim=1) # Normalize by the weights' norms if self.soft: loss *= iou # Weight each loss term according to its IoU # Add one to have a minimum loss of zero (because cosine similarity ranges from -1 to 1) and normalize # the value between 0 and 1 to have a more meaningfull loss loss = (loss + 1) / 2 losses.append(loss.mean()) return torch.stack(losses).mean()