torchsight.losses.dlde
module
The criterion for the weighted DLDENet.
Source code
"""The criterion for the weighted DLDENet."""
import torch
from torch import nn
from .ccs import CCSLoss
from .focal import FocalLoss
class DLDENetLoss(nn.Module):
"""Join the CCS and the Focal losses in one single module."""
def __init__(self, alpha=0.25, gamma=2.0, sigma=3.0, iou_thresholds=None, increase_foreground_by=1, soft=False, device=None):
"""Initialize the losses.
See their corresponding docs for more information.
Arguments:
alpha (float): Alpha parameter for the focal loss.
gamma (float): Gamma parameter for the focal loss.
sigma (float): Point that defines the change from L1 loss to L2 loss (smooth L1).
iou_thresholds (dict): Indicates the thresholds to assign an anchor as background or object.
soft (bool, optional): Apply soft Focal and soft Cosine similarity losses.
device (str, optional): Indicates the device where to run the loss.
"""
super().__init__()
if iou_thresholds is None:
iou_thresholds = {'background': 0.4, 'object': 0.5}
device = device if device is not None else 'cuda:0' if torch.cuda.is_available() else 'cpu'
self.focal = FocalLoss(alpha, gamma, sigma, iou_thresholds, increase_foreground_by, soft, device)
self.ccs = CCSLoss(iou_thresholds, soft)
def forward(self, anchors, regressions, classifications, annotations, model):
"""Compute the different losses for the batch.
Arguments:
anchors (torch.Tensor): The base anchors (without the transformation to adjust the
bounding boxes).
Shape:
`(batch size, total boxes, 4)`
regressions (torch.Tensor): The regression values to adjust the anchors to the predicted
bounding boxes.
Shape:
`(batch size, total boxes, 4)`
classifications (torch.Tensor): The probabilities for each class at each bounding box.
Shape:
`(batch size, total boxes, number of classes)`
annotations (torch.Tensor): Ground truth. Tensor with the bounding boxes and the label for
the object. The values must be x1, y1 (top left corner), x2, y2 (bottom right corner)
and the last value is the label.
Shape:
`(batch size, maximum objects in any image, 5)`
model (torch.nn.Module): The DLDENet model with its embeddings and weights.
Why `maximum objects in any image`? Because if we have more than one image, each image
could have different amounts of objects inside and have different dimensions in the
ground truth (dim 1 of the batch). So we could have the maximum amount of objects
inside any image and then the rest of the images ground truths could be populated
with -1.0. So if this loss finds a ground truth box populated with -1.0 it understands
that it was to match the dimensions and have only one tensor and it won't use that
annotation.
Returns:
tuple: A tuple with the tensors with the classification, regression and cosine similarity losses.
"""
classification, regression = self.focal(anchors, regressions, classifications, annotations)
similarity = self.ccs(anchors, model.classification.embeddings, model.classification.weights, annotations)
return classification, regression, similarity
Classes
class DLDENetLoss (ancestors: torch.nn.modules.module.Module)
-
Join the CCS and the Focal losses in one single module.
Source code
class DLDENetLoss(nn.Module): """Join the CCS and the Focal losses in one single module.""" def __init__(self, alpha=0.25, gamma=2.0, sigma=3.0, iou_thresholds=None, increase_foreground_by=1, soft=False, device=None): """Initialize the losses. See their corresponding docs for more information. Arguments: alpha (float): Alpha parameter for the focal loss. gamma (float): Gamma parameter for the focal loss. sigma (float): Point that defines the change from L1 loss to L2 loss (smooth L1). iou_thresholds (dict): Indicates the thresholds to assign an anchor as background or object. soft (bool, optional): Apply soft Focal and soft Cosine similarity losses. device (str, optional): Indicates the device where to run the loss. """ super().__init__() if iou_thresholds is None: iou_thresholds = {'background': 0.4, 'object': 0.5} device = device if device is not None else 'cuda:0' if torch.cuda.is_available() else 'cpu' self.focal = FocalLoss(alpha, gamma, sigma, iou_thresholds, increase_foreground_by, soft, device) self.ccs = CCSLoss(iou_thresholds, soft) def forward(self, anchors, regressions, classifications, annotations, model): """Compute the different losses for the batch. Arguments: anchors (torch.Tensor): The base anchors (without the transformation to adjust the bounding boxes). Shape: `(batch size, total boxes, 4)` regressions (torch.Tensor): The regression values to adjust the anchors to the predicted bounding boxes. Shape: `(batch size, total boxes, 4)` classifications (torch.Tensor): The probabilities for each class at each bounding box. Shape: `(batch size, total boxes, number of classes)` annotations (torch.Tensor): Ground truth. Tensor with the bounding boxes and the label for the object. The values must be x1, y1 (top left corner), x2, y2 (bottom right corner) and the last value is the label. Shape: `(batch size, maximum objects in any image, 5)` model (torch.nn.Module): The DLDENet model with its embeddings and weights. Why `maximum objects in any image`? Because if we have more than one image, each image could have different amounts of objects inside and have different dimensions in the ground truth (dim 1 of the batch). So we could have the maximum amount of objects inside any image and then the rest of the images ground truths could be populated with -1.0. So if this loss finds a ground truth box populated with -1.0 it understands that it was to match the dimensions and have only one tensor and it won't use that annotation. Returns: tuple: A tuple with the tensors with the classification, regression and cosine similarity losses. """ classification, regression = self.focal(anchors, regressions, classifications, annotations) similarity = self.ccs(anchors, model.classification.embeddings, model.classification.weights, annotations) return classification, regression, similarity
Methods
def __init__(self, alpha=0.25, gamma=2.0, sigma=3.0, iou_thresholds=None, increase_foreground_by=1, soft=False, device=None)
-
Initialize the losses.
See their corresponding docs for more information.
Arguments
alpha
:float
- Alpha parameter for the focal loss.
gamma
:float
- Gamma parameter for the focal loss.
sigma
:float
- Point that defines the change from L1 loss to L2 loss (smooth L1).
iou_thresholds
:dict
- Indicates the thresholds to assign an anchor as background or object.
soft
:bool
, optional- Apply soft Focal and soft Cosine similarity losses.
device
:str
, optional- Indicates the device where to run the loss.
Source code
def __init__(self, alpha=0.25, gamma=2.0, sigma=3.0, iou_thresholds=None, increase_foreground_by=1, soft=False, device=None): """Initialize the losses. See their corresponding docs for more information. Arguments: alpha (float): Alpha parameter for the focal loss. gamma (float): Gamma parameter for the focal loss. sigma (float): Point that defines the change from L1 loss to L2 loss (smooth L1). iou_thresholds (dict): Indicates the thresholds to assign an anchor as background or object. soft (bool, optional): Apply soft Focal and soft Cosine similarity losses. device (str, optional): Indicates the device where to run the loss. """ super().__init__() if iou_thresholds is None: iou_thresholds = {'background': 0.4, 'object': 0.5} device = device if device is not None else 'cuda:0' if torch.cuda.is_available() else 'cpu' self.focal = FocalLoss(alpha, gamma, sigma, iou_thresholds, increase_foreground_by, soft, device) self.ccs = CCSLoss(iou_thresholds, soft)
def forward(self, anchors, regressions, classifications, annotations, model)
-
Compute the different losses for the batch.
Arguments
anchors
:torch.Tensor
- The base anchors (without the transformation to adjust the
bounding boxes).
Shape:
(batch size, total boxes, 4)
regressions
:torch.Tensor
- The regression values to adjust the anchors to the predicted
bounding boxes.
Shape:
(batch size, total boxes, 4)
classifications
:torch.Tensor
- The probabilities for each class at each bounding box.
Shape:
(batch size, total boxes, number of classes)
annotations
:torch.Tensor
- Ground truth. Tensor with the bounding boxes and the label for
the object. The values must be x1, y1 (top left corner), x2, y2 (bottom right corner)
and the last value is the label.
Shape:
(batch size, maximum objects in any image, 5)
model
:torch.nn.Module
-
The DLDENet model with its embeddings and weights.
Why
maximum objects in any image
? Because if we have more than one image, each image could have different amounts of objects inside and have different dimensions in the ground truth (dim 1 of the batch). So we could have the maximum amount of objects inside any image and then the rest of the images ground truths could be populated with -1.0. So if this loss finds a ground truth box populated with -1.0 it understands that it was to match the dimensions and have only one tensor and it won't use that annotation.
Returns
tuple
- A tuple with the tensors with the classification, regression and cosine similarity losses.
Source code
def forward(self, anchors, regressions, classifications, annotations, model): """Compute the different losses for the batch. Arguments: anchors (torch.Tensor): The base anchors (without the transformation to adjust the bounding boxes). Shape: `(batch size, total boxes, 4)` regressions (torch.Tensor): The regression values to adjust the anchors to the predicted bounding boxes. Shape: `(batch size, total boxes, 4)` classifications (torch.Tensor): The probabilities for each class at each bounding box. Shape: `(batch size, total boxes, number of classes)` annotations (torch.Tensor): Ground truth. Tensor with the bounding boxes and the label for the object. The values must be x1, y1 (top left corner), x2, y2 (bottom right corner) and the last value is the label. Shape: `(batch size, maximum objects in any image, 5)` model (torch.nn.Module): The DLDENet model with its embeddings and weights. Why `maximum objects in any image`? Because if we have more than one image, each image could have different amounts of objects inside and have different dimensions in the ground truth (dim 1 of the batch). So we could have the maximum amount of objects inside any image and then the rest of the images ground truths could be populated with -1.0. So if this loss finds a ground truth box populated with -1.0 it understands that it was to match the dimensions and have only one tensor and it won't use that annotation. Returns: tuple: A tuple with the tensors with the classification, regression and cosine similarity losses. """ classification, regression = self.focal(anchors, regressions, classifications, annotations) similarity = self.ccs(anchors, model.classification.embeddings, model.classification.weights, annotations) return classification, regression, similarity