IOPaint/inpaint/plugins/segment_anything2/modeling/sam/prompt_encoder.py

# Copyright (c) Meta Platforms, Inc. and affiliates.
# All rights reserved.

# This source code is licensed under the license found in the
# LICENSE file in the root directory of this source tree.

from typing import Optional, Tuple, Type

import torch
from torch import nn

from ..position_encoding import PositionEmbeddingRandom

from ..sam2_utils import LayerNorm2d


class PromptEncoder(nn.Module):
    def __init__(
        self,
        embed_dim: int,
        image_embedding_size: Tuple[int, int],
        input_image_size: Tuple[int, int],
        mask_in_chans: int,
        activation: Type[nn.Module] = nn.GELU,
    ) -> None:
        """
        Encodes prompts for input to SAM's mask decoder.

        Arguments:
          embed_dim (int): The prompts' embedding dimension
          image_embedding_size (tuple(int, int)): The spatial size of the
            image embedding, as (H, W).
          input_image_size (int): The padded size of the image as input
            to the image encoder, as (H, W).
          mask_in_chans (int): The number of hidden channels used for
            encoding input masks.
          activation (nn.Module): The activation to use when encoding
            input masks.
        """
        super().__init__()
        self.embed_dim = embed_dim
        self.input_image_size = input_image_size
        self.image_embedding_size = image_embedding_size
        self.pe_layer = PositionEmbeddingRandom(embed_dim // 2)

        self.num_point_embeddings: int = 4  # pos/neg point + 2 box corners
        point_embeddings = [
            nn.Embedding(1, embed_dim) for i in range(self.num_point_embeddings)
        ]
        self.point_embeddings = nn.ModuleList(point_embeddings)
        self.not_a_point_embed = nn.Embedding(1, embed_dim)

        self.mask_input_size = (
            4 * image_embedding_size[0],
            4 * image_embedding_size[1],
        )
        self.mask_downscaling = nn.Sequential(
            nn.Conv2d(1, mask_in_chans // 4, kernel_size=2, stride=2),
            LayerNorm2d(mask_in_chans // 4),
            activation(),
            nn.Conv2d(mask_in_chans // 4, mask_in_chans, kernel_size=2, stride=2),
            LayerNorm2d(mask_in_chans),
            activation(),
            nn.Conv2d(mask_in_chans, embed_dim, kernel_size=1),
        )
        self.no_mask_embed = nn.Embedding(1, embed_dim)

    def get_dense_pe(self) -> torch.Tensor:
        """
        Returns the positional encoding used to encode point prompts,
        applied to a dense set of points the shape of the image encoding.

        Returns:
          torch.Tensor: Positional encoding with shape
            1x(embed_dim)x(embedding_h)x(embedding_w)
        """
        return self.pe_layer(self.image_embedding_size).unsqueeze(0)

    def _embed_points(
        self,
        points: torch.Tensor,
        labels: torch.Tensor,
        pad: bool,
    ) -> torch.Tensor:
        """Embeds point prompts."""
        points = points + 0.5  # Shift to center of pixel
        if pad:
            padding_point = torch.zeros((points.shape[0], 1, 2), device=points.device)
            padding_label = -torch.ones((labels.shape[0], 1), device=labels.device)
            points = torch.cat([points, padding_point], dim=1)
            labels = torch.cat([labels, padding_label], dim=1)
        point_embedding = self.pe_layer.forward_with_coords(
            points, self.input_image_size
        )
        point_embedding[labels == -1] = 0.0
        point_embedding[labels == -1] += self.not_a_point_embed.weight
        point_embedding[labels == 0] += self.point_embeddings[0].weight
        point_embedding[labels == 1] += self.point_embeddings[1].weight
        point_embedding[labels == 2] += self.point_embeddings[2].weight
        point_embedding[labels == 3] += self.point_embeddings[3].weight
        return point_embedding

    def _embed_boxes(self, boxes: torch.Tensor) -> torch.Tensor:
        """Embeds box prompts."""
        boxes = boxes + 0.5  # Shift to center of pixel
        coords = boxes.reshape(-1, 2, 2)
        corner_embedding = self.pe_layer.forward_with_coords(
            coords, self.input_image_size
        )
        corner_embedding[:, 0, :] += self.point_embeddings[2].weight
        corner_embedding[:, 1, :] += self.point_embeddings[3].weight
        return corner_embedding

    def _embed_masks(self, masks: torch.Tensor) -> torch.Tensor:
        """Embeds mask inputs."""
        mask_embedding = self.mask_downscaling(masks)
        return mask_embedding

    def _get_batch_size(
        self,
        points: Optional[Tuple[torch.Tensor, torch.Tensor]],
        boxes: Optional[torch.Tensor],
        masks: Optional[torch.Tensor],
    ) -> int:
        """
        Gets the batch size of the output given the batch size of the input prompts.
        """
        if points is not None:
            return points[0].shape[0]
        elif boxes is not None:
            return boxes.shape[0]
        elif masks is not None:
            return masks.shape[0]
        else:
            return 1

    def _get_device(self) -> torch.device:
        return self.point_embeddings[0].weight.device

    def forward(
        self,
        points: Optional[Tuple[torch.Tensor, torch.Tensor]],
        boxes: Optional[torch.Tensor],
        masks: Optional[torch.Tensor],
    ) -> Tuple[torch.Tensor, torch.Tensor]:
        """
        Embeds different types of prompts, returning both sparse and dense
        embeddings.

        Arguments:
          points (tuple(torch.Tensor, torch.Tensor) or none): point coordinates
            and labels to embed.
          boxes (torch.Tensor or none): boxes to embed
          masks (torch.Tensor or none): masks to embed

        Returns:
          torch.Tensor: sparse embeddings for the points and boxes, with shape
            BxNx(embed_dim), where N is determined by the number of input points
            and boxes.
          torch.Tensor: dense embeddings for the masks, in the shape
            Bx(embed_dim)x(embed_H)x(embed_W)
        """
        bs = self._get_batch_size(points, boxes, masks)
        sparse_embeddings = torch.empty(
            (bs, 0, self.embed_dim), device=self._get_device()
        )
        if points is not None:
            coords, labels = points
            point_embeddings = self._embed_points(coords, labels, pad=(boxes is None))
            sparse_embeddings = torch.cat([sparse_embeddings, point_embeddings], dim=1)
        if boxes is not None:
            box_embeddings = self._embed_boxes(boxes)
            sparse_embeddings = torch.cat([sparse_embeddings, box_embeddings], dim=1)

        if masks is not None:
            dense_embeddings = self._embed_masks(masks)
        else:
            dense_embeddings = self.no_mask_embed.weight.reshape(1, -1, 1, 1).expand(
                bs, -1, self.image_embedding_size[0], self.image_embedding_size[1]
            )

        return sparse_embeddings, dense_embeddings
new file: inpaint/__init__.py new file: inpaint/__main__.py new file: inpaint/api.py new file: inpaint/batch_processing.py new file: inpaint/benchmark.py new file: inpaint/cli.py new file: inpaint/const.py new file: inpaint/download.py new file: inpaint/file_manager/__init__.py new file: inpaint/file_manager/file_manager.py new file: inpaint/file_manager/storage_backends.py new file: inpaint/file_manager/utils.py new file: inpaint/helper.py new file: inpaint/installer.py new file: inpaint/model/__init__.py new file: inpaint/model/anytext/__init__.py new file: inpaint/model/anytext/anytext_model.py new file: inpaint/model/anytext/anytext_pipeline.py new file: inpaint/model/anytext/anytext_sd15.yaml new file: inpaint/model/anytext/cldm/__init__.py new file: inpaint/model/anytext/cldm/cldm.py new file: inpaint/model/anytext/cldm/ddim_hacked.py new file: inpaint/model/anytext/cldm/embedding_manager.py new file: inpaint/model/anytext/cldm/hack.py new file: inpaint/model/anytext/cldm/model.py new file: inpaint/model/anytext/cldm/recognizer.py new file: inpaint/model/anytext/ldm/__init__.py new file: inpaint/model/anytext/ldm/models/__init__.py new file: inpaint/model/anytext/ldm/models/autoencoder.py new file: inpaint/model/anytext/ldm/models/diffusion/__init__.py new file: inpaint/model/anytext/ldm/models/diffusion/ddim.py new file: inpaint/model/anytext/ldm/models/diffusion/ddpm.py new file: inpaint/model/anytext/ldm/models/diffusion/dpm_solver/__init__.py new file: inpaint/model/anytext/ldm/models/diffusion/dpm_solver/dpm_solver.py new file: inpaint/model/anytext/ldm/models/diffusion/dpm_solver/sampler.py new file: inpaint/model/anytext/ldm/models/diffusion/plms.py new file: inpaint/model/anytext/ldm/models/diffusion/sampling_util.py new file: inpaint/model/anytext/ldm/modules/__init__.py new file: inpaint/model/anytext/ldm/modules/attention.py new file: inpaint/model/anytext/ldm/modules/diffusionmodules/__init__.py new file: inpaint/model/anytext/ldm/modules/diffusionmodules/model.py new file: inpaint/model/anytext/ldm/modules/diffusionmodules/openaimodel.py new file: inpaint/model/anytext/ldm/modules/diffusionmodules/upscaling.py new file: inpaint/model/anytext/ldm/modules/diffusionmodules/util.py new file: inpaint/model/anytext/ldm/modules/distributions/__init__.py new file: inpaint/model/anytext/ldm/modules/distributions/distributions.py new file: inpaint/model/anytext/ldm/modules/ema.py new file: inpaint/model/anytext/ldm/modules/encoders/__init__.py new file: inpaint/model/anytext/ldm/modules/encoders/modules.py new file: inpaint/model/anytext/ldm/util.py new file: inpaint/model/anytext/main.py new file: inpaint/model/anytext/ocr_recog/RNN.py new file: inpaint/model/anytext/ocr_recog/RecCTCHead.py new file: inpaint/model/anytext/ocr_recog/RecModel.py new file: inpaint/model/anytext/ocr_recog/RecMv1_enhance.py new file: inpaint/model/anytext/ocr_recog/RecSVTR.py new file: inpaint/model/anytext/ocr_recog/__init__.py new file: inpaint/model/anytext/ocr_recog/common.py new file: inpaint/model/anytext/ocr_recog/en_dict.txt new file: inpaint/model/anytext/ocr_recog/ppocr_keys_v1.txt new file: inpaint/model/anytext/utils.py new file: inpaint/model/base.py new file: inpaint/model/brushnet/__init__.py new file: inpaint/model/brushnet/brushnet.py new file: inpaint/model/brushnet/brushnet_unet_forward.py new file: inpaint/model/brushnet/brushnet_wrapper.py new file: inpaint/model/brushnet/pipeline_brushnet.py new file: inpaint/model/brushnet/unet_2d_blocks.py new file: inpaint/model/controlnet.py new file: inpaint/model/ddim_sampler.py new file: inpaint/model/fcf.py new file: inpaint/model/helper/__init__.py new file: inpaint/model/helper/controlnet_preprocess.py new file: inpaint/model/helper/cpu_text_encoder.py new file: inpaint/model/helper/g_diffuser_bot.py new file: inpaint/model/instruct_pix2pix.py new file: inpaint/model/kandinsky.py new file: inpaint/model/lama.py new file: inpaint/model/ldm.py new file: inpaint/model/manga.py new file: inpaint/model/mat.py new file: inpaint/model/mi_gan.py new file: inpaint/model/opencv2.py new file: inpaint/model/original_sd_configs/__init__.py new file: inpaint/model/original_sd_configs/sd_xl_base.yaml new file: inpaint/model/original_sd_configs/sd_xl_refiner.yaml new file: inpaint/model/original_sd_configs/v1-inference.yaml new file: inpaint/model/original_sd_configs/v2-inference-v.yaml new file: inpaint/model/paint_by_example.py new file: inpaint/model/plms_sampler.py new file: inpaint/model/power_paint/__init__.py new file: inpaint/model/power_paint/pipeline_powerpaint.py new file: inpaint/model/power_paint/power_paint.py new file: inpaint/model/power_paint/power_paint_v2.py new file: inpaint/model/power_paint/powerpaint_tokenizer.py 2024-08-20 21:17:33 +02:00			`# Copyright (c) Meta Platforms, Inc. and affiliates.`
			`# All rights reserved.`

			`# This source code is licensed under the license found in the`
			`# LICENSE file in the root directory of this source tree.`

			`from typing import Optional, Tuple, Type`

			`import torch`
			`from torch import nn`

			`from ..position_encoding import PositionEmbeddingRandom`

			`from ..sam2_utils import LayerNorm2d`


			`class PromptEncoder(nn.Module):`
			`def __init__(`
			`self,`
			`embed_dim: int,`
			`image_embedding_size: Tuple[int, int],`
			`input_image_size: Tuple[int, int],`
			`mask_in_chans: int,`
			`activation: Type[nn.Module] = nn.GELU,`
			`) -> None:`
			`"""`
			`Encodes prompts for input to SAM's mask decoder.`

			`Arguments:`
			`embed_dim (int): The prompts' embedding dimension`
			`image_embedding_size (tuple(int, int)): The spatial size of the`
			`image embedding, as (H, W).`
			`input_image_size (int): The padded size of the image as input`
			`to the image encoder, as (H, W).`
			`mask_in_chans (int): The number of hidden channels used for`
			`encoding input masks.`
			`activation (nn.Module): The activation to use when encoding`
			`input masks.`
			`"""`
			`super().__init__()`
			`self.embed_dim = embed_dim`
			`self.input_image_size = input_image_size`
			`self.image_embedding_size = image_embedding_size`
			`self.pe_layer = PositionEmbeddingRandom(embed_dim // 2)`

			`self.num_point_embeddings: int = 4 # pos/neg point + 2 box corners`
			`point_embeddings = [`
			`nn.Embedding(1, embed_dim) for i in range(self.num_point_embeddings)`
			`]`
			`self.point_embeddings = nn.ModuleList(point_embeddings)`
			`self.not_a_point_embed = nn.Embedding(1, embed_dim)`

			`self.mask_input_size = (`
			`4 * image_embedding_size[0],`
			`4 * image_embedding_size[1],`
			`)`
			`self.mask_downscaling = nn.Sequential(`
			`nn.Conv2d(1, mask_in_chans // 4, kernel_size=2, stride=2),`
			`LayerNorm2d(mask_in_chans // 4),`
			`activation(),`
			`nn.Conv2d(mask_in_chans // 4, mask_in_chans, kernel_size=2, stride=2),`
			`LayerNorm2d(mask_in_chans),`
			`activation(),`
			`nn.Conv2d(mask_in_chans, embed_dim, kernel_size=1),`
			`)`
			`self.no_mask_embed = nn.Embedding(1, embed_dim)`

			`def get_dense_pe(self) -> torch.Tensor:`
			`"""`
			`Returns the positional encoding used to encode point prompts,`
			`applied to a dense set of points the shape of the image encoding.`

			`Returns:`
			`torch.Tensor: Positional encoding with shape`
			`1x(embed_dim)x(embedding_h)x(embedding_w)`
			`"""`
			`return self.pe_layer(self.image_embedding_size).unsqueeze(0)`

			`def _embed_points(`
			`self,`
			`points: torch.Tensor,`
			`labels: torch.Tensor,`
			`pad: bool,`
			`) -> torch.Tensor:`
			`"""Embeds point prompts."""`
			`points = points + 0.5 # Shift to center of pixel`
			`if pad:`
			`padding_point = torch.zeros((points.shape[0], 1, 2), device=points.device)`
			`padding_label = -torch.ones((labels.shape[0], 1), device=labels.device)`
			`points = torch.cat([points, padding_point], dim=1)`
			`labels = torch.cat([labels, padding_label], dim=1)`
			`point_embedding = self.pe_layer.forward_with_coords(`
			`points, self.input_image_size`
			`)`
			`point_embedding[labels == -1] = 0.0`
			`point_embedding[labels == -1] += self.not_a_point_embed.weight`
			`point_embedding[labels == 0] += self.point_embeddings[0].weight`
			`point_embedding[labels == 1] += self.point_embeddings[1].weight`
			`point_embedding[labels == 2] += self.point_embeddings[2].weight`
			`point_embedding[labels == 3] += self.point_embeddings[3].weight`
			`return point_embedding`

			`def _embed_boxes(self, boxes: torch.Tensor) -> torch.Tensor:`
			`"""Embeds box prompts."""`
			`boxes = boxes + 0.5 # Shift to center of pixel`
			`coords = boxes.reshape(-1, 2, 2)`
			`corner_embedding = self.pe_layer.forward_with_coords(`
			`coords, self.input_image_size`
			`)`
			`corner_embedding[:, 0, :] += self.point_embeddings[2].weight`
			`corner_embedding[:, 1, :] += self.point_embeddings[3].weight`
			`return corner_embedding`

			`def _embed_masks(self, masks: torch.Tensor) -> torch.Tensor:`
			`"""Embeds mask inputs."""`
			`mask_embedding = self.mask_downscaling(masks)`
			`return mask_embedding`

			`def _get_batch_size(`
			`self,`
			`points: Optional[Tuple[torch.Tensor, torch.Tensor]],`
			`boxes: Optional[torch.Tensor],`
			`masks: Optional[torch.Tensor],`
			`) -> int:`
			`"""`
			`Gets the batch size of the output given the batch size of the input prompts.`
			`"""`
			`if points is not None:`
			`return points[0].shape[0]`
			`elif boxes is not None:`
			`return boxes.shape[0]`
			`elif masks is not None:`
			`return masks.shape[0]`
			`else:`
			`return 1`

			`def _get_device(self) -> torch.device:`
			`return self.point_embeddings[0].weight.device`

			`def forward(`
			`self,`
			`points: Optional[Tuple[torch.Tensor, torch.Tensor]],`
			`boxes: Optional[torch.Tensor],`
			`masks: Optional[torch.Tensor],`
			`) -> Tuple[torch.Tensor, torch.Tensor]:`
			`"""`
			`Embeds different types of prompts, returning both sparse and dense`
			`embeddings.`

			`Arguments:`
			`points (tuple(torch.Tensor, torch.Tensor) or none): point coordinates`
			`and labels to embed.`
			`boxes (torch.Tensor or none): boxes to embed`
			`masks (torch.Tensor or none): masks to embed`

			`Returns:`
			`torch.Tensor: sparse embeddings for the points and boxes, with shape`
			`BxNx(embed_dim), where N is determined by the number of input points`
			`and boxes.`
			`torch.Tensor: dense embeddings for the masks, in the shape`
			`Bx(embed_dim)x(embed_H)x(embed_W)`
			`"""`
			`bs = self._get_batch_size(points, boxes, masks)`
			`sparse_embeddings = torch.empty(`
			`(bs, 0, self.embed_dim), device=self._get_device()`
			`)`
			`if points is not None:`
			`coords, labels = points`
			`point_embeddings = self._embed_points(coords, labels, pad=(boxes is None))`
			`sparse_embeddings = torch.cat([sparse_embeddings, point_embeddings], dim=1)`
			`if boxes is not None:`
			`box_embeddings = self._embed_boxes(boxes)`
			`sparse_embeddings = torch.cat([sparse_embeddings, box_embeddings], dim=1)`

			`if masks is not None:`
			`dense_embeddings = self._embed_masks(masks)`
			`else:`
			`dense_embeddings = self.no_mask_embed.weight.reshape(1, -1, 1, 1).expand(`
			`bs, -1, self.image_embedding_size[0], self.image_embedding_size[1]`
			`)`

			`return sparse_embeddings, dense_embeddings`