sd: load single file model using local config

2024-01-30 13:19:13 +08:00 · 2024-01-30 13:19:13 +08:00 · e5f71ae535
commit e5f71ae535
parent 934b0fc455
12 changed files with 352 additions and 2 deletions
--- a/iopaint/download.py
+++ b/iopaint/download.py
@ -14,6 +14,7 @@ from iopaint.const import (
    DIFFUSERS_SDXL_INPAINT_CLASS_NAME,
    ANYTEXT_NAME,
 )
 from iopaint.model.original_sd_configs import get_config_files
 from iopaint.model_info import ModelInfo, ModelType
@ -60,6 +61,7 @@ def get_sd_model_type(model_abs_path: str) -> ModelType:
                load_safety_checker=False,
                local_files_only=True,
                num_in_channels=9,
                config_files=get_config_files()
            )
            model_type = ModelType.DIFFUSERS_SD_INPAINT
        except ValueError as e:
@ -84,6 +86,7 @@ def get_sdxl_model_type(model_abs_path: str) -> ModelType:
                load_safety_checker=False,
                local_files_only=True,
                num_in_channels=9,
                config_files=get_config_files()
            )
            if model.unet.config.in_channels == 9:
                # https://github.com/huggingface/diffusers/issues/6610
--- a/iopaint/model/controlnet.py
+++ b/iopaint/model/controlnet.py
@ -13,6 +13,7 @@ from .helper.controlnet_preprocess import (
    make_inpaint_control_image,
 )
 from .helper.cpu_text_encoder import CPUTextEncoderWrapper
 from .original_sd_configs import get_config_files
 from .utils import (
    get_scheduler,
    handle_from_pretrained_exceptions,
@ -101,6 +102,7 @@ class ControlNet(DiffusionInpaintModel):
                controlnet=controlnet,
                load_safety_checker=not disable_nsfw_checker,
                torch_dtype=torch_dtype,
                config_files=get_config_files(),
                **model_kwargs,
            )
        else:
--- a/iopaint/model/original_sd_configs/init.py
+++ b/iopaint/model/original_sd_configs/init.py
@ -0,0 +1,19 @@
 from pathlib import Path
 from typing import Dict
 CURRENT_DIR = Path(__file__).parent.absolute()
 def get_config_files() -> Dict[str, Path]:
    """
    - `v1`: Config file for Stable Diffusion v1
    - `v2`: Config file for Stable Diffusion v2
    - `xl`: Config file for Stable Diffusion XL
    - `xl_refiner`: Config file for Stable Diffusion XL Refiner
    """
    return {
        "v1": CURRENT_DIR / "v1-inference.yaml",
        "v2": CURRENT_DIR / "v2-inference-v.yaml",
        "xl": CURRENT_DIR / "sd_xl_base.yaml",
        "xl_refiner": CURRENT_DIR / "sd_xl_refiner.yaml",
    }
--- a/iopaint/model/original_sd_configs/sd_xl_base.yaml
+++ b/iopaint/model/original_sd_configs/sd_xl_base.yaml
@ -0,0 +1,93 @@
 model:
  target: sgm.models.diffusion.DiffusionEngine
  params:
    scale_factor: 0.13025
    disable_first_stage_autocast: True
    denoiser_config:
      target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
      params:
        num_idx: 1000
        scaling_config:
          target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
        discretization_config:
          target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
    network_config:
      target: sgm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        adm_in_channels: 2816
        num_classes: sequential
        use_checkpoint: True
        in_channels: 4
        out_channels: 4
        model_channels: 320
        attention_resolutions: [4, 2]
        num_res_blocks: 2
        channel_mult: [1, 2, 4]
        num_head_channels: 64
        use_linear_in_transformer: True
        transformer_depth: [1, 2, 10]
        context_dim: 2048
        spatial_transformer_attn_type: softmax-xformers
    conditioner_config:
      target: sgm.modules.GeneralConditioner
      params:
        emb_models:
          - is_trainable: False
            input_key: txt
            target: sgm.modules.encoders.modules.FrozenCLIPEmbedder
            params:
              layer: hidden
              layer_idx: 11
          - is_trainable: False
            input_key: txt
            target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2
            params:
              arch: ViT-bigG-14
              version: laion2b_s39b_b160k
              freeze: True
              layer: penultimate
              always_return_pooled: True
              legacy: False
          - is_trainable: False
            input_key: original_size_as_tuple
            target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
            params:
              outdim: 256
          - is_trainable: False
            input_key: crop_coords_top_left
            target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
            params:
              outdim: 256
          - is_trainable: False
            input_key: target_size_as_tuple
            target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
            params:
              outdim: 256
    first_stage_config:
      target: sgm.models.autoencoder.AutoencoderKL
      params:
        embed_dim: 4
        monitor: val/rec_loss
        ddconfig:
          attn_type: vanilla-xformers
          double_z: true
          z_channels: 4
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult: [1, 2, 4, 4]
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
--- a/iopaint/model/original_sd_configs/sd_xl_refiner.yaml
+++ b/iopaint/model/original_sd_configs/sd_xl_refiner.yaml
@ -0,0 +1,86 @@
 model:
  target: sgm.models.diffusion.DiffusionEngine
  params:
    scale_factor: 0.13025
    disable_first_stage_autocast: True
    denoiser_config:
      target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
      params:
        num_idx: 1000
        scaling_config:
          target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
        discretization_config:
          target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
    network_config:
      target: sgm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        adm_in_channels: 2560
        num_classes: sequential
        use_checkpoint: True
        in_channels: 4
        out_channels: 4
        model_channels: 384
        attention_resolutions: [4, 2]
        num_res_blocks: 2
        channel_mult: [1, 2, 4, 4]
        num_head_channels: 64
        use_linear_in_transformer: True
        transformer_depth: 4
        context_dim: [1280, 1280, 1280, 1280]
        spatial_transformer_attn_type: softmax-xformers
    conditioner_config:
      target: sgm.modules.GeneralConditioner
      params:
        emb_models:
          - is_trainable: False
            input_key: txt
            target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2
            params:
              arch: ViT-bigG-14
              version: laion2b_s39b_b160k
              legacy: False
              freeze: True
              layer: penultimate
              always_return_pooled: True
          - is_trainable: False
            input_key: original_size_as_tuple
            target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
            params:
              outdim: 256
          - is_trainable: False
            input_key: crop_coords_top_left
            target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
            params:
              outdim: 256
          - is_trainable: False
            input_key: aesthetic_score
            target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
            params:
              outdim: 256
    first_stage_config:
      target: sgm.models.autoencoder.AutoencoderKL
      params:
        embed_dim: 4
        monitor: val/rec_loss
        ddconfig:
          attn_type: vanilla-xformers
          double_z: true
          z_channels: 4
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult: [1, 2, 4, 4]
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
--- a/iopaint/model/original_sd_configs/v1-inference.yaml
+++ b/iopaint/model/original_sd_configs/v1-inference.yaml
@ -0,0 +1,70 @@
 model:
  base_learning_rate: 1.0e-04
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    linear_start: 0.00085
    linear_end: 0.0120
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    first_stage_key: "jpg"
    cond_stage_key: "txt"
    image_size: 64
    channels: 4
    cond_stage_trainable: false   # Note: different from the one we trained before
    conditioning_key: crossattn
    monitor: val/loss_simple_ema
    scale_factor: 0.18215
    use_ema: False
    scheduler_config: # 10000 warmup steps
      target: ldm.lr_scheduler.LambdaLinearScheduler
      params:
        warm_up_steps: [ 10000 ]
        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
        f_start: [ 1.e-6 ]
        f_max: [ 1. ]
        f_min: [ 1. ]
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        image_size: 32 # unused
        in_channels: 4
        out_channels: 4
        model_channels: 320
        attention_resolutions: [ 4, 2, 1 ]
        num_res_blocks: 2
        channel_mult: [ 1, 2, 4, 4 ]
        num_heads: 8
        use_spatial_transformer: True
        transformer_depth: 1
        context_dim: 768
        use_checkpoint: True
        legacy: False
    first_stage_config:
      target: ldm.models.autoencoder.AutoencoderKL
      params:
        embed_dim: 4
        monitor: val/rec_loss
        ddconfig:
          double_z: true
          z_channels: 4
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config:
      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
--- a/iopaint/model/original_sd_configs/v2-inference-v.yaml
+++ b/iopaint/model/original_sd_configs/v2-inference-v.yaml
@ -0,0 +1,68 @@
 model:
  base_learning_rate: 1.0e-4
  target: ldm.models.diffusion.ddpm.LatentDiffusion
  params:
    parameterization: "v"
    linear_start: 0.00085
    linear_end: 0.0120
    num_timesteps_cond: 1
    log_every_t: 200
    timesteps: 1000
    first_stage_key: "jpg"
    cond_stage_key: "txt"
    image_size: 64
    channels: 4
    cond_stage_trainable: false
    conditioning_key: crossattn
    monitor: val/loss_simple_ema
    scale_factor: 0.18215
    use_ema: False # we set this to false because this is an inference only config
    unet_config:
      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
      params:
        use_checkpoint: True
        use_fp16: True
        image_size: 32 # unused
        in_channels: 4
        out_channels: 4
        model_channels: 320
        attention_resolutions: [ 4, 2, 1 ]
        num_res_blocks: 2
        channel_mult: [ 1, 2, 4, 4 ]
        num_head_channels: 64 # need to fix for flash-attn
        use_spatial_transformer: True
        use_linear_in_transformer: True
        transformer_depth: 1
        context_dim: 1024
        legacy: False
    first_stage_config:
      target: ldm.models.autoencoder.AutoencoderKL
      params:
        embed_dim: 4
        monitor: val/rec_loss
        ddconfig:
          #attn_type: "vanilla-xformers"
          double_z: true
          z_channels: 4
          resolution: 256
          in_channels: 3
          out_ch: 3
          ch: 128
          ch_mult:
          - 1
          - 2
          - 4
          - 4
          num_res_blocks: 2
          attn_resolutions: []
          dropout: 0.0
        lossconfig:
          target: torch.nn.Identity
    cond_stage_config:
      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
      params:
        freeze: True
        layer: "penultimate"
--- a/iopaint/model/sd.py
+++ b/iopaint/model/sd.py
@ -5,6 +5,7 @@ from loguru import logger
 from .base import DiffusionInpaintModel
 from .helper.cpu_text_encoder import CPUTextEncoderWrapper
 from .original_sd_configs import get_config_files
 from .utils import (
    handle_from_pretrained_exceptions,
    get_torch_dtype,
@ -51,6 +52,7 @@ class SD(DiffusionInpaintModel):
                self.model_id_or_path,
                dtype=torch_dtype,
                load_safety_checker=not disable_nsfw_checker,
                config_files=get_config_files(),
                **model_kwargs,
            )
        else:
--- a/iopaint/model/sdxl.py
+++ b/iopaint/model/sdxl.py
@ -10,6 +10,7 @@ from iopaint.schema import InpaintRequest, ModelType
 from .base import DiffusionInpaintModel
 from .helper.cpu_text_encoder import CPUTextEncoderWrapper
 from .original_sd_configs import get_config_files
 from .utils import (
    handle_from_pretrained_exceptions,
    get_torch_dtype,
@ -41,6 +42,7 @@ class SDXL(DiffusionInpaintModel):
                dtype=torch_dtype,
                num_in_channels=num_in_channels,
                load_safety_checker=False,
                config_files=get_config_files()
            )
        else:
            model_kwargs = {
--- a/iopaint/tests/test_sd_model.py
+++ b/iopaint/tests/test_sd_model.py
@ -232,9 +232,10 @@ def test_runway_sd_1_5_cpu_offload(device, strategy, sampler):
@pytest.mark.parametrize(
    "name",
    [
        "sd-v1-5-inpainting.ckpt",
        "sd-v1-5-inpainting.safetensors",
        "v1-5-pruned-emaonly.safetensors",
        "sd_xl_base_1.0.safetensors",
        "sd_xl_base_1.0_inpainting_0.1.safetensors",
    ],
 )
 def test_local_file_path(device, sampler, name):
--- a/setup.py
+++ b/setup.py
@ -5,6 +5,10 @@ package_files = Path("iopaint/web_app").glob("**/*")
 package_files = [str(it).replace("iopaint/", "") for it in package_files]
 package_files += ["model/anytext/ocr_recog/ppocr_keys_v1.txt"]
 package_files += ["model/anytext/anytext_sd15.yaml"]
 package_files += ["model/original_sd_configs/sd_xl_base.yaml"]
 package_files += ["model/original_sd_configs/sd_xl_refiner.yaml"]
 package_files += ["model/original_sd_configs/v1-inference.yaml"]
 package_files += ["model/original_sd_configs/v2-inference-v.yaml"]
 with open("README.md", "r", encoding="utf-8") as fh:
--- a/web_app/src/components/SidePanel/DiffusionOptions.tsx
+++ b/web_app/src/components/SidePanel/DiffusionOptions.tsx
@ -734,7 +734,7 @@ const DiffusionOptions = () => {
      <div className="flex flex-col gap-1">
        <LabelTitle
          text="Mask blur"
-          toolTip="How much to blur the mask before processing, in pixels."
+          toolTip="How much to blur the mask before processing, in pixels. Make the generated inpainting boundaries appear more natural."
        />
        <RowContainer>
          <Slider