sd: load single file model using local config

2024-01-30 13:19:13 +08:00 · 2024-01-30 13:19:13 +08:00 · e5f71ae535
commit e5f71ae535
parent 934b0fc455
12 changed files with 352 additions and 2 deletions
--- a/iopaint/download.py
+++ b/iopaint/download.py
@ -14,6 +14,7 @@ from iopaint.const import (
    DIFFUSERS_SDXL_INPAINT_CLASS_NAME,
    ANYTEXT_NAME,
 )
+from iopaint.model.original_sd_configs import get_config_files
 from iopaint.model_info import ModelInfo, ModelType


@ -60,6 +61,7 @@ def get_sd_model_type(model_abs_path: str) -> ModelType:
                load_safety_checker=False,
                local_files_only=True,
                num_in_channels=9,
+                config_files=get_config_files()
            )
            model_type = ModelType.DIFFUSERS_SD_INPAINT
        except ValueError as e:
@ -84,6 +86,7 @@ def get_sdxl_model_type(model_abs_path: str) -> ModelType:
                load_safety_checker=False,
                local_files_only=True,
                num_in_channels=9,
+                config_files=get_config_files()
            )
            if model.unet.config.in_channels == 9:
                # https://github.com/huggingface/diffusers/issues/6610
--- a/iopaint/model/controlnet.py
+++ b/iopaint/model/controlnet.py
@ -13,6 +13,7 @@ from .helper.controlnet_preprocess import (
    make_inpaint_control_image,
 )
 from .helper.cpu_text_encoder import CPUTextEncoderWrapper
+from .original_sd_configs import get_config_files
 from .utils import (
    get_scheduler,
    handle_from_pretrained_exceptions,
@ -101,6 +102,7 @@ class ControlNet(DiffusionInpaintModel):
                controlnet=controlnet,
                load_safety_checker=not disable_nsfw_checker,
                torch_dtype=torch_dtype,
+                config_files=get_config_files(),
                **model_kwargs,
            )
        else:
--- a/iopaint/model/original_sd_configs/init.py
+++ b/iopaint/model/original_sd_configs/init.py
@ -0,0 +1,19 @@
+from pathlib import Path
+from typing import Dict
+
+CURRENT_DIR = Path(__file__).parent.absolute()
+
+
+def get_config_files() -> Dict[str, Path]:
+    """
+    - `v1`: Config file for Stable Diffusion v1
+    - `v2`: Config file for Stable Diffusion v2
+    - `xl`: Config file for Stable Diffusion XL
+    - `xl_refiner`: Config file for Stable Diffusion XL Refiner
+    """
+    return {
+        "v1": CURRENT_DIR / "v1-inference.yaml",
+        "v2": CURRENT_DIR / "v2-inference-v.yaml",
+        "xl": CURRENT_DIR / "sd_xl_base.yaml",
+        "xl_refiner": CURRENT_DIR / "sd_xl_refiner.yaml",
+    }
--- a/iopaint/model/original_sd_configs/sd_xl_base.yaml
+++ b/iopaint/model/original_sd_configs/sd_xl_base.yaml
@ -0,0 +1,93 @@
+model:
+  target: sgm.models.diffusion.DiffusionEngine
+  params:
+    scale_factor: 0.13025
+    disable_first_stage_autocast: True
+
+    denoiser_config:
+      target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
+      params:
+        num_idx: 1000
+
+        scaling_config:
+          target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
+        discretization_config:
+          target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
+
+    network_config:
+      target: sgm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        adm_in_channels: 2816
+        num_classes: sequential
+        use_checkpoint: True
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [4, 2]
+        num_res_blocks: 2
+        channel_mult: [1, 2, 4]
+        num_head_channels: 64
+        use_linear_in_transformer: True
+        transformer_depth: [1, 2, 10]
+        context_dim: 2048
+        spatial_transformer_attn_type: softmax-xformers
+
+    conditioner_config:
+      target: sgm.modules.GeneralConditioner
+      params:
+        emb_models:
+          - is_trainable: False
+            input_key: txt
+            target: sgm.modules.encoders.modules.FrozenCLIPEmbedder
+            params:
+              layer: hidden
+              layer_idx: 11
+
+          - is_trainable: False
+            input_key: txt
+            target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2
+            params:
+              arch: ViT-bigG-14
+              version: laion2b_s39b_b160k
+              freeze: True
+              layer: penultimate
+              always_return_pooled: True
+              legacy: False
+
+          - is_trainable: False
+            input_key: original_size_as_tuple
+            target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256
+
+          - is_trainable: False
+            input_key: crop_coords_top_left
+            target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256
+
+          - is_trainable: False
+            input_key: target_size_as_tuple
+            target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256
+
+    first_stage_config:
+      target: sgm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          attn_type: vanilla-xformers
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult: [1, 2, 4, 4]
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
--- a/iopaint/model/original_sd_configs/sd_xl_refiner.yaml
+++ b/iopaint/model/original_sd_configs/sd_xl_refiner.yaml
@ -0,0 +1,86 @@
+model:
+  target: sgm.models.diffusion.DiffusionEngine
+  params:
+    scale_factor: 0.13025
+    disable_first_stage_autocast: True
+
+    denoiser_config:
+      target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
+      params:
+        num_idx: 1000
+
+        scaling_config:
+          target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
+        discretization_config:
+          target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
+
+    network_config:
+      target: sgm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        adm_in_channels: 2560
+        num_classes: sequential
+        use_checkpoint: True
+        in_channels: 4
+        out_channels: 4
+        model_channels: 384
+        attention_resolutions: [4, 2]
+        num_res_blocks: 2
+        channel_mult: [1, 2, 4, 4]
+        num_head_channels: 64
+        use_linear_in_transformer: True
+        transformer_depth: 4
+        context_dim: [1280, 1280, 1280, 1280]
+        spatial_transformer_attn_type: softmax-xformers
+
+    conditioner_config:
+      target: sgm.modules.GeneralConditioner
+      params:
+        emb_models:
+          - is_trainable: False
+            input_key: txt
+            target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2
+            params:
+              arch: ViT-bigG-14
+              version: laion2b_s39b_b160k
+              legacy: False
+              freeze: True
+              layer: penultimate
+              always_return_pooled: True
+
+          - is_trainable: False
+            input_key: original_size_as_tuple
+            target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256
+
+          - is_trainable: False
+            input_key: crop_coords_top_left
+            target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256
+
+          - is_trainable: False
+            input_key: aesthetic_score
+            target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256
+
+    first_stage_config:
+      target: sgm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          attn_type: vanilla-xformers
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult: [1, 2, 4, 4]
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
--- a/iopaint/model/original_sd_configs/v1-inference.yaml
+++ b/iopaint/model/original_sd_configs/v1-inference.yaml
@ -0,0 +1,70 @@
+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false   # Note: different from the one we trained before
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
--- a/iopaint/model/original_sd_configs/v2-inference-v.yaml
+++ b/iopaint/model/original_sd_configs/v2-inference-v.yaml
@ -0,0 +1,68 @@
+model:
+  base_learning_rate: 1.0e-4
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    parameterization: "v"
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False # we set this to false because this is an inference only config
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        use_checkpoint: True
+        use_fp16: True
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_head_channels: 64 # need to fix for flash-attn
+        use_spatial_transformer: True
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          #attn_type: "vanilla-xformers"
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
+      params:
+        freeze: True
+        layer: "penultimate"
--- a/iopaint/model/sd.py
+++ b/iopaint/model/sd.py
@ -5,6 +5,7 @@ from loguru import logger

 from .base import DiffusionInpaintModel
 from .helper.cpu_text_encoder import CPUTextEncoderWrapper
+from .original_sd_configs import get_config_files
 from .utils import (
    handle_from_pretrained_exceptions,
    get_torch_dtype,
@ -51,6 +52,7 @@ class SD(DiffusionInpaintModel):
                self.model_id_or_path,
                dtype=torch_dtype,
                load_safety_checker=not disable_nsfw_checker,
+                config_files=get_config_files(),
                **model_kwargs,
            )
        else:
--- a/iopaint/model/sdxl.py
+++ b/iopaint/model/sdxl.py
@ -10,6 +10,7 @@ from iopaint.schema import InpaintRequest, ModelType

 from .base import DiffusionInpaintModel
 from .helper.cpu_text_encoder import CPUTextEncoderWrapper
+from .original_sd_configs import get_config_files
 from .utils import (
    handle_from_pretrained_exceptions,
    get_torch_dtype,
@ -41,6 +42,7 @@ class SDXL(DiffusionInpaintModel):
                dtype=torch_dtype,
                num_in_channels=num_in_channels,
                load_safety_checker=False,
+                config_files=get_config_files()
            )
        else:
            model_kwargs = {
--- a/iopaint/tests/test_sd_model.py
+++ b/iopaint/tests/test_sd_model.py
@ -232,9 +232,10 @@ def test_runway_sd_1_5_cpu_offload(device, strategy, sampler):
@pytest.mark.parametrize(
    "name",
    [
-        "sd-v1-5-inpainting.ckpt",
        "sd-v1-5-inpainting.safetensors",
        "v1-5-pruned-emaonly.safetensors",
+        "sd_xl_base_1.0.safetensors",
+        "sd_xl_base_1.0_inpainting_0.1.safetensors",
    ],
 )
 def test_local_file_path(device, sampler, name):
--- a/setup.py
+++ b/setup.py
@ -5,6 +5,10 @@ package_files = Path("iopaint/web_app").glob("**/*")
 package_files = [str(it).replace("iopaint/", "") for it in package_files]
 package_files += ["model/anytext/ocr_recog/ppocr_keys_v1.txt"]
 package_files += ["model/anytext/anytext_sd15.yaml"]
+package_files += ["model/original_sd_configs/sd_xl_base.yaml"]
+package_files += ["model/original_sd_configs/sd_xl_refiner.yaml"]
+package_files += ["model/original_sd_configs/v1-inference.yaml"]
+package_files += ["model/original_sd_configs/v2-inference-v.yaml"]


 with open("README.md", "r", encoding="utf-8") as fh:
--- a/web_app/src/components/SidePanel/DiffusionOptions.tsx
+++ b/web_app/src/components/SidePanel/DiffusionOptions.tsx
@ -734,7 +734,7 @@ const DiffusionOptions = () => {
      <div className="flex flex-col gap-1">
        <LabelTitle
          text="Mask blur"
-          toolTip="How much to blur the mask before processing, in pixels."
+          toolTip="How much to blur the mask before processing, in pixels. Make the generated inpainting boundaries appear more natural."
        />
        <RowContainer>
          <Slider