From e5f71ae5355246b9560aaa9449a4b06f9fae4887 Mon Sep 17 00:00:00 2001
From: Qing <cwq1913@gmail.com>
Date: Tue, 30 Jan 2024 13:19:13 +0800
Subject: [PATCH] sd: load single file model using local config

---
 iopaint/download.py                           |  3 +
 iopaint/model/controlnet.py                   |  2 +
 iopaint/model/original_sd_configs/__init__.py | 19 ++++
 .../model/original_sd_configs/sd_xl_base.yaml | 93 +++++++++++++++++++
 .../original_sd_configs/sd_xl_refiner.yaml    | 86 +++++++++++++++++
 .../original_sd_configs/v1-inference.yaml     | 70 ++++++++++++++
 .../original_sd_configs/v2-inference-v.yaml   | 68 ++++++++++++++
 iopaint/model/sd.py                           |  2 +
 iopaint/model/sdxl.py                         |  2 +
 iopaint/tests/test_sd_model.py                |  3 +-
 setup.py                                      |  4 +
 .../components/SidePanel/DiffusionOptions.tsx |  2 +-
 12 files changed, 352 insertions(+), 2 deletions(-)
 create mode 100644 iopaint/model/original_sd_configs/__init__.py
 create mode 100644 iopaint/model/original_sd_configs/sd_xl_base.yaml
 create mode 100644 iopaint/model/original_sd_configs/sd_xl_refiner.yaml
 create mode 100644 iopaint/model/original_sd_configs/v1-inference.yaml
 create mode 100644 iopaint/model/original_sd_configs/v2-inference-v.yaml

diff --git a/iopaint/download.py b/iopaint/download.py
index 5253206..ce916aa 100644
--- a/iopaint/download.py
+++ b/iopaint/download.py
@@ -14,6 +14,7 @@ from iopaint.const import (
     DIFFUSERS_SDXL_INPAINT_CLASS_NAME,
     ANYTEXT_NAME,
 )
+from iopaint.model.original_sd_configs import get_config_files
 from iopaint.model_info import ModelInfo, ModelType
 
 
@@ -60,6 +61,7 @@ def get_sd_model_type(model_abs_path: str) -> ModelType:
                 load_safety_checker=False,
                 local_files_only=True,
                 num_in_channels=9,
+                config_files=get_config_files()
             )
             model_type = ModelType.DIFFUSERS_SD_INPAINT
         except ValueError as e:
@@ -84,6 +86,7 @@ def get_sdxl_model_type(model_abs_path: str) -> ModelType:
                 load_safety_checker=False,
                 local_files_only=True,
                 num_in_channels=9,
+                config_files=get_config_files()
             )
             if model.unet.config.in_channels == 9:
                 # https://github.com/huggingface/diffusers/issues/6610
diff --git a/iopaint/model/controlnet.py b/iopaint/model/controlnet.py
index 6ea80fd..c738b13 100644
--- a/iopaint/model/controlnet.py
+++ b/iopaint/model/controlnet.py
@@ -13,6 +13,7 @@ from .helper.controlnet_preprocess import (
     make_inpaint_control_image,
 )
 from .helper.cpu_text_encoder import CPUTextEncoderWrapper
+from .original_sd_configs import get_config_files
 from .utils import (
     get_scheduler,
     handle_from_pretrained_exceptions,
@@ -101,6 +102,7 @@ class ControlNet(DiffusionInpaintModel):
                 controlnet=controlnet,
                 load_safety_checker=not disable_nsfw_checker,
                 torch_dtype=torch_dtype,
+                config_files=get_config_files(),
                 **model_kwargs,
             )
         else:
diff --git a/iopaint/model/original_sd_configs/__init__.py b/iopaint/model/original_sd_configs/__init__.py
new file mode 100644
index 0000000..23896a7
--- /dev/null
+++ b/iopaint/model/original_sd_configs/__init__.py
@@ -0,0 +1,19 @@
+from pathlib import Path
+from typing import Dict
+
+CURRENT_DIR = Path(__file__).parent.absolute()
+
+
+def get_config_files() -> Dict[str, Path]:
+    """
+    - `v1`: Config file for Stable Diffusion v1
+    - `v2`: Config file for Stable Diffusion v2
+    - `xl`: Config file for Stable Diffusion XL
+    - `xl_refiner`: Config file for Stable Diffusion XL Refiner
+    """
+    return {
+        "v1": CURRENT_DIR / "v1-inference.yaml",
+        "v2": CURRENT_DIR / "v2-inference-v.yaml",
+        "xl": CURRENT_DIR / "sd_xl_base.yaml",
+        "xl_refiner": CURRENT_DIR / "sd_xl_refiner.yaml",
+    }
diff --git a/iopaint/model/original_sd_configs/sd_xl_base.yaml b/iopaint/model/original_sd_configs/sd_xl_base.yaml
new file mode 100644
index 0000000..6047379
--- /dev/null
+++ b/iopaint/model/original_sd_configs/sd_xl_base.yaml
@@ -0,0 +1,93 @@
+model:
+  target: sgm.models.diffusion.DiffusionEngine
+  params:
+    scale_factor: 0.13025
+    disable_first_stage_autocast: True
+
+    denoiser_config:
+      target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
+      params:
+        num_idx: 1000
+
+        scaling_config:
+          target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
+        discretization_config:
+          target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
+
+    network_config:
+      target: sgm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        adm_in_channels: 2816
+        num_classes: sequential
+        use_checkpoint: True
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [4, 2]
+        num_res_blocks: 2
+        channel_mult: [1, 2, 4]
+        num_head_channels: 64
+        use_linear_in_transformer: True
+        transformer_depth: [1, 2, 10]
+        context_dim: 2048
+        spatial_transformer_attn_type: softmax-xformers
+
+    conditioner_config:
+      target: sgm.modules.GeneralConditioner
+      params:
+        emb_models:
+          - is_trainable: False
+            input_key: txt
+            target: sgm.modules.encoders.modules.FrozenCLIPEmbedder
+            params:
+              layer: hidden
+              layer_idx: 11
+
+          - is_trainable: False
+            input_key: txt
+            target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2
+            params:
+              arch: ViT-bigG-14
+              version: laion2b_s39b_b160k
+              freeze: True
+              layer: penultimate
+              always_return_pooled: True
+              legacy: False
+
+          - is_trainable: False
+            input_key: original_size_as_tuple
+            target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256
+
+          - is_trainable: False
+            input_key: crop_coords_top_left
+            target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256
+
+          - is_trainable: False
+            input_key: target_size_as_tuple
+            target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256
+
+    first_stage_config:
+      target: sgm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          attn_type: vanilla-xformers
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult: [1, 2, 4, 4]
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
diff --git a/iopaint/model/original_sd_configs/sd_xl_refiner.yaml b/iopaint/model/original_sd_configs/sd_xl_refiner.yaml
new file mode 100644
index 0000000..2d5ab44
--- /dev/null
+++ b/iopaint/model/original_sd_configs/sd_xl_refiner.yaml
@@ -0,0 +1,86 @@
+model:
+  target: sgm.models.diffusion.DiffusionEngine
+  params:
+    scale_factor: 0.13025
+    disable_first_stage_autocast: True
+
+    denoiser_config:
+      target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
+      params:
+        num_idx: 1000
+
+        scaling_config:
+          target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
+        discretization_config:
+          target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
+
+    network_config:
+      target: sgm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        adm_in_channels: 2560
+        num_classes: sequential
+        use_checkpoint: True
+        in_channels: 4
+        out_channels: 4
+        model_channels: 384
+        attention_resolutions: [4, 2]
+        num_res_blocks: 2
+        channel_mult: [1, 2, 4, 4]
+        num_head_channels: 64
+        use_linear_in_transformer: True
+        transformer_depth: 4
+        context_dim: [1280, 1280, 1280, 1280]
+        spatial_transformer_attn_type: softmax-xformers
+
+    conditioner_config:
+      target: sgm.modules.GeneralConditioner
+      params:
+        emb_models:
+          - is_trainable: False
+            input_key: txt
+            target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2
+            params:
+              arch: ViT-bigG-14
+              version: laion2b_s39b_b160k
+              legacy: False
+              freeze: True
+              layer: penultimate
+              always_return_pooled: True
+
+          - is_trainable: False
+            input_key: original_size_as_tuple
+            target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256
+
+          - is_trainable: False
+            input_key: crop_coords_top_left
+            target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256
+
+          - is_trainable: False
+            input_key: aesthetic_score
+            target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
+            params:
+              outdim: 256
+
+    first_stage_config:
+      target: sgm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          attn_type: vanilla-xformers
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult: [1, 2, 4, 4]
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
diff --git a/iopaint/model/original_sd_configs/v1-inference.yaml b/iopaint/model/original_sd_configs/v1-inference.yaml
new file mode 100644
index 0000000..d4effe5
--- /dev/null
+++ b/iopaint/model/original_sd_configs/v1-inference.yaml
@@ -0,0 +1,70 @@
+model:
+  base_learning_rate: 1.0e-04
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false   # Note: different from the one we trained before
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False
+
+    scheduler_config: # 10000 warmup steps
+      target: ldm.lr_scheduler.LambdaLinearScheduler
+      params:
+        warm_up_steps: [ 10000 ]
+        cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
+        f_start: [ 1.e-6 ]
+        f_max: [ 1. ]
+        f_min: [ 1. ]
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_heads: 8
+        use_spatial_transformer: True
+        transformer_depth: 1
+        context_dim: 768
+        use_checkpoint: True
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
diff --git a/iopaint/model/original_sd_configs/v2-inference-v.yaml b/iopaint/model/original_sd_configs/v2-inference-v.yaml
new file mode 100644
index 0000000..8ec8dfb
--- /dev/null
+++ b/iopaint/model/original_sd_configs/v2-inference-v.yaml
@@ -0,0 +1,68 @@
+model:
+  base_learning_rate: 1.0e-4
+  target: ldm.models.diffusion.ddpm.LatentDiffusion
+  params:
+    parameterization: "v"
+    linear_start: 0.00085
+    linear_end: 0.0120
+    num_timesteps_cond: 1
+    log_every_t: 200
+    timesteps: 1000
+    first_stage_key: "jpg"
+    cond_stage_key: "txt"
+    image_size: 64
+    channels: 4
+    cond_stage_trainable: false
+    conditioning_key: crossattn
+    monitor: val/loss_simple_ema
+    scale_factor: 0.18215
+    use_ema: False # we set this to false because this is an inference only config
+
+    unet_config:
+      target: ldm.modules.diffusionmodules.openaimodel.UNetModel
+      params:
+        use_checkpoint: True
+        use_fp16: True
+        image_size: 32 # unused
+        in_channels: 4
+        out_channels: 4
+        model_channels: 320
+        attention_resolutions: [ 4, 2, 1 ]
+        num_res_blocks: 2
+        channel_mult: [ 1, 2, 4, 4 ]
+        num_head_channels: 64 # need to fix for flash-attn
+        use_spatial_transformer: True
+        use_linear_in_transformer: True
+        transformer_depth: 1
+        context_dim: 1024
+        legacy: False
+
+    first_stage_config:
+      target: ldm.models.autoencoder.AutoencoderKL
+      params:
+        embed_dim: 4
+        monitor: val/rec_loss
+        ddconfig:
+          #attn_type: "vanilla-xformers"
+          double_z: true
+          z_channels: 4
+          resolution: 256
+          in_channels: 3
+          out_ch: 3
+          ch: 128
+          ch_mult:
+          - 1
+          - 2
+          - 4
+          - 4
+          num_res_blocks: 2
+          attn_resolutions: []
+          dropout: 0.0
+        lossconfig:
+          target: torch.nn.Identity
+
+    cond_stage_config:
+      target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
+      params:
+        freeze: True
+        layer: "penultimate"
diff --git a/iopaint/model/sd.py b/iopaint/model/sd.py
index 3e78cad..4f20a41 100644
--- a/iopaint/model/sd.py
+++ b/iopaint/model/sd.py
@@ -5,6 +5,7 @@ from loguru import logger
 
 from .base import DiffusionInpaintModel
 from .helper.cpu_text_encoder import CPUTextEncoderWrapper
+from .original_sd_configs import get_config_files
 from .utils import (
     handle_from_pretrained_exceptions,
     get_torch_dtype,
@@ -51,6 +52,7 @@ class SD(DiffusionInpaintModel):
                 self.model_id_or_path,
                 dtype=torch_dtype,
                 load_safety_checker=not disable_nsfw_checker,
+                config_files=get_config_files(),
                 **model_kwargs,
             )
         else:
diff --git a/iopaint/model/sdxl.py b/iopaint/model/sdxl.py
index 716d4ec..2557e71 100644
--- a/iopaint/model/sdxl.py
+++ b/iopaint/model/sdxl.py
@@ -10,6 +10,7 @@ from iopaint.schema import InpaintRequest, ModelType
 
 from .base import DiffusionInpaintModel
 from .helper.cpu_text_encoder import CPUTextEncoderWrapper
+from .original_sd_configs import get_config_files
 from .utils import (
     handle_from_pretrained_exceptions,
     get_torch_dtype,
@@ -41,6 +42,7 @@ class SDXL(DiffusionInpaintModel):
                 dtype=torch_dtype,
                 num_in_channels=num_in_channels,
                 load_safety_checker=False,
+                config_files=get_config_files()
             )
         else:
             model_kwargs = {
diff --git a/iopaint/tests/test_sd_model.py b/iopaint/tests/test_sd_model.py
index c6e4d11..6865e5a 100644
--- a/iopaint/tests/test_sd_model.py
+++ b/iopaint/tests/test_sd_model.py
@@ -232,9 +232,10 @@ def test_runway_sd_1_5_cpu_offload(device, strategy, sampler):
 @pytest.mark.parametrize(
     "name",
     [
-        "sd-v1-5-inpainting.ckpt",
         "sd-v1-5-inpainting.safetensors",
         "v1-5-pruned-emaonly.safetensors",
+        "sd_xl_base_1.0.safetensors",
+        "sd_xl_base_1.0_inpainting_0.1.safetensors",
     ],
 )
 def test_local_file_path(device, sampler, name):
diff --git a/setup.py b/setup.py
index 02dc088..a358072 100644
--- a/setup.py
+++ b/setup.py
@@ -5,6 +5,10 @@ package_files = Path("iopaint/web_app").glob("**/*")
 package_files = [str(it).replace("iopaint/", "") for it in package_files]
 package_files += ["model/anytext/ocr_recog/ppocr_keys_v1.txt"]
 package_files += ["model/anytext/anytext_sd15.yaml"]
+package_files += ["model/original_sd_configs/sd_xl_base.yaml"]
+package_files += ["model/original_sd_configs/sd_xl_refiner.yaml"]
+package_files += ["model/original_sd_configs/v1-inference.yaml"]
+package_files += ["model/original_sd_configs/v2-inference-v.yaml"]
 
 
 with open("README.md", "r", encoding="utf-8") as fh:
diff --git a/web_app/src/components/SidePanel/DiffusionOptions.tsx b/web_app/src/components/SidePanel/DiffusionOptions.tsx
index 265e1c7..19d6820 100644
--- a/web_app/src/components/SidePanel/DiffusionOptions.tsx
+++ b/web_app/src/components/SidePanel/DiffusionOptions.tsx
@@ -734,7 +734,7 @@ const DiffusionOptions = () => {
       <div className="flex flex-col gap-1">
         <LabelTitle
           text="Mask blur"
-          toolTip="How much to blur the mask before processing, in pixels."
+          toolTip="How much to blur the mask before processing, in pixels. Make the generated inpainting boundaries appear more natural."
         />
         <RowContainer>
           <Slider