sd: load single file model using local config
This commit is contained in:
parent
934b0fc455
commit
e5f71ae535
@ -14,6 +14,7 @@ from iopaint.const import (
|
|||||||
DIFFUSERS_SDXL_INPAINT_CLASS_NAME,
|
DIFFUSERS_SDXL_INPAINT_CLASS_NAME,
|
||||||
ANYTEXT_NAME,
|
ANYTEXT_NAME,
|
||||||
)
|
)
|
||||||
|
from iopaint.model.original_sd_configs import get_config_files
|
||||||
from iopaint.model_info import ModelInfo, ModelType
|
from iopaint.model_info import ModelInfo, ModelType
|
||||||
|
|
||||||
|
|
||||||
@ -60,6 +61,7 @@ def get_sd_model_type(model_abs_path: str) -> ModelType:
|
|||||||
load_safety_checker=False,
|
load_safety_checker=False,
|
||||||
local_files_only=True,
|
local_files_only=True,
|
||||||
num_in_channels=9,
|
num_in_channels=9,
|
||||||
|
config_files=get_config_files()
|
||||||
)
|
)
|
||||||
model_type = ModelType.DIFFUSERS_SD_INPAINT
|
model_type = ModelType.DIFFUSERS_SD_INPAINT
|
||||||
except ValueError as e:
|
except ValueError as e:
|
||||||
@ -84,6 +86,7 @@ def get_sdxl_model_type(model_abs_path: str) -> ModelType:
|
|||||||
load_safety_checker=False,
|
load_safety_checker=False,
|
||||||
local_files_only=True,
|
local_files_only=True,
|
||||||
num_in_channels=9,
|
num_in_channels=9,
|
||||||
|
config_files=get_config_files()
|
||||||
)
|
)
|
||||||
if model.unet.config.in_channels == 9:
|
if model.unet.config.in_channels == 9:
|
||||||
# https://github.com/huggingface/diffusers/issues/6610
|
# https://github.com/huggingface/diffusers/issues/6610
|
||||||
|
@ -13,6 +13,7 @@ from .helper.controlnet_preprocess import (
|
|||||||
make_inpaint_control_image,
|
make_inpaint_control_image,
|
||||||
)
|
)
|
||||||
from .helper.cpu_text_encoder import CPUTextEncoderWrapper
|
from .helper.cpu_text_encoder import CPUTextEncoderWrapper
|
||||||
|
from .original_sd_configs import get_config_files
|
||||||
from .utils import (
|
from .utils import (
|
||||||
get_scheduler,
|
get_scheduler,
|
||||||
handle_from_pretrained_exceptions,
|
handle_from_pretrained_exceptions,
|
||||||
@ -101,6 +102,7 @@ class ControlNet(DiffusionInpaintModel):
|
|||||||
controlnet=controlnet,
|
controlnet=controlnet,
|
||||||
load_safety_checker=not disable_nsfw_checker,
|
load_safety_checker=not disable_nsfw_checker,
|
||||||
torch_dtype=torch_dtype,
|
torch_dtype=torch_dtype,
|
||||||
|
config_files=get_config_files(),
|
||||||
**model_kwargs,
|
**model_kwargs,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
|
19
iopaint/model/original_sd_configs/__init__.py
Normal file
19
iopaint/model/original_sd_configs/__init__.py
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict
|
||||||
|
|
||||||
|
CURRENT_DIR = Path(__file__).parent.absolute()
|
||||||
|
|
||||||
|
|
||||||
|
def get_config_files() -> Dict[str, Path]:
|
||||||
|
"""
|
||||||
|
- `v1`: Config file for Stable Diffusion v1
|
||||||
|
- `v2`: Config file for Stable Diffusion v2
|
||||||
|
- `xl`: Config file for Stable Diffusion XL
|
||||||
|
- `xl_refiner`: Config file for Stable Diffusion XL Refiner
|
||||||
|
"""
|
||||||
|
return {
|
||||||
|
"v1": CURRENT_DIR / "v1-inference.yaml",
|
||||||
|
"v2": CURRENT_DIR / "v2-inference-v.yaml",
|
||||||
|
"xl": CURRENT_DIR / "sd_xl_base.yaml",
|
||||||
|
"xl_refiner": CURRENT_DIR / "sd_xl_refiner.yaml",
|
||||||
|
}
|
93
iopaint/model/original_sd_configs/sd_xl_base.yaml
Normal file
93
iopaint/model/original_sd_configs/sd_xl_base.yaml
Normal file
@ -0,0 +1,93 @@
|
|||||||
|
model:
|
||||||
|
target: sgm.models.diffusion.DiffusionEngine
|
||||||
|
params:
|
||||||
|
scale_factor: 0.13025
|
||||||
|
disable_first_stage_autocast: True
|
||||||
|
|
||||||
|
denoiser_config:
|
||||||
|
target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
|
||||||
|
params:
|
||||||
|
num_idx: 1000
|
||||||
|
|
||||||
|
scaling_config:
|
||||||
|
target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
|
||||||
|
discretization_config:
|
||||||
|
target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
|
||||||
|
|
||||||
|
network_config:
|
||||||
|
target: sgm.modules.diffusionmodules.openaimodel.UNetModel
|
||||||
|
params:
|
||||||
|
adm_in_channels: 2816
|
||||||
|
num_classes: sequential
|
||||||
|
use_checkpoint: True
|
||||||
|
in_channels: 4
|
||||||
|
out_channels: 4
|
||||||
|
model_channels: 320
|
||||||
|
attention_resolutions: [4, 2]
|
||||||
|
num_res_blocks: 2
|
||||||
|
channel_mult: [1, 2, 4]
|
||||||
|
num_head_channels: 64
|
||||||
|
use_linear_in_transformer: True
|
||||||
|
transformer_depth: [1, 2, 10]
|
||||||
|
context_dim: 2048
|
||||||
|
spatial_transformer_attn_type: softmax-xformers
|
||||||
|
|
||||||
|
conditioner_config:
|
||||||
|
target: sgm.modules.GeneralConditioner
|
||||||
|
params:
|
||||||
|
emb_models:
|
||||||
|
- is_trainable: False
|
||||||
|
input_key: txt
|
||||||
|
target: sgm.modules.encoders.modules.FrozenCLIPEmbedder
|
||||||
|
params:
|
||||||
|
layer: hidden
|
||||||
|
layer_idx: 11
|
||||||
|
|
||||||
|
- is_trainable: False
|
||||||
|
input_key: txt
|
||||||
|
target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2
|
||||||
|
params:
|
||||||
|
arch: ViT-bigG-14
|
||||||
|
version: laion2b_s39b_b160k
|
||||||
|
freeze: True
|
||||||
|
layer: penultimate
|
||||||
|
always_return_pooled: True
|
||||||
|
legacy: False
|
||||||
|
|
||||||
|
- is_trainable: False
|
||||||
|
input_key: original_size_as_tuple
|
||||||
|
target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
|
||||||
|
params:
|
||||||
|
outdim: 256
|
||||||
|
|
||||||
|
- is_trainable: False
|
||||||
|
input_key: crop_coords_top_left
|
||||||
|
target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
|
||||||
|
params:
|
||||||
|
outdim: 256
|
||||||
|
|
||||||
|
- is_trainable: False
|
||||||
|
input_key: target_size_as_tuple
|
||||||
|
target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
|
||||||
|
params:
|
||||||
|
outdim: 256
|
||||||
|
|
||||||
|
first_stage_config:
|
||||||
|
target: sgm.models.autoencoder.AutoencoderKL
|
||||||
|
params:
|
||||||
|
embed_dim: 4
|
||||||
|
monitor: val/rec_loss
|
||||||
|
ddconfig:
|
||||||
|
attn_type: vanilla-xformers
|
||||||
|
double_z: true
|
||||||
|
z_channels: 4
|
||||||
|
resolution: 256
|
||||||
|
in_channels: 3
|
||||||
|
out_ch: 3
|
||||||
|
ch: 128
|
||||||
|
ch_mult: [1, 2, 4, 4]
|
||||||
|
num_res_blocks: 2
|
||||||
|
attn_resolutions: []
|
||||||
|
dropout: 0.0
|
||||||
|
lossconfig:
|
||||||
|
target: torch.nn.Identity
|
86
iopaint/model/original_sd_configs/sd_xl_refiner.yaml
Normal file
86
iopaint/model/original_sd_configs/sd_xl_refiner.yaml
Normal file
@ -0,0 +1,86 @@
|
|||||||
|
model:
|
||||||
|
target: sgm.models.diffusion.DiffusionEngine
|
||||||
|
params:
|
||||||
|
scale_factor: 0.13025
|
||||||
|
disable_first_stage_autocast: True
|
||||||
|
|
||||||
|
denoiser_config:
|
||||||
|
target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
|
||||||
|
params:
|
||||||
|
num_idx: 1000
|
||||||
|
|
||||||
|
scaling_config:
|
||||||
|
target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
|
||||||
|
discretization_config:
|
||||||
|
target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
|
||||||
|
|
||||||
|
network_config:
|
||||||
|
target: sgm.modules.diffusionmodules.openaimodel.UNetModel
|
||||||
|
params:
|
||||||
|
adm_in_channels: 2560
|
||||||
|
num_classes: sequential
|
||||||
|
use_checkpoint: True
|
||||||
|
in_channels: 4
|
||||||
|
out_channels: 4
|
||||||
|
model_channels: 384
|
||||||
|
attention_resolutions: [4, 2]
|
||||||
|
num_res_blocks: 2
|
||||||
|
channel_mult: [1, 2, 4, 4]
|
||||||
|
num_head_channels: 64
|
||||||
|
use_linear_in_transformer: True
|
||||||
|
transformer_depth: 4
|
||||||
|
context_dim: [1280, 1280, 1280, 1280]
|
||||||
|
spatial_transformer_attn_type: softmax-xformers
|
||||||
|
|
||||||
|
conditioner_config:
|
||||||
|
target: sgm.modules.GeneralConditioner
|
||||||
|
params:
|
||||||
|
emb_models:
|
||||||
|
- is_trainable: False
|
||||||
|
input_key: txt
|
||||||
|
target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2
|
||||||
|
params:
|
||||||
|
arch: ViT-bigG-14
|
||||||
|
version: laion2b_s39b_b160k
|
||||||
|
legacy: False
|
||||||
|
freeze: True
|
||||||
|
layer: penultimate
|
||||||
|
always_return_pooled: True
|
||||||
|
|
||||||
|
- is_trainable: False
|
||||||
|
input_key: original_size_as_tuple
|
||||||
|
target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
|
||||||
|
params:
|
||||||
|
outdim: 256
|
||||||
|
|
||||||
|
- is_trainable: False
|
||||||
|
input_key: crop_coords_top_left
|
||||||
|
target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
|
||||||
|
params:
|
||||||
|
outdim: 256
|
||||||
|
|
||||||
|
- is_trainable: False
|
||||||
|
input_key: aesthetic_score
|
||||||
|
target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
|
||||||
|
params:
|
||||||
|
outdim: 256
|
||||||
|
|
||||||
|
first_stage_config:
|
||||||
|
target: sgm.models.autoencoder.AutoencoderKL
|
||||||
|
params:
|
||||||
|
embed_dim: 4
|
||||||
|
monitor: val/rec_loss
|
||||||
|
ddconfig:
|
||||||
|
attn_type: vanilla-xformers
|
||||||
|
double_z: true
|
||||||
|
z_channels: 4
|
||||||
|
resolution: 256
|
||||||
|
in_channels: 3
|
||||||
|
out_ch: 3
|
||||||
|
ch: 128
|
||||||
|
ch_mult: [1, 2, 4, 4]
|
||||||
|
num_res_blocks: 2
|
||||||
|
attn_resolutions: []
|
||||||
|
dropout: 0.0
|
||||||
|
lossconfig:
|
||||||
|
target: torch.nn.Identity
|
70
iopaint/model/original_sd_configs/v1-inference.yaml
Normal file
70
iopaint/model/original_sd_configs/v1-inference.yaml
Normal file
@ -0,0 +1,70 @@
|
|||||||
|
model:
|
||||||
|
base_learning_rate: 1.0e-04
|
||||||
|
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||||
|
params:
|
||||||
|
linear_start: 0.00085
|
||||||
|
linear_end: 0.0120
|
||||||
|
num_timesteps_cond: 1
|
||||||
|
log_every_t: 200
|
||||||
|
timesteps: 1000
|
||||||
|
first_stage_key: "jpg"
|
||||||
|
cond_stage_key: "txt"
|
||||||
|
image_size: 64
|
||||||
|
channels: 4
|
||||||
|
cond_stage_trainable: false # Note: different from the one we trained before
|
||||||
|
conditioning_key: crossattn
|
||||||
|
monitor: val/loss_simple_ema
|
||||||
|
scale_factor: 0.18215
|
||||||
|
use_ema: False
|
||||||
|
|
||||||
|
scheduler_config: # 10000 warmup steps
|
||||||
|
target: ldm.lr_scheduler.LambdaLinearScheduler
|
||||||
|
params:
|
||||||
|
warm_up_steps: [ 10000 ]
|
||||||
|
cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
|
||||||
|
f_start: [ 1.e-6 ]
|
||||||
|
f_max: [ 1. ]
|
||||||
|
f_min: [ 1. ]
|
||||||
|
|
||||||
|
unet_config:
|
||||||
|
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||||
|
params:
|
||||||
|
image_size: 32 # unused
|
||||||
|
in_channels: 4
|
||||||
|
out_channels: 4
|
||||||
|
model_channels: 320
|
||||||
|
attention_resolutions: [ 4, 2, 1 ]
|
||||||
|
num_res_blocks: 2
|
||||||
|
channel_mult: [ 1, 2, 4, 4 ]
|
||||||
|
num_heads: 8
|
||||||
|
use_spatial_transformer: True
|
||||||
|
transformer_depth: 1
|
||||||
|
context_dim: 768
|
||||||
|
use_checkpoint: True
|
||||||
|
legacy: False
|
||||||
|
|
||||||
|
first_stage_config:
|
||||||
|
target: ldm.models.autoencoder.AutoencoderKL
|
||||||
|
params:
|
||||||
|
embed_dim: 4
|
||||||
|
monitor: val/rec_loss
|
||||||
|
ddconfig:
|
||||||
|
double_z: true
|
||||||
|
z_channels: 4
|
||||||
|
resolution: 256
|
||||||
|
in_channels: 3
|
||||||
|
out_ch: 3
|
||||||
|
ch: 128
|
||||||
|
ch_mult:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 4
|
||||||
|
num_res_blocks: 2
|
||||||
|
attn_resolutions: []
|
||||||
|
dropout: 0.0
|
||||||
|
lossconfig:
|
||||||
|
target: torch.nn.Identity
|
||||||
|
|
||||||
|
cond_stage_config:
|
||||||
|
target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
|
68
iopaint/model/original_sd_configs/v2-inference-v.yaml
Normal file
68
iopaint/model/original_sd_configs/v2-inference-v.yaml
Normal file
@ -0,0 +1,68 @@
|
|||||||
|
model:
|
||||||
|
base_learning_rate: 1.0e-4
|
||||||
|
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||||
|
params:
|
||||||
|
parameterization: "v"
|
||||||
|
linear_start: 0.00085
|
||||||
|
linear_end: 0.0120
|
||||||
|
num_timesteps_cond: 1
|
||||||
|
log_every_t: 200
|
||||||
|
timesteps: 1000
|
||||||
|
first_stage_key: "jpg"
|
||||||
|
cond_stage_key: "txt"
|
||||||
|
image_size: 64
|
||||||
|
channels: 4
|
||||||
|
cond_stage_trainable: false
|
||||||
|
conditioning_key: crossattn
|
||||||
|
monitor: val/loss_simple_ema
|
||||||
|
scale_factor: 0.18215
|
||||||
|
use_ema: False # we set this to false because this is an inference only config
|
||||||
|
|
||||||
|
unet_config:
|
||||||
|
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||||
|
params:
|
||||||
|
use_checkpoint: True
|
||||||
|
use_fp16: True
|
||||||
|
image_size: 32 # unused
|
||||||
|
in_channels: 4
|
||||||
|
out_channels: 4
|
||||||
|
model_channels: 320
|
||||||
|
attention_resolutions: [ 4, 2, 1 ]
|
||||||
|
num_res_blocks: 2
|
||||||
|
channel_mult: [ 1, 2, 4, 4 ]
|
||||||
|
num_head_channels: 64 # need to fix for flash-attn
|
||||||
|
use_spatial_transformer: True
|
||||||
|
use_linear_in_transformer: True
|
||||||
|
transformer_depth: 1
|
||||||
|
context_dim: 1024
|
||||||
|
legacy: False
|
||||||
|
|
||||||
|
first_stage_config:
|
||||||
|
target: ldm.models.autoencoder.AutoencoderKL
|
||||||
|
params:
|
||||||
|
embed_dim: 4
|
||||||
|
monitor: val/rec_loss
|
||||||
|
ddconfig:
|
||||||
|
#attn_type: "vanilla-xformers"
|
||||||
|
double_z: true
|
||||||
|
z_channels: 4
|
||||||
|
resolution: 256
|
||||||
|
in_channels: 3
|
||||||
|
out_ch: 3
|
||||||
|
ch: 128
|
||||||
|
ch_mult:
|
||||||
|
- 1
|
||||||
|
- 2
|
||||||
|
- 4
|
||||||
|
- 4
|
||||||
|
num_res_blocks: 2
|
||||||
|
attn_resolutions: []
|
||||||
|
dropout: 0.0
|
||||||
|
lossconfig:
|
||||||
|
target: torch.nn.Identity
|
||||||
|
|
||||||
|
cond_stage_config:
|
||||||
|
target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
|
||||||
|
params:
|
||||||
|
freeze: True
|
||||||
|
layer: "penultimate"
|
@ -5,6 +5,7 @@ from loguru import logger
|
|||||||
|
|
||||||
from .base import DiffusionInpaintModel
|
from .base import DiffusionInpaintModel
|
||||||
from .helper.cpu_text_encoder import CPUTextEncoderWrapper
|
from .helper.cpu_text_encoder import CPUTextEncoderWrapper
|
||||||
|
from .original_sd_configs import get_config_files
|
||||||
from .utils import (
|
from .utils import (
|
||||||
handle_from_pretrained_exceptions,
|
handle_from_pretrained_exceptions,
|
||||||
get_torch_dtype,
|
get_torch_dtype,
|
||||||
@ -51,6 +52,7 @@ class SD(DiffusionInpaintModel):
|
|||||||
self.model_id_or_path,
|
self.model_id_or_path,
|
||||||
dtype=torch_dtype,
|
dtype=torch_dtype,
|
||||||
load_safety_checker=not disable_nsfw_checker,
|
load_safety_checker=not disable_nsfw_checker,
|
||||||
|
config_files=get_config_files(),
|
||||||
**model_kwargs,
|
**model_kwargs,
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
|
@ -10,6 +10,7 @@ from iopaint.schema import InpaintRequest, ModelType
|
|||||||
|
|
||||||
from .base import DiffusionInpaintModel
|
from .base import DiffusionInpaintModel
|
||||||
from .helper.cpu_text_encoder import CPUTextEncoderWrapper
|
from .helper.cpu_text_encoder import CPUTextEncoderWrapper
|
||||||
|
from .original_sd_configs import get_config_files
|
||||||
from .utils import (
|
from .utils import (
|
||||||
handle_from_pretrained_exceptions,
|
handle_from_pretrained_exceptions,
|
||||||
get_torch_dtype,
|
get_torch_dtype,
|
||||||
@ -41,6 +42,7 @@ class SDXL(DiffusionInpaintModel):
|
|||||||
dtype=torch_dtype,
|
dtype=torch_dtype,
|
||||||
num_in_channels=num_in_channels,
|
num_in_channels=num_in_channels,
|
||||||
load_safety_checker=False,
|
load_safety_checker=False,
|
||||||
|
config_files=get_config_files()
|
||||||
)
|
)
|
||||||
else:
|
else:
|
||||||
model_kwargs = {
|
model_kwargs = {
|
||||||
|
@ -232,9 +232,10 @@ def test_runway_sd_1_5_cpu_offload(device, strategy, sampler):
|
|||||||
@pytest.mark.parametrize(
|
@pytest.mark.parametrize(
|
||||||
"name",
|
"name",
|
||||||
[
|
[
|
||||||
"sd-v1-5-inpainting.ckpt",
|
|
||||||
"sd-v1-5-inpainting.safetensors",
|
"sd-v1-5-inpainting.safetensors",
|
||||||
"v1-5-pruned-emaonly.safetensors",
|
"v1-5-pruned-emaonly.safetensors",
|
||||||
|
"sd_xl_base_1.0.safetensors",
|
||||||
|
"sd_xl_base_1.0_inpainting_0.1.safetensors",
|
||||||
],
|
],
|
||||||
)
|
)
|
||||||
def test_local_file_path(device, sampler, name):
|
def test_local_file_path(device, sampler, name):
|
||||||
|
4
setup.py
4
setup.py
@ -5,6 +5,10 @@ package_files = Path("iopaint/web_app").glob("**/*")
|
|||||||
package_files = [str(it).replace("iopaint/", "") for it in package_files]
|
package_files = [str(it).replace("iopaint/", "") for it in package_files]
|
||||||
package_files += ["model/anytext/ocr_recog/ppocr_keys_v1.txt"]
|
package_files += ["model/anytext/ocr_recog/ppocr_keys_v1.txt"]
|
||||||
package_files += ["model/anytext/anytext_sd15.yaml"]
|
package_files += ["model/anytext/anytext_sd15.yaml"]
|
||||||
|
package_files += ["model/original_sd_configs/sd_xl_base.yaml"]
|
||||||
|
package_files += ["model/original_sd_configs/sd_xl_refiner.yaml"]
|
||||||
|
package_files += ["model/original_sd_configs/v1-inference.yaml"]
|
||||||
|
package_files += ["model/original_sd_configs/v2-inference-v.yaml"]
|
||||||
|
|
||||||
|
|
||||||
with open("README.md", "r", encoding="utf-8") as fh:
|
with open("README.md", "r", encoding="utf-8") as fh:
|
||||||
|
@ -734,7 +734,7 @@ const DiffusionOptions = () => {
|
|||||||
<div className="flex flex-col gap-1">
|
<div className="flex flex-col gap-1">
|
||||||
<LabelTitle
|
<LabelTitle
|
||||||
text="Mask blur"
|
text="Mask blur"
|
||||||
toolTip="How much to blur the mask before processing, in pixels."
|
toolTip="How much to blur the mask before processing, in pixels. Make the generated inpainting boundaries appear more natural."
|
||||||
/>
|
/>
|
||||||
<RowContainer>
|
<RowContainer>
|
||||||
<Slider
|
<Slider
|
||||||
|
Loading…
Reference in New Issue
Block a user