sd: load single file model using local config
This commit is contained in:
parent
934b0fc455
commit
e5f71ae535
@ -14,6 +14,7 @@ from iopaint.const import (
|
||||
DIFFUSERS_SDXL_INPAINT_CLASS_NAME,
|
||||
ANYTEXT_NAME,
|
||||
)
|
||||
from iopaint.model.original_sd_configs import get_config_files
|
||||
from iopaint.model_info import ModelInfo, ModelType
|
||||
|
||||
|
||||
@ -60,6 +61,7 @@ def get_sd_model_type(model_abs_path: str) -> ModelType:
|
||||
load_safety_checker=False,
|
||||
local_files_only=True,
|
||||
num_in_channels=9,
|
||||
config_files=get_config_files()
|
||||
)
|
||||
model_type = ModelType.DIFFUSERS_SD_INPAINT
|
||||
except ValueError as e:
|
||||
@ -84,6 +86,7 @@ def get_sdxl_model_type(model_abs_path: str) -> ModelType:
|
||||
load_safety_checker=False,
|
||||
local_files_only=True,
|
||||
num_in_channels=9,
|
||||
config_files=get_config_files()
|
||||
)
|
||||
if model.unet.config.in_channels == 9:
|
||||
# https://github.com/huggingface/diffusers/issues/6610
|
||||
|
@ -13,6 +13,7 @@ from .helper.controlnet_preprocess import (
|
||||
make_inpaint_control_image,
|
||||
)
|
||||
from .helper.cpu_text_encoder import CPUTextEncoderWrapper
|
||||
from .original_sd_configs import get_config_files
|
||||
from .utils import (
|
||||
get_scheduler,
|
||||
handle_from_pretrained_exceptions,
|
||||
@ -101,6 +102,7 @@ class ControlNet(DiffusionInpaintModel):
|
||||
controlnet=controlnet,
|
||||
load_safety_checker=not disable_nsfw_checker,
|
||||
torch_dtype=torch_dtype,
|
||||
config_files=get_config_files(),
|
||||
**model_kwargs,
|
||||
)
|
||||
else:
|
||||
|
19
iopaint/model/original_sd_configs/__init__.py
Normal file
19
iopaint/model/original_sd_configs/__init__.py
Normal file
@ -0,0 +1,19 @@
|
||||
from pathlib import Path
|
||||
from typing import Dict
|
||||
|
||||
CURRENT_DIR = Path(__file__).parent.absolute()
|
||||
|
||||
|
||||
def get_config_files() -> Dict[str, Path]:
|
||||
"""
|
||||
- `v1`: Config file for Stable Diffusion v1
|
||||
- `v2`: Config file for Stable Diffusion v2
|
||||
- `xl`: Config file for Stable Diffusion XL
|
||||
- `xl_refiner`: Config file for Stable Diffusion XL Refiner
|
||||
"""
|
||||
return {
|
||||
"v1": CURRENT_DIR / "v1-inference.yaml",
|
||||
"v2": CURRENT_DIR / "v2-inference-v.yaml",
|
||||
"xl": CURRENT_DIR / "sd_xl_base.yaml",
|
||||
"xl_refiner": CURRENT_DIR / "sd_xl_refiner.yaml",
|
||||
}
|
93
iopaint/model/original_sd_configs/sd_xl_base.yaml
Normal file
93
iopaint/model/original_sd_configs/sd_xl_base.yaml
Normal file
@ -0,0 +1,93 @@
|
||||
model:
|
||||
target: sgm.models.diffusion.DiffusionEngine
|
||||
params:
|
||||
scale_factor: 0.13025
|
||||
disable_first_stage_autocast: True
|
||||
|
||||
denoiser_config:
|
||||
target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
|
||||
params:
|
||||
num_idx: 1000
|
||||
|
||||
scaling_config:
|
||||
target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
|
||||
discretization_config:
|
||||
target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
|
||||
|
||||
network_config:
|
||||
target: sgm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
adm_in_channels: 2816
|
||||
num_classes: sequential
|
||||
use_checkpoint: True
|
||||
in_channels: 4
|
||||
out_channels: 4
|
||||
model_channels: 320
|
||||
attention_resolutions: [4, 2]
|
||||
num_res_blocks: 2
|
||||
channel_mult: [1, 2, 4]
|
||||
num_head_channels: 64
|
||||
use_linear_in_transformer: True
|
||||
transformer_depth: [1, 2, 10]
|
||||
context_dim: 2048
|
||||
spatial_transformer_attn_type: softmax-xformers
|
||||
|
||||
conditioner_config:
|
||||
target: sgm.modules.GeneralConditioner
|
||||
params:
|
||||
emb_models:
|
||||
- is_trainable: False
|
||||
input_key: txt
|
||||
target: sgm.modules.encoders.modules.FrozenCLIPEmbedder
|
||||
params:
|
||||
layer: hidden
|
||||
layer_idx: 11
|
||||
|
||||
- is_trainable: False
|
||||
input_key: txt
|
||||
target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2
|
||||
params:
|
||||
arch: ViT-bigG-14
|
||||
version: laion2b_s39b_b160k
|
||||
freeze: True
|
||||
layer: penultimate
|
||||
always_return_pooled: True
|
||||
legacy: False
|
||||
|
||||
- is_trainable: False
|
||||
input_key: original_size_as_tuple
|
||||
target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
|
||||
params:
|
||||
outdim: 256
|
||||
|
||||
- is_trainable: False
|
||||
input_key: crop_coords_top_left
|
||||
target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
|
||||
params:
|
||||
outdim: 256
|
||||
|
||||
- is_trainable: False
|
||||
input_key: target_size_as_tuple
|
||||
target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
|
||||
params:
|
||||
outdim: 256
|
||||
|
||||
first_stage_config:
|
||||
target: sgm.models.autoencoder.AutoencoderKL
|
||||
params:
|
||||
embed_dim: 4
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
attn_type: vanilla-xformers
|
||||
double_z: true
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult: [1, 2, 4, 4]
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
86
iopaint/model/original_sd_configs/sd_xl_refiner.yaml
Normal file
86
iopaint/model/original_sd_configs/sd_xl_refiner.yaml
Normal file
@ -0,0 +1,86 @@
|
||||
model:
|
||||
target: sgm.models.diffusion.DiffusionEngine
|
||||
params:
|
||||
scale_factor: 0.13025
|
||||
disable_first_stage_autocast: True
|
||||
|
||||
denoiser_config:
|
||||
target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
|
||||
params:
|
||||
num_idx: 1000
|
||||
|
||||
scaling_config:
|
||||
target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
|
||||
discretization_config:
|
||||
target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
|
||||
|
||||
network_config:
|
||||
target: sgm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
adm_in_channels: 2560
|
||||
num_classes: sequential
|
||||
use_checkpoint: True
|
||||
in_channels: 4
|
||||
out_channels: 4
|
||||
model_channels: 384
|
||||
attention_resolutions: [4, 2]
|
||||
num_res_blocks: 2
|
||||
channel_mult: [1, 2, 4, 4]
|
||||
num_head_channels: 64
|
||||
use_linear_in_transformer: True
|
||||
transformer_depth: 4
|
||||
context_dim: [1280, 1280, 1280, 1280]
|
||||
spatial_transformer_attn_type: softmax-xformers
|
||||
|
||||
conditioner_config:
|
||||
target: sgm.modules.GeneralConditioner
|
||||
params:
|
||||
emb_models:
|
||||
- is_trainable: False
|
||||
input_key: txt
|
||||
target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2
|
||||
params:
|
||||
arch: ViT-bigG-14
|
||||
version: laion2b_s39b_b160k
|
||||
legacy: False
|
||||
freeze: True
|
||||
layer: penultimate
|
||||
always_return_pooled: True
|
||||
|
||||
- is_trainable: False
|
||||
input_key: original_size_as_tuple
|
||||
target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
|
||||
params:
|
||||
outdim: 256
|
||||
|
||||
- is_trainable: False
|
||||
input_key: crop_coords_top_left
|
||||
target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
|
||||
params:
|
||||
outdim: 256
|
||||
|
||||
- is_trainable: False
|
||||
input_key: aesthetic_score
|
||||
target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
|
||||
params:
|
||||
outdim: 256
|
||||
|
||||
first_stage_config:
|
||||
target: sgm.models.autoencoder.AutoencoderKL
|
||||
params:
|
||||
embed_dim: 4
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
attn_type: vanilla-xformers
|
||||
double_z: true
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult: [1, 2, 4, 4]
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
70
iopaint/model/original_sd_configs/v1-inference.yaml
Normal file
70
iopaint/model/original_sd_configs/v1-inference.yaml
Normal file
@ -0,0 +1,70 @@
|
||||
model:
|
||||
base_learning_rate: 1.0e-04
|
||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||
params:
|
||||
linear_start: 0.00085
|
||||
linear_end: 0.0120
|
||||
num_timesteps_cond: 1
|
||||
log_every_t: 200
|
||||
timesteps: 1000
|
||||
first_stage_key: "jpg"
|
||||
cond_stage_key: "txt"
|
||||
image_size: 64
|
||||
channels: 4
|
||||
cond_stage_trainable: false # Note: different from the one we trained before
|
||||
conditioning_key: crossattn
|
||||
monitor: val/loss_simple_ema
|
||||
scale_factor: 0.18215
|
||||
use_ema: False
|
||||
|
||||
scheduler_config: # 10000 warmup steps
|
||||
target: ldm.lr_scheduler.LambdaLinearScheduler
|
||||
params:
|
||||
warm_up_steps: [ 10000 ]
|
||||
cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
|
||||
f_start: [ 1.e-6 ]
|
||||
f_max: [ 1. ]
|
||||
f_min: [ 1. ]
|
||||
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
image_size: 32 # unused
|
||||
in_channels: 4
|
||||
out_channels: 4
|
||||
model_channels: 320
|
||||
attention_resolutions: [ 4, 2, 1 ]
|
||||
num_res_blocks: 2
|
||||
channel_mult: [ 1, 2, 4, 4 ]
|
||||
num_heads: 8
|
||||
use_spatial_transformer: True
|
||||
transformer_depth: 1
|
||||
context_dim: 768
|
||||
use_checkpoint: True
|
||||
legacy: False
|
||||
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.AutoencoderKL
|
||||
params:
|
||||
embed_dim: 4
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
double_z: true
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
||||
|
||||
cond_stage_config:
|
||||
target: ldm.modules.encoders.modules.FrozenCLIPEmbedder
|
68
iopaint/model/original_sd_configs/v2-inference-v.yaml
Normal file
68
iopaint/model/original_sd_configs/v2-inference-v.yaml
Normal file
@ -0,0 +1,68 @@
|
||||
model:
|
||||
base_learning_rate: 1.0e-4
|
||||
target: ldm.models.diffusion.ddpm.LatentDiffusion
|
||||
params:
|
||||
parameterization: "v"
|
||||
linear_start: 0.00085
|
||||
linear_end: 0.0120
|
||||
num_timesteps_cond: 1
|
||||
log_every_t: 200
|
||||
timesteps: 1000
|
||||
first_stage_key: "jpg"
|
||||
cond_stage_key: "txt"
|
||||
image_size: 64
|
||||
channels: 4
|
||||
cond_stage_trainable: false
|
||||
conditioning_key: crossattn
|
||||
monitor: val/loss_simple_ema
|
||||
scale_factor: 0.18215
|
||||
use_ema: False # we set this to false because this is an inference only config
|
||||
|
||||
unet_config:
|
||||
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
|
||||
params:
|
||||
use_checkpoint: True
|
||||
use_fp16: True
|
||||
image_size: 32 # unused
|
||||
in_channels: 4
|
||||
out_channels: 4
|
||||
model_channels: 320
|
||||
attention_resolutions: [ 4, 2, 1 ]
|
||||
num_res_blocks: 2
|
||||
channel_mult: [ 1, 2, 4, 4 ]
|
||||
num_head_channels: 64 # need to fix for flash-attn
|
||||
use_spatial_transformer: True
|
||||
use_linear_in_transformer: True
|
||||
transformer_depth: 1
|
||||
context_dim: 1024
|
||||
legacy: False
|
||||
|
||||
first_stage_config:
|
||||
target: ldm.models.autoencoder.AutoencoderKL
|
||||
params:
|
||||
embed_dim: 4
|
||||
monitor: val/rec_loss
|
||||
ddconfig:
|
||||
#attn_type: "vanilla-xformers"
|
||||
double_z: true
|
||||
z_channels: 4
|
||||
resolution: 256
|
||||
in_channels: 3
|
||||
out_ch: 3
|
||||
ch: 128
|
||||
ch_mult:
|
||||
- 1
|
||||
- 2
|
||||
- 4
|
||||
- 4
|
||||
num_res_blocks: 2
|
||||
attn_resolutions: []
|
||||
dropout: 0.0
|
||||
lossconfig:
|
||||
target: torch.nn.Identity
|
||||
|
||||
cond_stage_config:
|
||||
target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
|
||||
params:
|
||||
freeze: True
|
||||
layer: "penultimate"
|
@ -5,6 +5,7 @@ from loguru import logger
|
||||
|
||||
from .base import DiffusionInpaintModel
|
||||
from .helper.cpu_text_encoder import CPUTextEncoderWrapper
|
||||
from .original_sd_configs import get_config_files
|
||||
from .utils import (
|
||||
handle_from_pretrained_exceptions,
|
||||
get_torch_dtype,
|
||||
@ -51,6 +52,7 @@ class SD(DiffusionInpaintModel):
|
||||
self.model_id_or_path,
|
||||
dtype=torch_dtype,
|
||||
load_safety_checker=not disable_nsfw_checker,
|
||||
config_files=get_config_files(),
|
||||
**model_kwargs,
|
||||
)
|
||||
else:
|
||||
|
@ -10,6 +10,7 @@ from iopaint.schema import InpaintRequest, ModelType
|
||||
|
||||
from .base import DiffusionInpaintModel
|
||||
from .helper.cpu_text_encoder import CPUTextEncoderWrapper
|
||||
from .original_sd_configs import get_config_files
|
||||
from .utils import (
|
||||
handle_from_pretrained_exceptions,
|
||||
get_torch_dtype,
|
||||
@ -41,6 +42,7 @@ class SDXL(DiffusionInpaintModel):
|
||||
dtype=torch_dtype,
|
||||
num_in_channels=num_in_channels,
|
||||
load_safety_checker=False,
|
||||
config_files=get_config_files()
|
||||
)
|
||||
else:
|
||||
model_kwargs = {
|
||||
|
@ -232,9 +232,10 @@ def test_runway_sd_1_5_cpu_offload(device, strategy, sampler):
|
||||
@pytest.mark.parametrize(
|
||||
"name",
|
||||
[
|
||||
"sd-v1-5-inpainting.ckpt",
|
||||
"sd-v1-5-inpainting.safetensors",
|
||||
"v1-5-pruned-emaonly.safetensors",
|
||||
"sd_xl_base_1.0.safetensors",
|
||||
"sd_xl_base_1.0_inpainting_0.1.safetensors",
|
||||
],
|
||||
)
|
||||
def test_local_file_path(device, sampler, name):
|
||||
|
4
setup.py
4
setup.py
@ -5,6 +5,10 @@ package_files = Path("iopaint/web_app").glob("**/*")
|
||||
package_files = [str(it).replace("iopaint/", "") for it in package_files]
|
||||
package_files += ["model/anytext/ocr_recog/ppocr_keys_v1.txt"]
|
||||
package_files += ["model/anytext/anytext_sd15.yaml"]
|
||||
package_files += ["model/original_sd_configs/sd_xl_base.yaml"]
|
||||
package_files += ["model/original_sd_configs/sd_xl_refiner.yaml"]
|
||||
package_files += ["model/original_sd_configs/v1-inference.yaml"]
|
||||
package_files += ["model/original_sd_configs/v2-inference-v.yaml"]
|
||||
|
||||
|
||||
with open("README.md", "r", encoding="utf-8") as fh:
|
||||
|
@ -734,7 +734,7 @@ const DiffusionOptions = () => {
|
||||
<div className="flex flex-col gap-1">
|
||||
<LabelTitle
|
||||
text="Mask blur"
|
||||
toolTip="How much to blur the mask before processing, in pixels."
|
||||
toolTip="How much to blur the mask before processing, in pixels. Make the generated inpainting boundaries appear more natural."
|
||||
/>
|
||||
<RowContainer>
|
||||
<Slider
|
||||
|
Loading…
Reference in New Issue
Block a user