sd: load single file model using local config

This commit is contained in:
Qing 2024-01-30 13:19:13 +08:00
parent 934b0fc455
commit e5f71ae535
12 changed files with 352 additions and 2 deletions

View File

@ -14,6 +14,7 @@ from iopaint.const import (
DIFFUSERS_SDXL_INPAINT_CLASS_NAME,
ANYTEXT_NAME,
)
from iopaint.model.original_sd_configs import get_config_files
from iopaint.model_info import ModelInfo, ModelType
@ -60,6 +61,7 @@ def get_sd_model_type(model_abs_path: str) -> ModelType:
load_safety_checker=False,
local_files_only=True,
num_in_channels=9,
config_files=get_config_files()
)
model_type = ModelType.DIFFUSERS_SD_INPAINT
except ValueError as e:
@ -84,6 +86,7 @@ def get_sdxl_model_type(model_abs_path: str) -> ModelType:
load_safety_checker=False,
local_files_only=True,
num_in_channels=9,
config_files=get_config_files()
)
if model.unet.config.in_channels == 9:
# https://github.com/huggingface/diffusers/issues/6610

View File

@ -13,6 +13,7 @@ from .helper.controlnet_preprocess import (
make_inpaint_control_image,
)
from .helper.cpu_text_encoder import CPUTextEncoderWrapper
from .original_sd_configs import get_config_files
from .utils import (
get_scheduler,
handle_from_pretrained_exceptions,
@ -101,6 +102,7 @@ class ControlNet(DiffusionInpaintModel):
controlnet=controlnet,
load_safety_checker=not disable_nsfw_checker,
torch_dtype=torch_dtype,
config_files=get_config_files(),
**model_kwargs,
)
else:

View File

@ -0,0 +1,19 @@
from pathlib import Path
from typing import Dict
CURRENT_DIR = Path(__file__).parent.absolute()
def get_config_files() -> Dict[str, Path]:
"""
- `v1`: Config file for Stable Diffusion v1
- `v2`: Config file for Stable Diffusion v2
- `xl`: Config file for Stable Diffusion XL
- `xl_refiner`: Config file for Stable Diffusion XL Refiner
"""
return {
"v1": CURRENT_DIR / "v1-inference.yaml",
"v2": CURRENT_DIR / "v2-inference-v.yaml",
"xl": CURRENT_DIR / "sd_xl_base.yaml",
"xl_refiner": CURRENT_DIR / "sd_xl_refiner.yaml",
}

View File

@ -0,0 +1,93 @@
model:
target: sgm.models.diffusion.DiffusionEngine
params:
scale_factor: 0.13025
disable_first_stage_autocast: True
denoiser_config:
target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
params:
num_idx: 1000
scaling_config:
target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
discretization_config:
target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
network_config:
target: sgm.modules.diffusionmodules.openaimodel.UNetModel
params:
adm_in_channels: 2816
num_classes: sequential
use_checkpoint: True
in_channels: 4
out_channels: 4
model_channels: 320
attention_resolutions: [4, 2]
num_res_blocks: 2
channel_mult: [1, 2, 4]
num_head_channels: 64
use_linear_in_transformer: True
transformer_depth: [1, 2, 10]
context_dim: 2048
spatial_transformer_attn_type: softmax-xformers
conditioner_config:
target: sgm.modules.GeneralConditioner
params:
emb_models:
- is_trainable: False
input_key: txt
target: sgm.modules.encoders.modules.FrozenCLIPEmbedder
params:
layer: hidden
layer_idx: 11
- is_trainable: False
input_key: txt
target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2
params:
arch: ViT-bigG-14
version: laion2b_s39b_b160k
freeze: True
layer: penultimate
always_return_pooled: True
legacy: False
- is_trainable: False
input_key: original_size_as_tuple
target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
params:
outdim: 256
- is_trainable: False
input_key: crop_coords_top_left
target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
params:
outdim: 256
- is_trainable: False
input_key: target_size_as_tuple
target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
params:
outdim: 256
first_stage_config:
target: sgm.models.autoencoder.AutoencoderKL
params:
embed_dim: 4
monitor: val/rec_loss
ddconfig:
attn_type: vanilla-xformers
double_z: true
z_channels: 4
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult: [1, 2, 4, 4]
num_res_blocks: 2
attn_resolutions: []
dropout: 0.0
lossconfig:
target: torch.nn.Identity

View File

@ -0,0 +1,86 @@
model:
target: sgm.models.diffusion.DiffusionEngine
params:
scale_factor: 0.13025
disable_first_stage_autocast: True
denoiser_config:
target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser
params:
num_idx: 1000
scaling_config:
target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling
discretization_config:
target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization
network_config:
target: sgm.modules.diffusionmodules.openaimodel.UNetModel
params:
adm_in_channels: 2560
num_classes: sequential
use_checkpoint: True
in_channels: 4
out_channels: 4
model_channels: 384
attention_resolutions: [4, 2]
num_res_blocks: 2
channel_mult: [1, 2, 4, 4]
num_head_channels: 64
use_linear_in_transformer: True
transformer_depth: 4
context_dim: [1280, 1280, 1280, 1280]
spatial_transformer_attn_type: softmax-xformers
conditioner_config:
target: sgm.modules.GeneralConditioner
params:
emb_models:
- is_trainable: False
input_key: txt
target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2
params:
arch: ViT-bigG-14
version: laion2b_s39b_b160k
legacy: False
freeze: True
layer: penultimate
always_return_pooled: True
- is_trainable: False
input_key: original_size_as_tuple
target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
params:
outdim: 256
- is_trainable: False
input_key: crop_coords_top_left
target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
params:
outdim: 256
- is_trainable: False
input_key: aesthetic_score
target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND
params:
outdim: 256
first_stage_config:
target: sgm.models.autoencoder.AutoencoderKL
params:
embed_dim: 4
monitor: val/rec_loss
ddconfig:
attn_type: vanilla-xformers
double_z: true
z_channels: 4
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult: [1, 2, 4, 4]
num_res_blocks: 2
attn_resolutions: []
dropout: 0.0
lossconfig:
target: torch.nn.Identity

View File

@ -0,0 +1,70 @@
model:
base_learning_rate: 1.0e-04
target: ldm.models.diffusion.ddpm.LatentDiffusion
params:
linear_start: 0.00085
linear_end: 0.0120
num_timesteps_cond: 1
log_every_t: 200
timesteps: 1000
first_stage_key: "jpg"
cond_stage_key: "txt"
image_size: 64
channels: 4
cond_stage_trainable: false # Note: different from the one we trained before
conditioning_key: crossattn
monitor: val/loss_simple_ema
scale_factor: 0.18215
use_ema: False
scheduler_config: # 10000 warmup steps
target: ldm.lr_scheduler.LambdaLinearScheduler
params:
warm_up_steps: [ 10000 ]
cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases
f_start: [ 1.e-6 ]
f_max: [ 1. ]
f_min: [ 1. ]
unet_config:
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
params:
image_size: 32 # unused
in_channels: 4
out_channels: 4
model_channels: 320
attention_resolutions: [ 4, 2, 1 ]
num_res_blocks: 2
channel_mult: [ 1, 2, 4, 4 ]
num_heads: 8
use_spatial_transformer: True
transformer_depth: 1
context_dim: 768
use_checkpoint: True
legacy: False
first_stage_config:
target: ldm.models.autoencoder.AutoencoderKL
params:
embed_dim: 4
monitor: val/rec_loss
ddconfig:
double_z: true
z_channels: 4
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult:
- 1
- 2
- 4
- 4
num_res_blocks: 2
attn_resolutions: []
dropout: 0.0
lossconfig:
target: torch.nn.Identity
cond_stage_config:
target: ldm.modules.encoders.modules.FrozenCLIPEmbedder

View File

@ -0,0 +1,68 @@
model:
base_learning_rate: 1.0e-4
target: ldm.models.diffusion.ddpm.LatentDiffusion
params:
parameterization: "v"
linear_start: 0.00085
linear_end: 0.0120
num_timesteps_cond: 1
log_every_t: 200
timesteps: 1000
first_stage_key: "jpg"
cond_stage_key: "txt"
image_size: 64
channels: 4
cond_stage_trainable: false
conditioning_key: crossattn
monitor: val/loss_simple_ema
scale_factor: 0.18215
use_ema: False # we set this to false because this is an inference only config
unet_config:
target: ldm.modules.diffusionmodules.openaimodel.UNetModel
params:
use_checkpoint: True
use_fp16: True
image_size: 32 # unused
in_channels: 4
out_channels: 4
model_channels: 320
attention_resolutions: [ 4, 2, 1 ]
num_res_blocks: 2
channel_mult: [ 1, 2, 4, 4 ]
num_head_channels: 64 # need to fix for flash-attn
use_spatial_transformer: True
use_linear_in_transformer: True
transformer_depth: 1
context_dim: 1024
legacy: False
first_stage_config:
target: ldm.models.autoencoder.AutoencoderKL
params:
embed_dim: 4
monitor: val/rec_loss
ddconfig:
#attn_type: "vanilla-xformers"
double_z: true
z_channels: 4
resolution: 256
in_channels: 3
out_ch: 3
ch: 128
ch_mult:
- 1
- 2
- 4
- 4
num_res_blocks: 2
attn_resolutions: []
dropout: 0.0
lossconfig:
target: torch.nn.Identity
cond_stage_config:
target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder
params:
freeze: True
layer: "penultimate"

View File

@ -5,6 +5,7 @@ from loguru import logger
from .base import DiffusionInpaintModel
from .helper.cpu_text_encoder import CPUTextEncoderWrapper
from .original_sd_configs import get_config_files
from .utils import (
handle_from_pretrained_exceptions,
get_torch_dtype,
@ -51,6 +52,7 @@ class SD(DiffusionInpaintModel):
self.model_id_or_path,
dtype=torch_dtype,
load_safety_checker=not disable_nsfw_checker,
config_files=get_config_files(),
**model_kwargs,
)
else:

View File

@ -10,6 +10,7 @@ from iopaint.schema import InpaintRequest, ModelType
from .base import DiffusionInpaintModel
from .helper.cpu_text_encoder import CPUTextEncoderWrapper
from .original_sd_configs import get_config_files
from .utils import (
handle_from_pretrained_exceptions,
get_torch_dtype,
@ -41,6 +42,7 @@ class SDXL(DiffusionInpaintModel):
dtype=torch_dtype,
num_in_channels=num_in_channels,
load_safety_checker=False,
config_files=get_config_files()
)
else:
model_kwargs = {

View File

@ -232,9 +232,10 @@ def test_runway_sd_1_5_cpu_offload(device, strategy, sampler):
@pytest.mark.parametrize(
"name",
[
"sd-v1-5-inpainting.ckpt",
"sd-v1-5-inpainting.safetensors",
"v1-5-pruned-emaonly.safetensors",
"sd_xl_base_1.0.safetensors",
"sd_xl_base_1.0_inpainting_0.1.safetensors",
],
)
def test_local_file_path(device, sampler, name):

View File

@ -5,6 +5,10 @@ package_files = Path("iopaint/web_app").glob("**/*")
package_files = [str(it).replace("iopaint/", "") for it in package_files]
package_files += ["model/anytext/ocr_recog/ppocr_keys_v1.txt"]
package_files += ["model/anytext/anytext_sd15.yaml"]
package_files += ["model/original_sd_configs/sd_xl_base.yaml"]
package_files += ["model/original_sd_configs/sd_xl_refiner.yaml"]
package_files += ["model/original_sd_configs/v1-inference.yaml"]
package_files += ["model/original_sd_configs/v2-inference-v.yaml"]
with open("README.md", "r", encoding="utf-8") as fh:

View File

@ -734,7 +734,7 @@ const DiffusionOptions = () => {
<div className="flex flex-col gap-1">
<LabelTitle
text="Mask blur"
toolTip="How much to blur the mask before processing, in pixels."
toolTip="How much to blur the mask before processing, in pixels. Make the generated inpainting boundaries appear more natural."
/>
<RowContainer>
<Slider