diff --git a/inpaint/__init__.py b/inpaint/__init__.py new file mode 100644 index 0000000..d8e11fe --- /dev/null +++ b/inpaint/__init__.py @@ -0,0 +1,23 @@ +import os + +os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" +# https://github.com/pytorch/pytorch/issues/27971#issuecomment-1768868068 +os.environ["ONEDNN_PRIMITIVE_CACHE_CAPACITY"] = "1" +os.environ["LRU_CACHE_CAPACITY"] = "1" +# prevent CPU memory leak when run model on GPU +# https://github.com/pytorch/pytorch/issues/98688#issuecomment-1869288431 +# https://github.com/pytorch/pytorch/issues/108334#issuecomment-1752763633 +os.environ["TORCH_CUDNN_V8_API_LRU_CACHE_LIMIT"] = "1" + + +import warnings + +warnings.simplefilter("ignore", UserWarning) + + +def entry_point(): + # To make os.environ["XDG_CACHE_HOME"] = args.model_cache_dir works for diffusers + # https://github.com/huggingface/diffusers/blob/be99201a567c1ccd841dc16fb24e88f7f239c187/src/diffusers/utils/constants.py#L18 + from inpaint.cli import typer_app + + typer_app() diff --git a/inpaint/__main__.py b/inpaint/__main__.py new file mode 100644 index 0000000..57fc6ae --- /dev/null +++ b/inpaint/__main__.py @@ -0,0 +1,4 @@ +from inpaint import entry_point + +if __name__ == "__main__": + entry_point() diff --git a/inpaint/api.py b/inpaint/api.py new file mode 100644 index 0000000..51e1329 --- /dev/null +++ b/inpaint/api.py @@ -0,0 +1,398 @@ +import asyncio +import os +import threading +import time +import traceback +from pathlib import Path +from typing import Optional, Dict, List + +import cv2 +import numpy as np +import socketio +import torch + +try: + torch._C._jit_override_can_fuse_on_cpu(False) + torch._C._jit_override_can_fuse_on_gpu(False) + torch._C._jit_set_texpr_fuser_enabled(False) + torch._C._jit_set_nvfuser_enabled(False) +except: + pass + +import uvicorn +from PIL import Image +from fastapi import APIRouter, FastAPI, Request, UploadFile +from fastapi.encoders import jsonable_encoder +from fastapi.exceptions import HTTPException +from fastapi.middleware.cors import CORSMiddleware +from fastapi.responses import JSONResponse, FileResponse, Response +from fastapi.staticfiles import StaticFiles +from loguru import logger +from socketio import AsyncServer + +from inpaint.file_manager import FileManager +from inpaint.helper import ( + load_img, + decode_base64_to_image, + pil_to_bytes, + numpy_to_bytes, + concat_alpha_channel, + gen_frontend_mask, + adjust_mask, +) +from inpaint.model.utils import torch_gc +from inpaint.model_manager import ModelManager +from inpaint.plugins import build_plugins, RealESRGANUpscaler, InteractiveSeg +from inpaint.plugins.base_plugin import BasePlugin +from inpaint.plugins.remove_bg import RemoveBG +from inpaint.schema import ( + GenInfoResponse, + ApiConfig, + ServerConfigResponse, + SwitchModelRequest, + InpaintRequest, + RunPluginRequest, + SDSampler, + PluginInfo, + AdjustMaskRequest, + RemoveBGModel, + SwitchPluginModelRequest, + ModelInfo, + InteractiveSegModel, + RealESRGANModel, +) + +CURRENT_DIR = Path(__file__).parent.absolute().resolve() +WEB_APP_DIR = CURRENT_DIR / "web_app" + + +def api_middleware(app: FastAPI): + rich_available = False + try: + if os.environ.get("WEBUI_RICH_EXCEPTIONS", None) is not None: + import anyio # importing just so it can be placed on silent list + import starlette # importing just so it can be placed on silent list + from rich.console import Console + + console = Console() + rich_available = True + except Exception: + pass + + def handle_exception(request: Request, e: Exception): + err = { + "error": type(e).__name__, + "detail": vars(e).get("detail", ""), + "body": vars(e).get("body", ""), + "errors": str(e), + } + if not isinstance( + e, HTTPException + ): # do not print backtrace on known httpexceptions + message = f"API error: {request.method}: {request.url} {err}" + if rich_available: + print(message) + console.print_exception( + show_locals=True, + max_frames=2, + extra_lines=1, + suppress=[anyio, starlette], + word_wrap=False, + width=min([console.width, 200]), + ) + else: + traceback.print_exc() + return JSONResponse( + status_code=vars(e).get("status_code", 500), content=jsonable_encoder(err) + ) + + @app.middleware("http") + async def exception_handling(request: Request, call_next): + try: + return await call_next(request) + except Exception as e: + return handle_exception(request, e) + + @app.exception_handler(Exception) + async def fastapi_exception_handler(request: Request, e: Exception): + return handle_exception(request, e) + + @app.exception_handler(HTTPException) + async def http_exception_handler(request: Request, e: HTTPException): + return handle_exception(request, e) + + cors_options = { + "allow_methods": ["*"], + "allow_headers": ["*"], + "allow_origins": ["*"], + "allow_credentials": True, + "expose_headers": ["X-Seed"], + } + app.add_middleware(CORSMiddleware, **cors_options) + + +global_sio: AsyncServer = None + + +def diffuser_callback(pipe, step: int, timestep: int, callback_kwargs: Dict = {}): + # self: DiffusionPipeline, step: int, timestep: int, callback_kwargs: Dict + # logger.info(f"diffusion callback: step={step}, timestep={timestep}") + + # We use asyncio loos for task processing. Perhaps in the future, we can add a processing queue similar to InvokeAI, + # but for now let's just start a separate event loop. It shouldn't make a difference for single person use + asyncio.run(global_sio.emit("diffusion_progress", {"step": step})) + return {} + + +class Api: + def __init__(self, app: FastAPI, config: ApiConfig): + self.app = app + self.config = config + self.router = APIRouter() + self.queue_lock = threading.Lock() + api_middleware(self.app) + + self.file_manager = self._build_file_manager() + self.plugins = self._build_plugins() + self.model_manager = self._build_model_manager() + + # fmt: off + self.add_api_route("/api/v1/gen-info", self.api_geninfo, methods=["POST"], response_model=GenInfoResponse) + self.add_api_route("/api/v1/server-config", self.api_server_config, methods=["GET"], + response_model=ServerConfigResponse) + self.add_api_route("/api/v1/model", self.api_current_model, methods=["GET"], response_model=ModelInfo) + self.add_api_route("/api/v1/model", self.api_switch_model, methods=["POST"], response_model=ModelInfo) + self.add_api_route("/api/v1/inputimage", self.api_input_image, methods=["GET"]) + self.add_api_route("/api/v1/inpaint", self.api_inpaint, methods=["POST"]) + self.add_api_route("/api/v1/switch_plugin_model", self.api_switch_plugin_model, methods=["POST"]) + self.add_api_route("/api/v1/run_plugin_gen_mask", self.api_run_plugin_gen_mask, methods=["POST"]) + self.add_api_route("/api/v1/run_plugin_gen_image", self.api_run_plugin_gen_image, methods=["POST"]) + self.add_api_route("/api/v1/samplers", self.api_samplers, methods=["GET"]) + self.add_api_route("/api/v1/adjust_mask", self.api_adjust_mask, methods=["POST"]) + self.add_api_route("/api/v1/save_image", self.api_save_image, methods=["POST"]) + self.app.mount("/", StaticFiles(directory=WEB_APP_DIR, html=True), name="assets") + # fmt: on + + global global_sio + self.sio = socketio.AsyncServer(async_mode="asgi", cors_allowed_origins="*") + self.combined_asgi_app = socketio.ASGIApp(self.sio, self.app) + self.app.mount("/ws", self.combined_asgi_app) + global_sio = self.sio + + def add_api_route(self, path: str, endpoint, **kwargs): + return self.app.add_api_route(path, endpoint, **kwargs) + + def api_save_image(self, file: UploadFile): + filename = file.filename + origin_image_bytes = file.file.read() + with open(self.config.output_dir / filename, "wb") as fw: + fw.write(origin_image_bytes) + + def api_current_model(self) -> ModelInfo: + return self.model_manager.current_model + + def api_switch_model(self, req: SwitchModelRequest) -> ModelInfo: + if req.name == self.model_manager.name: + return self.model_manager.current_model + self.model_manager.switch(req.name) + return self.model_manager.current_model + + def api_switch_plugin_model(self, req: SwitchPluginModelRequest): + if req.plugin_name in self.plugins: + self.plugins[req.plugin_name].switch_model(req.model_name) + if req.plugin_name == RemoveBG.name: + self.config.remove_bg_model = req.model_name + if req.plugin_name == RealESRGANUpscaler.name: + self.config.realesrgan_model = req.model_name + if req.plugin_name == InteractiveSeg.name: + self.config.interactive_seg_model = req.model_name + torch_gc() + + def api_server_config(self) -> ServerConfigResponse: + plugins = [] + for it in self.plugins.values(): + plugins.append( + PluginInfo( + name=it.name, + support_gen_image=it.support_gen_image, + support_gen_mask=it.support_gen_mask, + ) + ) + + return ServerConfigResponse( + plugins=plugins, + modelInfos=self.model_manager.scan_models(), + removeBGModel=self.config.remove_bg_model, + removeBGModels=RemoveBGModel.values(), + realesrganModel=self.config.realesrgan_model, + realesrganModels=RealESRGANModel.values(), + interactiveSegModel=self.config.interactive_seg_model, + interactiveSegModels=InteractiveSegModel.values(), + enableFileManager=self.file_manager is not None, + enableAutoSaving=self.config.output_dir is not None, + enableControlnet=self.model_manager.enable_controlnet, + controlnetMethod=self.model_manager.controlnet_method, + disableModelSwitch=False, + isDesktop=False, + samplers=self.api_samplers(), + ) + + def api_input_image(self) -> FileResponse: + if self.config.input and self.config.input.is_file(): + return FileResponse(self.config.input) + raise HTTPException(status_code=404, detail="Input image not found") + + def api_geninfo(self, file: UploadFile) -> GenInfoResponse: + _, _, info = load_img(file.file.read(), return_info=True) + parts = info.get("parameters", "").split("Negative prompt: ") + prompt = parts[0].strip() + negative_prompt = "" + if len(parts) > 1: + negative_prompt = parts[1].split("\n")[0].strip() + return GenInfoResponse(prompt=prompt, negative_prompt=negative_prompt) + + def api_inpaint(self, req: InpaintRequest): + image, alpha_channel, infos = decode_base64_to_image(req.image) + mask, _, _ = decode_base64_to_image(req.mask, gray=True) + + mask = cv2.threshold(mask, 127, 255, cv2.THRESH_BINARY)[1] + if image.shape[:2] != mask.shape[:2]: + raise HTTPException( + 400, + detail=f"Image size({image.shape[:2]}) and mask size({mask.shape[:2]}) not match.", + ) + + if req.paint_by_example_example_image: + paint_by_example_image, _, _ = decode_base64_to_image( + req.paint_by_example_example_image + ) + + start = time.time() + rgb_np_img = self.model_manager(image, mask, req) + logger.info(f"process time: {(time.time() - start) * 1000:.2f}ms") + torch_gc() + + rgb_np_img = cv2.cvtColor(rgb_np_img.astype(np.uint8), cv2.COLOR_BGR2RGB) + rgb_res = concat_alpha_channel(rgb_np_img, alpha_channel) + + ext = "png" + res_img_bytes = pil_to_bytes( + Image.fromarray(rgb_res), + ext=ext, + quality=self.config.quality, + infos=infos, + ) + + asyncio.run(self.sio.emit("diffusion_finish")) + + return Response( + content=res_img_bytes, + media_type=f"image/{ext}", + headers={"X-Seed": str(req.sd_seed)}, + ) + + def api_run_plugin_gen_image(self, req: RunPluginRequest): + ext = "png" + if req.name not in self.plugins: + raise HTTPException(status_code=422, detail="Plugin not found") + if not self.plugins[req.name].support_gen_image: + raise HTTPException( + status_code=422, detail="Plugin does not support output image" + ) + rgb_np_img, alpha_channel, infos = decode_base64_to_image(req.image) + bgr_or_rgba_np_img = self.plugins[req.name].gen_image(rgb_np_img, req) + torch_gc() + + if bgr_or_rgba_np_img.shape[2] == 4: + rgba_np_img = bgr_or_rgba_np_img + else: + rgba_np_img = cv2.cvtColor(bgr_or_rgba_np_img, cv2.COLOR_BGR2RGB) + rgba_np_img = concat_alpha_channel(rgba_np_img, alpha_channel) + + return Response( + content=pil_to_bytes( + Image.fromarray(rgba_np_img), + ext=ext, + quality=self.config.quality, + infos=infos, + ), + media_type=f"image/{ext}", + ) + + def api_run_plugin_gen_mask(self, req: RunPluginRequest): + if req.name not in self.plugins: + raise HTTPException(status_code=422, detail="Plugin not found") + if not self.plugins[req.name].support_gen_mask: + raise HTTPException( + status_code=422, detail="Plugin does not support output image" + ) + rgb_np_img, alpha_channel, infos = decode_base64_to_image(req.image) + bgr_or_gray_mask = self.plugins[req.name].gen_mask(rgb_np_img, req) + torch_gc() + res_mask = gen_frontend_mask(bgr_or_gray_mask) + return Response( + content=numpy_to_bytes(res_mask, "png"), + media_type="image/png", + ) + + def api_samplers(self) -> List[str]: + return [member.value for member in SDSampler.__members__.values()] + + def api_adjust_mask(self, req: AdjustMaskRequest): + mask, _, _ = decode_base64_to_image(req.mask, gray=True) + mask = adjust_mask(mask, req.kernel_size, req.operate) + return Response(content=numpy_to_bytes(mask, "png"), media_type="image/png") + + def launch(self): + self.app.include_router(self.router) + uvicorn.run( + self.combined_asgi_app, + host=self.config.host, + port=self.config.port, + timeout_keep_alive=999999999, + ) + + def _build_file_manager(self) -> Optional[FileManager]: + if self.config.input and self.config.input.is_dir(): + logger.info( + f"Input is directory, initialize file manager {self.config.input}" + ) + + return FileManager( + app=self.app, + input_dir=self.config.input, + mask_dir=self.config.mask_dir, + output_dir=self.config.output_dir, + ) + return None + + def _build_plugins(self) -> Dict[str, BasePlugin]: + return build_plugins( + self.config.enable_interactive_seg, + self.config.interactive_seg_model, + self.config.interactive_seg_device, + self.config.enable_remove_bg, + self.config.remove_bg_model, + self.config.enable_anime_seg, + self.config.enable_realesrgan, + self.config.realesrgan_device, + self.config.realesrgan_model, + self.config.enable_gfpgan, + self.config.gfpgan_device, + self.config.enable_restoreformer, + self.config.restoreformer_device, + self.config.no_half, + ) + + def _build_model_manager(self): + return ModelManager( + name=self.config.model, + device=torch.device(self.config.device), + no_half=self.config.no_half, + low_mem=self.config.low_mem, + disable_nsfw=self.config.disable_nsfw_checker, + sd_cpu_textencoder=self.config.cpu_textencoder, + local_files_only=self.config.local_files_only, + cpu_offload=self.config.cpu_offload, + callback=diffuser_callback, + ) diff --git a/inpaint/batch_processing.py b/inpaint/batch_processing.py new file mode 100644 index 0000000..2430010 --- /dev/null +++ b/inpaint/batch_processing.py @@ -0,0 +1,128 @@ +import json +from pathlib import Path +from typing import Dict, Optional + +import cv2 +import numpy as np +from PIL import Image +from loguru import logger +from rich.console import Console +from rich.progress import ( + Progress, + SpinnerColumn, + TimeElapsedColumn, + MofNCompleteColumn, + TextColumn, + BarColumn, + TaskProgressColumn, +) + +from inpaint.helper import pil_to_bytes +from inpaint.model.utils import torch_gc +from inpaint.model_manager import ModelManager +from inpaint.schema import InpaintRequest + + +def glob_images(path: Path) -> Dict[str, Path]: + # png/jpg/jpeg + if path.is_file(): + return {path.stem: path} + elif path.is_dir(): + res = {} + for it in path.glob("*.*"): + if it.suffix.lower() in [".png", ".jpg", ".jpeg"]: + res[it.stem] = it + return res + + +def batch_inpaint( + model: str, + device, + image: Path, + mask: Path, + output: Path, + config: Optional[Path] = None, + concat: bool = False, +): + if image.is_dir() and output.is_file(): + logger.error( + "invalid --output: when image is a directory, output should be a directory" + ) + exit(-1) + output.mkdir(parents=True, exist_ok=True) + + image_paths = glob_images(image) + mask_paths = glob_images(mask) + if len(image_paths) == 0: + logger.error("invalid --image: empty image folder") + exit(-1) + if len(mask_paths) == 0: + logger.error("invalid --mask: empty mask folder") + exit(-1) + + if config is None: + inpaint_request = InpaintRequest() + logger.info(f"Using default config: {inpaint_request}") + else: + with open(config, "r", encoding="utf-8") as f: + inpaint_request = InpaintRequest(**json.load(f)) + logger.info(f"Using config: {inpaint_request}") + + model_manager = ModelManager(name=model, device=device) + first_mask = list(mask_paths.values())[0] + + console = Console() + + with Progress( + SpinnerColumn(), + TextColumn("[progress.description]{task.description}"), + BarColumn(), + TaskProgressColumn(), + MofNCompleteColumn(), + TimeElapsedColumn(), + console=console, + transient=False, + ) as progress: + task = progress.add_task("Batch processing...", total=len(image_paths)) + for stem, image_p in image_paths.items(): + if stem not in mask_paths and mask.is_dir(): + progress.log(f"mask for {image_p} not found") + progress.update(task, advance=1) + continue + mask_p = mask_paths.get(stem, first_mask) + + infos = Image.open(image_p).info + + img = np.array(Image.open(image_p).convert("RGB")) + mask_img = np.array(Image.open(mask_p).convert("L")) + + if mask_img.shape[:2] != img.shape[:2]: + progress.log( + f"resize mask {mask_p.name} to image {image_p.name} size: {img.shape[:2]}" + ) + mask_img = cv2.resize( + mask_img, + (img.shape[1], img.shape[0]), + interpolation=cv2.INTER_NEAREST, + ) + mask_img[mask_img >= 127] = 255 + mask_img[mask_img < 127] = 0 + + # bgr + inpaint_result = model_manager(img, mask_img, inpaint_request) + inpaint_result = cv2.cvtColor(inpaint_result, cv2.COLOR_BGR2RGB) + if concat: + mask_img = cv2.cvtColor(mask_img, cv2.COLOR_GRAY2RGB) + inpaint_result = cv2.hconcat([img, mask_img, inpaint_result]) + + img_bytes = pil_to_bytes(Image.fromarray(inpaint_result), "png", 100, infos) + save_p = output / f"{stem}.png" + with open(save_p, "wb") as fw: + fw.write(img_bytes) + + progress.update(task, advance=1) + torch_gc() + # pid = psutil.Process().pid + # memory_info = psutil.Process(pid).memory_info() + # memory_in_mb = memory_info.rss / (1024 * 1024) + # print(f"原图大小:{img.shape},当前进程的内存占用:{memory_in_mb}MB") diff --git a/inpaint/benchmark.py b/inpaint/benchmark.py new file mode 100644 index 0000000..9a98a3c --- /dev/null +++ b/inpaint/benchmark.py @@ -0,0 +1,109 @@ +#!/usr/bin/env python3 + +import argparse +import os +import time + +import numpy as np +import nvidia_smi +import psutil +import torch + +from inpaint.model_manager import ModelManager +from inpaint.schema import InpaintRequest, HDStrategy, SDSampler + +try: + torch._C._jit_override_can_fuse_on_cpu(False) + torch._C._jit_override_can_fuse_on_gpu(False) + torch._C._jit_set_texpr_fuser_enabled(False) + torch._C._jit_set_nvfuser_enabled(False) +except: + pass + +NUM_THREADS = str(4) + +os.environ["OMP_NUM_THREADS"] = NUM_THREADS +os.environ["OPENBLAS_NUM_THREADS"] = NUM_THREADS +os.environ["MKL_NUM_THREADS"] = NUM_THREADS +os.environ["VECLIB_MAXIMUM_THREADS"] = NUM_THREADS +os.environ["NUMEXPR_NUM_THREADS"] = NUM_THREADS +if os.environ.get("CACHE_DIR"): + os.environ["TORCH_HOME"] = os.environ["CACHE_DIR"] + + +def run_model(model, size): + # RGB + image = np.random.randint(0, 256, (size[0], size[1], 3)).astype(np.uint8) + mask = np.random.randint(0, 255, size).astype(np.uint8) + + config = InpaintRequest( + ldm_steps=2, + hd_strategy=HDStrategy.ORIGINAL, + hd_strategy_crop_margin=128, + hd_strategy_crop_trigger_size=128, + hd_strategy_resize_limit=128, + prompt="a fox is sitting on a bench", + sd_steps=5, + sd_sampler=SDSampler.ddim, + ) + model(image, mask, config) + + +def benchmark(model, times: int, empty_cache: bool): + sizes = [(512, 512)] + + nvidia_smi.nvmlInit() + device_id = 0 + handle = nvidia_smi.nvmlDeviceGetHandleByIndex(device_id) + + def format(metrics): + return f"{np.mean(metrics):.2f} ± {np.std(metrics):.2f}" + + process = psutil.Process(os.getpid()) + # 每个 size 给出显存和内存占用的指标 + for size in sizes: + torch.cuda.empty_cache() + time_metrics = [] + cpu_metrics = [] + memory_metrics = [] + gpu_memory_metrics = [] + for _ in range(times): + start = time.time() + run_model(model, size) + torch.cuda.synchronize() + + # cpu_metrics.append(process.cpu_percent()) + time_metrics.append((time.time() - start) * 1000) + memory_metrics.append(process.memory_info().rss / 1024 / 1024) + gpu_memory_metrics.append( + nvidia_smi.nvmlDeviceGetMemoryInfo(handle).used / 1024 / 1024 + ) + + print(f"size: {size}".center(80, "-")) + # print(f"cpu: {format(cpu_metrics)}") + print(f"latency: {format(time_metrics)}ms") + print(f"memory: {format(memory_metrics)} MB") + print(f"gpu memory: {format(gpu_memory_metrics)} MB") + + nvidia_smi.nvmlShutdown() + + +def get_args_parser(): + parser = argparse.ArgumentParser() + parser.add_argument("--name") + parser.add_argument("--device", default="cuda", type=str) + parser.add_argument("--times", default=10, type=int) + parser.add_argument("--empty-cache", action="store_true") + return parser.parse_args() + + +if __name__ == "__main__": + args = get_args_parser() + device = torch.device(args.device) + model = ModelManager( + name=args.name, + device=device, + disable_nsfw=True, + sd_cpu_textencoder=True, + ) + benchmark(model, args.times, args.empty_cache) diff --git a/inpaint/cli.py b/inpaint/cli.py new file mode 100644 index 0000000..fb8e94a --- /dev/null +++ b/inpaint/cli.py @@ -0,0 +1,232 @@ +import webbrowser +from contextlib import asynccontextmanager +from pathlib import Path +from typing import Optional + +import typer +from fastapi import FastAPI +from loguru import logger +from typer import Option +from typer_config import use_json_config + +from inpaint.const import * +from inpaint.runtime import setup_model_dir, dump_environment_info, check_device +from inpaint.schema import InteractiveSegModel, Device, RealESRGANModel, RemoveBGModel + +typer_app = typer.Typer(pretty_exceptions_show_locals=False, add_completion=False) + + +@typer_app.command(help="Install all plugins dependencies") +def install_plugins_packages(): + from inpaint.installer import install_plugins_package + + install_plugins_package() + + +@typer_app.command(help="Download SD/SDXL normal/inpainting model from HuggingFace") +def download( + model: str = Option( + ..., help="Model id on HuggingFace e.g: runwayml/stable-diffusion-inpainting" + ), + model_dir: Path = Option( + DEFAULT_MODEL_DIR, + help=MODEL_DIR_HELP, + file_okay=False, + callback=setup_model_dir, + ), +): + from inpaint.download import cli_download_model + + cli_download_model(model) + + +@typer_app.command(name="list", help="List downloaded models") +def list_model( + model_dir: Path = Option( + DEFAULT_MODEL_DIR, + help=MODEL_DIR_HELP, + file_okay=False, + callback=setup_model_dir, + ), +): + from inpaint.download import scan_models + + scanned_models = scan_models() + for it in scanned_models: + print(it.name) + + +@typer_app.command(help="Batch processing images") +def run( + model: str = Option("lama"), + device: Device = Option(Device.cpu), + image: Path = Option(..., help="Image folders or file path"), + mask: Path = Option( + ..., + help="Mask folders or file path. " + "If it is a directory, the mask images in the directory should have the same name as the original image." + "If it is a file, all images will use this mask." + "Mask will automatically resize to the same size as the original image.", + ), + output: Path = Option(..., help="Output directory or file path"), + config: Path = Option( + None, help="Config file path. You can use dump command to create a base config." + ), + concat: bool = Option( + False, help="Concat original image, mask and output images into one image" + ), + model_dir: Path = Option( + DEFAULT_MODEL_DIR, + help=MODEL_DIR_HELP, + file_okay=False, + callback=setup_model_dir, + ), +): + from inpaint.download import cli_download_model, scan_models + + scanned_models = scan_models() + if model not in [it.name for it in scanned_models]: + logger.info(f"{model} not found in {model_dir}, try to downloading") + cli_download_model(model) + + from inpaint.batch_processing import batch_inpaint + + batch_inpaint(model, device, image, mask, output, config, concat) + + +@typer_app.command(help="Start IOPaint server") +@use_json_config() +def start( + host: str = Option("127.0.0.1"), + port: int = Option(8080), + inbrowser: bool = Option(False, help=INBROWSER_HELP), + model: str = Option( + DEFAULT_MODEL, + help=f"Erase models: [{', '.join(AVAILABLE_MODELS)}].\n" + f"Diffusion models: [{', '.join(DIFFUSION_MODELS)}] or any SD/SDXL normal/inpainting models on HuggingFace.", + ), + model_dir: Path = Option( + DEFAULT_MODEL_DIR, + help=MODEL_DIR_HELP, + dir_okay=True, + file_okay=False, + callback=setup_model_dir, + ), + low_mem: bool = Option(False, help=LOW_MEM_HELP), + no_half: bool = Option(False, help=NO_HALF_HELP), + cpu_offload: bool = Option(False, help=CPU_OFFLOAD_HELP), + disable_nsfw_checker: bool = Option(False, help=DISABLE_NSFW_HELP), + cpu_textencoder: bool = Option(False, help=CPU_TEXTENCODER_HELP), + local_files_only: bool = Option(False, help=LOCAL_FILES_ONLY_HELP), + device: Device = Option(Device.cpu), + input: Optional[Path] = Option(None, help=INPUT_HELP), + mask_dir: Optional[Path] = Option( + None, help=MODEL_DIR_HELP, dir_okay=True, file_okay=False + ), + output_dir: Optional[Path] = Option( + None, help=OUTPUT_DIR_HELP, dir_okay=True, file_okay=False + ), + quality: int = Option(95, help=QUALITY_HELP), + enable_interactive_seg: bool = Option(False, help=INTERACTIVE_SEG_HELP), + interactive_seg_model: InteractiveSegModel = Option( + InteractiveSegModel.vit_b, help=INTERACTIVE_SEG_MODEL_HELP + ), + interactive_seg_device: Device = Option(Device.cpu), + enable_remove_bg: bool = Option(False, help=REMOVE_BG_HELP), + remove_bg_model: RemoveBGModel = Option(RemoveBGModel.briaai_rmbg_1_4), + enable_anime_seg: bool = Option(False, help=ANIMESEG_HELP), + enable_realesrgan: bool = Option(False), + realesrgan_device: Device = Option(Device.cpu), + realesrgan_model: RealESRGANModel = Option(RealESRGANModel.realesr_general_x4v3), + enable_gfpgan: bool = Option(False), + gfpgan_device: Device = Option(Device.cpu), + enable_restoreformer: bool = Option(False), + restoreformer_device: Device = Option(Device.cpu), +): + dump_environment_info() + device = check_device(device) + if input and not input.exists(): + logger.error(f"invalid --input: {input} not exists") + exit(-1) + if mask_dir and not mask_dir.exists(): + logger.error(f"invalid --mask-dir: {mask_dir} not exists") + exit(-1) + if input and input.is_dir() and not output_dir: + logger.error("invalid --output-dir: --output-dir must be set when --input is a directory") + exit(-1) + if output_dir: + output_dir = output_dir.expanduser().absolute() + logger.info(f"Image will be saved to {output_dir}") + if not output_dir.exists(): + logger.info(f"Create output directory {output_dir}") + output_dir.mkdir(parents=True) + if mask_dir: + mask_dir = mask_dir.expanduser().absolute() + + model_dir = model_dir.expanduser().absolute() + + if local_files_only: + os.environ["TRANSFORMERS_OFFLINE"] = "1" + os.environ["HF_HUB_OFFLINE"] = "1" + + from inpaint.download import cli_download_model, scan_models + + scanned_models = scan_models() + if model not in [it.name for it in scanned_models]: + logger.info(f"{model} not found in {model_dir}, try to downloading") + cli_download_model(model) + + from inpaint.api import Api + from inpaint.schema import ApiConfig + + @asynccontextmanager + async def lifespan(app: FastAPI): + if inbrowser: + webbrowser.open(f"http://localhost:{port}", new=0, autoraise=True) + yield + + app = FastAPI(lifespan=lifespan) + + api_config = ApiConfig( + host=host, + port=port, + inbrowser=inbrowser, + model=model, + no_half=no_half, + low_mem=low_mem, + cpu_offload=cpu_offload, + disable_nsfw_checker=disable_nsfw_checker, + local_files_only=local_files_only, + cpu_textencoder=cpu_textencoder if device == Device.cuda else False, + device=device, + input=input, + mask_dir=mask_dir, + output_dir=output_dir, + quality=quality, + enable_interactive_seg=enable_interactive_seg, + interactive_seg_model=interactive_seg_model, + interactive_seg_device=interactive_seg_device, + enable_remove_bg=enable_remove_bg, + remove_bg_model=remove_bg_model, + enable_anime_seg=enable_anime_seg, + enable_realesrgan=enable_realesrgan, + realesrgan_device=realesrgan_device, + realesrgan_model=realesrgan_model, + enable_gfpgan=enable_gfpgan, + gfpgan_device=gfpgan_device, + enable_restoreformer=enable_restoreformer, + restoreformer_device=restoreformer_device, + ) + print(api_config.model_dump_json(indent=4)) + api = Api(app, api_config) + api.launch() + + +@typer_app.command(help="Start IOPaint web config page") +def start_web_config( + config_file: Path = Option("config.json"), +): + dump_environment_info() + from inpaint.web_config import main + + main(config_file) diff --git a/inpaint/const.py b/inpaint/const.py new file mode 100644 index 0000000..b18254b --- /dev/null +++ b/inpaint/const.py @@ -0,0 +1,128 @@ +import os +from typing import List + +INSTRUCT_PIX2PIX_NAME = "timbrooks/instruct-pix2pix" +KANDINSKY22_NAME = "kandinsky-community/kandinsky-2-2-decoder-inpaint" +POWERPAINT_NAME = "Sanster/PowerPaint-V1-stable-diffusion-inpainting" +ANYTEXT_NAME = "Sanster/AnyText" + +DIFFUSERS_SD_CLASS_NAME = "StableDiffusionPipeline" +DIFFUSERS_SD_INPAINT_CLASS_NAME = "StableDiffusionInpaintPipeline" +DIFFUSERS_SDXL_CLASS_NAME = "StableDiffusionXLPipeline" +DIFFUSERS_SDXL_INPAINT_CLASS_NAME = "StableDiffusionXLInpaintPipeline" + +MPS_UNSUPPORT_MODELS = [ + "lama", + "ldm", + "zits", + "mat", + "fcf", + "cv2", + "manga", +] + +DEFAULT_MODEL = "lama" +AVAILABLE_MODELS = ["lama", "ldm", "zits", "mat", "fcf", "manga", "cv2", "migan"] +DIFFUSION_MODELS = [ + "runwayml/stable-diffusion-inpainting", + "Uminosachi/realisticVisionV51_v51VAE-inpainting", + "redstonehero/dreamshaper-inpainting", + "Sanster/anything-4.0-inpainting", + "diffusers/stable-diffusion-xl-1.0-inpainting-0.1", + "Fantasy-Studio/Paint-by-Example", + POWERPAINT_NAME, + ANYTEXT_NAME, +] + +NO_HALF_HELP = """ +Using full precision(fp32) model. +If your diffusion model generate result is always black or green, use this argument. +""" + +CPU_OFFLOAD_HELP = """ +Offloads diffusion model's weight to CPU RAM, significantly reducing vRAM usage. +""" + +LOW_MEM_HELP = "Enable attention slicing and vae tiling to save memory." + +DISABLE_NSFW_HELP = """ +Disable NSFW checker for diffusion model. +""" + +CPU_TEXTENCODER_HELP = """ +Run diffusion models text encoder on CPU to reduce vRAM usage. +""" + +SD_CONTROLNET_CHOICES: List[str] = [ + "lllyasviel/control_v11p_sd15_canny", + # "lllyasviel/control_v11p_sd15_seg", + "lllyasviel/control_v11p_sd15_openpose", + "lllyasviel/control_v11p_sd15_inpaint", + "lllyasviel/control_v11f1p_sd15_depth", +] + +SD_BRUSHNET_CHOICES: List[str] = [ + "Sanster/brushnet_random_mask", + "Sanster/brushnet_segmentation_mask", +] + +SD2_CONTROLNET_CHOICES = [ + "thibaud/controlnet-sd21-canny-diffusers", + "thibaud/controlnet-sd21-depth-diffusers", + "thibaud/controlnet-sd21-openpose-diffusers", +] + +SDXL_CONTROLNET_CHOICES = [ + "thibaud/controlnet-openpose-sdxl-1.0", + "destitech/controlnet-inpaint-dreamer-sdxl", + "diffusers/controlnet-canny-sdxl-1.0", + "diffusers/controlnet-canny-sdxl-1.0-mid", + "diffusers/controlnet-canny-sdxl-1.0-small", + "diffusers/controlnet-depth-sdxl-1.0", + "diffusers/controlnet-depth-sdxl-1.0-mid", + "diffusers/controlnet-depth-sdxl-1.0-small", +] + +LOCAL_FILES_ONLY_HELP = """ +When loading diffusion models, using local files only, not connect to HuggingFace server. +""" + +DEFAULT_MODEL_DIR = os.path.abspath( + os.getenv("XDG_CACHE_HOME", os.path.join(os.path.expanduser("~"), ".cache")) +) + +MODEL_DIR_HELP = f""" +Model download directory (by setting XDG_CACHE_HOME environment variable), by default model download to {DEFAULT_MODEL_DIR} +""" + +OUTPUT_DIR_HELP = """ +Result images will be saved to output directory automatically. +""" + +MASK_DIR_HELP = """ +You can view masks in FileManager +""" + +INPUT_HELP = """ +If input is image, it will be loaded by default. +If input is directory, you can browse and select image in file manager. +""" + +GUI_HELP = """ +Launch Lama Cleaner as desktop app +""" + +QUALITY_HELP = """ +Quality of image encoding, 0-100. Default is 95, higher quality will generate larger file size. +""" + +INTERACTIVE_SEG_HELP = "Enable interactive segmentation using Segment Anything." +INTERACTIVE_SEG_MODEL_HELP = "Model size: mobile_sam < vit_b < vit_l < vit_h. Bigger model size means better segmentation but slower speed." +REMOVE_BG_HELP = "Enable remove background plugin. Always run on CPU" +ANIMESEG_HELP = "Enable anime segmentation plugin. Always run on CPU" +REALESRGAN_HELP = "Enable realesrgan super resolution" +GFPGAN_HELP = "Enable GFPGAN face restore. To also enhance background, use with --enable-realesrgan" +RESTOREFORMER_HELP = "Enable RestoreFormer face restore. To also enhance background, use with --enable-realesrgan" +GIF_HELP = "Enable GIF plugin. Make GIF to compare original and cleaned image" + +INBROWSER_HELP = "Automatically launch IOPaint in a new tab on the default browser" diff --git a/inpaint/download.py b/inpaint/download.py new file mode 100644 index 0000000..c0a099f --- /dev/null +++ b/inpaint/download.py @@ -0,0 +1,313 @@ +import glob +import json +import os +from functools import lru_cache +from typing import List, Optional + +from inpaint.schema import ModelType, ModelInfo +from loguru import logger +from pathlib import Path + +from inpaint.const import ( + DEFAULT_MODEL_DIR, + DIFFUSERS_SD_CLASS_NAME, + DIFFUSERS_SD_INPAINT_CLASS_NAME, + DIFFUSERS_SDXL_CLASS_NAME, + DIFFUSERS_SDXL_INPAINT_CLASS_NAME, + ANYTEXT_NAME, +) +from inpaint.model.original_sd_configs import get_config_files + + +def cli_download_model(model: str): + from inpaint.model import models + from inpaint.model.utils import handle_from_pretrained_exceptions + + if model in models and models[model].is_erase_model: + logger.info(f"Downloading {model}...") + models[model].download() + logger.info("Done.") + elif model == ANYTEXT_NAME: + logger.info(f"Downloading {model}...") + models[model].download() + logger.info("Done.") + else: + logger.info(f"Downloading model from Huggingface: {model}") + from diffusers import DiffusionPipeline + + downloaded_path = handle_from_pretrained_exceptions( + DiffusionPipeline.download, + pretrained_model_name=model, + variant="fp16", + resume_download=True, + ) + logger.info(f"Done. Downloaded to {downloaded_path}") + + +def folder_name_to_show_name(name: str) -> str: + return name.replace("models--", "").replace("--", "/") + + +@lru_cache(maxsize=512) +def get_sd_model_type(model_abs_path: str) -> Optional[ModelType]: + if "inpaint" in Path(model_abs_path).name.lower(): + model_type = ModelType.DIFFUSERS_SD_INPAINT + else: + # load once to check num_in_channels + from diffusers import StableDiffusionInpaintPipeline + + try: + StableDiffusionInpaintPipeline.from_single_file( + model_abs_path, + load_safety_checker=False, + num_in_channels=9, + original_config_file=get_config_files()['v1'] + ) + model_type = ModelType.DIFFUSERS_SD_INPAINT + except ValueError as e: + if "[320, 4, 3, 3]" in str(e): + model_type = ModelType.DIFFUSERS_SD + else: + logger.info(f"Ignore non sdxl file: {model_abs_path}") + return + except Exception as e: + logger.error(f"Failed to load {model_abs_path}: {e}") + return + return model_type + + +@lru_cache() +def get_sdxl_model_type(model_abs_path: str) -> Optional[ModelType]: + if "inpaint" in model_abs_path: + model_type = ModelType.DIFFUSERS_SDXL_INPAINT + else: + # load once to check num_in_channels + from diffusers import StableDiffusionXLInpaintPipeline + + try: + model = StableDiffusionXLInpaintPipeline.from_single_file( + model_abs_path, + load_safety_checker=False, + num_in_channels=9, + original_config_file=get_config_files()['xl'], + ) + if model.unet.config.in_channels == 9: + # https://github.com/huggingface/diffusers/issues/6610 + model_type = ModelType.DIFFUSERS_SDXL_INPAINT + else: + model_type = ModelType.DIFFUSERS_SDXL + except ValueError as e: + if "[320, 4, 3, 3]" in str(e): + model_type = ModelType.DIFFUSERS_SDXL + else: + logger.info(f"Ignore non sdxl file: {model_abs_path}") + return + except Exception as e: + logger.error(f"Failed to load {model_abs_path}: {e}") + return + return model_type + + +def scan_single_file_diffusion_models(cache_dir) -> List[ModelInfo]: + cache_dir = Path(cache_dir) + stable_diffusion_dir = cache_dir / "stable_diffusion" + cache_file = stable_diffusion_dir / "iopaint_cache.json" + model_type_cache = {} + if cache_file.exists(): + try: + with open(cache_file, "r", encoding="utf-8") as f: + model_type_cache = json.load(f) + assert isinstance(model_type_cache, dict) + except: + pass + + res = [] + for it in stable_diffusion_dir.glob("*.*"): + if it.suffix not in [".safetensors", ".ckpt"]: + continue + model_abs_path = str(it.absolute()) + model_type = model_type_cache.get(it.name) + if model_type is None: + model_type = get_sd_model_type(model_abs_path) + if model_type is None: + continue + + model_type_cache[it.name] = model_type + res.append( + ModelInfo( + name=it.name, + path=model_abs_path, + model_type=model_type, + is_single_file_diffusers=True, + ) + ) + if stable_diffusion_dir.exists(): + with open(cache_file, "w", encoding="utf-8") as fw: + json.dump(model_type_cache, fw, indent=2, ensure_ascii=False) + + stable_diffusion_xl_dir = cache_dir / "stable_diffusion_xl" + sdxl_cache_file = stable_diffusion_xl_dir / "iopaint_cache.json" + sdxl_model_type_cache = {} + if sdxl_cache_file.exists(): + try: + with open(sdxl_cache_file, "r", encoding="utf-8") as f: + sdxl_model_type_cache = json.load(f) + assert isinstance(sdxl_model_type_cache, dict) + except: + pass + + for it in stable_diffusion_xl_dir.glob("*.*"): + if it.suffix not in [".safetensors", ".ckpt"]: + continue + model_abs_path = str(it.absolute()) + model_type = sdxl_model_type_cache.get(it.name) + if model_type is None: + model_type = get_sdxl_model_type(model_abs_path) + if model_type is None: + continue + + sdxl_model_type_cache[it.name] = model_type + if stable_diffusion_xl_dir.exists(): + with open(sdxl_cache_file, "w", encoding="utf-8") as fw: + json.dump(sdxl_model_type_cache, fw, indent=2, ensure_ascii=False) + + res.append( + ModelInfo( + name=it.name, + path=model_abs_path, + model_type=model_type, + is_single_file_diffusers=True, + ) + ) + return res + + +def scan_inpaint_models(model_dir: Path) -> List[ModelInfo]: + res = [] + from inpaint.model import models + + # logger.info(f"Scanning inpaint models in {model_dir}") + + for name, m in models.items(): + if m.is_erase_model and m.is_downloaded(): + res.append( + ModelInfo( + name=name, + path=name, + model_type=ModelType.INPAINT, + ) + ) + return res + + +def scan_diffusers_models() -> List[ModelInfo]: + from huggingface_hub.constants import HF_HUB_CACHE + + available_models = [] + cache_dir = Path(HF_HUB_CACHE) + # logger.info(f"Scanning diffusers models in {cache_dir}") + diffusers_model_names = [] + model_index_files = glob.glob(os.path.join(cache_dir, "**/*", "model_index.json"), recursive=True) + for it in model_index_files: + it = Path(it) + with open(it, "r", encoding="utf-8") as f: + try: + data = json.load(f) + except: + continue + + _class_name = data["_class_name"] + name = folder_name_to_show_name(it.parent.parent.parent.name) + if name in diffusers_model_names: + continue + if "PowerPaint" in name: + model_type = ModelType.DIFFUSERS_OTHER + elif _class_name == DIFFUSERS_SD_CLASS_NAME: + model_type = ModelType.DIFFUSERS_SD + elif _class_name == DIFFUSERS_SD_INPAINT_CLASS_NAME: + model_type = ModelType.DIFFUSERS_SD_INPAINT + elif _class_name == DIFFUSERS_SDXL_CLASS_NAME: + model_type = ModelType.DIFFUSERS_SDXL + elif _class_name == DIFFUSERS_SDXL_INPAINT_CLASS_NAME: + model_type = ModelType.DIFFUSERS_SDXL_INPAINT + elif _class_name in [ + "StableDiffusionInstructPix2PixPipeline", + "PaintByExamplePipeline", + "KandinskyV22InpaintPipeline", + "AnyText", + ]: + model_type = ModelType.DIFFUSERS_OTHER + else: + continue + + diffusers_model_names.append(name) + available_models.append( + ModelInfo( + name=name, + path=name, + model_type=model_type, + ) + ) + return available_models + + +def _scan_converted_diffusers_models(cache_dir) -> List[ModelInfo]: + cache_dir = Path(cache_dir) + available_models = [] + diffusers_model_names = [] + model_index_files = glob.glob(os.path.join(cache_dir, "**/*", "model_index.json"), recursive=True) + for it in model_index_files: + it = Path(it) + with open(it, "r", encoding="utf-8") as f: + try: + data = json.load(f) + except: + logger.error( + f"Failed to load {it}, please try revert from original model or fix model_index.json by hand." + ) + continue + + _class_name = data["_class_name"] + name = folder_name_to_show_name(it.parent.name) + if name in diffusers_model_names: + continue + elif _class_name == DIFFUSERS_SD_CLASS_NAME: + model_type = ModelType.DIFFUSERS_SD + elif _class_name == DIFFUSERS_SD_INPAINT_CLASS_NAME: + model_type = ModelType.DIFFUSERS_SD_INPAINT + elif _class_name == DIFFUSERS_SDXL_CLASS_NAME: + model_type = ModelType.DIFFUSERS_SDXL + elif _class_name == DIFFUSERS_SDXL_INPAINT_CLASS_NAME: + model_type = ModelType.DIFFUSERS_SDXL_INPAINT + else: + continue + + diffusers_model_names.append(name) + available_models.append( + ModelInfo( + name=name, + path=str(it.parent.absolute()), + model_type=model_type, + ) + ) + return available_models + + +def scan_converted_diffusers_models(cache_dir) -> List[ModelInfo]: + cache_dir = Path(cache_dir) + available_models = [] + stable_diffusion_dir = cache_dir / "stable_diffusion" + stable_diffusion_xl_dir = cache_dir / "stable_diffusion_xl" + available_models.extend(_scan_converted_diffusers_models(stable_diffusion_dir)) + available_models.extend(_scan_converted_diffusers_models(stable_diffusion_xl_dir)) + return available_models + + +def scan_models() -> List[ModelInfo]: + model_dir = os.getenv("XDG_CACHE_HOME", DEFAULT_MODEL_DIR) + available_models = [] + available_models.extend(scan_inpaint_models(model_dir)) + available_models.extend(scan_single_file_diffusion_models(model_dir)) + available_models.extend(scan_diffusers_models()) + available_models.extend(scan_converted_diffusers_models(model_dir)) + return available_models diff --git a/inpaint/file_manager/__init__.py b/inpaint/file_manager/__init__.py new file mode 100644 index 0000000..1a24998 --- /dev/null +++ b/inpaint/file_manager/__init__.py @@ -0,0 +1 @@ +from .file_manager import FileManager diff --git a/inpaint/file_manager/file_manager.py b/inpaint/file_manager/file_manager.py new file mode 100644 index 0000000..c24f54f --- /dev/null +++ b/inpaint/file_manager/file_manager.py @@ -0,0 +1,218 @@ +import os +from io import BytesIO +from pathlib import Path +from typing import List + +from PIL import Image, ImageOps, PngImagePlugin +from fastapi import FastAPI, HTTPException +from starlette.responses import FileResponse + +from ..schema import MediasResponse, MediaTab + +LARGE_ENOUGH_NUMBER = 100 +PngImagePlugin.MAX_TEXT_CHUNK = LARGE_ENOUGH_NUMBER * (1024**2) +from .storage_backends import FilesystemStorageBackend +from .utils import aspect_to_string, generate_filename, glob_img + + +class FileManager: + def __init__(self, app: FastAPI, input_dir: Path, mask_dir: Path, output_dir: Path): + self.app = app + self.input_dir: Path = input_dir + self.mask_dir: Path = mask_dir + self.output_dir: Path = output_dir + + self.image_dir_filenames = [] + self.output_dir_filenames = [] + if not self.thumbnail_directory.exists(): + self.thumbnail_directory.mkdir(parents=True) + + # fmt: off + self.app.add_api_route("/api/v1/medias", self.api_medias, methods=["GET"], response_model=List[MediasResponse]) + self.app.add_api_route("/api/v1/media_file", self.api_media_file, methods=["GET"]) + self.app.add_api_route("/api/v1/media_thumbnail_file", self.api_media_thumbnail_file, methods=["GET"]) + # fmt: on + + def api_medias(self, tab: MediaTab) -> List[MediasResponse]: + img_dir = self._get_dir(tab) + return self._media_names(img_dir) + + def api_media_file(self, tab: MediaTab, filename: str) -> FileResponse: + file_path = self._get_file(tab, filename) + return FileResponse(file_path, media_type="image/png") + + # tab=${tab}?filename=${filename.name}?width=${width}&height=${height} + def api_media_thumbnail_file( + self, tab: MediaTab, filename: str, width: int, height: int + ) -> FileResponse: + img_dir = self._get_dir(tab) + thumb_filename, (width, height) = self.get_thumbnail( + img_dir, filename, width=width, height=height + ) + thumbnail_filepath = self.thumbnail_directory / thumb_filename + return FileResponse( + thumbnail_filepath, + headers={ + "X-Width": str(width), + "X-Height": str(height), + }, + media_type="image/jpeg", + ) + + def _get_dir(self, tab: MediaTab) -> Path: + if tab == "input": + return self.input_dir + elif tab == "output": + return self.output_dir + elif tab == "mask": + return self.mask_dir + else: + raise HTTPException(status_code=422, detail=f"tab not found: {tab}") + + def _get_file(self, tab: MediaTab, filename: str) -> Path: + file_path = self._get_dir(tab) / filename + if not file_path.exists(): + raise HTTPException(status_code=422, detail=f"file not found: {file_path}") + return file_path + + @property + def thumbnail_directory(self) -> Path: + return self.output_dir / "thumbnails" + + @staticmethod + def _media_names(directory: Path) -> List[MediasResponse]: + names = sorted([it.name for it in glob_img(directory)]) + res = [] + for name in names: + path = os.path.join(directory, name) + img = Image.open(path) + res.append( + MediasResponse( + name=name, + height=img.height, + width=img.width, + ctime=os.path.getctime(path), + mtime=os.path.getmtime(path), + ) + ) + return res + + def get_thumbnail( + self, directory: Path, original_filename: str, width, height, **options + ): + directory = Path(directory) + storage = FilesystemStorageBackend(self.app) + crop = options.get("crop", "fit") + background = options.get("background") + quality = options.get("quality", 90) + + original_path, original_filename = os.path.split(original_filename) + original_filepath = os.path.join(directory, original_path, original_filename) + image = Image.open(BytesIO(storage.read(original_filepath))) + + # keep ratio resize + if not width and not height: + width = 256 + + if width != 0: + height = int(image.height * width / image.width) + else: + width = int(image.width * height / image.height) + + thumbnail_size = (width, height) + + thumbnail_filename = generate_filename( + directory, + original_filename, + aspect_to_string(thumbnail_size), + crop, + background, + quality, + ) + + thumbnail_filepath = os.path.join( + self.thumbnail_directory, original_path, thumbnail_filename + ) + + if storage.exists(thumbnail_filepath): + return thumbnail_filepath, (width, height) + + try: + image.load() + except (IOError, OSError): + self.app.logger.warning("Thumbnail not load image: %s", original_filepath) + return thumbnail_filepath, (width, height) + + # get original image format + options["format"] = options.get("format", image.format) + + image = self._create_thumbnail( + image, thumbnail_size, crop, background=background + ) + + raw_data = self.get_raw_data(image, **options) + storage.save(thumbnail_filepath, raw_data) + + return thumbnail_filepath, (width, height) + + def get_raw_data(self, image, **options): + data = { + "format": self._get_format(image, **options), + "quality": options.get("quality", 90), + } + + _file = BytesIO() + image.save(_file, **data) + return _file.getvalue() + + @staticmethod + def colormode(image, colormode="RGB"): + if colormode == "RGB" or colormode == "RGBA": + if image.mode == "RGBA": + return image + if image.mode == "LA": + return image.convert("RGBA") + return image.convert(colormode) + + if colormode == "GRAY": + return image.convert("L") + + return image.convert(colormode) + + @staticmethod + def background(original_image, color=0xFF): + size = (max(original_image.size),) * 2 + image = Image.new("L", size, color) + image.paste( + original_image, + tuple(map(lambda x: (x[0] - x[1]) / 2, zip(size, original_image.size))), + ) + + return image + + def _get_format(self, image, **options): + if options.get("format"): + return options.get("format") + if image.format: + return image.format + + return "JPEG" + + def _create_thumbnail(self, image, size, crop="fit", background=None): + try: + resample = Image.Resampling.LANCZOS + except AttributeError: # pylint: disable=raise-missing-from + resample = Image.ANTIALIAS + + if crop == "fit": + image = ImageOps.fit(image, size, resample) + else: + image = image.copy() + image.thumbnail(size, resample=resample) + + if background is not None: + image = self.background(image) + + image = self.colormode(image) + + return image diff --git a/inpaint/file_manager/storage_backends.py b/inpaint/file_manager/storage_backends.py new file mode 100644 index 0000000..3f453ad --- /dev/null +++ b/inpaint/file_manager/storage_backends.py @@ -0,0 +1,46 @@ +# Copy from https://github.com/silentsokolov/flask-thumbnails/blob/master/flask_thumbnails/storage_backends.py +import errno +import os +from abc import ABC, abstractmethod + + +class BaseStorageBackend(ABC): + def __init__(self, app=None): + self.app = app + + @abstractmethod + def read(self, filepath, mode="rb", **kwargs): + raise NotImplementedError + + @abstractmethod + def exists(self, filepath): + raise NotImplementedError + + @abstractmethod + def save(self, filepath, data): + raise NotImplementedError + + +class FilesystemStorageBackend(BaseStorageBackend): + def read(self, filepath, mode="rb", **kwargs): + with open(filepath, mode) as f: # pylint: disable=unspecified-encoding + return f.read() + + def exists(self, filepath): + return os.path.exists(filepath) + + def save(self, filepath, data): + directory = os.path.dirname(filepath) + + if not os.path.exists(directory): + try: + os.makedirs(directory) + except OSError as e: + if e.errno != errno.EEXIST: + raise + + if not os.path.isdir(directory): + raise IOError("{} is not a directory".format(directory)) + + with open(filepath, "wb") as f: + f.write(data) diff --git a/inpaint/file_manager/utils.py b/inpaint/file_manager/utils.py new file mode 100644 index 0000000..f6890af --- /dev/null +++ b/inpaint/file_manager/utils.py @@ -0,0 +1,65 @@ +# Copy from: https://github.com/silentsokolov/flask-thumbnails/blob/master/flask_thumbnails/utils.py +import hashlib +from pathlib import Path + +from typing import Union + + +def generate_filename(directory: Path, original_filename, *options) -> str: + text = str(directory.absolute()) + original_filename + for v in options: + text += "%s" % v + md5_hash = hashlib.md5() + md5_hash.update(text.encode("utf-8")) + return md5_hash.hexdigest() + ".jpg" + + +def parse_size(size): + if isinstance(size, int): + # If the size parameter is a single number, assume square aspect. + return [size, size] + + if isinstance(size, (tuple, list)): + if len(size) == 1: + # If single value tuple/list is provided, exand it to two elements + return size + type(size)(size) + return size + + try: + thumbnail_size = [int(x) for x in size.lower().split("x", 1)] + except ValueError: + raise ValueError( # pylint: disable=raise-missing-from + "Bad thumbnail size format. Valid format is INTxINT." + ) + + if len(thumbnail_size) == 1: + # If the size parameter only contains a single integer, assume square aspect. + thumbnail_size.append(thumbnail_size[0]) + + return thumbnail_size + + +def aspect_to_string(size): + if isinstance(size, str): + return size + + return "x".join(map(str, size)) + + +IMG_SUFFIX = {".jpg", ".jpeg", ".png", ".JPG", ".JPEG", ".PNG"} + + +def glob_img(p: Union[Path, str], recursive: bool = False): + p = Path(p) + if p.is_file() and p.suffix in IMG_SUFFIX: + yield p + else: + if recursive: + files = Path(p).glob("**/*.*") + else: + files = Path(p).glob("*.*") + + for it in files: + if it.suffix not in IMG_SUFFIX: + continue + yield it diff --git a/inpaint/helper.py b/inpaint/helper.py new file mode 100644 index 0000000..c2c0c48 --- /dev/null +++ b/inpaint/helper.py @@ -0,0 +1,408 @@ +import base64 +import imghdr +import io +import os +import sys +from typing import List, Optional, Dict, Tuple + +from urllib.parse import urlparse +import cv2 +from PIL import Image, ImageOps, PngImagePlugin +import numpy as np +import torch +from inpaint.const import MPS_UNSUPPORT_MODELS +from loguru import logger +from torch.hub import download_url_to_file, get_dir +import hashlib + + +def md5sum(filename): + md5 = hashlib.md5() + with open(filename, "rb") as f: + for chunk in iter(lambda: f.read(128 * md5.block_size), b""): + md5.update(chunk) + return md5.hexdigest() + + +def switch_mps_device(model_name, device): + if model_name in MPS_UNSUPPORT_MODELS and str(device) == "mps": + logger.info(f"{model_name} not support mps, switch to cpu") + return torch.device("cpu") + return device + + +def get_cache_path_by_url(url): + parts = urlparse(url) + hub_dir = get_dir() + model_dir = os.path.join(hub_dir, "checkpoints") + if not os.path.isdir(model_dir): + os.makedirs(model_dir) + filename = os.path.basename(parts.path) + cached_file = os.path.join(model_dir, filename) + return cached_file + + +def download_model(url, model_md5: str = None): + if os.path.exists(url): + cached_file = url + else: + cached_file = get_cache_path_by_url(url) + if not os.path.exists(cached_file): + sys.stderr.write('Downloading: "{}" to {}\n'.format(url, cached_file)) + hash_prefix = None + download_url_to_file(url, cached_file, hash_prefix, progress=True) + if model_md5: + _md5 = md5sum(cached_file) + if model_md5 == _md5: + logger.info(f"Download model success, md5: {_md5}") + else: + try: + os.remove(cached_file) + logger.error( + f"Model md5: {_md5}, expected md5: {model_md5}, wrong model deleted. Please restart iopaint." + f"If you still have errors, please try download model manually first https://lama-cleaner-docs.vercel.app/install/download_model_manually.\n" + ) + except: + logger.error( + f"Model md5: {_md5}, expected md5: {model_md5}, please delete {cached_file} and restart iopaint." + ) + exit(-1) + + return cached_file + + +def ceil_modulo(x, mod): + if x % mod == 0: + return x + return (x // mod + 1) * mod + + +def handle_error(model_path, model_md5, e): + _md5 = md5sum(model_path) + if _md5 != model_md5: + try: + os.remove(model_path) + logger.error( + f"Model md5: {_md5}, expected md5: {model_md5}, wrong model deleted. Please restart iopaint." + f"If you still have errors, please try download model manually first https://lama-cleaner-docs.vercel.app/install/download_model_manually.\n" + ) + except: + logger.error( + f"Model md5: {_md5}, expected md5: {model_md5}, please delete {model_path} and restart iopaint." + ) + else: + logger.error( + f"Failed to load model {model_path}," + f"please submit an issue at https://github.com/Sanster/lama-cleaner/issues and include a screenshot of the error:\n{e}" + ) + exit(-1) + + +def load_jit_model(url_or_path, device, model_md5: str): + if os.path.exists(url_or_path): + model_path = url_or_path + else: + model_path = download_model(url_or_path, model_md5) + + logger.info(f"Loading model from: {model_path}") + try: + model = torch.jit.load(model_path, map_location="cpu").to(device) + except Exception as e: + handle_error(model_path, model_md5, e) + model.eval() + return model + + +def load_model(model: torch.nn.Module, url_or_path, device, model_md5): + if os.path.exists(url_or_path): + model_path = url_or_path + else: + model_path = download_model(url_or_path, model_md5) + + try: + logger.info(f"Loading model from: {model_path}") + state_dict = torch.load(model_path, map_location="cpu") + model.load_state_dict(state_dict, strict=True) + model.to(device) + except Exception as e: + handle_error(model_path, model_md5, e) + model.eval() + return model + + +def numpy_to_bytes(image_numpy: np.ndarray, ext: str) -> bytes: + data = cv2.imencode( + f".{ext}", + image_numpy, + [int(cv2.IMWRITE_JPEG_QUALITY), 100, int(cv2.IMWRITE_PNG_COMPRESSION), 0], + )[1] + image_bytes = data.tobytes() + return image_bytes + + +def pil_to_bytes(pil_img, ext: str, quality: int = 95, infos={}) -> bytes: + with io.BytesIO() as output: + kwargs = {k: v for k, v in infos.items() if v is not None} + if ext == "jpg": + ext = "jpeg" + if "png" == ext.lower() and "parameters" in kwargs: + pnginfo_data = PngImagePlugin.PngInfo() + pnginfo_data.add_text("parameters", kwargs["parameters"]) + kwargs["pnginfo"] = pnginfo_data + + pil_img.save(output, format=ext, quality=quality, **kwargs) + image_bytes = output.getvalue() + return image_bytes + + +def load_img(img_bytes, gray: bool = False, return_info: bool = False): + alpha_channel = None + image = Image.open(io.BytesIO(img_bytes)) + + if return_info: + infos = image.info + + try: + image = ImageOps.exif_transpose(image) + except: + pass + + if gray: + image = image.convert("L") + np_img = np.array(image) + else: + if image.mode == "RGBA": + np_img = np.array(image) + alpha_channel = np_img[:, :, -1] + np_img = cv2.cvtColor(np_img, cv2.COLOR_RGBA2RGB) + else: + image = image.convert("RGB") + np_img = np.array(image) + + if return_info: + return np_img, alpha_channel, infos + return np_img, alpha_channel + + +def norm_img(np_img): + if len(np_img.shape) == 2: + np_img = np_img[:, :, np.newaxis] + np_img = np.transpose(np_img, (2, 0, 1)) + np_img = np_img.astype("float32") / 255 + return np_img + + +def resize_max_size( + np_img, size_limit: int, interpolation=cv2.INTER_CUBIC +) -> np.ndarray: + # Resize image's longer size to size_limit if longer size larger than size_limit + h, w = np_img.shape[:2] + if max(h, w) > size_limit: + ratio = size_limit / max(h, w) + new_w = int(w * ratio + 0.5) + new_h = int(h * ratio + 0.5) + return cv2.resize(np_img, dsize=(new_w, new_h), interpolation=interpolation) + else: + return np_img + + +def pad_img_to_modulo( + img: np.ndarray, mod: int, square: bool = False, min_size: Optional[int] = None +): + """ + + Args: + img: [H, W, C] + mod: + square: 是否为正方形 + min_size: + + Returns: + + """ + if len(img.shape) == 2: + img = img[:, :, np.newaxis] + height, width = img.shape[:2] + out_height = ceil_modulo(height, mod) + out_width = ceil_modulo(width, mod) + + if min_size is not None: + assert min_size % mod == 0 + out_width = max(min_size, out_width) + out_height = max(min_size, out_height) + + if square: + max_size = max(out_height, out_width) + out_height = max_size + out_width = max_size + + return np.pad( + img, + ((0, out_height - height), (0, out_width - width), (0, 0)), + mode="symmetric", + ) + + +def boxes_from_mask(mask: np.ndarray) -> List[np.ndarray]: + """ + Args: + mask: (h, w, 1) 0~255 + + Returns: + + """ + height, width = mask.shape[:2] + _, thresh = cv2.threshold(mask, 127, 255, 0) + contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + + boxes = [] + for cnt in contours: + x, y, w, h = cv2.boundingRect(cnt) + box = np.array([x, y, x + w, y + h]).astype(int) + + box[::2] = np.clip(box[::2], 0, width) + box[1::2] = np.clip(box[1::2], 0, height) + boxes.append(box) + + return boxes + + +def only_keep_largest_contour(mask: np.ndarray) -> List[np.ndarray]: + """ + Args: + mask: (h, w) 0~255 + + Returns: + + """ + _, thresh = cv2.threshold(mask, 127, 255, 0) + contours, _ = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE) + + max_area = 0 + max_index = -1 + for i, cnt in enumerate(contours): + area = cv2.contourArea(cnt) + if area > max_area: + max_area = area + max_index = i + + if max_index != -1: + new_mask = np.zeros_like(mask) + return cv2.drawContours(new_mask, contours, max_index, 255, -1) + else: + return mask + + +def is_mac(): + return sys.platform == "darwin" + + +def get_image_ext(img_bytes): + w = imghdr.what("", img_bytes) + if w is None: + w = "jpeg" + return w + + +def decode_base64_to_image( + encoding: str, gray=False +) -> Tuple[np.array, Optional[np.array], Dict]: + if encoding.startswith("data:image/") or encoding.startswith( + "data:application/octet-stream;base64," + ): + encoding = encoding.split(";")[1].split(",")[1] + image = Image.open(io.BytesIO(base64.b64decode(encoding))) + + alpha_channel = None + try: + image = ImageOps.exif_transpose(image) + except: + pass + # exif_transpose will remove exif rotate info,we must call image.info after exif_transpose + infos = image.info + + if gray: + image = image.convert("L") + np_img = np.array(image) + else: + if image.mode == "RGBA": + np_img = np.array(image) + alpha_channel = np_img[:, :, -1] + np_img = cv2.cvtColor(np_img, cv2.COLOR_RGBA2RGB) + else: + image = image.convert("RGB") + np_img = np.array(image) + + return np_img, alpha_channel, infos + + +def encode_pil_to_base64(image: Image, quality: int, infos: Dict) -> bytes: + img_bytes = pil_to_bytes( + image, + "png", + quality=quality, + infos=infos, + ) + return base64.b64encode(img_bytes) + + +def concat_alpha_channel(rgb_np_img, alpha_channel) -> np.ndarray: + if alpha_channel is not None: + if alpha_channel.shape[:2] != rgb_np_img.shape[:2]: + alpha_channel = cv2.resize( + alpha_channel, dsize=(rgb_np_img.shape[1], rgb_np_img.shape[0]) + ) + rgb_np_img = np.concatenate( + (rgb_np_img, alpha_channel[:, :, np.newaxis]), axis=-1 + ) + return rgb_np_img + + +def adjust_mask(mask: np.ndarray, kernel_size: int, operate): + # fronted brush color "ffcc00bb" + # kernel_size = kernel_size*2+1 + mask[mask >= 127] = 255 + mask[mask < 127] = 0 + + if operate == "reverse": + mask = 255 - mask + else: + kernel = cv2.getStructuringElement( + cv2.MORPH_ELLIPSE, (2 * kernel_size + 1, 2 * kernel_size + 1) + ) + if operate == "expand": + mask = cv2.dilate( + mask, + kernel, + iterations=1, + ) + else: + mask = cv2.erode( + mask, + kernel, + iterations=1, + ) + res_mask = np.zeros((mask.shape[0], mask.shape[1], 4), dtype=np.uint8) + res_mask[mask > 128] = [255, 203, 0, int(255 * 0.73)] + res_mask = cv2.cvtColor(res_mask, cv2.COLOR_BGRA2RGBA) + return res_mask + + +def gen_frontend_mask(bgr_or_gray_mask): + if len(bgr_or_gray_mask.shape) == 3 and bgr_or_gray_mask.shape[2] != 1: + bgr_or_gray_mask = cv2.cvtColor(bgr_or_gray_mask, cv2.COLOR_BGR2GRAY) + + # fronted brush color "ffcc00bb" + # TODO: how to set kernel size? + kernel_size = 9 + bgr_or_gray_mask = cv2.dilate( + bgr_or_gray_mask, + np.ones((kernel_size, kernel_size), np.uint8), + iterations=1, + ) + res_mask = np.zeros( + (bgr_or_gray_mask.shape[0], bgr_or_gray_mask.shape[1], 4), dtype=np.uint8 + ) + res_mask[bgr_or_gray_mask > 128] = [255, 203, 0, int(255 * 0.73)] + res_mask = cv2.cvtColor(res_mask, cv2.COLOR_BGRA2RGBA) + return res_mask diff --git a/inpaint/installer.py b/inpaint/installer.py new file mode 100644 index 0000000..01506d9 --- /dev/null +++ b/inpaint/installer.py @@ -0,0 +1,10 @@ +import subprocess +import sys + + +def install(package): + subprocess.check_call([sys.executable, "-m", "pip", "install", package]) + + +def install_plugins_package(): + install("rembg") diff --git a/inpaint/model/__init__.py b/inpaint/model/__init__.py new file mode 100644 index 0000000..799e2ec --- /dev/null +++ b/inpaint/model/__init__.py @@ -0,0 +1,37 @@ +from .anytext.anytext_model import AnyText +from .controlnet import ControlNet +from .fcf import FcF +from .instruct_pix2pix import InstructPix2Pix +from .kandinsky import Kandinsky22 +from .lama import LaMa +from .ldm import LDM +from .manga import Manga +from .mat import MAT +from .mi_gan import MIGAN +from .opencv2 import OpenCV2 +from .paint_by_example import PaintByExample +from .power_paint.power_paint import PowerPaint +from .sd import SD15, SD2, Anything4, RealisticVision14, SD +from .sdxl import SDXL +from .zits import ZITS + +models = { + LaMa.name: LaMa, + LDM.name: LDM, + ZITS.name: ZITS, + MAT.name: MAT, + FcF.name: FcF, + OpenCV2.name: OpenCV2, + Manga.name: Manga, + MIGAN.name: MIGAN, + SD15.name: SD15, + Anything4.name: Anything4, + RealisticVision14.name: RealisticVision14, + SD2.name: SD2, + PaintByExample.name: PaintByExample, + InstructPix2Pix.name: InstructPix2Pix, + Kandinsky22.name: Kandinsky22, + SDXL.name: SDXL, + PowerPaint.name: PowerPaint, + AnyText.name: AnyText, +} diff --git a/inpaint/model/anytext/__init__.py b/inpaint/model/anytext/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/inpaint/model/anytext/anytext_model.py b/inpaint/model/anytext/anytext_model.py new file mode 100644 index 0000000..7a687d5 --- /dev/null +++ b/inpaint/model/anytext/anytext_model.py @@ -0,0 +1,73 @@ +import torch +from huggingface_hub import hf_hub_download + +from inpaint.const import ANYTEXT_NAME +from inpaint.model.anytext.anytext_pipeline import AnyTextPipeline +from inpaint.model.base import DiffusionInpaintModel +from inpaint.model.utils import get_torch_dtype, is_local_files_only +from inpaint.schema import InpaintRequest + + +class AnyText(DiffusionInpaintModel): + name = ANYTEXT_NAME + pad_mod = 64 + is_erase_model = False + + @staticmethod + def download(local_files_only=False): + hf_hub_download( + repo_id=ANYTEXT_NAME, + filename="model_index.json", + local_files_only=local_files_only, + ) + ckpt_path = hf_hub_download( + repo_id=ANYTEXT_NAME, + filename="pytorch_model.fp16.safetensors", + local_files_only=local_files_only, + ) + font_path = hf_hub_download( + repo_id=ANYTEXT_NAME, + filename="SourceHanSansSC-Medium.otf", + local_files_only=local_files_only, + ) + return ckpt_path, font_path + + def init_model(self, device, **kwargs): + local_files_only = is_local_files_only(**kwargs) + ckpt_path, font_path = self.download(local_files_only) + use_gpu, torch_dtype = get_torch_dtype(device, kwargs.get("no_half", False)) + self.model = AnyTextPipeline( + ckpt_path=ckpt_path, + font_path=font_path, + device=device, + use_fp16=torch_dtype == torch.float16, + ) + self.callback = kwargs.pop("callback", None) + + def forward(self, image, mask, config: InpaintRequest): + """Input image and output image have same size + image: [H, W, C] RGB + mask: [H, W, 1] 255 means area to inpainting + return: BGR IMAGE + """ + height, width = image.shape[:2] + mask = mask.astype("float32") / 255.0 + masked_image = image * (1 - mask) + + # list of rgb ndarray + results, rtn_code, rtn_warning = self.model( + image=image, + masked_image=masked_image, + prompt=config.prompt, + negative_prompt=config.negative_prompt, + num_inference_steps=config.sd_steps, + strength=config.sd_strength, + guidance_scale=config.sd_guidance_scale, + height=height, + width=width, + seed=config.sd_seed, + sort_priority="y", + callback=self.callback + ) + inpainted_rgb_image = results[0][..., ::-1] + return inpainted_rgb_image diff --git a/inpaint/model/anytext/anytext_pipeline.py b/inpaint/model/anytext/anytext_pipeline.py new file mode 100644 index 0000000..8571728 --- /dev/null +++ b/inpaint/model/anytext/anytext_pipeline.py @@ -0,0 +1,403 @@ +""" +AnyText: Multilingual Visual Text Generation And Editing +Paper: https://arxiv.org/abs/2311.03054 +Code: https://github.com/tyxsspa/AnyText +Copyright (c) Alibaba, Inc. and its affiliates. +""" +import os +from pathlib import Path + +from inpaint.model.utils import set_seed +from safetensors.torch import load_file + +os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3" +import torch +import re +import numpy as np +import cv2 +import einops +from PIL import ImageFont +from inpaint.model.anytext.cldm.model import create_model, load_state_dict +from inpaint.model.anytext.cldm.ddim_hacked import DDIMSampler +from inpaint.model.anytext.utils import ( + check_channels, + draw_glyph, + draw_glyph2, +) + + +BBOX_MAX_NUM = 8 +PLACE_HOLDER = "*" +max_chars = 20 + +ANYTEXT_CFG = os.path.join( + os.path.dirname(os.path.abspath(__file__)), "anytext_sd15.yaml" +) + + +def check_limits(tensor): + float16_min = torch.finfo(torch.float16).min + float16_max = torch.finfo(torch.float16).max + + # 检查张量中是否有值小于float16的最小值或大于float16的最大值 + is_below_min = (tensor < float16_min).any() + is_above_max = (tensor > float16_max).any() + + return is_below_min or is_above_max + + +class AnyTextPipeline: + def __init__(self, ckpt_path, font_path, device, use_fp16=True): + self.cfg_path = ANYTEXT_CFG + self.font_path = font_path + self.use_fp16 = use_fp16 + self.device = device + + self.font = ImageFont.truetype(font_path, size=60) + self.model = create_model( + self.cfg_path, + device=self.device, + use_fp16=self.use_fp16, + ) + if self.use_fp16: + self.model = self.model.half() + if Path(ckpt_path).suffix == ".safetensors": + state_dict = load_file(ckpt_path, device="cpu") + else: + state_dict = load_state_dict(ckpt_path, location="cpu") + self.model.load_state_dict(state_dict, strict=False) + self.model = self.model.eval().to(self.device) + self.ddim_sampler = DDIMSampler(self.model, device=self.device) + + def __call__( + self, + prompt: str, + negative_prompt: str, + image: np.ndarray, + masked_image: np.ndarray, + num_inference_steps: int, + strength: float, + guidance_scale: float, + height: int, + width: int, + seed: int, + sort_priority: str = "y", + callback=None, + ): + """ + + Args: + prompt: + negative_prompt: + image: + masked_image: + num_inference_steps: + strength: + guidance_scale: + height: + width: + seed: + sort_priority: x: left-right, y: top-down + + Returns: + result: list of images in numpy.ndarray format + rst_code: 0: normal -1: error 1:warning + rst_info: string of error or warning + + """ + set_seed(seed) + str_warning = "" + + mode = "text-editing" + revise_pos = False + img_count = 1 + ddim_steps = num_inference_steps + w = width + h = height + strength = strength + cfg_scale = guidance_scale + eta = 0.0 + + prompt, texts = self.modify_prompt(prompt) + if prompt is None and texts is None: + return ( + None, + -1, + "You have input Chinese prompt but the translator is not loaded!", + "", + ) + n_lines = len(texts) + if mode in ["text-generation", "gen"]: + edit_image = np.ones((h, w, 3)) * 127.5 # empty mask image + elif mode in ["text-editing", "edit"]: + if masked_image is None or image is None: + return ( + None, + -1, + "Reference image and position image are needed for text editing!", + "", + ) + if isinstance(image, str): + image = cv2.imread(image)[..., ::-1] + assert image is not None, f"Can't read ori_image image from{image}!" + elif isinstance(image, torch.Tensor): + image = image.cpu().numpy() + else: + assert isinstance( + image, np.ndarray + ), f"Unknown format of ori_image: {type(image)}" + edit_image = image.clip(1, 255) # for mask reason + edit_image = check_channels(edit_image) + # edit_image = resize_image( + # edit_image, max_length=768 + # ) # make w h multiple of 64, resize if w or h > max_length + h, w = edit_image.shape[:2] # change h, w by input ref_img + # preprocess pos_imgs(if numpy, make sure it's white pos in black bg) + if masked_image is None: + pos_imgs = np.zeros((w, h, 1)) + if isinstance(masked_image, str): + masked_image = cv2.imread(masked_image)[..., ::-1] + assert ( + masked_image is not None + ), f"Can't read draw_pos image from{masked_image}!" + pos_imgs = 255 - masked_image + elif isinstance(masked_image, torch.Tensor): + pos_imgs = masked_image.cpu().numpy() + else: + assert isinstance( + masked_image, np.ndarray + ), f"Unknown format of draw_pos: {type(masked_image)}" + pos_imgs = 255 - masked_image + pos_imgs = pos_imgs[..., 0:1] + pos_imgs = cv2.convertScaleAbs(pos_imgs) + _, pos_imgs = cv2.threshold(pos_imgs, 254, 255, cv2.THRESH_BINARY) + # seprate pos_imgs + pos_imgs = self.separate_pos_imgs(pos_imgs, sort_priority) + if len(pos_imgs) == 0: + pos_imgs = [np.zeros((h, w, 1))] + if len(pos_imgs) < n_lines: + if n_lines == 1 and texts[0] == " ": + pass # text-to-image without text + else: + raise RuntimeError( + f"{n_lines} text line to draw from prompt, not enough mask area({len(pos_imgs)}) on images" + ) + elif len(pos_imgs) > n_lines: + str_warning = f"Warning: found {len(pos_imgs)} positions that > needed {n_lines} from prompt." + # get pre_pos, poly_list, hint that needed for anytext + pre_pos = [] + poly_list = [] + for input_pos in pos_imgs: + if input_pos.mean() != 0: + input_pos = ( + input_pos[..., np.newaxis] + if len(input_pos.shape) == 2 + else input_pos + ) + poly, pos_img = self.find_polygon(input_pos) + pre_pos += [pos_img / 255.0] + poly_list += [poly] + else: + pre_pos += [np.zeros((h, w, 1))] + poly_list += [None] + np_hint = np.sum(pre_pos, axis=0).clip(0, 1) + # prepare info dict + info = {} + info["glyphs"] = [] + info["gly_line"] = [] + info["positions"] = [] + info["n_lines"] = [len(texts)] * img_count + gly_pos_imgs = [] + for i in range(len(texts)): + text = texts[i] + if len(text) > max_chars: + str_warning = ( + f'"{text}" length > max_chars: {max_chars}, will be cut off...' + ) + text = text[:max_chars] + gly_scale = 2 + if pre_pos[i].mean() != 0: + gly_line = draw_glyph(self.font, text) + glyphs = draw_glyph2( + self.font, + text, + poly_list[i], + scale=gly_scale, + width=w, + height=h, + add_space=False, + ) + gly_pos_img = cv2.drawContours( + glyphs * 255, [poly_list[i] * gly_scale], 0, (255, 255, 255), 1 + ) + if revise_pos: + resize_gly = cv2.resize( + glyphs, (pre_pos[i].shape[1], pre_pos[i].shape[0]) + ) + new_pos = cv2.morphologyEx( + (resize_gly * 255).astype(np.uint8), + cv2.MORPH_CLOSE, + kernel=np.ones( + (resize_gly.shape[0] // 10, resize_gly.shape[1] // 10), + dtype=np.uint8, + ), + iterations=1, + ) + new_pos = ( + new_pos[..., np.newaxis] if len(new_pos.shape) == 2 else new_pos + ) + contours, _ = cv2.findContours( + new_pos, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE + ) + if len(contours) != 1: + str_warning = f"Fail to revise position {i} to bounding rect, remain position unchanged..." + else: + rect = cv2.minAreaRect(contours[0]) + poly = np.int0(cv2.boxPoints(rect)) + pre_pos[i] = ( + cv2.drawContours(new_pos, [poly], -1, 255, -1) / 255.0 + ) + gly_pos_img = cv2.drawContours( + glyphs * 255, [poly * gly_scale], 0, (255, 255, 255), 1 + ) + gly_pos_imgs += [gly_pos_img] # for show + else: + glyphs = np.zeros((h * gly_scale, w * gly_scale, 1)) + gly_line = np.zeros((80, 512, 1)) + gly_pos_imgs += [ + np.zeros((h * gly_scale, w * gly_scale, 1)) + ] # for show + pos = pre_pos[i] + info["glyphs"] += [self.arr2tensor(glyphs, img_count)] + info["gly_line"] += [self.arr2tensor(gly_line, img_count)] + info["positions"] += [self.arr2tensor(pos, img_count)] + # get masked_x + masked_img = ((edit_image.astype(np.float32) / 127.5) - 1.0) * (1 - np_hint) + masked_img = np.transpose(masked_img, (2, 0, 1)) + masked_img = torch.from_numpy(masked_img.copy()).float().to(self.device) + if self.use_fp16: + masked_img = masked_img.half() + encoder_posterior = self.model.encode_first_stage(masked_img[None, ...]) + masked_x = self.model.get_first_stage_encoding(encoder_posterior).detach() + if self.use_fp16: + masked_x = masked_x.half() + info["masked_x"] = torch.cat([masked_x for _ in range(img_count)], dim=0) + + hint = self.arr2tensor(np_hint, img_count) + cond = self.model.get_learned_conditioning( + dict( + c_concat=[hint], + c_crossattn=[[prompt] * img_count], + text_info=info, + ) + ) + un_cond = self.model.get_learned_conditioning( + dict( + c_concat=[hint], + c_crossattn=[[negative_prompt] * img_count], + text_info=info, + ) + ) + shape = (4, h // 8, w // 8) + self.model.control_scales = [strength] * 13 + samples, intermediates = self.ddim_sampler.sample( + ddim_steps, + img_count, + shape, + cond, + verbose=False, + eta=eta, + unconditional_guidance_scale=cfg_scale, + unconditional_conditioning=un_cond, + callback=callback + ) + if self.use_fp16: + samples = samples.half() + x_samples = self.model.decode_first_stage(samples) + x_samples = ( + (einops.rearrange(x_samples, "b c h w -> b h w c") * 127.5 + 127.5) + .cpu() + .numpy() + .clip(0, 255) + .astype(np.uint8) + ) + results = [x_samples[i] for i in range(img_count)] + # if ( + # mode == "edit" and False + # ): # replace backgound in text editing but not ideal yet + # results = [r * np_hint + edit_image * (1 - np_hint) for r in results] + # results = [r.clip(0, 255).astype(np.uint8) for r in results] + # if len(gly_pos_imgs) > 0 and show_debug: + # glyph_bs = np.stack(gly_pos_imgs, axis=2) + # glyph_img = np.sum(glyph_bs, axis=2) * 255 + # glyph_img = glyph_img.clip(0, 255).astype(np.uint8) + # results += [np.repeat(glyph_img, 3, axis=2)] + rst_code = 1 if str_warning else 0 + return results, rst_code, str_warning + + def modify_prompt(self, prompt): + prompt = prompt.replace("“", '"') + prompt = prompt.replace("”", '"') + p = '"(.*?)"' + strs = re.findall(p, prompt) + if len(strs) == 0: + strs = [" "] + else: + for s in strs: + prompt = prompt.replace(f'"{s}"', f" {PLACE_HOLDER} ", 1) + # if self.is_chinese(prompt): + # if self.trans_pipe is None: + # return None, None + # old_prompt = prompt + # prompt = self.trans_pipe(input=prompt + " .")["translation"][:-1] + # print(f"Translate: {old_prompt} --> {prompt}") + return prompt, strs + + # def is_chinese(self, text): + # text = checker._clean_text(text) + # for char in text: + # cp = ord(char) + # if checker._is_chinese_char(cp): + # return True + # return False + + def separate_pos_imgs(self, img, sort_priority, gap=102): + num_labels, labels, stats, centroids = cv2.connectedComponentsWithStats(img) + components = [] + for label in range(1, num_labels): + component = np.zeros_like(img) + component[labels == label] = 255 + components.append((component, centroids[label])) + if sort_priority == "y": + fir, sec = 1, 0 # top-down first + elif sort_priority == "x": + fir, sec = 0, 1 # left-right first + components.sort(key=lambda c: (c[1][fir] // gap, c[1][sec] // gap)) + sorted_components = [c[0] for c in components] + return sorted_components + + def find_polygon(self, image, min_rect=False): + contours, hierarchy = cv2.findContours( + image, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE + ) + max_contour = max(contours, key=cv2.contourArea) # get contour with max area + if min_rect: + # get minimum enclosing rectangle + rect = cv2.minAreaRect(max_contour) + poly = np.int0(cv2.boxPoints(rect)) + else: + # get approximate polygon + epsilon = 0.01 * cv2.arcLength(max_contour, True) + poly = cv2.approxPolyDP(max_contour, epsilon, True) + n, _, xy = poly.shape + poly = poly.reshape(n, xy) + cv2.drawContours(image, [poly], -1, 255, -1) + return poly, image + + def arr2tensor(self, arr, bs): + arr = np.transpose(arr, (2, 0, 1)) + _arr = torch.from_numpy(arr.copy()).float().to(self.device) + if self.use_fp16: + _arr = _arr.half() + _arr = torch.stack([_arr for _ in range(bs)], dim=0) + return _arr diff --git a/inpaint/model/anytext/anytext_sd15.yaml b/inpaint/model/anytext/anytext_sd15.yaml new file mode 100644 index 0000000..d727594 --- /dev/null +++ b/inpaint/model/anytext/anytext_sd15.yaml @@ -0,0 +1,99 @@ +model: + target: iopaint.model.anytext.cldm.cldm.ControlLDM + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "img" + cond_stage_key: "caption" + control_key: "hint" + glyph_key: "glyphs" + position_key: "positions" + image_size: 64 + channels: 4 + cond_stage_trainable: true # need be true when embedding_manager is valid + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + only_mid_control: False + loss_alpha: 0 # perceptual loss, 0.003 + loss_beta: 0 # ctc loss + latin_weight: 1.0 # latin text line may need smaller weigth + with_step_weight: true + use_vae_upsample: true + embedding_manager_config: + target: iopaint.model.anytext.cldm.embedding_manager.EmbeddingManager + params: + valid: true # v6 + emb_type: ocr # ocr, vit, conv + glyph_channels: 1 + position_channels: 1 + add_pos: false + placeholder_string: '*' + + control_stage_config: + target: iopaint.model.anytext.cldm.cldm.ControlNet + params: + image_size: 32 # unused + in_channels: 4 + model_channels: 320 + glyph_channels: 1 + position_channels: 1 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + unet_config: + target: iopaint.model.anytext.cldm.cldm.ControlledUnetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: iopaint.model.anytext.ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: iopaint.model.anytext.ldm.modules.encoders.modules.FrozenCLIPEmbedderT3 + params: + version: openai/clip-vit-large-patch14 + use_vision: false # v6 diff --git a/inpaint/model/anytext/cldm/__init__.py b/inpaint/model/anytext/cldm/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/inpaint/model/anytext/cldm/cldm.py b/inpaint/model/anytext/cldm/cldm.py new file mode 100644 index 0000000..ad9692a --- /dev/null +++ b/inpaint/model/anytext/cldm/cldm.py @@ -0,0 +1,630 @@ +import os +from pathlib import Path + +import einops +import torch +import torch as th +import torch.nn as nn +import copy +from easydict import EasyDict as edict + +from iopaint.model.anytext.ldm.modules.diffusionmodules.util import ( + conv_nd, + linear, + zero_module, + timestep_embedding, +) + +from einops import rearrange, repeat +from iopaint.model.anytext.ldm.modules.attention import SpatialTransformer +from iopaint.model.anytext.ldm.modules.diffusionmodules.openaimodel import UNetModel, TimestepEmbedSequential, ResBlock, Downsample, AttentionBlock +from iopaint.model.anytext.ldm.models.diffusion.ddpm import LatentDiffusion +from iopaint.model.anytext.ldm.util import log_txt_as_img, exists, instantiate_from_config +from iopaint.model.anytext.ldm.models.diffusion.ddim import DDIMSampler +from iopaint.model.anytext.ldm.modules.distributions.distributions import DiagonalGaussianDistribution +from .recognizer import TextRecognizer, create_predictor + +CURRENT_DIR = Path(os.path.dirname(os.path.abspath(__file__))) + + +def count_parameters(model): + return sum(p.numel() for p in model.parameters() if p.requires_grad) + + +class ControlledUnetModel(UNetModel): + def forward(self, x, timesteps=None, context=None, control=None, only_mid_control=False, **kwargs): + hs = [] + with torch.no_grad(): + t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False) + if self.use_fp16: + t_emb = t_emb.half() + emb = self.time_embed(t_emb) + h = x.type(self.dtype) + for module in self.input_blocks: + h = module(h, emb, context) + hs.append(h) + h = self.middle_block(h, emb, context) + + if control is not None: + h += control.pop() + + for i, module in enumerate(self.output_blocks): + if only_mid_control or control is None: + h = torch.cat([h, hs.pop()], dim=1) + else: + h = torch.cat([h, hs.pop() + control.pop()], dim=1) + h = module(h, emb, context) + + h = h.type(x.dtype) + return self.out(h) + + +class ControlNet(nn.Module): + def __init__( + self, + image_size, + in_channels, + model_channels, + glyph_channels, + position_channels, + num_res_blocks, + attention_resolutions, + dropout=0, + channel_mult=(1, 2, 4, 8), + conv_resample=True, + dims=2, + use_checkpoint=False, + use_fp16=False, + num_heads=-1, + num_head_channels=-1, + num_heads_upsample=-1, + use_scale_shift_norm=False, + resblock_updown=False, + use_new_attention_order=False, + use_spatial_transformer=False, # custom transformer support + transformer_depth=1, # custom transformer support + context_dim=None, # custom transformer support + n_embed=None, # custom support for prediction of discrete ids into codebook of first stage vq model + legacy=True, + disable_self_attentions=None, + num_attention_blocks=None, + disable_middle_self_attn=False, + use_linear_in_transformer=False, + ): + super().__init__() + if use_spatial_transformer: + assert context_dim is not None, 'Fool!! You forgot to include the dimension of your cross-attention conditioning...' + + if context_dim is not None: + assert use_spatial_transformer, 'Fool!! You forgot to use the spatial transformer for your cross-attention conditioning...' + from omegaconf.listconfig import ListConfig + if type(context_dim) == ListConfig: + context_dim = list(context_dim) + + if num_heads_upsample == -1: + num_heads_upsample = num_heads + + if num_heads == -1: + assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set' + + if num_head_channels == -1: + assert num_heads != -1, 'Either num_heads or num_head_channels has to be set' + self.dims = dims + self.image_size = image_size + self.in_channels = in_channels + self.model_channels = model_channels + if isinstance(num_res_blocks, int): + self.num_res_blocks = len(channel_mult) * [num_res_blocks] + else: + if len(num_res_blocks) != len(channel_mult): + raise ValueError("provide num_res_blocks either as an int (globally constant) or " + "as a list/tuple (per-level) with the same length as channel_mult") + self.num_res_blocks = num_res_blocks + if disable_self_attentions is not None: + # should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not + assert len(disable_self_attentions) == len(channel_mult) + if num_attention_blocks is not None: + assert len(num_attention_blocks) == len(self.num_res_blocks) + assert all(map(lambda i: self.num_res_blocks[i] >= num_attention_blocks[i], range(len(num_attention_blocks)))) + print(f"Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. " + f"This option has LESS priority than attention_resolutions {attention_resolutions}, " + f"i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, " + f"attention will still not be set.") + self.attention_resolutions = attention_resolutions + self.dropout = dropout + self.channel_mult = channel_mult + self.conv_resample = conv_resample + self.use_checkpoint = use_checkpoint + self.use_fp16 = use_fp16 + self.dtype = th.float16 if use_fp16 else th.float32 + self.num_heads = num_heads + self.num_head_channels = num_head_channels + self.num_heads_upsample = num_heads_upsample + self.predict_codebook_ids = n_embed is not None + + time_embed_dim = model_channels * 4 + self.time_embed = nn.Sequential( + linear(model_channels, time_embed_dim), + nn.SiLU(), + linear(time_embed_dim, time_embed_dim), + ) + + self.input_blocks = nn.ModuleList( + [ + TimestepEmbedSequential( + conv_nd(dims, in_channels, model_channels, 3, padding=1) + ) + ] + ) + self.zero_convs = nn.ModuleList([self.make_zero_conv(model_channels)]) + + self.glyph_block = TimestepEmbedSequential( + conv_nd(dims, glyph_channels, 8, 3, padding=1), + nn.SiLU(), + conv_nd(dims, 8, 8, 3, padding=1), + nn.SiLU(), + conv_nd(dims, 8, 16, 3, padding=1, stride=2), + nn.SiLU(), + conv_nd(dims, 16, 16, 3, padding=1), + nn.SiLU(), + conv_nd(dims, 16, 32, 3, padding=1, stride=2), + nn.SiLU(), + conv_nd(dims, 32, 32, 3, padding=1), + nn.SiLU(), + conv_nd(dims, 32, 96, 3, padding=1, stride=2), + nn.SiLU(), + conv_nd(dims, 96, 96, 3, padding=1), + nn.SiLU(), + conv_nd(dims, 96, 256, 3, padding=1, stride=2), + nn.SiLU(), + ) + + self.position_block = TimestepEmbedSequential( + conv_nd(dims, position_channels, 8, 3, padding=1), + nn.SiLU(), + conv_nd(dims, 8, 8, 3, padding=1), + nn.SiLU(), + conv_nd(dims, 8, 16, 3, padding=1, stride=2), + nn.SiLU(), + conv_nd(dims, 16, 16, 3, padding=1), + nn.SiLU(), + conv_nd(dims, 16, 32, 3, padding=1, stride=2), + nn.SiLU(), + conv_nd(dims, 32, 32, 3, padding=1), + nn.SiLU(), + conv_nd(dims, 32, 64, 3, padding=1, stride=2), + nn.SiLU(), + ) + + self.fuse_block = zero_module(conv_nd(dims, 256+64+4, model_channels, 3, padding=1)) + + self._feature_size = model_channels + input_block_chans = [model_channels] + ch = model_channels + ds = 1 + for level, mult in enumerate(channel_mult): + for nr in range(self.num_res_blocks[level]): + layers = [ + ResBlock( + ch, + time_embed_dim, + dropout, + out_channels=mult * model_channels, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ) + ] + ch = mult * model_channels + if ds in attention_resolutions: + if num_head_channels == -1: + dim_head = ch // num_heads + else: + num_heads = ch // num_head_channels + dim_head = num_head_channels + if legacy: + # num_heads = 1 + dim_head = ch // num_heads if use_spatial_transformer else num_head_channels + if exists(disable_self_attentions): + disabled_sa = disable_self_attentions[level] + else: + disabled_sa = False + + if not exists(num_attention_blocks) or nr < num_attention_blocks[level]: + layers.append( + AttentionBlock( + ch, + use_checkpoint=use_checkpoint, + num_heads=num_heads, + num_head_channels=dim_head, + use_new_attention_order=use_new_attention_order, + ) if not use_spatial_transformer else SpatialTransformer( + ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim, + disable_self_attn=disabled_sa, use_linear=use_linear_in_transformer, + use_checkpoint=use_checkpoint + ) + ) + self.input_blocks.append(TimestepEmbedSequential(*layers)) + self.zero_convs.append(self.make_zero_conv(ch)) + self._feature_size += ch + input_block_chans.append(ch) + if level != len(channel_mult) - 1: + out_ch = ch + self.input_blocks.append( + TimestepEmbedSequential( + ResBlock( + ch, + time_embed_dim, + dropout, + out_channels=out_ch, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + down=True, + ) + if resblock_updown + else Downsample( + ch, conv_resample, dims=dims, out_channels=out_ch + ) + ) + ) + ch = out_ch + input_block_chans.append(ch) + self.zero_convs.append(self.make_zero_conv(ch)) + ds *= 2 + self._feature_size += ch + + if num_head_channels == -1: + dim_head = ch // num_heads + else: + num_heads = ch // num_head_channels + dim_head = num_head_channels + if legacy: + # num_heads = 1 + dim_head = ch // num_heads if use_spatial_transformer else num_head_channels + self.middle_block = TimestepEmbedSequential( + ResBlock( + ch, + time_embed_dim, + dropout, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ), + AttentionBlock( + ch, + use_checkpoint=use_checkpoint, + num_heads=num_heads, + num_head_channels=dim_head, + use_new_attention_order=use_new_attention_order, + ) if not use_spatial_transformer else SpatialTransformer( # always uses a self-attn + ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim, + disable_self_attn=disable_middle_self_attn, use_linear=use_linear_in_transformer, + use_checkpoint=use_checkpoint + ), + ResBlock( + ch, + time_embed_dim, + dropout, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ), + ) + self.middle_block_out = self.make_zero_conv(ch) + self._feature_size += ch + + def make_zero_conv(self, channels): + return TimestepEmbedSequential(zero_module(conv_nd(self.dims, channels, channels, 1, padding=0))) + + def forward(self, x, hint, text_info, timesteps, context, **kwargs): + t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False) + if self.use_fp16: + t_emb = t_emb.half() + emb = self.time_embed(t_emb) + + # guided_hint from text_info + B, C, H, W = x.shape + glyphs = torch.cat(text_info['glyphs'], dim=1).sum(dim=1, keepdim=True) + positions = torch.cat(text_info['positions'], dim=1).sum(dim=1, keepdim=True) + enc_glyph = self.glyph_block(glyphs, emb, context) + enc_pos = self.position_block(positions, emb, context) + guided_hint = self.fuse_block(torch.cat([enc_glyph, enc_pos, text_info['masked_x']], dim=1)) + + outs = [] + + h = x.type(self.dtype) + for module, zero_conv in zip(self.input_blocks, self.zero_convs): + if guided_hint is not None: + h = module(h, emb, context) + h += guided_hint + guided_hint = None + else: + h = module(h, emb, context) + outs.append(zero_conv(h, emb, context)) + + h = self.middle_block(h, emb, context) + outs.append(self.middle_block_out(h, emb, context)) + + return outs + + +class ControlLDM(LatentDiffusion): + + def __init__(self, control_stage_config, control_key, glyph_key, position_key, only_mid_control, loss_alpha=0, loss_beta=0, with_step_weight=False, use_vae_upsample=False, latin_weight=1.0, embedding_manager_config=None, *args, **kwargs): + self.use_fp16 = kwargs.pop('use_fp16', False) + super().__init__(*args, **kwargs) + self.control_model = instantiate_from_config(control_stage_config) + self.control_key = control_key + self.glyph_key = glyph_key + self.position_key = position_key + self.only_mid_control = only_mid_control + self.control_scales = [1.0] * 13 + self.loss_alpha = loss_alpha + self.loss_beta = loss_beta + self.with_step_weight = with_step_weight + self.use_vae_upsample = use_vae_upsample + self.latin_weight = latin_weight + + if embedding_manager_config is not None and embedding_manager_config.params.valid: + self.embedding_manager = self.instantiate_embedding_manager(embedding_manager_config, self.cond_stage_model) + for param in self.embedding_manager.embedding_parameters(): + param.requires_grad = True + else: + self.embedding_manager = None + if self.loss_alpha > 0 or self.loss_beta > 0 or self.embedding_manager: + if embedding_manager_config.params.emb_type == 'ocr': + self.text_predictor = create_predictor().eval() + args = edict() + args.rec_image_shape = "3, 48, 320" + args.rec_batch_num = 6 + args.rec_char_dict_path = str(CURRENT_DIR.parent / "ocr_recog" / "ppocr_keys_v1.txt") + args.use_fp16 = self.use_fp16 + self.cn_recognizer = TextRecognizer(args, self.text_predictor) + for param in self.text_predictor.parameters(): + param.requires_grad = False + if self.embedding_manager: + self.embedding_manager.recog = self.cn_recognizer + + @torch.no_grad() + def get_input(self, batch, k, bs=None, *args, **kwargs): + if self.embedding_manager is None: # fill in full caption + self.fill_caption(batch) + x, c, mx = super().get_input(batch, self.first_stage_key, mask_k='masked_img', *args, **kwargs) + control = batch[self.control_key] # for log_images and loss_alpha, not real control + if bs is not None: + control = control[:bs] + control = control.to(self.device) + control = einops.rearrange(control, 'b h w c -> b c h w') + control = control.to(memory_format=torch.contiguous_format).float() + + inv_mask = batch['inv_mask'] + if bs is not None: + inv_mask = inv_mask[:bs] + inv_mask = inv_mask.to(self.device) + inv_mask = einops.rearrange(inv_mask, 'b h w c -> b c h w') + inv_mask = inv_mask.to(memory_format=torch.contiguous_format).float() + + glyphs = batch[self.glyph_key] + gly_line = batch['gly_line'] + positions = batch[self.position_key] + n_lines = batch['n_lines'] + language = batch['language'] + texts = batch['texts'] + assert len(glyphs) == len(positions) + for i in range(len(glyphs)): + if bs is not None: + glyphs[i] = glyphs[i][:bs] + gly_line[i] = gly_line[i][:bs] + positions[i] = positions[i][:bs] + n_lines = n_lines[:bs] + glyphs[i] = glyphs[i].to(self.device) + gly_line[i] = gly_line[i].to(self.device) + positions[i] = positions[i].to(self.device) + glyphs[i] = einops.rearrange(glyphs[i], 'b h w c -> b c h w') + gly_line[i] = einops.rearrange(gly_line[i], 'b h w c -> b c h w') + positions[i] = einops.rearrange(positions[i], 'b h w c -> b c h w') + glyphs[i] = glyphs[i].to(memory_format=torch.contiguous_format).float() + gly_line[i] = gly_line[i].to(memory_format=torch.contiguous_format).float() + positions[i] = positions[i].to(memory_format=torch.contiguous_format).float() + info = {} + info['glyphs'] = glyphs + info['positions'] = positions + info['n_lines'] = n_lines + info['language'] = language + info['texts'] = texts + info['img'] = batch['img'] # nhwc, (-1,1) + info['masked_x'] = mx + info['gly_line'] = gly_line + info['inv_mask'] = inv_mask + return x, dict(c_crossattn=[c], c_concat=[control], text_info=info) + + def apply_model(self, x_noisy, t, cond, *args, **kwargs): + assert isinstance(cond, dict) + diffusion_model = self.model.diffusion_model + _cond = torch.cat(cond['c_crossattn'], 1) + _hint = torch.cat(cond['c_concat'], 1) + if self.use_fp16: + x_noisy = x_noisy.half() + control = self.control_model(x=x_noisy, timesteps=t, context=_cond, hint=_hint, text_info=cond['text_info']) + control = [c * scale for c, scale in zip(control, self.control_scales)] + eps = diffusion_model(x=x_noisy, timesteps=t, context=_cond, control=control, only_mid_control=self.only_mid_control) + + return eps + + def instantiate_embedding_manager(self, config, embedder): + model = instantiate_from_config(config, embedder=embedder) + return model + + @torch.no_grad() + def get_unconditional_conditioning(self, N): + return self.get_learned_conditioning(dict(c_crossattn=[[""] * N], text_info=None)) + + def get_learned_conditioning(self, c): + if self.cond_stage_forward is None: + if hasattr(self.cond_stage_model, 'encode') and callable(self.cond_stage_model.encode): + if self.embedding_manager is not None and c['text_info'] is not None: + self.embedding_manager.encode_text(c['text_info']) + if isinstance(c, dict): + cond_txt = c['c_crossattn'][0] + else: + cond_txt = c + if self.embedding_manager is not None: + cond_txt = self.cond_stage_model.encode(cond_txt, embedding_manager=self.embedding_manager) + else: + cond_txt = self.cond_stage_model.encode(cond_txt) + if isinstance(c, dict): + c['c_crossattn'][0] = cond_txt + else: + c = cond_txt + if isinstance(c, DiagonalGaussianDistribution): + c = c.mode() + else: + c = self.cond_stage_model(c) + else: + assert hasattr(self.cond_stage_model, self.cond_stage_forward) + c = getattr(self.cond_stage_model, self.cond_stage_forward)(c) + return c + + def fill_caption(self, batch, place_holder='*'): + bs = len(batch['n_lines']) + cond_list = copy.deepcopy(batch[self.cond_stage_key]) + for i in range(bs): + n_lines = batch['n_lines'][i] + if n_lines == 0: + continue + cur_cap = cond_list[i] + for j in range(n_lines): + r_txt = batch['texts'][j][i] + cur_cap = cur_cap.replace(place_holder, f'"{r_txt}"', 1) + cond_list[i] = cur_cap + batch[self.cond_stage_key] = cond_list + + @torch.no_grad() + def log_images(self, batch, N=4, n_row=2, sample=False, ddim_steps=50, ddim_eta=0.0, return_keys=None, + quantize_denoised=True, inpaint=True, plot_denoise_rows=False, plot_progressive_rows=True, + plot_diffusion_rows=False, unconditional_guidance_scale=9.0, unconditional_guidance_label=None, + use_ema_scope=True, + **kwargs): + use_ddim = ddim_steps is not None + + log = dict() + z, c = self.get_input(batch, self.first_stage_key, bs=N) + if self.cond_stage_trainable: + with torch.no_grad(): + c = self.get_learned_conditioning(c) + c_crossattn = c["c_crossattn"][0][:N] + c_cat = c["c_concat"][0][:N] + text_info = c["text_info"] + text_info['glyphs'] = [i[:N] for i in text_info['glyphs']] + text_info['gly_line'] = [i[:N] for i in text_info['gly_line']] + text_info['positions'] = [i[:N] for i in text_info['positions']] + text_info['n_lines'] = text_info['n_lines'][:N] + text_info['masked_x'] = text_info['masked_x'][:N] + text_info['img'] = text_info['img'][:N] + + N = min(z.shape[0], N) + n_row = min(z.shape[0], n_row) + log["reconstruction"] = self.decode_first_stage(z) + log["masked_image"] = self.decode_first_stage(text_info['masked_x']) + log["control"] = c_cat * 2.0 - 1.0 + log["img"] = text_info['img'].permute(0, 3, 1, 2) # log source image if needed + # get glyph + glyph_bs = torch.stack(text_info['glyphs']) + glyph_bs = torch.sum(glyph_bs, dim=0) * 2.0 - 1.0 + log["glyph"] = torch.nn.functional.interpolate(glyph_bs, size=(512, 512), mode='bilinear', align_corners=True,) + # fill caption + if not self.embedding_manager: + self.fill_caption(batch) + captions = batch[self.cond_stage_key] + log["conditioning"] = log_txt_as_img((512, 512), captions, size=16) + + if plot_diffusion_rows: + # get diffusion row + diffusion_row = list() + z_start = z[:n_row] + for t in range(self.num_timesteps): + if t % self.log_every_t == 0 or t == self.num_timesteps - 1: + t = repeat(torch.tensor([t]), '1 -> b', b=n_row) + t = t.to(self.device).long() + noise = torch.randn_like(z_start) + z_noisy = self.q_sample(x_start=z_start, t=t, noise=noise) + diffusion_row.append(self.decode_first_stage(z_noisy)) + + diffusion_row = torch.stack(diffusion_row) # n_log_step, n_row, C, H, W + diffusion_grid = rearrange(diffusion_row, 'n b c h w -> b n c h w') + diffusion_grid = rearrange(diffusion_grid, 'b n c h w -> (b n) c h w') + diffusion_grid = make_grid(diffusion_grid, nrow=diffusion_row.shape[0]) + log["diffusion_row"] = diffusion_grid + + if sample: + # get denoise row + samples, z_denoise_row = self.sample_log(cond={"c_concat": [c_cat], "c_crossattn": [c], "text_info": text_info}, + batch_size=N, ddim=use_ddim, + ddim_steps=ddim_steps, eta=ddim_eta) + x_samples = self.decode_first_stage(samples) + log["samples"] = x_samples + if plot_denoise_rows: + denoise_grid = self._get_denoise_row_from_list(z_denoise_row) + log["denoise_row"] = denoise_grid + + if unconditional_guidance_scale > 1.0: + uc_cross = self.get_unconditional_conditioning(N) + uc_cat = c_cat # torch.zeros_like(c_cat) + uc_full = {"c_concat": [uc_cat], "c_crossattn": [uc_cross['c_crossattn'][0]], "text_info": text_info} + samples_cfg, tmps = self.sample_log(cond={"c_concat": [c_cat], "c_crossattn": [c_crossattn], "text_info": text_info}, + batch_size=N, ddim=use_ddim, + ddim_steps=ddim_steps, eta=ddim_eta, + unconditional_guidance_scale=unconditional_guidance_scale, + unconditional_conditioning=uc_full, + ) + x_samples_cfg = self.decode_first_stage(samples_cfg) + log[f"samples_cfg_scale_{unconditional_guidance_scale:.2f}"] = x_samples_cfg + pred_x0 = False # wether log pred_x0 + if pred_x0: + for idx in range(len(tmps['pred_x0'])): + pred_x0 = self.decode_first_stage(tmps['pred_x0'][idx]) + log[f"pred_x0_{tmps['index'][idx]}"] = pred_x0 + + return log + + @torch.no_grad() + def sample_log(self, cond, batch_size, ddim, ddim_steps, **kwargs): + ddim_sampler = DDIMSampler(self) + b, c, h, w = cond["c_concat"][0].shape + shape = (self.channels, h // 8, w // 8) + samples, intermediates = ddim_sampler.sample(ddim_steps, batch_size, shape, cond, verbose=False, log_every_t=5, **kwargs) + return samples, intermediates + + def configure_optimizers(self): + lr = self.learning_rate + params = list(self.control_model.parameters()) + if self.embedding_manager: + params += list(self.embedding_manager.embedding_parameters()) + if not self.sd_locked: + # params += list(self.model.diffusion_model.input_blocks.parameters()) + # params += list(self.model.diffusion_model.middle_block.parameters()) + params += list(self.model.diffusion_model.output_blocks.parameters()) + params += list(self.model.diffusion_model.out.parameters()) + if self.unlockKV: + nCount = 0 + for name, param in self.model.diffusion_model.named_parameters(): + if 'attn2.to_k' in name or 'attn2.to_v' in name: + params += [param] + nCount += 1 + print(f'Cross attention is unlocked, and {nCount} Wk or Wv are added to potimizers!!!') + + opt = torch.optim.AdamW(params, lr=lr) + return opt + + def low_vram_shift(self, is_diffusing): + if is_diffusing: + self.model = self.model.cuda() + self.control_model = self.control_model.cuda() + self.first_stage_model = self.first_stage_model.cpu() + self.cond_stage_model = self.cond_stage_model.cpu() + else: + self.model = self.model.cpu() + self.control_model = self.control_model.cpu() + self.first_stage_model = self.first_stage_model.cuda() + self.cond_stage_model = self.cond_stage_model.cuda() diff --git a/inpaint/model/anytext/cldm/ddim_hacked.py b/inpaint/model/anytext/cldm/ddim_hacked.py new file mode 100644 index 0000000..b23a883 --- /dev/null +++ b/inpaint/model/anytext/cldm/ddim_hacked.py @@ -0,0 +1,486 @@ +"""SAMPLING ONLY.""" + +import torch +import numpy as np +from tqdm import tqdm + +from iopaint.model.anytext.ldm.modules.diffusionmodules.util import ( + make_ddim_sampling_parameters, + make_ddim_timesteps, + noise_like, + extract_into_tensor, +) + + +class DDIMSampler(object): + def __init__(self, model, device, schedule="linear", **kwargs): + super().__init__() + self.device = device + self.model = model + self.ddpm_num_timesteps = model.num_timesteps + self.schedule = schedule + + def register_buffer(self, name, attr): + if type(attr) == torch.Tensor: + if attr.device != torch.device(self.device): + attr = attr.to(torch.device(self.device)) + setattr(self, name, attr) + + def make_schedule( + self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0.0, verbose=True + ): + self.ddim_timesteps = make_ddim_timesteps( + ddim_discr_method=ddim_discretize, + num_ddim_timesteps=ddim_num_steps, + num_ddpm_timesteps=self.ddpm_num_timesteps, + verbose=verbose, + ) + alphas_cumprod = self.model.alphas_cumprod + assert ( + alphas_cumprod.shape[0] == self.ddpm_num_timesteps + ), "alphas have to be defined for each timestep" + to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.device) + + self.register_buffer("betas", to_torch(self.model.betas)) + self.register_buffer("alphas_cumprod", to_torch(alphas_cumprod)) + self.register_buffer( + "alphas_cumprod_prev", to_torch(self.model.alphas_cumprod_prev) + ) + + # calculations for diffusion q(x_t | x_{t-1}) and others + self.register_buffer( + "sqrt_alphas_cumprod", to_torch(np.sqrt(alphas_cumprod.cpu())) + ) + self.register_buffer( + "sqrt_one_minus_alphas_cumprod", + to_torch(np.sqrt(1.0 - alphas_cumprod.cpu())), + ) + self.register_buffer( + "log_one_minus_alphas_cumprod", to_torch(np.log(1.0 - alphas_cumprod.cpu())) + ) + self.register_buffer( + "sqrt_recip_alphas_cumprod", to_torch(np.sqrt(1.0 / alphas_cumprod.cpu())) + ) + self.register_buffer( + "sqrt_recipm1_alphas_cumprod", + to_torch(np.sqrt(1.0 / alphas_cumprod.cpu() - 1)), + ) + + # ddim sampling parameters + ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters( + alphacums=alphas_cumprod.cpu(), + ddim_timesteps=self.ddim_timesteps, + eta=ddim_eta, + verbose=verbose, + ) + self.register_buffer("ddim_sigmas", ddim_sigmas) + self.register_buffer("ddim_alphas", ddim_alphas) + self.register_buffer("ddim_alphas_prev", ddim_alphas_prev) + self.register_buffer("ddim_sqrt_one_minus_alphas", np.sqrt(1.0 - ddim_alphas)) + sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt( + (1 - self.alphas_cumprod_prev) + / (1 - self.alphas_cumprod) + * (1 - self.alphas_cumprod / self.alphas_cumprod_prev) + ) + self.register_buffer( + "ddim_sigmas_for_original_num_steps", sigmas_for_original_sampling_steps + ) + + @torch.no_grad() + def sample( + self, + S, + batch_size, + shape, + conditioning=None, + callback=None, + normals_sequence=None, + img_callback=None, + quantize_x0=False, + eta=0.0, + mask=None, + x0=None, + temperature=1.0, + noise_dropout=0.0, + score_corrector=None, + corrector_kwargs=None, + verbose=True, + x_T=None, + log_every_t=100, + unconditional_guidance_scale=1.0, + unconditional_conditioning=None, # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ... + dynamic_threshold=None, + ucg_schedule=None, + **kwargs, + ): + if conditioning is not None: + if isinstance(conditioning, dict): + ctmp = conditioning[list(conditioning.keys())[0]] + while isinstance(ctmp, list): + ctmp = ctmp[0] + cbs = ctmp.shape[0] + if cbs != batch_size: + print( + f"Warning: Got {cbs} conditionings but batch-size is {batch_size}" + ) + + elif isinstance(conditioning, list): + for ctmp in conditioning: + if ctmp.shape[0] != batch_size: + print( + f"Warning: Got {cbs} conditionings but batch-size is {batch_size}" + ) + + else: + if conditioning.shape[0] != batch_size: + print( + f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}" + ) + + self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose) + # sampling + C, H, W = shape + size = (batch_size, C, H, W) + print(f"Data shape for DDIM sampling is {size}, eta {eta}") + + samples, intermediates = self.ddim_sampling( + conditioning, + size, + callback=callback, + img_callback=img_callback, + quantize_denoised=quantize_x0, + mask=mask, + x0=x0, + ddim_use_original_steps=False, + noise_dropout=noise_dropout, + temperature=temperature, + score_corrector=score_corrector, + corrector_kwargs=corrector_kwargs, + x_T=x_T, + log_every_t=log_every_t, + unconditional_guidance_scale=unconditional_guidance_scale, + unconditional_conditioning=unconditional_conditioning, + dynamic_threshold=dynamic_threshold, + ucg_schedule=ucg_schedule, + ) + return samples, intermediates + + @torch.no_grad() + def ddim_sampling( + self, + cond, + shape, + x_T=None, + ddim_use_original_steps=False, + callback=None, + timesteps=None, + quantize_denoised=False, + mask=None, + x0=None, + img_callback=None, + log_every_t=100, + temperature=1.0, + noise_dropout=0.0, + score_corrector=None, + corrector_kwargs=None, + unconditional_guidance_scale=1.0, + unconditional_conditioning=None, + dynamic_threshold=None, + ucg_schedule=None, + ): + device = self.model.betas.device + b = shape[0] + if x_T is None: + img = torch.randn(shape, device=device) + else: + img = x_T + + if timesteps is None: + timesteps = ( + self.ddpm_num_timesteps + if ddim_use_original_steps + else self.ddim_timesteps + ) + elif timesteps is not None and not ddim_use_original_steps: + subset_end = ( + int( + min(timesteps / self.ddim_timesteps.shape[0], 1) + * self.ddim_timesteps.shape[0] + ) + - 1 + ) + timesteps = self.ddim_timesteps[:subset_end] + + intermediates = {"x_inter": [img], "pred_x0": [img]} + time_range = ( + reversed(range(0, timesteps)) + if ddim_use_original_steps + else np.flip(timesteps) + ) + total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0] + print(f"Running DDIM Sampling with {total_steps} timesteps") + + iterator = tqdm(time_range, desc="DDIM Sampler", total=total_steps) + + for i, step in enumerate(iterator): + index = total_steps - i - 1 + ts = torch.full((b,), step, device=device, dtype=torch.long) + + if mask is not None: + assert x0 is not None + img_orig = self.model.q_sample( + x0, ts + ) # TODO: deterministic forward pass? + img = img_orig * mask + (1.0 - mask) * img + + if ucg_schedule is not None: + assert len(ucg_schedule) == len(time_range) + unconditional_guidance_scale = ucg_schedule[i] + + outs = self.p_sample_ddim( + img, + cond, + ts, + index=index, + use_original_steps=ddim_use_original_steps, + quantize_denoised=quantize_denoised, + temperature=temperature, + noise_dropout=noise_dropout, + score_corrector=score_corrector, + corrector_kwargs=corrector_kwargs, + unconditional_guidance_scale=unconditional_guidance_scale, + unconditional_conditioning=unconditional_conditioning, + dynamic_threshold=dynamic_threshold, + ) + img, pred_x0 = outs + if callback: + callback(None, i, None, None) + if img_callback: + img_callback(pred_x0, i) + + if index % log_every_t == 0 or index == total_steps - 1: + intermediates["x_inter"].append(img) + intermediates["pred_x0"].append(pred_x0) + + return img, intermediates + + @torch.no_grad() + def p_sample_ddim( + self, + x, + c, + t, + index, + repeat_noise=False, + use_original_steps=False, + quantize_denoised=False, + temperature=1.0, + noise_dropout=0.0, + score_corrector=None, + corrector_kwargs=None, + unconditional_guidance_scale=1.0, + unconditional_conditioning=None, + dynamic_threshold=None, + ): + b, *_, device = *x.shape, x.device + + if unconditional_conditioning is None or unconditional_guidance_scale == 1.0: + model_output = self.model.apply_model(x, t, c) + else: + model_t = self.model.apply_model(x, t, c) + model_uncond = self.model.apply_model(x, t, unconditional_conditioning) + model_output = model_uncond + unconditional_guidance_scale * ( + model_t - model_uncond + ) + + if self.model.parameterization == "v": + e_t = self.model.predict_eps_from_z_and_v(x, t, model_output) + else: + e_t = model_output + + if score_corrector is not None: + assert self.model.parameterization == "eps", "not implemented" + e_t = score_corrector.modify_score( + self.model, e_t, x, t, c, **corrector_kwargs + ) + + alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas + alphas_prev = ( + self.model.alphas_cumprod_prev + if use_original_steps + else self.ddim_alphas_prev + ) + sqrt_one_minus_alphas = ( + self.model.sqrt_one_minus_alphas_cumprod + if use_original_steps + else self.ddim_sqrt_one_minus_alphas + ) + sigmas = ( + self.model.ddim_sigmas_for_original_num_steps + if use_original_steps + else self.ddim_sigmas + ) + # select parameters corresponding to the currently considered timestep + a_t = torch.full((b, 1, 1, 1), alphas[index], device=device) + a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device) + sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device) + sqrt_one_minus_at = torch.full( + (b, 1, 1, 1), sqrt_one_minus_alphas[index], device=device + ) + + # current prediction for x_0 + if self.model.parameterization != "v": + pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt() + else: + pred_x0 = self.model.predict_start_from_z_and_v(x, t, model_output) + + if quantize_denoised: + pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0) + + if dynamic_threshold is not None: + raise NotImplementedError() + + # direction pointing to x_t + dir_xt = (1.0 - a_prev - sigma_t**2).sqrt() * e_t + noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature + if noise_dropout > 0.0: + noise = torch.nn.functional.dropout(noise, p=noise_dropout) + x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise + return x_prev, pred_x0 + + @torch.no_grad() + def encode( + self, + x0, + c, + t_enc, + use_original_steps=False, + return_intermediates=None, + unconditional_guidance_scale=1.0, + unconditional_conditioning=None, + callback=None, + ): + timesteps = ( + np.arange(self.ddpm_num_timesteps) + if use_original_steps + else self.ddim_timesteps + ) + num_reference_steps = timesteps.shape[0] + + assert t_enc <= num_reference_steps + num_steps = t_enc + + if use_original_steps: + alphas_next = self.alphas_cumprod[:num_steps] + alphas = self.alphas_cumprod_prev[:num_steps] + else: + alphas_next = self.ddim_alphas[:num_steps] + alphas = torch.tensor(self.ddim_alphas_prev[:num_steps]) + + x_next = x0 + intermediates = [] + inter_steps = [] + for i in tqdm(range(num_steps), desc="Encoding Image"): + t = torch.full( + (x0.shape[0],), timesteps[i], device=self.model.device, dtype=torch.long + ) + if unconditional_guidance_scale == 1.0: + noise_pred = self.model.apply_model(x_next, t, c) + else: + assert unconditional_conditioning is not None + e_t_uncond, noise_pred = torch.chunk( + self.model.apply_model( + torch.cat((x_next, x_next)), + torch.cat((t, t)), + torch.cat((unconditional_conditioning, c)), + ), + 2, + ) + noise_pred = e_t_uncond + unconditional_guidance_scale * ( + noise_pred - e_t_uncond + ) + + xt_weighted = (alphas_next[i] / alphas[i]).sqrt() * x_next + weighted_noise_pred = ( + alphas_next[i].sqrt() + * ((1 / alphas_next[i] - 1).sqrt() - (1 / alphas[i] - 1).sqrt()) + * noise_pred + ) + x_next = xt_weighted + weighted_noise_pred + if ( + return_intermediates + and i % (num_steps // return_intermediates) == 0 + and i < num_steps - 1 + ): + intermediates.append(x_next) + inter_steps.append(i) + elif return_intermediates and i >= num_steps - 2: + intermediates.append(x_next) + inter_steps.append(i) + if callback: + callback(i) + + out = {"x_encoded": x_next, "intermediate_steps": inter_steps} + if return_intermediates: + out.update({"intermediates": intermediates}) + return x_next, out + + @torch.no_grad() + def stochastic_encode(self, x0, t, use_original_steps=False, noise=None): + # fast, but does not allow for exact reconstruction + # t serves as an index to gather the correct alphas + if use_original_steps: + sqrt_alphas_cumprod = self.sqrt_alphas_cumprod + sqrt_one_minus_alphas_cumprod = self.sqrt_one_minus_alphas_cumprod + else: + sqrt_alphas_cumprod = torch.sqrt(self.ddim_alphas) + sqrt_one_minus_alphas_cumprod = self.ddim_sqrt_one_minus_alphas + + if noise is None: + noise = torch.randn_like(x0) + return ( + extract_into_tensor(sqrt_alphas_cumprod, t, x0.shape) * x0 + + extract_into_tensor(sqrt_one_minus_alphas_cumprod, t, x0.shape) * noise + ) + + @torch.no_grad() + def decode( + self, + x_latent, + cond, + t_start, + unconditional_guidance_scale=1.0, + unconditional_conditioning=None, + use_original_steps=False, + callback=None, + ): + timesteps = ( + np.arange(self.ddpm_num_timesteps) + if use_original_steps + else self.ddim_timesteps + ) + timesteps = timesteps[:t_start] + + time_range = np.flip(timesteps) + total_steps = timesteps.shape[0] + print(f"Running DDIM Sampling with {total_steps} timesteps") + + iterator = tqdm(time_range, desc="Decoding image", total=total_steps) + x_dec = x_latent + for i, step in enumerate(iterator): + index = total_steps - i - 1 + ts = torch.full( + (x_latent.shape[0],), step, device=x_latent.device, dtype=torch.long + ) + x_dec, _ = self.p_sample_ddim( + x_dec, + cond, + ts, + index=index, + use_original_steps=use_original_steps, + unconditional_guidance_scale=unconditional_guidance_scale, + unconditional_conditioning=unconditional_conditioning, + ) + if callback: + callback(i) + return x_dec diff --git a/inpaint/model/anytext/cldm/embedding_manager.py b/inpaint/model/anytext/cldm/embedding_manager.py new file mode 100644 index 0000000..6ccf8a9 --- /dev/null +++ b/inpaint/model/anytext/cldm/embedding_manager.py @@ -0,0 +1,165 @@ +''' +Copyright (c) Alibaba, Inc. and its affiliates. +''' +import torch +import torch.nn as nn +import torch.nn.functional as F +from functools import partial +from iopaint.model.anytext.ldm.modules.diffusionmodules.util import conv_nd, linear + + +def get_clip_token_for_string(tokenizer, string): + batch_encoding = tokenizer(string, truncation=True, max_length=77, return_length=True, + return_overflowing_tokens=False, padding="max_length", return_tensors="pt") + tokens = batch_encoding["input_ids"] + assert torch.count_nonzero(tokens - 49407) == 2, f"String '{string}' maps to more than a single token. Please use another string" + return tokens[0, 1] + + +def get_bert_token_for_string(tokenizer, string): + token = tokenizer(string) + assert torch.count_nonzero(token) == 3, f"String '{string}' maps to more than a single token. Please use another string" + token = token[0, 1] + return token + + +def get_clip_vision_emb(encoder, processor, img): + _img = img.repeat(1, 3, 1, 1)*255 + inputs = processor(images=_img, return_tensors="pt") + inputs['pixel_values'] = inputs['pixel_values'].to(img.device) + outputs = encoder(**inputs) + emb = outputs.image_embeds + return emb + + +def get_recog_emb(encoder, img_list): + _img_list = [(img.repeat(1, 3, 1, 1)*255)[0] for img in img_list] + encoder.predictor.eval() + _, preds_neck = encoder.pred_imglist(_img_list, show_debug=False) + return preds_neck + + +def pad_H(x): + _, _, H, W = x.shape + p_top = (W - H) // 2 + p_bot = W - H - p_top + return F.pad(x, (0, 0, p_top, p_bot)) + + +class EncodeNet(nn.Module): + def __init__(self, in_channels, out_channels): + super(EncodeNet, self).__init__() + chan = 16 + n_layer = 4 # downsample + + self.conv1 = conv_nd(2, in_channels, chan, 3, padding=1) + self.conv_list = nn.ModuleList([]) + _c = chan + for i in range(n_layer): + self.conv_list.append(conv_nd(2, _c, _c*2, 3, padding=1, stride=2)) + _c *= 2 + self.conv2 = conv_nd(2, _c, out_channels, 3, padding=1) + self.avgpool = nn.AdaptiveAvgPool2d(1) + self.act = nn.SiLU() + + def forward(self, x): + x = self.act(self.conv1(x)) + for layer in self.conv_list: + x = self.act(layer(x)) + x = self.act(self.conv2(x)) + x = self.avgpool(x) + x = x.view(x.size(0), -1) + return x + + +class EmbeddingManager(nn.Module): + def __init__( + self, + embedder, + valid=True, + glyph_channels=20, + position_channels=1, + placeholder_string='*', + add_pos=False, + emb_type='ocr', + **kwargs + ): + super().__init__() + if hasattr(embedder, 'tokenizer'): # using Stable Diffusion's CLIP encoder + get_token_for_string = partial(get_clip_token_for_string, embedder.tokenizer) + token_dim = 768 + if hasattr(embedder, 'vit'): + assert emb_type == 'vit' + self.get_vision_emb = partial(get_clip_vision_emb, embedder.vit, embedder.processor) + self.get_recog_emb = None + else: # using LDM's BERT encoder + get_token_for_string = partial(get_bert_token_for_string, embedder.tknz_fn) + token_dim = 1280 + self.token_dim = token_dim + self.emb_type = emb_type + + self.add_pos = add_pos + if add_pos: + self.position_encoder = EncodeNet(position_channels, token_dim) + if emb_type == 'ocr': + self.proj = linear(40*64, token_dim) + if emb_type == 'conv': + self.glyph_encoder = EncodeNet(glyph_channels, token_dim) + + self.placeholder_token = get_token_for_string(placeholder_string) + + def encode_text(self, text_info): + if self.get_recog_emb is None and self.emb_type == 'ocr': + self.get_recog_emb = partial(get_recog_emb, self.recog) + + gline_list = [] + pos_list = [] + for i in range(len(text_info['n_lines'])): # sample index in a batch + n_lines = text_info['n_lines'][i] + for j in range(n_lines): # line + gline_list += [text_info['gly_line'][j][i:i+1]] + if self.add_pos: + pos_list += [text_info['positions'][j][i:i+1]] + + if len(gline_list) > 0: + if self.emb_type == 'ocr': + recog_emb = self.get_recog_emb(gline_list) + enc_glyph = self.proj(recog_emb.reshape(recog_emb.shape[0], -1)) + elif self.emb_type == 'vit': + enc_glyph = self.get_vision_emb(pad_H(torch.cat(gline_list, dim=0))) + elif self.emb_type == 'conv': + enc_glyph = self.glyph_encoder(pad_H(torch.cat(gline_list, dim=0))) + if self.add_pos: + enc_pos = self.position_encoder(torch.cat(gline_list, dim=0)) + enc_glyph = enc_glyph+enc_pos + + self.text_embs_all = [] + n_idx = 0 + for i in range(len(text_info['n_lines'])): # sample index in a batch + n_lines = text_info['n_lines'][i] + text_embs = [] + for j in range(n_lines): # line + text_embs += [enc_glyph[n_idx:n_idx+1]] + n_idx += 1 + self.text_embs_all += [text_embs] + + def forward( + self, + tokenized_text, + embedded_text, + ): + b, device = tokenized_text.shape[0], tokenized_text.device + for i in range(b): + idx = tokenized_text[i] == self.placeholder_token.to(device) + if sum(idx) > 0: + if i >= len(self.text_embs_all): + print('truncation for log images...') + break + text_emb = torch.cat(self.text_embs_all[i], dim=0) + if sum(idx) != len(text_emb): + print('truncation for long caption...') + embedded_text[i][idx] = text_emb[:sum(idx)] + return embedded_text + + def embedding_parameters(self): + return self.parameters() diff --git a/inpaint/model/anytext/cldm/hack.py b/inpaint/model/anytext/cldm/hack.py new file mode 100644 index 0000000..05afe5f --- /dev/null +++ b/inpaint/model/anytext/cldm/hack.py @@ -0,0 +1,111 @@ +import torch +import einops + +import iopaint.model.anytext.ldm.modules.encoders.modules +import iopaint.model.anytext.ldm.modules.attention + +from transformers import logging +from iopaint.model.anytext.ldm.modules.attention import default + + +def disable_verbosity(): + logging.set_verbosity_error() + print('logging improved.') + return + + +def enable_sliced_attention(): + iopaint.model.anytext.ldm.modules.attention.CrossAttention.forward = _hacked_sliced_attentin_forward + print('Enabled sliced_attention.') + return + + +def hack_everything(clip_skip=0): + disable_verbosity() + iopaint.model.anytext.ldm.modules.encoders.modules.FrozenCLIPEmbedder.forward = _hacked_clip_forward + iopaint.model.anytext.ldm.modules.encoders.modules.FrozenCLIPEmbedder.clip_skip = clip_skip + print('Enabled clip hacks.') + return + + +# Written by Lvmin +def _hacked_clip_forward(self, text): + PAD = self.tokenizer.pad_token_id + EOS = self.tokenizer.eos_token_id + BOS = self.tokenizer.bos_token_id + + def tokenize(t): + return self.tokenizer(t, truncation=False, add_special_tokens=False)["input_ids"] + + def transformer_encode(t): + if self.clip_skip > 1: + rt = self.transformer(input_ids=t, output_hidden_states=True) + return self.transformer.text_model.final_layer_norm(rt.hidden_states[-self.clip_skip]) + else: + return self.transformer(input_ids=t, output_hidden_states=False).last_hidden_state + + def split(x): + return x[75 * 0: 75 * 1], x[75 * 1: 75 * 2], x[75 * 2: 75 * 3] + + def pad(x, p, i): + return x[:i] if len(x) >= i else x + [p] * (i - len(x)) + + raw_tokens_list = tokenize(text) + tokens_list = [] + + for raw_tokens in raw_tokens_list: + raw_tokens_123 = split(raw_tokens) + raw_tokens_123 = [[BOS] + raw_tokens_i + [EOS] for raw_tokens_i in raw_tokens_123] + raw_tokens_123 = [pad(raw_tokens_i, PAD, 77) for raw_tokens_i in raw_tokens_123] + tokens_list.append(raw_tokens_123) + + tokens_list = torch.IntTensor(tokens_list).to(self.device) + + feed = einops.rearrange(tokens_list, 'b f i -> (b f) i') + y = transformer_encode(feed) + z = einops.rearrange(y, '(b f) i c -> b (f i) c', f=3) + + return z + + +# Stolen from https://github.com/basujindal/stable-diffusion/blob/main/optimizedSD/splitAttention.py +def _hacked_sliced_attentin_forward(self, x, context=None, mask=None): + h = self.heads + + q = self.to_q(x) + context = default(context, x) + k = self.to_k(context) + v = self.to_v(context) + del context, x + + q, k, v = map(lambda t: einops.rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v)) + + limit = k.shape[0] + att_step = 1 + q_chunks = list(torch.tensor_split(q, limit // att_step, dim=0)) + k_chunks = list(torch.tensor_split(k, limit // att_step, dim=0)) + v_chunks = list(torch.tensor_split(v, limit // att_step, dim=0)) + + q_chunks.reverse() + k_chunks.reverse() + v_chunks.reverse() + sim = torch.zeros(q.shape[0], q.shape[1], v.shape[2], device=q.device) + del k, q, v + for i in range(0, limit, att_step): + q_buffer = q_chunks.pop() + k_buffer = k_chunks.pop() + v_buffer = v_chunks.pop() + sim_buffer = torch.einsum('b i d, b j d -> b i j', q_buffer, k_buffer) * self.scale + + del k_buffer, q_buffer + # attention, what we cannot get enough of, by chunks + + sim_buffer = sim_buffer.softmax(dim=-1) + + sim_buffer = torch.einsum('b i j, b j d -> b i d', sim_buffer, v_buffer) + del v_buffer + sim[i:i + att_step, :, :] = sim_buffer + + del sim_buffer + sim = einops.rearrange(sim, '(b h) n d -> b n (h d)', h=h) + return self.to_out(sim) diff --git a/inpaint/model/anytext/cldm/model.py b/inpaint/model/anytext/cldm/model.py new file mode 100644 index 0000000..688f2ed --- /dev/null +++ b/inpaint/model/anytext/cldm/model.py @@ -0,0 +1,40 @@ +import os +import torch + +from omegaconf import OmegaConf +from iopaint.model.anytext.ldm.util import instantiate_from_config + + +def get_state_dict(d): + return d.get("state_dict", d) + + +def load_state_dict(ckpt_path, location="cpu"): + _, extension = os.path.splitext(ckpt_path) + if extension.lower() == ".safetensors": + import safetensors.torch + + state_dict = safetensors.torch.load_file(ckpt_path, device=location) + else: + state_dict = get_state_dict( + torch.load(ckpt_path, map_location=torch.device(location)) + ) + state_dict = get_state_dict(state_dict) + print(f"Loaded state_dict from [{ckpt_path}]") + return state_dict + + +def create_model(config_path, device, cond_stage_path=None, use_fp16=False): + config = OmegaConf.load(config_path) + # if cond_stage_path: + # config.model.params.cond_stage_config.params.version = ( + # cond_stage_path # use pre-downloaded ckpts, in case blocked + # ) + config.model.params.cond_stage_config.params.device = str(device) + if use_fp16: + config.model.params.use_fp16 = True + config.model.params.control_stage_config.params.use_fp16 = True + config.model.params.unet_config.params.use_fp16 = True + model = instantiate_from_config(config.model).cpu() + print(f"Loaded model config from [{config_path}]") + return model diff --git a/inpaint/model/anytext/cldm/recognizer.py b/inpaint/model/anytext/cldm/recognizer.py new file mode 100755 index 0000000..0621512 --- /dev/null +++ b/inpaint/model/anytext/cldm/recognizer.py @@ -0,0 +1,300 @@ +""" +Copyright (c) Alibaba, Inc. and its affiliates. +""" +import os +import cv2 +import numpy as np +import math +import traceback +from easydict import EasyDict as edict +import time +from iopaint.model.anytext.ocr_recog.RecModel import RecModel +import torch +import torch.nn.functional as F + + +def min_bounding_rect(img): + ret, thresh = cv2.threshold(img, 127, 255, 0) + contours, hierarchy = cv2.findContours( + thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE + ) + if len(contours) == 0: + print("Bad contours, using fake bbox...") + return np.array([[0, 0], [100, 0], [100, 100], [0, 100]]) + max_contour = max(contours, key=cv2.contourArea) + rect = cv2.minAreaRect(max_contour) + box = cv2.boxPoints(rect) + box = np.int0(box) + # sort + x_sorted = sorted(box, key=lambda x: x[0]) + left = x_sorted[:2] + right = x_sorted[2:] + left = sorted(left, key=lambda x: x[1]) + (tl, bl) = left + right = sorted(right, key=lambda x: x[1]) + (tr, br) = right + if tl[1] > bl[1]: + (tl, bl) = (bl, tl) + if tr[1] > br[1]: + (tr, br) = (br, tr) + return np.array([tl, tr, br, bl]) + + +def create_predictor(model_dir=None, model_lang="ch", is_onnx=False): + model_file_path = model_dir + if model_file_path is not None and not os.path.exists(model_file_path): + raise ValueError("not find model file path {}".format(model_file_path)) + + if is_onnx: + import onnxruntime as ort + + sess = ort.InferenceSession( + model_file_path, providers=["CPUExecutionProvider"] + ) # 'TensorrtExecutionProvider', 'CUDAExecutionProvider', 'CPUExecutionProvider' + return sess + else: + if model_lang == "ch": + n_class = 6625 + elif model_lang == "en": + n_class = 97 + else: + raise ValueError(f"Unsupported OCR recog model_lang: {model_lang}") + rec_config = edict( + in_channels=3, + backbone=edict( + type="MobileNetV1Enhance", + scale=0.5, + last_conv_stride=[1, 2], + last_pool_type="avg", + ), + neck=edict( + type="SequenceEncoder", + encoder_type="svtr", + dims=64, + depth=2, + hidden_dims=120, + use_guide=True, + ), + head=edict( + type="CTCHead", + fc_decay=0.00001, + out_channels=n_class, + return_feats=True, + ), + ) + + rec_model = RecModel(rec_config) + if model_file_path is not None: + rec_model.load_state_dict(torch.load(model_file_path, map_location="cpu")) + rec_model.eval() + return rec_model.eval() + + +def _check_image_file(path): + img_end = {"jpg", "bmp", "png", "jpeg", "rgb", "tif", "tiff"} + return any([path.lower().endswith(e) for e in img_end]) + + +def get_image_file_list(img_file): + imgs_lists = [] + if img_file is None or not os.path.exists(img_file): + raise Exception("not found any img file in {}".format(img_file)) + if os.path.isfile(img_file) and _check_image_file(img_file): + imgs_lists.append(img_file) + elif os.path.isdir(img_file): + for single_file in os.listdir(img_file): + file_path = os.path.join(img_file, single_file) + if os.path.isfile(file_path) and _check_image_file(file_path): + imgs_lists.append(file_path) + if len(imgs_lists) == 0: + raise Exception("not found any img file in {}".format(img_file)) + imgs_lists = sorted(imgs_lists) + return imgs_lists + + +class TextRecognizer(object): + def __init__(self, args, predictor): + self.rec_image_shape = [int(v) for v in args.rec_image_shape.split(",")] + self.rec_batch_num = args.rec_batch_num + self.predictor = predictor + self.chars = self.get_char_dict(args.rec_char_dict_path) + self.char2id = {x: i for i, x in enumerate(self.chars)} + self.is_onnx = not isinstance(self.predictor, torch.nn.Module) + self.use_fp16 = args.use_fp16 + + # img: CHW + def resize_norm_img(self, img, max_wh_ratio): + imgC, imgH, imgW = self.rec_image_shape + assert imgC == img.shape[0] + imgW = int((imgH * max_wh_ratio)) + + h, w = img.shape[1:] + ratio = w / float(h) + if math.ceil(imgH * ratio) > imgW: + resized_w = imgW + else: + resized_w = int(math.ceil(imgH * ratio)) + resized_image = torch.nn.functional.interpolate( + img.unsqueeze(0), + size=(imgH, resized_w), + mode="bilinear", + align_corners=True, + ) + resized_image /= 255.0 + resized_image -= 0.5 + resized_image /= 0.5 + padding_im = torch.zeros((imgC, imgH, imgW), dtype=torch.float32).to(img.device) + padding_im[:, :, 0:resized_w] = resized_image[0] + return padding_im + + # img_list: list of tensors with shape chw 0-255 + def pred_imglist(self, img_list, show_debug=False, is_ori=False): + img_num = len(img_list) + assert img_num > 0 + # Calculate the aspect ratio of all text bars + width_list = [] + for img in img_list: + width_list.append(img.shape[2] / float(img.shape[1])) + # Sorting can speed up the recognition process + indices = torch.from_numpy(np.argsort(np.array(width_list))) + batch_num = self.rec_batch_num + preds_all = [None] * img_num + preds_neck_all = [None] * img_num + for beg_img_no in range(0, img_num, batch_num): + end_img_no = min(img_num, beg_img_no + batch_num) + norm_img_batch = [] + + imgC, imgH, imgW = self.rec_image_shape[:3] + max_wh_ratio = imgW / imgH + for ino in range(beg_img_no, end_img_no): + h, w = img_list[indices[ino]].shape[1:] + if h > w * 1.2: + img = img_list[indices[ino]] + img = torch.transpose(img, 1, 2).flip(dims=[1]) + img_list[indices[ino]] = img + h, w = img.shape[1:] + # wh_ratio = w * 1.0 / h + # max_wh_ratio = max(max_wh_ratio, wh_ratio) # comment to not use different ratio + for ino in range(beg_img_no, end_img_no): + norm_img = self.resize_norm_img(img_list[indices[ino]], max_wh_ratio) + if self.use_fp16: + norm_img = norm_img.half() + norm_img = norm_img.unsqueeze(0) + norm_img_batch.append(norm_img) + norm_img_batch = torch.cat(norm_img_batch, dim=0) + if show_debug: + for i in range(len(norm_img_batch)): + _img = norm_img_batch[i].permute(1, 2, 0).detach().cpu().numpy() + _img = (_img + 0.5) * 255 + _img = _img[:, :, ::-1] + file_name = f"{indices[beg_img_no + i]}" + file_name = file_name + "_ori" if is_ori else file_name + cv2.imwrite(file_name + ".jpg", _img) + if self.is_onnx: + input_dict = {} + input_dict[self.predictor.get_inputs()[0].name] = ( + norm_img_batch.detach().cpu().numpy() + ) + outputs = self.predictor.run(None, input_dict) + preds = {} + preds["ctc"] = torch.from_numpy(outputs[0]) + preds["ctc_neck"] = [torch.zeros(1)] * img_num + else: + preds = self.predictor(norm_img_batch) + for rno in range(preds["ctc"].shape[0]): + preds_all[indices[beg_img_no + rno]] = preds["ctc"][rno] + preds_neck_all[indices[beg_img_no + rno]] = preds["ctc_neck"][rno] + + return torch.stack(preds_all, dim=0), torch.stack(preds_neck_all, dim=0) + + def get_char_dict(self, character_dict_path): + character_str = [] + with open(character_dict_path, "rb") as fin: + lines = fin.readlines() + for line in lines: + line = line.decode("utf-8").strip("\n").strip("\r\n") + character_str.append(line) + dict_character = list(character_str) + dict_character = ["sos"] + dict_character + [" "] # eos is space + return dict_character + + def get_text(self, order): + char_list = [self.chars[text_id] for text_id in order] + return "".join(char_list) + + def decode(self, mat): + text_index = mat.detach().cpu().numpy().argmax(axis=1) + ignored_tokens = [0] + selection = np.ones(len(text_index), dtype=bool) + selection[1:] = text_index[1:] != text_index[:-1] + for ignored_token in ignored_tokens: + selection &= text_index != ignored_token + return text_index[selection], np.where(selection)[0] + + def get_ctcloss(self, preds, gt_text, weight): + if not isinstance(weight, torch.Tensor): + weight = torch.tensor(weight).to(preds.device) + ctc_loss = torch.nn.CTCLoss(reduction="none") + log_probs = preds.log_softmax(dim=2).permute(1, 0, 2) # NTC-->TNC + targets = [] + target_lengths = [] + for t in gt_text: + targets += [self.char2id.get(i, len(self.chars) - 1) for i in t] + target_lengths += [len(t)] + targets = torch.tensor(targets).to(preds.device) + target_lengths = torch.tensor(target_lengths).to(preds.device) + input_lengths = torch.tensor([log_probs.shape[0]] * (log_probs.shape[1])).to( + preds.device + ) + loss = ctc_loss(log_probs, targets, input_lengths, target_lengths) + loss = loss / input_lengths * weight + return loss + + +def main(): + rec_model_dir = "./ocr_weights/ppv3_rec.pth" + predictor = create_predictor(rec_model_dir) + args = edict() + args.rec_image_shape = "3, 48, 320" + args.rec_char_dict_path = "./ocr_weights/ppocr_keys_v1.txt" + args.rec_batch_num = 6 + text_recognizer = TextRecognizer(args, predictor) + image_dir = "./test_imgs_cn" + gt_text = ["韩国小馆"] * 14 + + image_file_list = get_image_file_list(image_dir) + valid_image_file_list = [] + img_list = [] + + for image_file in image_file_list: + img = cv2.imread(image_file) + if img is None: + print("error in loading image:{}".format(image_file)) + continue + valid_image_file_list.append(image_file) + img_list.append(torch.from_numpy(img).permute(2, 0, 1).float()) + try: + tic = time.time() + times = [] + for i in range(10): + preds, _ = text_recognizer.pred_imglist(img_list) # get text + preds_all = preds.softmax(dim=2) + times += [(time.time() - tic) * 1000.0] + tic = time.time() + print(times) + print(np.mean(times[1:]) / len(preds_all)) + weight = np.ones(len(gt_text)) + loss = text_recognizer.get_ctcloss(preds, gt_text, weight) + for i in range(len(valid_image_file_list)): + pred = preds_all[i] + order, idx = text_recognizer.decode(pred) + text = text_recognizer.get_text(order) + print( + f'{valid_image_file_list[i]}: pred/gt="{text}"/"{gt_text[i]}", loss={loss[i]:.2f}' + ) + except Exception as E: + print(traceback.format_exc(), E) + + +if __name__ == "__main__": + main() diff --git a/inpaint/model/anytext/ldm/__init__.py b/inpaint/model/anytext/ldm/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/inpaint/model/anytext/ldm/models/__init__.py b/inpaint/model/anytext/ldm/models/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/inpaint/model/anytext/ldm/models/autoencoder.py b/inpaint/model/anytext/ldm/models/autoencoder.py new file mode 100644 index 0000000..20d52e9 --- /dev/null +++ b/inpaint/model/anytext/ldm/models/autoencoder.py @@ -0,0 +1,218 @@ +import torch +import torch.nn.functional as F +from contextlib import contextmanager + +from iopaint.model.anytext.ldm.modules.diffusionmodules.model import Encoder, Decoder +from iopaint.model.anytext.ldm.modules.distributions.distributions import DiagonalGaussianDistribution + +from iopaint.model.anytext.ldm.util import instantiate_from_config +from iopaint.model.anytext.ldm.modules.ema import LitEma + + +class AutoencoderKL(torch.nn.Module): + def __init__(self, + ddconfig, + lossconfig, + embed_dim, + ckpt_path=None, + ignore_keys=[], + image_key="image", + colorize_nlabels=None, + monitor=None, + ema_decay=None, + learn_logvar=False + ): + super().__init__() + self.learn_logvar = learn_logvar + self.image_key = image_key + self.encoder = Encoder(**ddconfig) + self.decoder = Decoder(**ddconfig) + self.loss = instantiate_from_config(lossconfig) + assert ddconfig["double_z"] + self.quant_conv = torch.nn.Conv2d(2*ddconfig["z_channels"], 2*embed_dim, 1) + self.post_quant_conv = torch.nn.Conv2d(embed_dim, ddconfig["z_channels"], 1) + self.embed_dim = embed_dim + if colorize_nlabels is not None: + assert type(colorize_nlabels)==int + self.register_buffer("colorize", torch.randn(3, colorize_nlabels, 1, 1)) + if monitor is not None: + self.monitor = monitor + + self.use_ema = ema_decay is not None + if self.use_ema: + self.ema_decay = ema_decay + assert 0. < ema_decay < 1. + self.model_ema = LitEma(self, decay=ema_decay) + print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.") + + if ckpt_path is not None: + self.init_from_ckpt(ckpt_path, ignore_keys=ignore_keys) + + def init_from_ckpt(self, path, ignore_keys=list()): + sd = torch.load(path, map_location="cpu")["state_dict"] + keys = list(sd.keys()) + for k in keys: + for ik in ignore_keys: + if k.startswith(ik): + print("Deleting key {} from state_dict.".format(k)) + del sd[k] + self.load_state_dict(sd, strict=False) + print(f"Restored from {path}") + + @contextmanager + def ema_scope(self, context=None): + if self.use_ema: + self.model_ema.store(self.parameters()) + self.model_ema.copy_to(self) + if context is not None: + print(f"{context}: Switched to EMA weights") + try: + yield None + finally: + if self.use_ema: + self.model_ema.restore(self.parameters()) + if context is not None: + print(f"{context}: Restored training weights") + + def on_train_batch_end(self, *args, **kwargs): + if self.use_ema: + self.model_ema(self) + + def encode(self, x): + h = self.encoder(x) + moments = self.quant_conv(h) + posterior = DiagonalGaussianDistribution(moments) + return posterior + + def decode(self, z): + z = self.post_quant_conv(z) + dec = self.decoder(z) + return dec + + def forward(self, input, sample_posterior=True): + posterior = self.encode(input) + if sample_posterior: + z = posterior.sample() + else: + z = posterior.mode() + dec = self.decode(z) + return dec, posterior + + def get_input(self, batch, k): + x = batch[k] + if len(x.shape) == 3: + x = x[..., None] + x = x.permute(0, 3, 1, 2).to(memory_format=torch.contiguous_format).float() + return x + + def training_step(self, batch, batch_idx, optimizer_idx): + inputs = self.get_input(batch, self.image_key) + reconstructions, posterior = self(inputs) + + if optimizer_idx == 0: + # train encoder+decoder+logvar + aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step, + last_layer=self.get_last_layer(), split="train") + self.log("aeloss", aeloss, prog_bar=True, logger=True, on_step=True, on_epoch=True) + self.log_dict(log_dict_ae, prog_bar=False, logger=True, on_step=True, on_epoch=False) + return aeloss + + if optimizer_idx == 1: + # train the discriminator + discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, optimizer_idx, self.global_step, + last_layer=self.get_last_layer(), split="train") + + self.log("discloss", discloss, prog_bar=True, logger=True, on_step=True, on_epoch=True) + self.log_dict(log_dict_disc, prog_bar=False, logger=True, on_step=True, on_epoch=False) + return discloss + + def validation_step(self, batch, batch_idx): + log_dict = self._validation_step(batch, batch_idx) + with self.ema_scope(): + log_dict_ema = self._validation_step(batch, batch_idx, postfix="_ema") + return log_dict + + def _validation_step(self, batch, batch_idx, postfix=""): + inputs = self.get_input(batch, self.image_key) + reconstructions, posterior = self(inputs) + aeloss, log_dict_ae = self.loss(inputs, reconstructions, posterior, 0, self.global_step, + last_layer=self.get_last_layer(), split="val"+postfix) + + discloss, log_dict_disc = self.loss(inputs, reconstructions, posterior, 1, self.global_step, + last_layer=self.get_last_layer(), split="val"+postfix) + + self.log(f"val{postfix}/rec_loss", log_dict_ae[f"val{postfix}/rec_loss"]) + self.log_dict(log_dict_ae) + self.log_dict(log_dict_disc) + return self.log_dict + + def configure_optimizers(self): + lr = self.learning_rate + ae_params_list = list(self.encoder.parameters()) + list(self.decoder.parameters()) + list( + self.quant_conv.parameters()) + list(self.post_quant_conv.parameters()) + if self.learn_logvar: + print(f"{self.__class__.__name__}: Learning logvar") + ae_params_list.append(self.loss.logvar) + opt_ae = torch.optim.Adam(ae_params_list, + lr=lr, betas=(0.5, 0.9)) + opt_disc = torch.optim.Adam(self.loss.discriminator.parameters(), + lr=lr, betas=(0.5, 0.9)) + return [opt_ae, opt_disc], [] + + def get_last_layer(self): + return self.decoder.conv_out.weight + + @torch.no_grad() + def log_images(self, batch, only_inputs=False, log_ema=False, **kwargs): + log = dict() + x = self.get_input(batch, self.image_key) + x = x.to(self.device) + if not only_inputs: + xrec, posterior = self(x) + if x.shape[1] > 3: + # colorize with random projection + assert xrec.shape[1] > 3 + x = self.to_rgb(x) + xrec = self.to_rgb(xrec) + log["samples"] = self.decode(torch.randn_like(posterior.sample())) + log["reconstructions"] = xrec + if log_ema or self.use_ema: + with self.ema_scope(): + xrec_ema, posterior_ema = self(x) + if x.shape[1] > 3: + # colorize with random projection + assert xrec_ema.shape[1] > 3 + xrec_ema = self.to_rgb(xrec_ema) + log["samples_ema"] = self.decode(torch.randn_like(posterior_ema.sample())) + log["reconstructions_ema"] = xrec_ema + log["inputs"] = x + return log + + def to_rgb(self, x): + assert self.image_key == "segmentation" + if not hasattr(self, "colorize"): + self.register_buffer("colorize", torch.randn(3, x.shape[1], 1, 1).to(x)) + x = F.conv2d(x, weight=self.colorize) + x = 2.*(x-x.min())/(x.max()-x.min()) - 1. + return x + + +class IdentityFirstStage(torch.nn.Module): + def __init__(self, *args, vq_interface=False, **kwargs): + self.vq_interface = vq_interface + super().__init__() + + def encode(self, x, *args, **kwargs): + return x + + def decode(self, x, *args, **kwargs): + return x + + def quantize(self, x, *args, **kwargs): + if self.vq_interface: + return x, None, [None, None, None] + return x + + def forward(self, x, *args, **kwargs): + return x + diff --git a/inpaint/model/anytext/ldm/models/diffusion/__init__.py b/inpaint/model/anytext/ldm/models/diffusion/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/inpaint/model/anytext/ldm/models/diffusion/ddim.py b/inpaint/model/anytext/ldm/models/diffusion/ddim.py new file mode 100644 index 0000000..f8bbaff --- /dev/null +++ b/inpaint/model/anytext/ldm/models/diffusion/ddim.py @@ -0,0 +1,354 @@ +"""SAMPLING ONLY.""" + +import torch +import numpy as np +from tqdm import tqdm + +from iopaint.model.anytext.ldm.modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like, extract_into_tensor + + +class DDIMSampler(object): + def __init__(self, model, schedule="linear", **kwargs): + super().__init__() + self.model = model + self.ddpm_num_timesteps = model.num_timesteps + self.schedule = schedule + + def register_buffer(self, name, attr): + if type(attr) == torch.Tensor: + if attr.device != torch.device("cuda"): + attr = attr.to(torch.device("cuda")) + setattr(self, name, attr) + + def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True): + self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps, + num_ddpm_timesteps=self.ddpm_num_timesteps,verbose=verbose) + alphas_cumprod = self.model.alphas_cumprod + assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep' + to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device) + + self.register_buffer('betas', to_torch(self.model.betas)) + self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod)) + self.register_buffer('alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev)) + + # calculations for diffusion q(x_t | x_{t-1}) and others + self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu()))) + self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu()))) + self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu()))) + self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu()))) + self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1))) + + # ddim sampling parameters + ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(), + ddim_timesteps=self.ddim_timesteps, + eta=ddim_eta,verbose=verbose) + self.register_buffer('ddim_sigmas', ddim_sigmas) + self.register_buffer('ddim_alphas', ddim_alphas) + self.register_buffer('ddim_alphas_prev', ddim_alphas_prev) + self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas)) + sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt( + (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * ( + 1 - self.alphas_cumprod / self.alphas_cumprod_prev)) + self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps) + + @torch.no_grad() + def sample(self, + S, + batch_size, + shape, + conditioning=None, + callback=None, + normals_sequence=None, + img_callback=None, + quantize_x0=False, + eta=0., + mask=None, + x0=None, + temperature=1., + noise_dropout=0., + score_corrector=None, + corrector_kwargs=None, + verbose=True, + x_T=None, + log_every_t=100, + unconditional_guidance_scale=1., + unconditional_conditioning=None, # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ... + dynamic_threshold=None, + ucg_schedule=None, + **kwargs + ): + if conditioning is not None: + if isinstance(conditioning, dict): + ctmp = conditioning[list(conditioning.keys())[0]] + while isinstance(ctmp, list): ctmp = ctmp[0] + cbs = ctmp.shape[0] + # cbs = len(ctmp[0]) + if cbs != batch_size: + print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}") + + elif isinstance(conditioning, list): + for ctmp in conditioning: + if ctmp.shape[0] != batch_size: + print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}") + + else: + if conditioning.shape[0] != batch_size: + print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}") + + self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose) + # sampling + C, H, W = shape + size = (batch_size, C, H, W) + print(f'Data shape for DDIM sampling is {size}, eta {eta}') + + samples, intermediates = self.ddim_sampling(conditioning, size, + callback=callback, + img_callback=img_callback, + quantize_denoised=quantize_x0, + mask=mask, x0=x0, + ddim_use_original_steps=False, + noise_dropout=noise_dropout, + temperature=temperature, + score_corrector=score_corrector, + corrector_kwargs=corrector_kwargs, + x_T=x_T, + log_every_t=log_every_t, + unconditional_guidance_scale=unconditional_guidance_scale, + unconditional_conditioning=unconditional_conditioning, + dynamic_threshold=dynamic_threshold, + ucg_schedule=ucg_schedule + ) + return samples, intermediates + + @torch.no_grad() + def ddim_sampling(self, cond, shape, + x_T=None, ddim_use_original_steps=False, + callback=None, timesteps=None, quantize_denoised=False, + mask=None, x0=None, img_callback=None, log_every_t=100, + temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None, + unconditional_guidance_scale=1., unconditional_conditioning=None, dynamic_threshold=None, + ucg_schedule=None): + device = self.model.betas.device + b = shape[0] + if x_T is None: + img = torch.randn(shape, device=device) + else: + img = x_T + + if timesteps is None: + timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps + elif timesteps is not None and not ddim_use_original_steps: + subset_end = int(min(timesteps / self.ddim_timesteps.shape[0], 1) * self.ddim_timesteps.shape[0]) - 1 + timesteps = self.ddim_timesteps[:subset_end] + + intermediates = {'x_inter': [img], 'pred_x0': [img], "index": [10000]} + time_range = reversed(range(0, timesteps)) if ddim_use_original_steps else np.flip(timesteps) + total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0] + print(f"Running DDIM Sampling with {total_steps} timesteps") + + iterator = tqdm(time_range, desc='DDIM Sampler', total=total_steps) + + for i, step in enumerate(iterator): + index = total_steps - i - 1 + ts = torch.full((b,), step, device=device, dtype=torch.long) + + if mask is not None: + assert x0 is not None + img_orig = self.model.q_sample(x0, ts) # TODO: deterministic forward pass? + img = img_orig * mask + (1. - mask) * img + + if ucg_schedule is not None: + assert len(ucg_schedule) == len(time_range) + unconditional_guidance_scale = ucg_schedule[i] + + outs = self.p_sample_ddim(img, cond, ts, index=index, use_original_steps=ddim_use_original_steps, + quantize_denoised=quantize_denoised, temperature=temperature, + noise_dropout=noise_dropout, score_corrector=score_corrector, + corrector_kwargs=corrector_kwargs, + unconditional_guidance_scale=unconditional_guidance_scale, + unconditional_conditioning=unconditional_conditioning, + dynamic_threshold=dynamic_threshold) + img, pred_x0 = outs + if callback: + callback(i) + if img_callback: + img_callback(pred_x0, i) + + if index % log_every_t == 0 or index == total_steps - 1: + intermediates['x_inter'].append(img) + intermediates['pred_x0'].append(pred_x0) + intermediates['index'].append(index) + + return img, intermediates + + @torch.no_grad() + def p_sample_ddim(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False, + temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None, + unconditional_guidance_scale=1., unconditional_conditioning=None, + dynamic_threshold=None): + b, *_, device = *x.shape, x.device + + if unconditional_conditioning is None or unconditional_guidance_scale == 1.: + model_output = self.model.apply_model(x, t, c) + else: + x_in = torch.cat([x] * 2) + t_in = torch.cat([t] * 2) + if isinstance(c, dict): + assert isinstance(unconditional_conditioning, dict) + c_in = dict() + for k in c: + if isinstance(c[k], list): + c_in[k] = [torch.cat([ + unconditional_conditioning[k][i], + c[k][i]]) for i in range(len(c[k]))] + elif isinstance(c[k], dict): + c_in[k] = dict() + for key in c[k]: + if isinstance(c[k][key], list): + if not isinstance(c[k][key][0], torch.Tensor): + continue + c_in[k][key] = [torch.cat([ + unconditional_conditioning[k][key][i], + c[k][key][i]]) for i in range(len(c[k][key]))] + else: + c_in[k][key] = torch.cat([ + unconditional_conditioning[k][key], + c[k][key]]) + + else: + c_in[k] = torch.cat([ + unconditional_conditioning[k], + c[k]]) + elif isinstance(c, list): + c_in = list() + assert isinstance(unconditional_conditioning, list) + for i in range(len(c)): + c_in.append(torch.cat([unconditional_conditioning[i], c[i]])) + else: + c_in = torch.cat([unconditional_conditioning, c]) + model_uncond, model_t = self.model.apply_model(x_in, t_in, c_in).chunk(2) + model_output = model_uncond + unconditional_guidance_scale * (model_t - model_uncond) + + if self.model.parameterization == "v": + e_t = self.model.predict_eps_from_z_and_v(x, t, model_output) + else: + e_t = model_output + + if score_corrector is not None: + assert self.model.parameterization == "eps", 'not implemented' + e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs) + + alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas + alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev + sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas + sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas + # select parameters corresponding to the currently considered timestep + a_t = torch.full((b, 1, 1, 1), alphas[index], device=device) + a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device) + sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device) + sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index],device=device) + + # current prediction for x_0 + if self.model.parameterization != "v": + pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt() + else: + pred_x0 = self.model.predict_start_from_z_and_v(x, t, model_output) + + if quantize_denoised: + pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0) + + if dynamic_threshold is not None: + raise NotImplementedError() + + # direction pointing to x_t + dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t + noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature + if noise_dropout > 0.: + noise = torch.nn.functional.dropout(noise, p=noise_dropout) + x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise + return x_prev, pred_x0 + + @torch.no_grad() + def encode(self, x0, c, t_enc, use_original_steps=False, return_intermediates=None, + unconditional_guidance_scale=1.0, unconditional_conditioning=None, callback=None): + num_reference_steps = self.ddpm_num_timesteps if use_original_steps else self.ddim_timesteps.shape[0] + + assert t_enc <= num_reference_steps + num_steps = t_enc + + if use_original_steps: + alphas_next = self.alphas_cumprod[:num_steps] + alphas = self.alphas_cumprod_prev[:num_steps] + else: + alphas_next = self.ddim_alphas[:num_steps] + alphas = torch.tensor(self.ddim_alphas_prev[:num_steps]) + + x_next = x0 + intermediates = [] + inter_steps = [] + for i in tqdm(range(num_steps), desc='Encoding Image'): + t = torch.full((x0.shape[0],), i, device=self.model.device, dtype=torch.long) + if unconditional_guidance_scale == 1.: + noise_pred = self.model.apply_model(x_next, t, c) + else: + assert unconditional_conditioning is not None + e_t_uncond, noise_pred = torch.chunk( + self.model.apply_model(torch.cat((x_next, x_next)), torch.cat((t, t)), + torch.cat((unconditional_conditioning, c))), 2) + noise_pred = e_t_uncond + unconditional_guidance_scale * (noise_pred - e_t_uncond) + + xt_weighted = (alphas_next[i] / alphas[i]).sqrt() * x_next + weighted_noise_pred = alphas_next[i].sqrt() * ( + (1 / alphas_next[i] - 1).sqrt() - (1 / alphas[i] - 1).sqrt()) * noise_pred + x_next = xt_weighted + weighted_noise_pred + if return_intermediates and i % ( + num_steps // return_intermediates) == 0 and i < num_steps - 1: + intermediates.append(x_next) + inter_steps.append(i) + elif return_intermediates and i >= num_steps - 2: + intermediates.append(x_next) + inter_steps.append(i) + if callback: callback(i) + + out = {'x_encoded': x_next, 'intermediate_steps': inter_steps} + if return_intermediates: + out.update({'intermediates': intermediates}) + return x_next, out + + @torch.no_grad() + def stochastic_encode(self, x0, t, use_original_steps=False, noise=None): + # fast, but does not allow for exact reconstruction + # t serves as an index to gather the correct alphas + if use_original_steps: + sqrt_alphas_cumprod = self.sqrt_alphas_cumprod + sqrt_one_minus_alphas_cumprod = self.sqrt_one_minus_alphas_cumprod + else: + sqrt_alphas_cumprod = torch.sqrt(self.ddim_alphas) + sqrt_one_minus_alphas_cumprod = self.ddim_sqrt_one_minus_alphas + + if noise is None: + noise = torch.randn_like(x0) + return (extract_into_tensor(sqrt_alphas_cumprod, t, x0.shape) * x0 + + extract_into_tensor(sqrt_one_minus_alphas_cumprod, t, x0.shape) * noise) + + @torch.no_grad() + def decode(self, x_latent, cond, t_start, unconditional_guidance_scale=1.0, unconditional_conditioning=None, + use_original_steps=False, callback=None): + + timesteps = np.arange(self.ddpm_num_timesteps) if use_original_steps else self.ddim_timesteps + timesteps = timesteps[:t_start] + + time_range = np.flip(timesteps) + total_steps = timesteps.shape[0] + print(f"Running DDIM Sampling with {total_steps} timesteps") + + iterator = tqdm(time_range, desc='Decoding image', total=total_steps) + x_dec = x_latent + for i, step in enumerate(iterator): + index = total_steps - i - 1 + ts = torch.full((x_latent.shape[0],), step, device=x_latent.device, dtype=torch.long) + x_dec, _ = self.p_sample_ddim(x_dec, cond, ts, index=index, use_original_steps=use_original_steps, + unconditional_guidance_scale=unconditional_guidance_scale, + unconditional_conditioning=unconditional_conditioning) + if callback: callback(i) + return x_dec \ No newline at end of file diff --git a/inpaint/model/anytext/ldm/models/diffusion/ddpm.py b/inpaint/model/anytext/ldm/models/diffusion/ddpm.py new file mode 100644 index 0000000..9f48918 --- /dev/null +++ b/inpaint/model/anytext/ldm/models/diffusion/ddpm.py @@ -0,0 +1,2380 @@ +""" +Part of the implementation is borrowed and modified from ControlNet, publicly available at https://github.com/lllyasviel/ControlNet/blob/main/ldm/models/diffusion/ddpm.py +""" + +import torch +import torch.nn as nn +import numpy as np +from torch.optim.lr_scheduler import LambdaLR +from einops import rearrange, repeat +from contextlib import contextmanager, nullcontext +from functools import partial +import itertools +from tqdm import tqdm +from torchvision.utils import make_grid +from omegaconf import ListConfig + +from iopaint.model.anytext.ldm.util import ( + log_txt_as_img, + exists, + default, + ismap, + isimage, + mean_flat, + count_params, + instantiate_from_config, +) +from iopaint.model.anytext.ldm.modules.ema import LitEma +from iopaint.model.anytext.ldm.modules.distributions.distributions import ( + normal_kl, + DiagonalGaussianDistribution, +) +from iopaint.model.anytext.ldm.models.autoencoder import IdentityFirstStage, AutoencoderKL +from iopaint.model.anytext.ldm.modules.diffusionmodules.util import ( + make_beta_schedule, + extract_into_tensor, + noise_like, +) +from iopaint.model.anytext.ldm.models.diffusion.ddim import DDIMSampler +import cv2 + + +__conditioning_keys__ = {"concat": "c_concat", "crossattn": "c_crossattn", "adm": "y"} + +PRINT_DEBUG = False + + +def print_grad(grad): + # print('Gradient:', grad) + # print(grad.shape) + a = grad.max() + b = grad.min() + # print(f'mean={grad.mean():.4f}, max={a:.4f}, min={b:.4f}') + s = 255.0 / (a - b) + c = 255 * (-b / (a - b)) + grad = grad * s + c + # print(f'mean={grad.mean():.4f}, max={grad.max():.4f}, min={grad.min():.4f}') + img = grad[0].permute(1, 2, 0).detach().cpu().numpy() + if img.shape[0] == 512: + cv2.imwrite("grad-img.jpg", img) + elif img.shape[0] == 64: + cv2.imwrite("grad-latent.jpg", img) + + +def disabled_train(self, mode=True): + """Overwrite model.train with this function to make sure train/eval mode + does not change anymore.""" + return self + + +def uniform_on_device(r1, r2, shape, device): + return (r1 - r2) * torch.rand(*shape, device=device) + r2 + + +class DDPM(torch.nn.Module): + # classic DDPM with Gaussian diffusion, in image space + def __init__( + self, + unet_config, + timesteps=1000, + beta_schedule="linear", + loss_type="l2", + ckpt_path=None, + ignore_keys=[], + load_only_unet=False, + monitor="val/loss", + use_ema=True, + first_stage_key="image", + image_size=256, + channels=3, + log_every_t=100, + clip_denoised=True, + linear_start=1e-4, + linear_end=2e-2, + cosine_s=8e-3, + given_betas=None, + original_elbo_weight=0.0, + v_posterior=0.0, # weight for choosing posterior variance as sigma = (1-v) * beta_tilde + v * beta + l_simple_weight=1.0, + conditioning_key=None, + parameterization="eps", # all assuming fixed variance schedules + scheduler_config=None, + use_positional_encodings=False, + learn_logvar=False, + logvar_init=0.0, + make_it_fit=False, + ucg_training=None, + reset_ema=False, + reset_num_ema_updates=False, + ): + super().__init__() + assert parameterization in [ + "eps", + "x0", + "v", + ], 'currently only supporting "eps" and "x0" and "v"' + self.parameterization = parameterization + print( + f"{self.__class__.__name__}: Running in {self.parameterization}-prediction mode" + ) + self.cond_stage_model = None + self.clip_denoised = clip_denoised + self.log_every_t = log_every_t + self.first_stage_key = first_stage_key + self.image_size = image_size # try conv? + self.channels = channels + self.use_positional_encodings = use_positional_encodings + self.model = DiffusionWrapper(unet_config, conditioning_key) + count_params(self.model, verbose=True) + self.use_ema = use_ema + if self.use_ema: + self.model_ema = LitEma(self.model) + print(f"Keeping EMAs of {len(list(self.model_ema.buffers()))}.") + + self.use_scheduler = scheduler_config is not None + if self.use_scheduler: + self.scheduler_config = scheduler_config + + self.v_posterior = v_posterior + self.original_elbo_weight = original_elbo_weight + self.l_simple_weight = l_simple_weight + + if monitor is not None: + self.monitor = monitor + self.make_it_fit = make_it_fit + if reset_ema: + assert exists(ckpt_path) + if ckpt_path is not None: + self.init_from_ckpt( + ckpt_path, ignore_keys=ignore_keys, only_model=load_only_unet + ) + if reset_ema: + assert self.use_ema + print( + f"Resetting ema to pure model weights. This is useful when restoring from an ema-only checkpoint." + ) + self.model_ema = LitEma(self.model) + if reset_num_ema_updates: + print( + " +++++++++++ WARNING: RESETTING NUM_EMA UPDATES TO ZERO +++++++++++ " + ) + assert self.use_ema + self.model_ema.reset_num_updates() + + self.register_schedule( + given_betas=given_betas, + beta_schedule=beta_schedule, + timesteps=timesteps, + linear_start=linear_start, + linear_end=linear_end, + cosine_s=cosine_s, + ) + + self.loss_type = loss_type + + self.learn_logvar = learn_logvar + logvar = torch.full(fill_value=logvar_init, size=(self.num_timesteps,)) + if self.learn_logvar: + self.logvar = nn.Parameter(self.logvar, requires_grad=True) + else: + self.register_buffer("logvar", logvar) + + self.ucg_training = ucg_training or dict() + if self.ucg_training: + self.ucg_prng = np.random.RandomState() + + def register_schedule( + self, + given_betas=None, + beta_schedule="linear", + timesteps=1000, + linear_start=1e-4, + linear_end=2e-2, + cosine_s=8e-3, + ): + if exists(given_betas): + betas = given_betas + else: + betas = make_beta_schedule( + beta_schedule, + timesteps, + linear_start=linear_start, + linear_end=linear_end, + cosine_s=cosine_s, + ) + alphas = 1.0 - betas + alphas_cumprod = np.cumprod(alphas, axis=0) + # np.save('1.npy', alphas_cumprod) + alphas_cumprod_prev = np.append(1.0, alphas_cumprod[:-1]) + + (timesteps,) = betas.shape + self.num_timesteps = int(timesteps) + self.linear_start = linear_start + self.linear_end = linear_end + assert ( + alphas_cumprod.shape[0] == self.num_timesteps + ), "alphas have to be defined for each timestep" + + to_torch = partial(torch.tensor, dtype=torch.float32) + + self.register_buffer("betas", to_torch(betas)) + self.register_buffer("alphas_cumprod", to_torch(alphas_cumprod)) + self.register_buffer("alphas_cumprod_prev", to_torch(alphas_cumprod_prev)) + + # calculations for diffusion q(x_t | x_{t-1}) and others + self.register_buffer("sqrt_alphas_cumprod", to_torch(np.sqrt(alphas_cumprod))) + self.register_buffer( + "sqrt_one_minus_alphas_cumprod", to_torch(np.sqrt(1.0 - alphas_cumprod)) + ) + self.register_buffer( + "log_one_minus_alphas_cumprod", to_torch(np.log(1.0 - alphas_cumprod)) + ) + self.register_buffer( + "sqrt_recip_alphas_cumprod", to_torch(np.sqrt(1.0 / alphas_cumprod)) + ) + self.register_buffer( + "sqrt_recipm1_alphas_cumprod", to_torch(np.sqrt(1.0 / alphas_cumprod - 1)) + ) + + # calculations for posterior q(x_{t-1} | x_t, x_0) + posterior_variance = (1 - self.v_posterior) * betas * ( + 1.0 - alphas_cumprod_prev + ) / (1.0 - alphas_cumprod) + self.v_posterior * betas + # above: equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t) + self.register_buffer("posterior_variance", to_torch(posterior_variance)) + # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain + self.register_buffer( + "posterior_log_variance_clipped", + to_torch(np.log(np.maximum(posterior_variance, 1e-20))), + ) + self.register_buffer( + "posterior_mean_coef1", + to_torch(betas * np.sqrt(alphas_cumprod_prev) / (1.0 - alphas_cumprod)), + ) + self.register_buffer( + "posterior_mean_coef2", + to_torch( + (1.0 - alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - alphas_cumprod) + ), + ) + + if self.parameterization == "eps": + lvlb_weights = self.betas**2 / ( + 2 + * self.posterior_variance + * to_torch(alphas) + * (1 - self.alphas_cumprod) + ) + elif self.parameterization == "x0": + lvlb_weights = ( + 0.5 + * np.sqrt(torch.Tensor(alphas_cumprod)) + / (2.0 * 1 - torch.Tensor(alphas_cumprod)) + ) + elif self.parameterization == "v": + lvlb_weights = torch.ones_like( + self.betas**2 + / ( + 2 + * self.posterior_variance + * to_torch(alphas) + * (1 - self.alphas_cumprod) + ) + ) + else: + raise NotImplementedError("mu not supported") + lvlb_weights[0] = lvlb_weights[1] + self.register_buffer("lvlb_weights", lvlb_weights, persistent=False) + assert not torch.isnan(self.lvlb_weights).all() + + @contextmanager + def ema_scope(self, context=None): + if self.use_ema: + self.model_ema.store(self.model.parameters()) + self.model_ema.copy_to(self.model) + if context is not None: + print(f"{context}: Switched to EMA weights") + try: + yield None + finally: + if self.use_ema: + self.model_ema.restore(self.model.parameters()) + if context is not None: + print(f"{context}: Restored training weights") + + @torch.no_grad() + def init_from_ckpt(self, path, ignore_keys=list(), only_model=False): + sd = torch.load(path, map_location="cpu") + if "state_dict" in list(sd.keys()): + sd = sd["state_dict"] + keys = list(sd.keys()) + for k in keys: + for ik in ignore_keys: + if k.startswith(ik): + print("Deleting key {} from state_dict.".format(k)) + del sd[k] + if self.make_it_fit: + n_params = len( + [ + name + for name, _ in itertools.chain( + self.named_parameters(), self.named_buffers() + ) + ] + ) + for name, param in tqdm( + itertools.chain(self.named_parameters(), self.named_buffers()), + desc="Fitting old weights to new weights", + total=n_params, + ): + if not name in sd: + continue + old_shape = sd[name].shape + new_shape = param.shape + assert len(old_shape) == len(new_shape) + if len(new_shape) > 2: + # we only modify first two axes + assert new_shape[2:] == old_shape[2:] + # assumes first axis corresponds to output dim + if not new_shape == old_shape: + new_param = param.clone() + old_param = sd[name] + if len(new_shape) == 1: + for i in range(new_param.shape[0]): + new_param[i] = old_param[i % old_shape[0]] + elif len(new_shape) >= 2: + for i in range(new_param.shape[0]): + for j in range(new_param.shape[1]): + new_param[i, j] = old_param[ + i % old_shape[0], j % old_shape[1] + ] + + n_used_old = torch.ones(old_shape[1]) + for j in range(new_param.shape[1]): + n_used_old[j % old_shape[1]] += 1 + n_used_new = torch.zeros(new_shape[1]) + for j in range(new_param.shape[1]): + n_used_new[j] = n_used_old[j % old_shape[1]] + + n_used_new = n_used_new[None, :] + while len(n_used_new.shape) < len(new_shape): + n_used_new = n_used_new.unsqueeze(-1) + new_param /= n_used_new + + sd[name] = new_param + + missing, unexpected = ( + self.load_state_dict(sd, strict=False) + if not only_model + else self.model.load_state_dict(sd, strict=False) + ) + print( + f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys" + ) + if len(missing) > 0: + print(f"Missing Keys:\n {missing}") + if len(unexpected) > 0: + print(f"\nUnexpected Keys:\n {unexpected}") + + def q_mean_variance(self, x_start, t): + """ + Get the distribution q(x_t | x_0). + :param x_start: the [N x C x ...] tensor of noiseless inputs. + :param t: the number of diffusion steps (minus 1). Here, 0 means one step. + :return: A tuple (mean, variance, log_variance), all of x_start's shape. + """ + mean = extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start + variance = extract_into_tensor(1.0 - self.alphas_cumprod, t, x_start.shape) + log_variance = extract_into_tensor( + self.log_one_minus_alphas_cumprod, t, x_start.shape + ) + return mean, variance, log_variance + + def predict_start_from_noise(self, x_t, t, noise): + return ( + extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t + - extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) + * noise + ) + + def predict_start_from_z_and_v(self, x_t, t, v): + # self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod))) + # self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod))) + return ( + extract_into_tensor(self.sqrt_alphas_cumprod, t, x_t.shape) * x_t + - extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_t.shape) * v + ) + + def predict_eps_from_z_and_v(self, x_t, t, v): + return ( + extract_into_tensor(self.sqrt_alphas_cumprod, t, x_t.shape) * v + + extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_t.shape) + * x_t + ) + + def q_posterior(self, x_start, x_t, t): + posterior_mean = ( + extract_into_tensor(self.posterior_mean_coef1, t, x_t.shape) * x_start + + extract_into_tensor(self.posterior_mean_coef2, t, x_t.shape) * x_t + ) + posterior_variance = extract_into_tensor(self.posterior_variance, t, x_t.shape) + posterior_log_variance_clipped = extract_into_tensor( + self.posterior_log_variance_clipped, t, x_t.shape + ) + return posterior_mean, posterior_variance, posterior_log_variance_clipped + + def p_mean_variance(self, x, t, clip_denoised: bool): + model_out = self.model(x, t) + if self.parameterization == "eps": + x_recon = self.predict_start_from_noise(x, t=t, noise=model_out) + elif self.parameterization == "x0": + x_recon = model_out + if clip_denoised: + x_recon.clamp_(-1.0, 1.0) + + model_mean, posterior_variance, posterior_log_variance = self.q_posterior( + x_start=x_recon, x_t=x, t=t + ) + return model_mean, posterior_variance, posterior_log_variance + + @torch.no_grad() + def p_sample(self, x, t, clip_denoised=True, repeat_noise=False): + b, *_, device = *x.shape, x.device + model_mean, _, model_log_variance = self.p_mean_variance( + x=x, t=t, clip_denoised=clip_denoised + ) + noise = noise_like(x.shape, device, repeat_noise) + # no noise when t == 0 + nonzero_mask = (1 - (t == 0).float()).reshape(b, *((1,) * (len(x.shape) - 1))) + return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise + + @torch.no_grad() + def p_sample_loop(self, shape, return_intermediates=False): + device = self.betas.device + b = shape[0] + img = torch.randn(shape, device=device) + intermediates = [img] + for i in tqdm( + reversed(range(0, self.num_timesteps)), + desc="Sampling t", + total=self.num_timesteps, + ): + img = self.p_sample( + img, + torch.full((b,), i, device=device, dtype=torch.long), + clip_denoised=self.clip_denoised, + ) + if i % self.log_every_t == 0 or i == self.num_timesteps - 1: + intermediates.append(img) + if return_intermediates: + return img, intermediates + return img + + @torch.no_grad() + def sample(self, batch_size=16, return_intermediates=False): + image_size = self.image_size + channels = self.channels + return self.p_sample_loop( + (batch_size, channels, image_size, image_size), + return_intermediates=return_intermediates, + ) + + def q_sample(self, x_start, t, noise=None): + noise = default(noise, lambda: torch.randn_like(x_start)) + return ( + extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start + + extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) + * noise + ) + + def get_v(self, x, noise, t): + return ( + extract_into_tensor(self.sqrt_alphas_cumprod, t, x.shape) * noise + - extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x.shape) * x + ) + + def get_loss(self, pred, target, mean=True): + if self.loss_type == "l1": + loss = (target - pred).abs() + if mean: + loss = loss.mean() + elif self.loss_type == "l2": + if mean: + loss = torch.nn.functional.mse_loss(target, pred) + else: + loss = torch.nn.functional.mse_loss(target, pred, reduction="none") + else: + raise NotImplementedError("unknown loss type '{loss_type}'") + + return loss + + def p_losses(self, x_start, t, noise=None): + noise = default(noise, lambda: torch.randn_like(x_start)) + x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise) + model_out = self.model(x_noisy, t) + + loss_dict = {} + if self.parameterization == "eps": + target = noise + elif self.parameterization == "x0": + target = x_start + elif self.parameterization == "v": + target = self.get_v(x_start, noise, t) + else: + raise NotImplementedError( + f"Parameterization {self.parameterization} not yet supported" + ) + + loss = self.get_loss(model_out, target, mean=False).mean(dim=[1, 2, 3]) + + log_prefix = "train" if self.training else "val" + + loss_dict.update({f"{log_prefix}/loss_simple": loss.mean()}) + loss_simple = loss.mean() * self.l_simple_weight + + loss_vlb = (self.lvlb_weights[t] * loss).mean() + loss_dict.update({f"{log_prefix}/loss_vlb": loss_vlb}) + + loss = loss_simple + self.original_elbo_weight * loss_vlb + + loss_dict.update({f"{log_prefix}/loss": loss}) + + return loss, loss_dict + + def forward(self, x, *args, **kwargs): + # b, c, h, w, device, img_size, = *x.shape, x.device, self.image_size + # assert h == img_size and w == img_size, f'height and width of image must be {img_size}' + t = torch.randint( + 0, self.num_timesteps, (x.shape[0],), device=self.device + ).long() + return self.p_losses(x, t, *args, **kwargs) + + def get_input(self, batch, k): + x = batch[k] + if len(x.shape) == 3: + x = x[..., None] + x = rearrange(x, "b h w c -> b c h w") + x = x.to(memory_format=torch.contiguous_format).float() + return x + + def shared_step(self, batch): + x = self.get_input(batch, self.first_stage_key) + loss, loss_dict = self(x) + return loss, loss_dict + + def training_step(self, batch, batch_idx): + for k in self.ucg_training: + p = self.ucg_training[k]["p"] + val = self.ucg_training[k]["val"] + if val is None: + val = "" + for i in range(len(batch[k])): + if self.ucg_prng.choice(2, p=[1 - p, p]): + batch[k][i] = val + + loss, loss_dict = self.shared_step(batch) + + self.log_dict( + loss_dict, prog_bar=True, logger=True, on_step=True, on_epoch=True + ) + + self.log( + "global_step", + self.global_step, + prog_bar=True, + logger=True, + on_step=True, + on_epoch=False, + ) + + if self.use_scheduler: + lr = self.optimizers().param_groups[0]["lr"] + self.log( + "lr_abs", lr, prog_bar=True, logger=True, on_step=True, on_epoch=False + ) + + return loss + + @torch.no_grad() + def validation_step(self, batch, batch_idx): + _, loss_dict_no_ema = self.shared_step(batch) + with self.ema_scope(): + _, loss_dict_ema = self.shared_step(batch) + loss_dict_ema = {key + "_ema": loss_dict_ema[key] for key in loss_dict_ema} + self.log_dict( + loss_dict_no_ema, prog_bar=False, logger=True, on_step=False, on_epoch=True + ) + self.log_dict( + loss_dict_ema, prog_bar=False, logger=True, on_step=False, on_epoch=True + ) + + def on_train_batch_end(self, *args, **kwargs): + if self.use_ema: + self.model_ema(self.model) + + def _get_rows_from_list(self, samples): + n_imgs_per_row = len(samples) + denoise_grid = rearrange(samples, "n b c h w -> b n c h w") + denoise_grid = rearrange(denoise_grid, "b n c h w -> (b n) c h w") + denoise_grid = make_grid(denoise_grid, nrow=n_imgs_per_row) + return denoise_grid + + @torch.no_grad() + def log_images(self, batch, N=8, n_row=2, sample=True, return_keys=None, **kwargs): + log = dict() + x = self.get_input(batch, self.first_stage_key) + N = min(x.shape[0], N) + n_row = min(x.shape[0], n_row) + x = x.to(self.device)[:N] + log["inputs"] = x + + # get diffusion row + diffusion_row = list() + x_start = x[:n_row] + + for t in range(self.num_timesteps): + if t % self.log_every_t == 0 or t == self.num_timesteps - 1: + t = repeat(torch.tensor([t]), "1 -> b", b=n_row) + t = t.to(self.device).long() + noise = torch.randn_like(x_start) + x_noisy = self.q_sample(x_start=x_start, t=t, noise=noise) + diffusion_row.append(x_noisy) + + log["diffusion_row"] = self._get_rows_from_list(diffusion_row) + + if sample: + # get denoise row + with self.ema_scope("Plotting"): + samples, denoise_row = self.sample( + batch_size=N, return_intermediates=True + ) + + log["samples"] = samples + log["denoise_row"] = self._get_rows_from_list(denoise_row) + + if return_keys: + if np.intersect1d(list(log.keys()), return_keys).shape[0] == 0: + return log + else: + return {key: log[key] for key in return_keys} + return log + + def configure_optimizers(self): + lr = self.learning_rate + params = list(self.model.parameters()) + if self.learn_logvar: + params = params + [self.logvar] + opt = torch.optim.AdamW(params, lr=lr) + return opt + + +class LatentDiffusion(DDPM): + """main class""" + + def __init__( + self, + first_stage_config, + cond_stage_config, + num_timesteps_cond=None, + cond_stage_key="image", + cond_stage_trainable=False, + concat_mode=True, + cond_stage_forward=None, + conditioning_key=None, + scale_factor=1.0, + scale_by_std=False, + force_null_conditioning=False, + *args, + **kwargs, + ): + self.force_null_conditioning = force_null_conditioning + self.num_timesteps_cond = default(num_timesteps_cond, 1) + self.scale_by_std = scale_by_std + assert self.num_timesteps_cond <= kwargs["timesteps"] + # for backwards compatibility after implementation of DiffusionWrapper + if conditioning_key is None: + conditioning_key = "concat" if concat_mode else "crossattn" + if ( + cond_stage_config == "__is_unconditional__" + and not self.force_null_conditioning + ): + conditioning_key = None + ckpt_path = kwargs.pop("ckpt_path", None) + reset_ema = kwargs.pop("reset_ema", False) + reset_num_ema_updates = kwargs.pop("reset_num_ema_updates", False) + ignore_keys = kwargs.pop("ignore_keys", []) + super().__init__(conditioning_key=conditioning_key, *args, **kwargs) + self.concat_mode = concat_mode + self.cond_stage_trainable = cond_stage_trainable + self.cond_stage_key = cond_stage_key + try: + self.num_downs = len(first_stage_config.params.ddconfig.ch_mult) - 1 + except: + self.num_downs = 0 + if not scale_by_std: + self.scale_factor = scale_factor + else: + self.register_buffer("scale_factor", torch.tensor(scale_factor)) + self.instantiate_first_stage(first_stage_config) + self.instantiate_cond_stage(cond_stage_config) + self.cond_stage_forward = cond_stage_forward + self.clip_denoised = False + self.bbox_tokenizer = None + + self.restarted_from_ckpt = False + if ckpt_path is not None: + self.init_from_ckpt(ckpt_path, ignore_keys) + self.restarted_from_ckpt = True + if reset_ema: + assert self.use_ema + print( + f"Resetting ema to pure model weights. This is useful when restoring from an ema-only checkpoint." + ) + self.model_ema = LitEma(self.model) + if reset_num_ema_updates: + print( + " +++++++++++ WARNING: RESETTING NUM_EMA UPDATES TO ZERO +++++++++++ " + ) + assert self.use_ema + self.model_ema.reset_num_updates() + + def make_cond_schedule( + self, + ): + self.cond_ids = torch.full( + size=(self.num_timesteps,), + fill_value=self.num_timesteps - 1, + dtype=torch.long, + ) + ids = torch.round( + torch.linspace(0, self.num_timesteps - 1, self.num_timesteps_cond) + ).long() + self.cond_ids[: self.num_timesteps_cond] = ids + + @torch.no_grad() + def on_train_batch_start(self, batch, batch_idx, dataloader_idx): + # only for very first batch + if ( + self.scale_by_std + and self.current_epoch == 0 + and self.global_step == 0 + and batch_idx == 0 + and not self.restarted_from_ckpt + ): + assert ( + self.scale_factor == 1.0 + ), "rather not use custom rescaling and std-rescaling simultaneously" + # set rescale weight to 1./std of encodings + print("### USING STD-RESCALING ###") + x = super().get_input(batch, self.first_stage_key) + x = x.to(self.device) + encoder_posterior = self.encode_first_stage(x) + z = self.get_first_stage_encoding(encoder_posterior).detach() + del self.scale_factor + self.register_buffer("scale_factor", 1.0 / z.flatten().std()) + print(f"setting self.scale_factor to {self.scale_factor}") + print("### USING STD-RESCALING ###") + + def register_schedule( + self, + given_betas=None, + beta_schedule="linear", + timesteps=1000, + linear_start=1e-4, + linear_end=2e-2, + cosine_s=8e-3, + ): + super().register_schedule( + given_betas, beta_schedule, timesteps, linear_start, linear_end, cosine_s + ) + + self.shorten_cond_schedule = self.num_timesteps_cond > 1 + if self.shorten_cond_schedule: + self.make_cond_schedule() + + def instantiate_first_stage(self, config): + model = instantiate_from_config(config) + self.first_stage_model = model.eval() + self.first_stage_model.train = disabled_train + for param in self.first_stage_model.parameters(): + param.requires_grad = False + + def instantiate_cond_stage(self, config): + if not self.cond_stage_trainable: + if config == "__is_first_stage__": + print("Using first stage also as cond stage.") + self.cond_stage_model = self.first_stage_model + elif config == "__is_unconditional__": + print(f"Training {self.__class__.__name__} as an unconditional model.") + self.cond_stage_model = None + # self.be_unconditional = True + else: + model = instantiate_from_config(config) + self.cond_stage_model = model.eval() + self.cond_stage_model.train = disabled_train + for param in self.cond_stage_model.parameters(): + param.requires_grad = False + else: + assert config != "__is_first_stage__" + assert config != "__is_unconditional__" + model = instantiate_from_config(config) + self.cond_stage_model = model + + def _get_denoise_row_from_list( + self, samples, desc="", force_no_decoder_quantization=False + ): + denoise_row = [] + for zd in tqdm(samples, desc=desc): + denoise_row.append( + self.decode_first_stage( + zd.to(self.device), force_not_quantize=force_no_decoder_quantization + ) + ) + n_imgs_per_row = len(denoise_row) + denoise_row = torch.stack(denoise_row) # n_log_step, n_row, C, H, W + denoise_grid = rearrange(denoise_row, "n b c h w -> b n c h w") + denoise_grid = rearrange(denoise_grid, "b n c h w -> (b n) c h w") + denoise_grid = make_grid(denoise_grid, nrow=n_imgs_per_row) + return denoise_grid + + def get_first_stage_encoding(self, encoder_posterior): + if isinstance(encoder_posterior, DiagonalGaussianDistribution): + z = encoder_posterior.sample() + elif isinstance(encoder_posterior, torch.Tensor): + z = encoder_posterior + else: + raise NotImplementedError( + f"encoder_posterior of type '{type(encoder_posterior)}' not yet implemented" + ) + return self.scale_factor * z + + def get_learned_conditioning(self, c): + if self.cond_stage_forward is None: + if hasattr(self.cond_stage_model, "encode") and callable( + self.cond_stage_model.encode + ): + c = self.cond_stage_model.encode(c) + if isinstance(c, DiagonalGaussianDistribution): + c = c.mode() + else: + c = self.cond_stage_model(c) + else: + assert hasattr(self.cond_stage_model, self.cond_stage_forward) + c = getattr(self.cond_stage_model, self.cond_stage_forward)(c) + return c + + def meshgrid(self, h, w): + y = torch.arange(0, h).view(h, 1, 1).repeat(1, w, 1) + x = torch.arange(0, w).view(1, w, 1).repeat(h, 1, 1) + + arr = torch.cat([y, x], dim=-1) + return arr + + def delta_border(self, h, w): + """ + :param h: height + :param w: width + :return: normalized distance to image border, + wtith min distance = 0 at border and max dist = 0.5 at image center + """ + lower_right_corner = torch.tensor([h - 1, w - 1]).view(1, 1, 2) + arr = self.meshgrid(h, w) / lower_right_corner + dist_left_up = torch.min(arr, dim=-1, keepdims=True)[0] + dist_right_down = torch.min(1 - arr, dim=-1, keepdims=True)[0] + edge_dist = torch.min( + torch.cat([dist_left_up, dist_right_down], dim=-1), dim=-1 + )[0] + return edge_dist + + def get_weighting(self, h, w, Ly, Lx, device): + weighting = self.delta_border(h, w) + weighting = torch.clip( + weighting, + self.split_input_params["clip_min_weight"], + self.split_input_params["clip_max_weight"], + ) + weighting = weighting.view(1, h * w, 1).repeat(1, 1, Ly * Lx).to(device) + + if self.split_input_params["tie_braker"]: + L_weighting = self.delta_border(Ly, Lx) + L_weighting = torch.clip( + L_weighting, + self.split_input_params["clip_min_tie_weight"], + self.split_input_params["clip_max_tie_weight"], + ) + + L_weighting = L_weighting.view(1, 1, Ly * Lx).to(device) + weighting = weighting * L_weighting + return weighting + + def get_fold_unfold( + self, x, kernel_size, stride, uf=1, df=1 + ): # todo load once not every time, shorten code + """ + :param x: img of size (bs, c, h, w) + :return: n img crops of size (n, bs, c, kernel_size[0], kernel_size[1]) + """ + bs, nc, h, w = x.shape + + # number of crops in image + Ly = (h - kernel_size[0]) // stride[0] + 1 + Lx = (w - kernel_size[1]) // stride[1] + 1 + + if uf == 1 and df == 1: + fold_params = dict( + kernel_size=kernel_size, dilation=1, padding=0, stride=stride + ) + unfold = torch.nn.Unfold(**fold_params) + + fold = torch.nn.Fold(output_size=x.shape[2:], **fold_params) + + weighting = self.get_weighting( + kernel_size[0], kernel_size[1], Ly, Lx, x.device + ).to(x.dtype) + normalization = fold(weighting).view(1, 1, h, w) # normalizes the overlap + weighting = weighting.view((1, 1, kernel_size[0], kernel_size[1], Ly * Lx)) + + elif uf > 1 and df == 1: + fold_params = dict( + kernel_size=kernel_size, dilation=1, padding=0, stride=stride + ) + unfold = torch.nn.Unfold(**fold_params) + + fold_params2 = dict( + kernel_size=(kernel_size[0] * uf, kernel_size[0] * uf), + dilation=1, + padding=0, + stride=(stride[0] * uf, stride[1] * uf), + ) + fold = torch.nn.Fold( + output_size=(x.shape[2] * uf, x.shape[3] * uf), **fold_params2 + ) + + weighting = self.get_weighting( + kernel_size[0] * uf, kernel_size[1] * uf, Ly, Lx, x.device + ).to(x.dtype) + normalization = fold(weighting).view( + 1, 1, h * uf, w * uf + ) # normalizes the overlap + weighting = weighting.view( + (1, 1, kernel_size[0] * uf, kernel_size[1] * uf, Ly * Lx) + ) + + elif df > 1 and uf == 1: + fold_params = dict( + kernel_size=kernel_size, dilation=1, padding=0, stride=stride + ) + unfold = torch.nn.Unfold(**fold_params) + + fold_params2 = dict( + kernel_size=(kernel_size[0] // df, kernel_size[0] // df), + dilation=1, + padding=0, + stride=(stride[0] // df, stride[1] // df), + ) + fold = torch.nn.Fold( + output_size=(x.shape[2] // df, x.shape[3] // df), **fold_params2 + ) + + weighting = self.get_weighting( + kernel_size[0] // df, kernel_size[1] // df, Ly, Lx, x.device + ).to(x.dtype) + normalization = fold(weighting).view( + 1, 1, h // df, w // df + ) # normalizes the overlap + weighting = weighting.view( + (1, 1, kernel_size[0] // df, kernel_size[1] // df, Ly * Lx) + ) + + else: + raise NotImplementedError + + return fold, unfold, normalization, weighting + + @torch.no_grad() + def get_input( + self, + batch, + k, + return_first_stage_outputs=False, + force_c_encode=False, + cond_key=None, + return_original_cond=False, + bs=None, + return_x=False, + mask_k=None, + ): + x = super().get_input(batch, k) + if bs is not None: + x = x[:bs] + x = x.to(self.device) + encoder_posterior = self.encode_first_stage(x) + z = self.get_first_stage_encoding(encoder_posterior).detach() + + if mask_k is not None: + mx = super().get_input(batch, mask_k) + if bs is not None: + mx = mx[:bs] + mx = mx.to(self.device) + encoder_posterior = self.encode_first_stage(mx) + mx = self.get_first_stage_encoding(encoder_posterior).detach() + + if self.model.conditioning_key is not None and not self.force_null_conditioning: + if cond_key is None: + cond_key = self.cond_stage_key + if cond_key != self.first_stage_key: + if cond_key in ["caption", "coordinates_bbox", "txt"]: + xc = batch[cond_key] + elif cond_key in ["class_label", "cls"]: + xc = batch + else: + xc = super().get_input(batch, cond_key).to(self.device) + else: + xc = x + if not self.cond_stage_trainable or force_c_encode: + if isinstance(xc, dict) or isinstance(xc, list): + c = self.get_learned_conditioning(xc) + else: + c = self.get_learned_conditioning(xc.to(self.device)) + else: + c = xc + if bs is not None: + c = c[:bs] + + if self.use_positional_encodings: + pos_x, pos_y = self.compute_latent_shifts(batch) + ckey = __conditioning_keys__[self.model.conditioning_key] + c = {ckey: c, "pos_x": pos_x, "pos_y": pos_y} + + else: + c = None + xc = None + if self.use_positional_encodings: + pos_x, pos_y = self.compute_latent_shifts(batch) + c = {"pos_x": pos_x, "pos_y": pos_y} + out = [z, c] + if return_first_stage_outputs: + xrec = self.decode_first_stage(z) + out.extend([x, xrec]) + if return_x: + out.extend([x]) + if return_original_cond: + out.append(xc) + if mask_k: + out.append(mx) + return out + + @torch.no_grad() + def decode_first_stage(self, z, predict_cids=False, force_not_quantize=False): + if predict_cids: + if z.dim() == 4: + z = torch.argmax(z.exp(), dim=1).long() + z = self.first_stage_model.quantize.get_codebook_entry(z, shape=None) + z = rearrange(z, "b h w c -> b c h w").contiguous() + + z = 1.0 / self.scale_factor * z + return self.first_stage_model.decode(z) + + def decode_first_stage_grad(self, z, predict_cids=False, force_not_quantize=False): + if predict_cids: + if z.dim() == 4: + z = torch.argmax(z.exp(), dim=1).long() + z = self.first_stage_model.quantize.get_codebook_entry(z, shape=None) + z = rearrange(z, "b h w c -> b c h w").contiguous() + + z = 1.0 / self.scale_factor * z + return self.first_stage_model.decode(z) + + @torch.no_grad() + def encode_first_stage(self, x): + return self.first_stage_model.encode(x) + + def shared_step(self, batch, **kwargs): + x, c = self.get_input(batch, self.first_stage_key) + loss = self(x, c) + return loss + + def forward(self, x, c, *args, **kwargs): + t = torch.randint( + 0, self.num_timesteps, (x.shape[0],), device=self.device + ).long() + # t = torch.randint(500, 501, (x.shape[0],), device=self.device).long() + if self.model.conditioning_key is not None: + assert c is not None + if self.cond_stage_trainable: + c = self.get_learned_conditioning(c) + if self.shorten_cond_schedule: # TODO: drop this option + tc = self.cond_ids[t].to(self.device) + c = self.q_sample(x_start=c, t=tc, noise=torch.randn_like(c.float())) + return self.p_losses(x, c, t, *args, **kwargs) + + def apply_model(self, x_noisy, t, cond, return_ids=False): + if isinstance(cond, dict): + # hybrid case, cond is expected to be a dict + pass + else: + if not isinstance(cond, list): + cond = [cond] + key = ( + "c_concat" if self.model.conditioning_key == "concat" else "c_crossattn" + ) + cond = {key: cond} + + x_recon = self.model(x_noisy, t, **cond) + + if isinstance(x_recon, tuple) and not return_ids: + return x_recon[0] + else: + return x_recon + + def _predict_eps_from_xstart(self, x_t, t, pred_xstart): + return ( + extract_into_tensor(self.sqrt_recip_alphas_cumprod, t, x_t.shape) * x_t + - pred_xstart + ) / extract_into_tensor(self.sqrt_recipm1_alphas_cumprod, t, x_t.shape) + + def _prior_bpd(self, x_start): + """ + Get the prior KL term for the variational lower-bound, measured in + bits-per-dim. + This term can't be optimized, as it only depends on the encoder. + :param x_start: the [N x C x ...] tensor of inputs. + :return: a batch of [N] KL values (in bits), one per batch element. + """ + batch_size = x_start.shape[0] + t = torch.tensor([self.num_timesteps - 1] * batch_size, device=x_start.device) + qt_mean, _, qt_log_variance = self.q_mean_variance(x_start, t) + kl_prior = normal_kl( + mean1=qt_mean, logvar1=qt_log_variance, mean2=0.0, logvar2=0.0 + ) + return mean_flat(kl_prior) / np.log(2.0) + + def p_mean_variance( + self, + x, + c, + t, + clip_denoised: bool, + return_codebook_ids=False, + quantize_denoised=False, + return_x0=False, + score_corrector=None, + corrector_kwargs=None, + ): + t_in = t + model_out = self.apply_model(x, t_in, c, return_ids=return_codebook_ids) + + if score_corrector is not None: + assert self.parameterization == "eps" + model_out = score_corrector.modify_score( + self, model_out, x, t, c, **corrector_kwargs + ) + + if return_codebook_ids: + model_out, logits = model_out + + if self.parameterization == "eps": + x_recon = self.predict_start_from_noise(x, t=t, noise=model_out) + elif self.parameterization == "x0": + x_recon = model_out + else: + raise NotImplementedError() + + if clip_denoised: + x_recon.clamp_(-1.0, 1.0) + if quantize_denoised: + x_recon, _, [_, _, indices] = self.first_stage_model.quantize(x_recon) + model_mean, posterior_variance, posterior_log_variance = self.q_posterior( + x_start=x_recon, x_t=x, t=t + ) + if return_codebook_ids: + return model_mean, posterior_variance, posterior_log_variance, logits + elif return_x0: + return model_mean, posterior_variance, posterior_log_variance, x_recon + else: + return model_mean, posterior_variance, posterior_log_variance + + @torch.no_grad() + def p_sample( + self, + x, + c, + t, + clip_denoised=False, + repeat_noise=False, + return_codebook_ids=False, + quantize_denoised=False, + return_x0=False, + temperature=1.0, + noise_dropout=0.0, + score_corrector=None, + corrector_kwargs=None, + ): + b, *_, device = *x.shape, x.device + outputs = self.p_mean_variance( + x=x, + c=c, + t=t, + clip_denoised=clip_denoised, + return_codebook_ids=return_codebook_ids, + quantize_denoised=quantize_denoised, + return_x0=return_x0, + score_corrector=score_corrector, + corrector_kwargs=corrector_kwargs, + ) + if return_codebook_ids: + raise DeprecationWarning("Support dropped.") + model_mean, _, model_log_variance, logits = outputs + elif return_x0: + model_mean, _, model_log_variance, x0 = outputs + else: + model_mean, _, model_log_variance = outputs + + noise = noise_like(x.shape, device, repeat_noise) * temperature + if noise_dropout > 0.0: + noise = torch.nn.functional.dropout(noise, p=noise_dropout) + # no noise when t == 0 + nonzero_mask = (1 - (t == 0).float()).reshape(b, *((1,) * (len(x.shape) - 1))) + + if return_codebook_ids: + return model_mean + nonzero_mask * ( + 0.5 * model_log_variance + ).exp() * noise, logits.argmax(dim=1) + if return_x0: + return ( + model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise, + x0, + ) + else: + return model_mean + nonzero_mask * (0.5 * model_log_variance).exp() * noise + + @torch.no_grad() + def progressive_denoising( + self, + cond, + shape, + verbose=True, + callback=None, + quantize_denoised=False, + img_callback=None, + mask=None, + x0=None, + temperature=1.0, + noise_dropout=0.0, + score_corrector=None, + corrector_kwargs=None, + batch_size=None, + x_T=None, + start_T=None, + log_every_t=None, + ): + if not log_every_t: + log_every_t = self.log_every_t + timesteps = self.num_timesteps + if batch_size is not None: + b = batch_size if batch_size is not None else shape[0] + shape = [batch_size] + list(shape) + else: + b = batch_size = shape[0] + if x_T is None: + img = torch.randn(shape, device=self.device) + else: + img = x_T + intermediates = [] + if cond is not None: + if isinstance(cond, dict): + cond = { + key: cond[key][:batch_size] + if not isinstance(cond[key], list) + else list(map(lambda x: x[:batch_size], cond[key])) + for key in cond + } + else: + cond = ( + [c[:batch_size] for c in cond] + if isinstance(cond, list) + else cond[:batch_size] + ) + + if start_T is not None: + timesteps = min(timesteps, start_T) + iterator = ( + tqdm( + reversed(range(0, timesteps)), + desc="Progressive Generation", + total=timesteps, + ) + if verbose + else reversed(range(0, timesteps)) + ) + if type(temperature) == float: + temperature = [temperature] * timesteps + + for i in iterator: + ts = torch.full((b,), i, device=self.device, dtype=torch.long) + if self.shorten_cond_schedule: + assert self.model.conditioning_key != "hybrid" + tc = self.cond_ids[ts].to(cond.device) + cond = self.q_sample(x_start=cond, t=tc, noise=torch.randn_like(cond)) + + img, x0_partial = self.p_sample( + img, + cond, + ts, + clip_denoised=self.clip_denoised, + quantize_denoised=quantize_denoised, + return_x0=True, + temperature=temperature[i], + noise_dropout=noise_dropout, + score_corrector=score_corrector, + corrector_kwargs=corrector_kwargs, + ) + if mask is not None: + assert x0 is not None + img_orig = self.q_sample(x0, ts) + img = img_orig * mask + (1.0 - mask) * img + + if i % log_every_t == 0 or i == timesteps - 1: + intermediates.append(x0_partial) + if callback: + callback(i) + if img_callback: + img_callback(img, i) + return img, intermediates + + @torch.no_grad() + def p_sample_loop( + self, + cond, + shape, + return_intermediates=False, + x_T=None, + verbose=True, + callback=None, + timesteps=None, + quantize_denoised=False, + mask=None, + x0=None, + img_callback=None, + start_T=None, + log_every_t=None, + ): + if not log_every_t: + log_every_t = self.log_every_t + device = self.betas.device + b = shape[0] + if x_T is None: + img = torch.randn(shape, device=device) + else: + img = x_T + + intermediates = [img] + if timesteps is None: + timesteps = self.num_timesteps + + if start_T is not None: + timesteps = min(timesteps, start_T) + iterator = ( + tqdm(reversed(range(0, timesteps)), desc="Sampling t", total=timesteps) + if verbose + else reversed(range(0, timesteps)) + ) + + if mask is not None: + assert x0 is not None + assert x0.shape[2:3] == mask.shape[2:3] # spatial size has to match + + for i in iterator: + ts = torch.full((b,), i, device=device, dtype=torch.long) + if self.shorten_cond_schedule: + assert self.model.conditioning_key != "hybrid" + tc = self.cond_ids[ts].to(cond.device) + cond = self.q_sample(x_start=cond, t=tc, noise=torch.randn_like(cond)) + + img = self.p_sample( + img, + cond, + ts, + clip_denoised=self.clip_denoised, + quantize_denoised=quantize_denoised, + ) + if mask is not None: + img_orig = self.q_sample(x0, ts) + img = img_orig * mask + (1.0 - mask) * img + + if i % log_every_t == 0 or i == timesteps - 1: + intermediates.append(img) + if callback: + callback(i) + if img_callback: + img_callback(img, i) + + if return_intermediates: + return img, intermediates + return img + + @torch.no_grad() + def sample( + self, + cond, + batch_size=16, + return_intermediates=False, + x_T=None, + verbose=True, + timesteps=None, + quantize_denoised=False, + mask=None, + x0=None, + shape=None, + **kwargs, + ): + if shape is None: + shape = (batch_size, self.channels, self.image_size, self.image_size) + if cond is not None: + if isinstance(cond, dict): + cond = { + key: cond[key][:batch_size] + if not isinstance(cond[key], list) + else list(map(lambda x: x[:batch_size], cond[key])) + for key in cond + } + else: + cond = ( + [c[:batch_size] for c in cond] + if isinstance(cond, list) + else cond[:batch_size] + ) + return self.p_sample_loop( + cond, + shape, + return_intermediates=return_intermediates, + x_T=x_T, + verbose=verbose, + timesteps=timesteps, + quantize_denoised=quantize_denoised, + mask=mask, + x0=x0, + ) + + @torch.no_grad() + def sample_log(self, cond, batch_size, ddim, ddim_steps, **kwargs): + if ddim: + ddim_sampler = DDIMSampler(self) + shape = (self.channels, self.image_size, self.image_size) + samples, intermediates = ddim_sampler.sample( + ddim_steps, batch_size, shape, cond, verbose=False, **kwargs + ) + + else: + samples, intermediates = self.sample( + cond=cond, batch_size=batch_size, return_intermediates=True, **kwargs + ) + + return samples, intermediates + + @torch.no_grad() + def get_unconditional_conditioning(self, batch_size, null_label=None): + if null_label is not None: + xc = null_label + if isinstance(xc, ListConfig): + xc = list(xc) + if isinstance(xc, dict) or isinstance(xc, list): + c = self.get_learned_conditioning(xc) + else: + if hasattr(xc, "to"): + xc = xc.to(self.device) + c = self.get_learned_conditioning(xc) + else: + if self.cond_stage_key in ["class_label", "cls"]: + xc = self.cond_stage_model.get_unconditional_conditioning( + batch_size, device=self.device + ) + return self.get_learned_conditioning(xc) + else: + raise NotImplementedError("todo") + if isinstance(c, list): # in case the encoder gives us a list + for i in range(len(c)): + c[i] = repeat(c[i], "1 ... -> b ...", b=batch_size).to(self.device) + else: + c = repeat(c, "1 ... -> b ...", b=batch_size).to(self.device) + return c + + @torch.no_grad() + def log_images( + self, + batch, + N=8, + n_row=4, + sample=True, + ddim_steps=50, + ddim_eta=0.0, + return_keys=None, + quantize_denoised=True, + inpaint=True, + plot_denoise_rows=False, + plot_progressive_rows=True, + plot_diffusion_rows=True, + unconditional_guidance_scale=1.0, + unconditional_guidance_label=None, + use_ema_scope=True, + **kwargs, + ): + ema_scope = self.ema_scope if use_ema_scope else nullcontext + use_ddim = ddim_steps is not None + + log = dict() + z, c, x, xrec, xc = self.get_input( + batch, + self.first_stage_key, + return_first_stage_outputs=True, + force_c_encode=True, + return_original_cond=True, + bs=N, + ) + N = min(x.shape[0], N) + n_row = min(x.shape[0], n_row) + log["inputs"] = x + log["reconstruction"] = xrec + if self.model.conditioning_key is not None: + if hasattr(self.cond_stage_model, "decode"): + xc = self.cond_stage_model.decode(c) + log["conditioning"] = xc + elif self.cond_stage_key in ["caption", "txt"]: + xc = log_txt_as_img( + (x.shape[2], x.shape[3]), + batch[self.cond_stage_key], + size=x.shape[2] // 25, + ) + log["conditioning"] = xc + elif self.cond_stage_key in ["class_label", "cls"]: + try: + xc = log_txt_as_img( + (x.shape[2], x.shape[3]), + batch["human_label"], + size=x.shape[2] // 25, + ) + log["conditioning"] = xc + except KeyError: + # probably no "human_label" in batch + pass + elif isimage(xc): + log["conditioning"] = xc + if ismap(xc): + log["original_conditioning"] = self.to_rgb(xc) + + if plot_diffusion_rows: + # get diffusion row + diffusion_row = list() + z_start = z[:n_row] + for t in range(self.num_timesteps): + if t % self.log_every_t == 0 or t == self.num_timesteps - 1: + t = repeat(torch.tensor([t]), "1 -> b", b=n_row) + t = t.to(self.device).long() + noise = torch.randn_like(z_start) + z_noisy = self.q_sample(x_start=z_start, t=t, noise=noise) + diffusion_row.append(self.decode_first_stage(z_noisy)) + + diffusion_row = torch.stack(diffusion_row) # n_log_step, n_row, C, H, W + diffusion_grid = rearrange(diffusion_row, "n b c h w -> b n c h w") + diffusion_grid = rearrange(diffusion_grid, "b n c h w -> (b n) c h w") + diffusion_grid = make_grid(diffusion_grid, nrow=diffusion_row.shape[0]) + log["diffusion_row"] = diffusion_grid + + if sample: + # get denoise row + with ema_scope("Sampling"): + samples, z_denoise_row = self.sample_log( + cond=c, + batch_size=N, + ddim=use_ddim, + ddim_steps=ddim_steps, + eta=ddim_eta, + ) + # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True) + x_samples = self.decode_first_stage(samples) + log["samples"] = x_samples + if plot_denoise_rows: + denoise_grid = self._get_denoise_row_from_list(z_denoise_row) + log["denoise_row"] = denoise_grid + + if ( + quantize_denoised + and not isinstance(self.first_stage_model, AutoencoderKL) + and not isinstance(self.first_stage_model, IdentityFirstStage) + ): + # also display when quantizing x0 while sampling + with ema_scope("Plotting Quantized Denoised"): + samples, z_denoise_row = self.sample_log( + cond=c, + batch_size=N, + ddim=use_ddim, + ddim_steps=ddim_steps, + eta=ddim_eta, + quantize_denoised=True, + ) + # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True, + # quantize_denoised=True) + x_samples = self.decode_first_stage(samples.to(self.device)) + log["samples_x0_quantized"] = x_samples + + if unconditional_guidance_scale > 1.0: + uc = self.get_unconditional_conditioning(N, unconditional_guidance_label) + if self.model.conditioning_key == "crossattn-adm": + uc = {"c_crossattn": [uc], "c_adm": c["c_adm"]} + with ema_scope("Sampling with classifier-free guidance"): + samples_cfg, _ = self.sample_log( + cond=c, + batch_size=N, + ddim=use_ddim, + ddim_steps=ddim_steps, + eta=ddim_eta, + unconditional_guidance_scale=unconditional_guidance_scale, + unconditional_conditioning=uc, + ) + x_samples_cfg = self.decode_first_stage(samples_cfg) + log[ + f"samples_cfg_scale_{unconditional_guidance_scale:.2f}" + ] = x_samples_cfg + + if inpaint: + # make a simple center square + b, h, w = z.shape[0], z.shape[2], z.shape[3] + mask = torch.ones(N, h, w).to(self.device) + # zeros will be filled in + mask[:, h // 4 : 3 * h // 4, w // 4 : 3 * w // 4] = 0.0 + mask = mask[:, None, ...] + with ema_scope("Plotting Inpaint"): + samples, _ = self.sample_log( + cond=c, + batch_size=N, + ddim=use_ddim, + eta=ddim_eta, + ddim_steps=ddim_steps, + x0=z[:N], + mask=mask, + ) + x_samples = self.decode_first_stage(samples.to(self.device)) + log["samples_inpainting"] = x_samples + log["mask"] = mask + + # outpaint + mask = 1.0 - mask + with ema_scope("Plotting Outpaint"): + samples, _ = self.sample_log( + cond=c, + batch_size=N, + ddim=use_ddim, + eta=ddim_eta, + ddim_steps=ddim_steps, + x0=z[:N], + mask=mask, + ) + x_samples = self.decode_first_stage(samples.to(self.device)) + log["samples_outpainting"] = x_samples + + if plot_progressive_rows: + with ema_scope("Plotting Progressives"): + img, progressives = self.progressive_denoising( + c, + shape=(self.channels, self.image_size, self.image_size), + batch_size=N, + ) + prog_row = self._get_denoise_row_from_list( + progressives, desc="Progressive Generation" + ) + log["progressive_row"] = prog_row + + if return_keys: + if np.intersect1d(list(log.keys()), return_keys).shape[0] == 0: + return log + else: + return {key: log[key] for key in return_keys} + return log + + def configure_optimizers(self): + lr = self.learning_rate + params = list(self.model.parameters()) + if self.cond_stage_trainable: + print(f"{self.__class__.__name__}: Also optimizing conditioner params!") + params = params + list(self.cond_stage_model.parameters()) + if self.learn_logvar: + print("Diffusion model optimizing logvar") + params.append(self.logvar) + opt = torch.optim.AdamW(params, lr=lr) + if self.use_scheduler: + assert "target" in self.scheduler_config + scheduler = instantiate_from_config(self.scheduler_config) + + print("Setting up LambdaLR scheduler...") + scheduler = [ + { + "scheduler": LambdaLR(opt, lr_lambda=scheduler.schedule), + "interval": "step", + "frequency": 1, + } + ] + return [opt], scheduler + return opt + + @torch.no_grad() + def to_rgb(self, x): + x = x.float() + if not hasattr(self, "colorize"): + self.colorize = torch.randn(3, x.shape[1], 1, 1).to(x) + x = nn.functional.conv2d(x, weight=self.colorize) + x = 2.0 * (x - x.min()) / (x.max() - x.min()) - 1.0 + return x + + +class DiffusionWrapper(torch.nn.Module): + def __init__(self, diff_model_config, conditioning_key): + super().__init__() + self.sequential_cross_attn = diff_model_config.pop( + "sequential_crossattn", False + ) + self.diffusion_model = instantiate_from_config(diff_model_config) + self.conditioning_key = conditioning_key + assert self.conditioning_key in [ + None, + "concat", + "crossattn", + "hybrid", + "adm", + "hybrid-adm", + "crossattn-adm", + ] + + def forward( + self, x, t, c_concat: list = None, c_crossattn: list = None, c_adm=None + ): + if self.conditioning_key is None: + out = self.diffusion_model(x, t) + elif self.conditioning_key == "concat": + xc = torch.cat([x] + c_concat, dim=1) + out = self.diffusion_model(xc, t) + elif self.conditioning_key == "crossattn": + if not self.sequential_cross_attn: + cc = torch.cat(c_crossattn, 1) + else: + cc = c_crossattn + out = self.diffusion_model(x, t, context=cc) + elif self.conditioning_key == "hybrid": + xc = torch.cat([x] + c_concat, dim=1) + cc = torch.cat(c_crossattn, 1) + out = self.diffusion_model(xc, t, context=cc) + elif self.conditioning_key == "hybrid-adm": + assert c_adm is not None + xc = torch.cat([x] + c_concat, dim=1) + cc = torch.cat(c_crossattn, 1) + out = self.diffusion_model(xc, t, context=cc, y=c_adm) + elif self.conditioning_key == "crossattn-adm": + assert c_adm is not None + cc = torch.cat(c_crossattn, 1) + out = self.diffusion_model(x, t, context=cc, y=c_adm) + elif self.conditioning_key == "adm": + cc = c_crossattn[0] + out = self.diffusion_model(x, t, y=cc) + else: + raise NotImplementedError() + + return out + + +class LatentUpscaleDiffusion(LatentDiffusion): + def __init__( + self, + *args, + low_scale_config, + low_scale_key="LR", + noise_level_key=None, + **kwargs, + ): + super().__init__(*args, **kwargs) + # assumes that neither the cond_stage nor the low_scale_model contain trainable params + assert not self.cond_stage_trainable + self.instantiate_low_stage(low_scale_config) + self.low_scale_key = low_scale_key + self.noise_level_key = noise_level_key + + def instantiate_low_stage(self, config): + model = instantiate_from_config(config) + self.low_scale_model = model.eval() + self.low_scale_model.train = disabled_train + for param in self.low_scale_model.parameters(): + param.requires_grad = False + + @torch.no_grad() + def get_input(self, batch, k, cond_key=None, bs=None, log_mode=False): + if not log_mode: + z, c = super().get_input(batch, k, force_c_encode=True, bs=bs) + else: + z, c, x, xrec, xc = super().get_input( + batch, + self.first_stage_key, + return_first_stage_outputs=True, + force_c_encode=True, + return_original_cond=True, + bs=bs, + ) + x_low = batch[self.low_scale_key][:bs] + x_low = rearrange(x_low, "b h w c -> b c h w") + x_low = x_low.to(memory_format=torch.contiguous_format).float() + zx, noise_level = self.low_scale_model(x_low) + if self.noise_level_key is not None: + # get noise level from batch instead, e.g. when extracting a custom noise level for bsr + raise NotImplementedError("TODO") + + all_conds = {"c_concat": [zx], "c_crossattn": [c], "c_adm": noise_level} + if log_mode: + # TODO: maybe disable if too expensive + x_low_rec = self.low_scale_model.decode(zx) + return z, all_conds, x, xrec, xc, x_low, x_low_rec, noise_level + return z, all_conds + + @torch.no_grad() + def log_images( + self, + batch, + N=8, + n_row=4, + sample=True, + ddim_steps=200, + ddim_eta=1.0, + return_keys=None, + plot_denoise_rows=False, + plot_progressive_rows=True, + plot_diffusion_rows=True, + unconditional_guidance_scale=1.0, + unconditional_guidance_label=None, + use_ema_scope=True, + **kwargs, + ): + ema_scope = self.ema_scope if use_ema_scope else nullcontext + use_ddim = ddim_steps is not None + + log = dict() + z, c, x, xrec, xc, x_low, x_low_rec, noise_level = self.get_input( + batch, self.first_stage_key, bs=N, log_mode=True + ) + N = min(x.shape[0], N) + n_row = min(x.shape[0], n_row) + log["inputs"] = x + log["reconstruction"] = xrec + log["x_lr"] = x_low + log[ + f"x_lr_rec_@noise_levels{'-'.join(map(lambda x: str(x), list(noise_level.cpu().numpy())))}" + ] = x_low_rec + if self.model.conditioning_key is not None: + if hasattr(self.cond_stage_model, "decode"): + xc = self.cond_stage_model.decode(c) + log["conditioning"] = xc + elif self.cond_stage_key in ["caption", "txt"]: + xc = log_txt_as_img( + (x.shape[2], x.shape[3]), + batch[self.cond_stage_key], + size=x.shape[2] // 25, + ) + log["conditioning"] = xc + elif self.cond_stage_key in ["class_label", "cls"]: + xc = log_txt_as_img( + (x.shape[2], x.shape[3]), + batch["human_label"], + size=x.shape[2] // 25, + ) + log["conditioning"] = xc + elif isimage(xc): + log["conditioning"] = xc + if ismap(xc): + log["original_conditioning"] = self.to_rgb(xc) + + if plot_diffusion_rows: + # get diffusion row + diffusion_row = list() + z_start = z[:n_row] + for t in range(self.num_timesteps): + if t % self.log_every_t == 0 or t == self.num_timesteps - 1: + t = repeat(torch.tensor([t]), "1 -> b", b=n_row) + t = t.to(self.device).long() + noise = torch.randn_like(z_start) + z_noisy = self.q_sample(x_start=z_start, t=t, noise=noise) + diffusion_row.append(self.decode_first_stage(z_noisy)) + + diffusion_row = torch.stack(diffusion_row) # n_log_step, n_row, C, H, W + diffusion_grid = rearrange(diffusion_row, "n b c h w -> b n c h w") + diffusion_grid = rearrange(diffusion_grid, "b n c h w -> (b n) c h w") + diffusion_grid = make_grid(diffusion_grid, nrow=diffusion_row.shape[0]) + log["diffusion_row"] = diffusion_grid + + if sample: + # get denoise row + with ema_scope("Sampling"): + samples, z_denoise_row = self.sample_log( + cond=c, + batch_size=N, + ddim=use_ddim, + ddim_steps=ddim_steps, + eta=ddim_eta, + ) + # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True) + x_samples = self.decode_first_stage(samples) + log["samples"] = x_samples + if plot_denoise_rows: + denoise_grid = self._get_denoise_row_from_list(z_denoise_row) + log["denoise_row"] = denoise_grid + + if unconditional_guidance_scale > 1.0: + uc_tmp = self.get_unconditional_conditioning( + N, unconditional_guidance_label + ) + # TODO explore better "unconditional" choices for the other keys + # maybe guide away from empty text label and highest noise level and maximally degraded zx? + uc = dict() + for k in c: + if k == "c_crossattn": + assert isinstance(c[k], list) and len(c[k]) == 1 + uc[k] = [uc_tmp] + elif k == "c_adm": # todo: only run with text-based guidance? + assert isinstance(c[k], torch.Tensor) + # uc[k] = torch.ones_like(c[k]) * self.low_scale_model.max_noise_level + uc[k] = c[k] + elif isinstance(c[k], list): + uc[k] = [c[k][i] for i in range(len(c[k]))] + else: + uc[k] = c[k] + + with ema_scope("Sampling with classifier-free guidance"): + samples_cfg, _ = self.sample_log( + cond=c, + batch_size=N, + ddim=use_ddim, + ddim_steps=ddim_steps, + eta=ddim_eta, + unconditional_guidance_scale=unconditional_guidance_scale, + unconditional_conditioning=uc, + ) + x_samples_cfg = self.decode_first_stage(samples_cfg) + log[ + f"samples_cfg_scale_{unconditional_guidance_scale:.2f}" + ] = x_samples_cfg + + if plot_progressive_rows: + with ema_scope("Plotting Progressives"): + img, progressives = self.progressive_denoising( + c, + shape=(self.channels, self.image_size, self.image_size), + batch_size=N, + ) + prog_row = self._get_denoise_row_from_list( + progressives, desc="Progressive Generation" + ) + log["progressive_row"] = prog_row + + return log + + +class LatentFinetuneDiffusion(LatentDiffusion): + """ + Basis for different finetunas, such as inpainting or depth2image + To disable finetuning mode, set finetune_keys to None + """ + + def __init__( + self, + concat_keys: tuple, + finetune_keys=( + "model.diffusion_model.input_blocks.0.0.weight", + "model_ema.diffusion_modelinput_blocks00weight", + ), + keep_finetune_dims=4, + # if model was trained without concat mode before and we would like to keep these channels + c_concat_log_start=None, # to log reconstruction of c_concat codes + c_concat_log_end=None, + *args, + **kwargs, + ): + ckpt_path = kwargs.pop("ckpt_path", None) + ignore_keys = kwargs.pop("ignore_keys", list()) + super().__init__(*args, **kwargs) + self.finetune_keys = finetune_keys + self.concat_keys = concat_keys + self.keep_dims = keep_finetune_dims + self.c_concat_log_start = c_concat_log_start + self.c_concat_log_end = c_concat_log_end + if exists(self.finetune_keys): + assert exists(ckpt_path), "can only finetune from a given checkpoint" + if exists(ckpt_path): + self.init_from_ckpt(ckpt_path, ignore_keys) + + def init_from_ckpt(self, path, ignore_keys=list(), only_model=False): + sd = torch.load(path, map_location="cpu") + if "state_dict" in list(sd.keys()): + sd = sd["state_dict"] + keys = list(sd.keys()) + for k in keys: + for ik in ignore_keys: + if k.startswith(ik): + print("Deleting key {} from state_dict.".format(k)) + del sd[k] + + # make it explicit, finetune by including extra input channels + if exists(self.finetune_keys) and k in self.finetune_keys: + new_entry = None + for name, param in self.named_parameters(): + if name in self.finetune_keys: + print( + f"modifying key '{name}' and keeping its original {self.keep_dims} (channels) dimensions only" + ) + new_entry = torch.zeros_like(param) # zero init + assert exists(new_entry), "did not find matching parameter to modify" + new_entry[:, : self.keep_dims, ...] = sd[k] + sd[k] = new_entry + + missing, unexpected = ( + self.load_state_dict(sd, strict=False) + if not only_model + else self.model.load_state_dict(sd, strict=False) + ) + print( + f"Restored from {path} with {len(missing)} missing and {len(unexpected)} unexpected keys" + ) + if len(missing) > 0: + print(f"Missing Keys: {missing}") + if len(unexpected) > 0: + print(f"Unexpected Keys: {unexpected}") + + @torch.no_grad() + def log_images( + self, + batch, + N=8, + n_row=4, + sample=True, + ddim_steps=200, + ddim_eta=1.0, + return_keys=None, + quantize_denoised=True, + inpaint=True, + plot_denoise_rows=False, + plot_progressive_rows=True, + plot_diffusion_rows=True, + unconditional_guidance_scale=1.0, + unconditional_guidance_label=None, + use_ema_scope=True, + **kwargs, + ): + ema_scope = self.ema_scope if use_ema_scope else nullcontext + use_ddim = ddim_steps is not None + + log = dict() + z, c, x, xrec, xc = self.get_input( + batch, self.first_stage_key, bs=N, return_first_stage_outputs=True + ) + c_cat, c = c["c_concat"][0], c["c_crossattn"][0] + N = min(x.shape[0], N) + n_row = min(x.shape[0], n_row) + log["inputs"] = x + log["reconstruction"] = xrec + if self.model.conditioning_key is not None: + if hasattr(self.cond_stage_model, "decode"): + xc = self.cond_stage_model.decode(c) + log["conditioning"] = xc + elif self.cond_stage_key in ["caption", "txt"]: + xc = log_txt_as_img( + (x.shape[2], x.shape[3]), + batch[self.cond_stage_key], + size=x.shape[2] // 25, + ) + log["conditioning"] = xc + elif self.cond_stage_key in ["class_label", "cls"]: + xc = log_txt_as_img( + (x.shape[2], x.shape[3]), + batch["human_label"], + size=x.shape[2] // 25, + ) + log["conditioning"] = xc + elif isimage(xc): + log["conditioning"] = xc + if ismap(xc): + log["original_conditioning"] = self.to_rgb(xc) + + if not (self.c_concat_log_start is None and self.c_concat_log_end is None): + log["c_concat_decoded"] = self.decode_first_stage( + c_cat[:, self.c_concat_log_start : self.c_concat_log_end] + ) + + if plot_diffusion_rows: + # get diffusion row + diffusion_row = list() + z_start = z[:n_row] + for t in range(self.num_timesteps): + if t % self.log_every_t == 0 or t == self.num_timesteps - 1: + t = repeat(torch.tensor([t]), "1 -> b", b=n_row) + t = t.to(self.device).long() + noise = torch.randn_like(z_start) + z_noisy = self.q_sample(x_start=z_start, t=t, noise=noise) + diffusion_row.append(self.decode_first_stage(z_noisy)) + + diffusion_row = torch.stack(diffusion_row) # n_log_step, n_row, C, H, W + diffusion_grid = rearrange(diffusion_row, "n b c h w -> b n c h w") + diffusion_grid = rearrange(diffusion_grid, "b n c h w -> (b n) c h w") + diffusion_grid = make_grid(diffusion_grid, nrow=diffusion_row.shape[0]) + log["diffusion_row"] = diffusion_grid + + if sample: + # get denoise row + with ema_scope("Sampling"): + samples, z_denoise_row = self.sample_log( + cond={"c_concat": [c_cat], "c_crossattn": [c]}, + batch_size=N, + ddim=use_ddim, + ddim_steps=ddim_steps, + eta=ddim_eta, + ) + # samples, z_denoise_row = self.sample(cond=c, batch_size=N, return_intermediates=True) + x_samples = self.decode_first_stage(samples) + log["samples"] = x_samples + if plot_denoise_rows: + denoise_grid = self._get_denoise_row_from_list(z_denoise_row) + log["denoise_row"] = denoise_grid + + if unconditional_guidance_scale > 1.0: + uc_cross = self.get_unconditional_conditioning( + N, unconditional_guidance_label + ) + uc_cat = c_cat + uc_full = {"c_concat": [uc_cat], "c_crossattn": [uc_cross]} + with ema_scope("Sampling with classifier-free guidance"): + samples_cfg, _ = self.sample_log( + cond={"c_concat": [c_cat], "c_crossattn": [c]}, + batch_size=N, + ddim=use_ddim, + ddim_steps=ddim_steps, + eta=ddim_eta, + unconditional_guidance_scale=unconditional_guidance_scale, + unconditional_conditioning=uc_full, + ) + x_samples_cfg = self.decode_first_stage(samples_cfg) + log[ + f"samples_cfg_scale_{unconditional_guidance_scale:.2f}" + ] = x_samples_cfg + + return log + + +class LatentInpaintDiffusion(LatentFinetuneDiffusion): + """ + can either run as pure inpainting model (only concat mode) or with mixed conditionings, + e.g. mask as concat and text via cross-attn. + To disable finetuning mode, set finetune_keys to None + """ + + def __init__( + self, + concat_keys=("mask", "masked_image"), + masked_image_key="masked_image", + *args, + **kwargs, + ): + super().__init__(concat_keys, *args, **kwargs) + self.masked_image_key = masked_image_key + assert self.masked_image_key in concat_keys + + @torch.no_grad() + def get_input( + self, batch, k, cond_key=None, bs=None, return_first_stage_outputs=False + ): + # note: restricted to non-trainable encoders currently + assert ( + not self.cond_stage_trainable + ), "trainable cond stages not yet supported for inpainting" + z, c, x, xrec, xc = super().get_input( + batch, + self.first_stage_key, + return_first_stage_outputs=True, + force_c_encode=True, + return_original_cond=True, + bs=bs, + ) + + assert exists(self.concat_keys) + c_cat = list() + for ck in self.concat_keys: + cc = ( + rearrange(batch[ck], "b h w c -> b c h w") + .to(memory_format=torch.contiguous_format) + .float() + ) + if bs is not None: + cc = cc[:bs] + cc = cc.to(self.device) + bchw = z.shape + if ck != self.masked_image_key: + cc = torch.nn.functional.interpolate(cc, size=bchw[-2:]) + else: + cc = self.get_first_stage_encoding(self.encode_first_stage(cc)) + c_cat.append(cc) + c_cat = torch.cat(c_cat, dim=1) + all_conds = {"c_concat": [c_cat], "c_crossattn": [c]} + if return_first_stage_outputs: + return z, all_conds, x, xrec, xc + return z, all_conds + + @torch.no_grad() + def log_images(self, *args, **kwargs): + log = super(LatentInpaintDiffusion, self).log_images(*args, **kwargs) + log["masked_image"] = ( + rearrange(args[0]["masked_image"], "b h w c -> b c h w") + .to(memory_format=torch.contiguous_format) + .float() + ) + return log + + +class LatentDepth2ImageDiffusion(LatentFinetuneDiffusion): + """ + condition on monocular depth estimation + """ + + def __init__(self, depth_stage_config, concat_keys=("midas_in",), *args, **kwargs): + super().__init__(concat_keys=concat_keys, *args, **kwargs) + self.depth_model = instantiate_from_config(depth_stage_config) + self.depth_stage_key = concat_keys[0] + + @torch.no_grad() + def get_input( + self, batch, k, cond_key=None, bs=None, return_first_stage_outputs=False + ): + # note: restricted to non-trainable encoders currently + assert ( + not self.cond_stage_trainable + ), "trainable cond stages not yet supported for depth2img" + z, c, x, xrec, xc = super().get_input( + batch, + self.first_stage_key, + return_first_stage_outputs=True, + force_c_encode=True, + return_original_cond=True, + bs=bs, + ) + + assert exists(self.concat_keys) + assert len(self.concat_keys) == 1 + c_cat = list() + for ck in self.concat_keys: + cc = batch[ck] + if bs is not None: + cc = cc[:bs] + cc = cc.to(self.device) + cc = self.depth_model(cc) + cc = torch.nn.functional.interpolate( + cc, + size=z.shape[2:], + mode="bicubic", + align_corners=False, + ) + + depth_min, depth_max = torch.amin( + cc, dim=[1, 2, 3], keepdim=True + ), torch.amax(cc, dim=[1, 2, 3], keepdim=True) + cc = 2.0 * (cc - depth_min) / (depth_max - depth_min + 0.001) - 1.0 + c_cat.append(cc) + c_cat = torch.cat(c_cat, dim=1) + all_conds = {"c_concat": [c_cat], "c_crossattn": [c]} + if return_first_stage_outputs: + return z, all_conds, x, xrec, xc + return z, all_conds + + @torch.no_grad() + def log_images(self, *args, **kwargs): + log = super().log_images(*args, **kwargs) + depth = self.depth_model(args[0][self.depth_stage_key]) + depth_min, depth_max = torch.amin( + depth, dim=[1, 2, 3], keepdim=True + ), torch.amax(depth, dim=[1, 2, 3], keepdim=True) + log["depth"] = 2.0 * (depth - depth_min) / (depth_max - depth_min) - 1.0 + return log + + +class LatentUpscaleFinetuneDiffusion(LatentFinetuneDiffusion): + """ + condition on low-res image (and optionally on some spatial noise augmentation) + """ + + def __init__( + self, + concat_keys=("lr",), + reshuffle_patch_size=None, + low_scale_config=None, + low_scale_key=None, + *args, + **kwargs, + ): + super().__init__(concat_keys=concat_keys, *args, **kwargs) + self.reshuffle_patch_size = reshuffle_patch_size + self.low_scale_model = None + if low_scale_config is not None: + print("Initializing a low-scale model") + assert exists(low_scale_key) + self.instantiate_low_stage(low_scale_config) + self.low_scale_key = low_scale_key + + def instantiate_low_stage(self, config): + model = instantiate_from_config(config) + self.low_scale_model = model.eval() + self.low_scale_model.train = disabled_train + for param in self.low_scale_model.parameters(): + param.requires_grad = False + + @torch.no_grad() + def get_input( + self, batch, k, cond_key=None, bs=None, return_first_stage_outputs=False + ): + # note: restricted to non-trainable encoders currently + assert ( + not self.cond_stage_trainable + ), "trainable cond stages not yet supported for upscaling-ft" + z, c, x, xrec, xc = super().get_input( + batch, + self.first_stage_key, + return_first_stage_outputs=True, + force_c_encode=True, + return_original_cond=True, + bs=bs, + ) + + assert exists(self.concat_keys) + assert len(self.concat_keys) == 1 + # optionally make spatial noise_level here + c_cat = list() + noise_level = None + for ck in self.concat_keys: + cc = batch[ck] + cc = rearrange(cc, "b h w c -> b c h w") + if exists(self.reshuffle_patch_size): + assert isinstance(self.reshuffle_patch_size, int) + cc = rearrange( + cc, + "b c (p1 h) (p2 w) -> b (p1 p2 c) h w", + p1=self.reshuffle_patch_size, + p2=self.reshuffle_patch_size, + ) + if bs is not None: + cc = cc[:bs] + cc = cc.to(self.device) + if exists(self.low_scale_model) and ck == self.low_scale_key: + cc, noise_level = self.low_scale_model(cc) + c_cat.append(cc) + c_cat = torch.cat(c_cat, dim=1) + if exists(noise_level): + all_conds = {"c_concat": [c_cat], "c_crossattn": [c], "c_adm": noise_level} + else: + all_conds = {"c_concat": [c_cat], "c_crossattn": [c]} + if return_first_stage_outputs: + return z, all_conds, x, xrec, xc + return z, all_conds + + @torch.no_grad() + def log_images(self, *args, **kwargs): + log = super().log_images(*args, **kwargs) + log["lr"] = rearrange(args[0]["lr"], "b h w c -> b c h w") + return log diff --git a/inpaint/model/anytext/ldm/models/diffusion/dpm_solver/__init__.py b/inpaint/model/anytext/ldm/models/diffusion/dpm_solver/__init__.py new file mode 100644 index 0000000..7427f38 --- /dev/null +++ b/inpaint/model/anytext/ldm/models/diffusion/dpm_solver/__init__.py @@ -0,0 +1 @@ +from .sampler import DPMSolverSampler \ No newline at end of file diff --git a/inpaint/model/anytext/ldm/models/diffusion/dpm_solver/dpm_solver.py b/inpaint/model/anytext/ldm/models/diffusion/dpm_solver/dpm_solver.py new file mode 100644 index 0000000..095e5ba --- /dev/null +++ b/inpaint/model/anytext/ldm/models/diffusion/dpm_solver/dpm_solver.py @@ -0,0 +1,1154 @@ +import torch +import torch.nn.functional as F +import math +from tqdm import tqdm + + +class NoiseScheduleVP: + def __init__( + self, + schedule='discrete', + betas=None, + alphas_cumprod=None, + continuous_beta_0=0.1, + continuous_beta_1=20., + ): + """Create a wrapper class for the forward SDE (VP type). + *** + Update: We support discrete-time diffusion models by implementing a picewise linear interpolation for log_alpha_t. + We recommend to use schedule='discrete' for the discrete-time diffusion models, especially for high-resolution images. + *** + The forward SDE ensures that the condition distribution q_{t|0}(x_t | x_0) = N ( alpha_t * x_0, sigma_t^2 * I ). + We further define lambda_t = log(alpha_t) - log(sigma_t), which is the half-logSNR (described in the DPM-Solver paper). + Therefore, we implement the functions for computing alpha_t, sigma_t and lambda_t. For t in [0, T], we have: + log_alpha_t = self.marginal_log_mean_coeff(t) + sigma_t = self.marginal_std(t) + lambda_t = self.marginal_lambda(t) + Moreover, as lambda(t) is an invertible function, we also support its inverse function: + t = self.inverse_lambda(lambda_t) + =============================================================== + We support both discrete-time DPMs (trained on n = 0, 1, ..., N-1) and continuous-time DPMs (trained on t in [t_0, T]). + 1. For discrete-time DPMs: + For discrete-time DPMs trained on n = 0, 1, ..., N-1, we convert the discrete steps to continuous time steps by: + t_i = (i + 1) / N + e.g. for N = 1000, we have t_0 = 1e-3 and T = t_{N-1} = 1. + We solve the corresponding diffusion ODE from time T = 1 to time t_0 = 1e-3. + Args: + betas: A `torch.Tensor`. The beta array for the discrete-time DPM. (See the original DDPM paper for details) + alphas_cumprod: A `torch.Tensor`. The cumprod alphas for the discrete-time DPM. (See the original DDPM paper for details) + Note that we always have alphas_cumprod = cumprod(betas). Therefore, we only need to set one of `betas` and `alphas_cumprod`. + **Important**: Please pay special attention for the args for `alphas_cumprod`: + The `alphas_cumprod` is the \hat{alpha_n} arrays in the notations of DDPM. Specifically, DDPMs assume that + q_{t_n | 0}(x_{t_n} | x_0) = N ( \sqrt{\hat{alpha_n}} * x_0, (1 - \hat{alpha_n}) * I ). + Therefore, the notation \hat{alpha_n} is different from the notation alpha_t in DPM-Solver. In fact, we have + alpha_{t_n} = \sqrt{\hat{alpha_n}}, + and + log(alpha_{t_n}) = 0.5 * log(\hat{alpha_n}). + 2. For continuous-time DPMs: + We support two types of VPSDEs: linear (DDPM) and cosine (improved-DDPM). The hyperparameters for the noise + schedule are the default settings in DDPM and improved-DDPM: + Args: + beta_min: A `float` number. The smallest beta for the linear schedule. + beta_max: A `float` number. The largest beta for the linear schedule. + cosine_s: A `float` number. The hyperparameter in the cosine schedule. + cosine_beta_max: A `float` number. The hyperparameter in the cosine schedule. + T: A `float` number. The ending time of the forward process. + =============================================================== + Args: + schedule: A `str`. The noise schedule of the forward SDE. 'discrete' for discrete-time DPMs, + 'linear' or 'cosine' for continuous-time DPMs. + Returns: + A wrapper object of the forward SDE (VP type). + + =============================================================== + Example: + # For discrete-time DPMs, given betas (the beta array for n = 0, 1, ..., N - 1): + >>> ns = NoiseScheduleVP('discrete', betas=betas) + # For discrete-time DPMs, given alphas_cumprod (the \hat{alpha_n} array for n = 0, 1, ..., N - 1): + >>> ns = NoiseScheduleVP('discrete', alphas_cumprod=alphas_cumprod) + # For continuous-time DPMs (VPSDE), linear schedule: + >>> ns = NoiseScheduleVP('linear', continuous_beta_0=0.1, continuous_beta_1=20.) + """ + + if schedule not in ['discrete', 'linear', 'cosine']: + raise ValueError( + "Unsupported noise schedule {}. The schedule needs to be 'discrete' or 'linear' or 'cosine'".format( + schedule)) + + self.schedule = schedule + if schedule == 'discrete': + if betas is not None: + log_alphas = 0.5 * torch.log(1 - betas).cumsum(dim=0) + else: + assert alphas_cumprod is not None + log_alphas = 0.5 * torch.log(alphas_cumprod) + self.total_N = len(log_alphas) + self.T = 1. + self.t_array = torch.linspace(0., 1., self.total_N + 1)[1:].reshape((1, -1)) + self.log_alpha_array = log_alphas.reshape((1, -1,)) + else: + self.total_N = 1000 + self.beta_0 = continuous_beta_0 + self.beta_1 = continuous_beta_1 + self.cosine_s = 0.008 + self.cosine_beta_max = 999. + self.cosine_t_max = math.atan(self.cosine_beta_max * (1. + self.cosine_s) / math.pi) * 2. * ( + 1. + self.cosine_s) / math.pi - self.cosine_s + self.cosine_log_alpha_0 = math.log(math.cos(self.cosine_s / (1. + self.cosine_s) * math.pi / 2.)) + self.schedule = schedule + if schedule == 'cosine': + # For the cosine schedule, T = 1 will have numerical issues. So we manually set the ending time T. + # Note that T = 0.9946 may be not the optimal setting. However, we find it works well. + self.T = 0.9946 + else: + self.T = 1. + + def marginal_log_mean_coeff(self, t): + """ + Compute log(alpha_t) of a given continuous-time label t in [0, T]. + """ + if self.schedule == 'discrete': + return interpolate_fn(t.reshape((-1, 1)), self.t_array.to(t.device), + self.log_alpha_array.to(t.device)).reshape((-1)) + elif self.schedule == 'linear': + return -0.25 * t ** 2 * (self.beta_1 - self.beta_0) - 0.5 * t * self.beta_0 + elif self.schedule == 'cosine': + log_alpha_fn = lambda s: torch.log(torch.cos((s + self.cosine_s) / (1. + self.cosine_s) * math.pi / 2.)) + log_alpha_t = log_alpha_fn(t) - self.cosine_log_alpha_0 + return log_alpha_t + + def marginal_alpha(self, t): + """ + Compute alpha_t of a given continuous-time label t in [0, T]. + """ + return torch.exp(self.marginal_log_mean_coeff(t)) + + def marginal_std(self, t): + """ + Compute sigma_t of a given continuous-time label t in [0, T]. + """ + return torch.sqrt(1. - torch.exp(2. * self.marginal_log_mean_coeff(t))) + + def marginal_lambda(self, t): + """ + Compute lambda_t = log(alpha_t) - log(sigma_t) of a given continuous-time label t in [0, T]. + """ + log_mean_coeff = self.marginal_log_mean_coeff(t) + log_std = 0.5 * torch.log(1. - torch.exp(2. * log_mean_coeff)) + return log_mean_coeff - log_std + + def inverse_lambda(self, lamb): + """ + Compute the continuous-time label t in [0, T] of a given half-logSNR lambda_t. + """ + if self.schedule == 'linear': + tmp = 2. * (self.beta_1 - self.beta_0) * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb)) + Delta = self.beta_0 ** 2 + tmp + return tmp / (torch.sqrt(Delta) + self.beta_0) / (self.beta_1 - self.beta_0) + elif self.schedule == 'discrete': + log_alpha = -0.5 * torch.logaddexp(torch.zeros((1,)).to(lamb.device), -2. * lamb) + t = interpolate_fn(log_alpha.reshape((-1, 1)), torch.flip(self.log_alpha_array.to(lamb.device), [1]), + torch.flip(self.t_array.to(lamb.device), [1])) + return t.reshape((-1,)) + else: + log_alpha = -0.5 * torch.logaddexp(-2. * lamb, torch.zeros((1,)).to(lamb)) + t_fn = lambda log_alpha_t: torch.arccos(torch.exp(log_alpha_t + self.cosine_log_alpha_0)) * 2. * ( + 1. + self.cosine_s) / math.pi - self.cosine_s + t = t_fn(log_alpha) + return t + + +def model_wrapper( + model, + noise_schedule, + model_type="noise", + model_kwargs={}, + guidance_type="uncond", + condition=None, + unconditional_condition=None, + guidance_scale=1., + classifier_fn=None, + classifier_kwargs={}, +): + """Create a wrapper function for the noise prediction model. + DPM-Solver needs to solve the continuous-time diffusion ODEs. For DPMs trained on discrete-time labels, we need to + firstly wrap the model function to a noise prediction model that accepts the continuous time as the input. + We support four types of the diffusion model by setting `model_type`: + 1. "noise": noise prediction model. (Trained by predicting noise). + 2. "x_start": data prediction model. (Trained by predicting the data x_0 at time 0). + 3. "v": velocity prediction model. (Trained by predicting the velocity). + The "v" prediction is derivation detailed in Appendix D of [1], and is used in Imagen-Video [2]. + [1] Salimans, Tim, and Jonathan Ho. "Progressive distillation for fast sampling of diffusion models." + arXiv preprint arXiv:2202.00512 (2022). + [2] Ho, Jonathan, et al. "Imagen Video: High Definition Video Generation with Diffusion Models." + arXiv preprint arXiv:2210.02303 (2022). + + 4. "score": marginal score function. (Trained by denoising score matching). + Note that the score function and the noise prediction model follows a simple relationship: + ``` + noise(x_t, t) = -sigma_t * score(x_t, t) + ``` + We support three types of guided sampling by DPMs by setting `guidance_type`: + 1. "uncond": unconditional sampling by DPMs. + The input `model` has the following format: + `` + model(x, t_input, **model_kwargs) -> noise | x_start | v | score + `` + 2. "classifier": classifier guidance sampling [3] by DPMs and another classifier. + The input `model` has the following format: + `` + model(x, t_input, **model_kwargs) -> noise | x_start | v | score + `` + The input `classifier_fn` has the following format: + `` + classifier_fn(x, t_input, cond, **classifier_kwargs) -> logits(x, t_input, cond) + `` + [3] P. Dhariwal and A. Q. Nichol, "Diffusion models beat GANs on image synthesis," + in Advances in Neural Information Processing Systems, vol. 34, 2021, pp. 8780-8794. + 3. "classifier-free": classifier-free guidance sampling by conditional DPMs. + The input `model` has the following format: + `` + model(x, t_input, cond, **model_kwargs) -> noise | x_start | v | score + `` + And if cond == `unconditional_condition`, the model output is the unconditional DPM output. + [4] Ho, Jonathan, and Tim Salimans. "Classifier-free diffusion guidance." + arXiv preprint arXiv:2207.12598 (2022). + + The `t_input` is the time label of the model, which may be discrete-time labels (i.e. 0 to 999) + or continuous-time labels (i.e. epsilon to T). + We wrap the model function to accept only `x` and `t_continuous` as inputs, and outputs the predicted noise: + `` + def model_fn(x, t_continuous) -> noise: + t_input = get_model_input_time(t_continuous) + return noise_pred(model, x, t_input, **model_kwargs) + `` + where `t_continuous` is the continuous time labels (i.e. epsilon to T). And we use `model_fn` for DPM-Solver. + =============================================================== + Args: + model: A diffusion model with the corresponding format described above. + noise_schedule: A noise schedule object, such as NoiseScheduleVP. + model_type: A `str`. The parameterization type of the diffusion model. + "noise" or "x_start" or "v" or "score". + model_kwargs: A `dict`. A dict for the other inputs of the model function. + guidance_type: A `str`. The type of the guidance for sampling. + "uncond" or "classifier" or "classifier-free". + condition: A pytorch tensor. The condition for the guided sampling. + Only used for "classifier" or "classifier-free" guidance type. + unconditional_condition: A pytorch tensor. The condition for the unconditional sampling. + Only used for "classifier-free" guidance type. + guidance_scale: A `float`. The scale for the guided sampling. + classifier_fn: A classifier function. Only used for the classifier guidance. + classifier_kwargs: A `dict`. A dict for the other inputs of the classifier function. + Returns: + A noise prediction model that accepts the noised data and the continuous time as the inputs. + """ + + def get_model_input_time(t_continuous): + """ + Convert the continuous-time `t_continuous` (in [epsilon, T]) to the model input time. + For discrete-time DPMs, we convert `t_continuous` in [1 / N, 1] to `t_input` in [0, 1000 * (N - 1) / N]. + For continuous-time DPMs, we just use `t_continuous`. + """ + if noise_schedule.schedule == 'discrete': + return (t_continuous - 1. / noise_schedule.total_N) * 1000. + else: + return t_continuous + + def noise_pred_fn(x, t_continuous, cond=None): + if t_continuous.reshape((-1,)).shape[0] == 1: + t_continuous = t_continuous.expand((x.shape[0])) + t_input = get_model_input_time(t_continuous) + if cond is None: + output = model(x, t_input, **model_kwargs) + else: + output = model(x, t_input, cond, **model_kwargs) + if model_type == "noise": + return output + elif model_type == "x_start": + alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous) + dims = x.dim() + return (x - expand_dims(alpha_t, dims) * output) / expand_dims(sigma_t, dims) + elif model_type == "v": + alpha_t, sigma_t = noise_schedule.marginal_alpha(t_continuous), noise_schedule.marginal_std(t_continuous) + dims = x.dim() + return expand_dims(alpha_t, dims) * output + expand_dims(sigma_t, dims) * x + elif model_type == "score": + sigma_t = noise_schedule.marginal_std(t_continuous) + dims = x.dim() + return -expand_dims(sigma_t, dims) * output + + def cond_grad_fn(x, t_input): + """ + Compute the gradient of the classifier, i.e. nabla_{x} log p_t(cond | x_t). + """ + with torch.enable_grad(): + x_in = x.detach().requires_grad_(True) + log_prob = classifier_fn(x_in, t_input, condition, **classifier_kwargs) + return torch.autograd.grad(log_prob.sum(), x_in)[0] + + def model_fn(x, t_continuous): + """ + The noise predicition model function that is used for DPM-Solver. + """ + if t_continuous.reshape((-1,)).shape[0] == 1: + t_continuous = t_continuous.expand((x.shape[0])) + if guidance_type == "uncond": + return noise_pred_fn(x, t_continuous) + elif guidance_type == "classifier": + assert classifier_fn is not None + t_input = get_model_input_time(t_continuous) + cond_grad = cond_grad_fn(x, t_input) + sigma_t = noise_schedule.marginal_std(t_continuous) + noise = noise_pred_fn(x, t_continuous) + return noise - guidance_scale * expand_dims(sigma_t, dims=cond_grad.dim()) * cond_grad + elif guidance_type == "classifier-free": + if guidance_scale == 1. or unconditional_condition is None: + return noise_pred_fn(x, t_continuous, cond=condition) + else: + x_in = torch.cat([x] * 2) + t_in = torch.cat([t_continuous] * 2) + c_in = torch.cat([unconditional_condition, condition]) + noise_uncond, noise = noise_pred_fn(x_in, t_in, cond=c_in).chunk(2) + return noise_uncond + guidance_scale * (noise - noise_uncond) + + assert model_type in ["noise", "x_start", "v"] + assert guidance_type in ["uncond", "classifier", "classifier-free"] + return model_fn + + +class DPM_Solver: + def __init__(self, model_fn, noise_schedule, predict_x0=False, thresholding=False, max_val=1.): + """Construct a DPM-Solver. + We support both the noise prediction model ("predicting epsilon") and the data prediction model ("predicting x0"). + If `predict_x0` is False, we use the solver for the noise prediction model (DPM-Solver). + If `predict_x0` is True, we use the solver for the data prediction model (DPM-Solver++). + In such case, we further support the "dynamic thresholding" in [1] when `thresholding` is True. + The "dynamic thresholding" can greatly improve the sample quality for pixel-space DPMs with large guidance scales. + Args: + model_fn: A noise prediction model function which accepts the continuous-time input (t in [epsilon, T]): + `` + def model_fn(x, t_continuous): + return noise + `` + noise_schedule: A noise schedule object, such as NoiseScheduleVP. + predict_x0: A `bool`. If true, use the data prediction model; else, use the noise prediction model. + thresholding: A `bool`. Valid when `predict_x0` is True. Whether to use the "dynamic thresholding" in [1]. + max_val: A `float`. Valid when both `predict_x0` and `thresholding` are True. The max value for thresholding. + + [1] Chitwan Saharia, William Chan, Saurabh Saxena, Lala Li, Jay Whang, Emily Denton, Seyed Kamyar Seyed Ghasemipour, Burcu Karagol Ayan, S Sara Mahdavi, Rapha Gontijo Lopes, et al. Photorealistic text-to-image diffusion models with deep language understanding. arXiv preprint arXiv:2205.11487, 2022b. + """ + self.model = model_fn + self.noise_schedule = noise_schedule + self.predict_x0 = predict_x0 + self.thresholding = thresholding + self.max_val = max_val + + def noise_prediction_fn(self, x, t): + """ + Return the noise prediction model. + """ + return self.model(x, t) + + def data_prediction_fn(self, x, t): + """ + Return the data prediction model (with thresholding). + """ + noise = self.noise_prediction_fn(x, t) + dims = x.dim() + alpha_t, sigma_t = self.noise_schedule.marginal_alpha(t), self.noise_schedule.marginal_std(t) + x0 = (x - expand_dims(sigma_t, dims) * noise) / expand_dims(alpha_t, dims) + if self.thresholding: + p = 0.995 # A hyperparameter in the paper of "Imagen" [1]. + s = torch.quantile(torch.abs(x0).reshape((x0.shape[0], -1)), p, dim=1) + s = expand_dims(torch.maximum(s, self.max_val * torch.ones_like(s).to(s.device)), dims) + x0 = torch.clamp(x0, -s, s) / s + return x0 + + def model_fn(self, x, t): + """ + Convert the model to the noise prediction model or the data prediction model. + """ + if self.predict_x0: + return self.data_prediction_fn(x, t) + else: + return self.noise_prediction_fn(x, t) + + def get_time_steps(self, skip_type, t_T, t_0, N, device): + """Compute the intermediate time steps for sampling. + Args: + skip_type: A `str`. The type for the spacing of the time steps. We support three types: + - 'logSNR': uniform logSNR for the time steps. + - 'time_uniform': uniform time for the time steps. (**Recommended for high-resolutional data**.) + - 'time_quadratic': quadratic time for the time steps. (Used in DDIM for low-resolutional data.) + t_T: A `float`. The starting time of the sampling (default is T). + t_0: A `float`. The ending time of the sampling (default is epsilon). + N: A `int`. The total number of the spacing of the time steps. + device: A torch device. + Returns: + A pytorch tensor of the time steps, with the shape (N + 1,). + """ + if skip_type == 'logSNR': + lambda_T = self.noise_schedule.marginal_lambda(torch.tensor(t_T).to(device)) + lambda_0 = self.noise_schedule.marginal_lambda(torch.tensor(t_0).to(device)) + logSNR_steps = torch.linspace(lambda_T.cpu().item(), lambda_0.cpu().item(), N + 1).to(device) + return self.noise_schedule.inverse_lambda(logSNR_steps) + elif skip_type == 'time_uniform': + return torch.linspace(t_T, t_0, N + 1).to(device) + elif skip_type == 'time_quadratic': + t_order = 2 + t = torch.linspace(t_T ** (1. / t_order), t_0 ** (1. / t_order), N + 1).pow(t_order).to(device) + return t + else: + raise ValueError( + "Unsupported skip_type {}, need to be 'logSNR' or 'time_uniform' or 'time_quadratic'".format(skip_type)) + + def get_orders_and_timesteps_for_singlestep_solver(self, steps, order, skip_type, t_T, t_0, device): + """ + Get the order of each step for sampling by the singlestep DPM-Solver. + We combine both DPM-Solver-1,2,3 to use all the function evaluations, which is named as "DPM-Solver-fast". + Given a fixed number of function evaluations by `steps`, the sampling procedure by DPM-Solver-fast is: + - If order == 1: + We take `steps` of DPM-Solver-1 (i.e. DDIM). + - If order == 2: + - Denote K = (steps // 2). We take K or (K + 1) intermediate time steps for sampling. + - If steps % 2 == 0, we use K steps of DPM-Solver-2. + - If steps % 2 == 1, we use K steps of DPM-Solver-2 and 1 step of DPM-Solver-1. + - If order == 3: + - Denote K = (steps // 3 + 1). We take K intermediate time steps for sampling. + - If steps % 3 == 0, we use (K - 2) steps of DPM-Solver-3, and 1 step of DPM-Solver-2 and 1 step of DPM-Solver-1. + - If steps % 3 == 1, we use (K - 1) steps of DPM-Solver-3 and 1 step of DPM-Solver-1. + - If steps % 3 == 2, we use (K - 1) steps of DPM-Solver-3 and 1 step of DPM-Solver-2. + ============================================ + Args: + order: A `int`. The max order for the solver (2 or 3). + steps: A `int`. The total number of function evaluations (NFE). + skip_type: A `str`. The type for the spacing of the time steps. We support three types: + - 'logSNR': uniform logSNR for the time steps. + - 'time_uniform': uniform time for the time steps. (**Recommended for high-resolutional data**.) + - 'time_quadratic': quadratic time for the time steps. (Used in DDIM for low-resolutional data.) + t_T: A `float`. The starting time of the sampling (default is T). + t_0: A `float`. The ending time of the sampling (default is epsilon). + device: A torch device. + Returns: + orders: A list of the solver order of each step. + """ + if order == 3: + K = steps // 3 + 1 + if steps % 3 == 0: + orders = [3, ] * (K - 2) + [2, 1] + elif steps % 3 == 1: + orders = [3, ] * (K - 1) + [1] + else: + orders = [3, ] * (K - 1) + [2] + elif order == 2: + if steps % 2 == 0: + K = steps // 2 + orders = [2, ] * K + else: + K = steps // 2 + 1 + orders = [2, ] * (K - 1) + [1] + elif order == 1: + K = 1 + orders = [1, ] * steps + else: + raise ValueError("'order' must be '1' or '2' or '3'.") + if skip_type == 'logSNR': + # To reproduce the results in DPM-Solver paper + timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, K, device) + else: + timesteps_outer = self.get_time_steps(skip_type, t_T, t_0, steps, device)[ + torch.cumsum(torch.tensor([0, ] + orders)).to(device)] + return timesteps_outer, orders + + def denoise_to_zero_fn(self, x, s): + """ + Denoise at the final step, which is equivalent to solve the ODE from lambda_s to infty by first-order discretization. + """ + return self.data_prediction_fn(x, s) + + def dpm_solver_first_update(self, x, s, t, model_s=None, return_intermediate=False): + """ + DPM-Solver-1 (equivalent to DDIM) from time `s` to time `t`. + Args: + x: A pytorch tensor. The initial value at time `s`. + s: A pytorch tensor. The starting time, with the shape (x.shape[0],). + t: A pytorch tensor. The ending time, with the shape (x.shape[0],). + model_s: A pytorch tensor. The model function evaluated at time `s`. + If `model_s` is None, we evaluate the model by `x` and `s`; otherwise we directly use it. + return_intermediate: A `bool`. If true, also return the model value at time `s`. + Returns: + x_t: A pytorch tensor. The approximated solution at time `t`. + """ + ns = self.noise_schedule + dims = x.dim() + lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t) + h = lambda_t - lambda_s + log_alpha_s, log_alpha_t = ns.marginal_log_mean_coeff(s), ns.marginal_log_mean_coeff(t) + sigma_s, sigma_t = ns.marginal_std(s), ns.marginal_std(t) + alpha_t = torch.exp(log_alpha_t) + + if self.predict_x0: + phi_1 = torch.expm1(-h) + if model_s is None: + model_s = self.model_fn(x, s) + x_t = ( + expand_dims(sigma_t / sigma_s, dims) * x + - expand_dims(alpha_t * phi_1, dims) * model_s + ) + if return_intermediate: + return x_t, {'model_s': model_s} + else: + return x_t + else: + phi_1 = torch.expm1(h) + if model_s is None: + model_s = self.model_fn(x, s) + x_t = ( + expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x + - expand_dims(sigma_t * phi_1, dims) * model_s + ) + if return_intermediate: + return x_t, {'model_s': model_s} + else: + return x_t + + def singlestep_dpm_solver_second_update(self, x, s, t, r1=0.5, model_s=None, return_intermediate=False, + solver_type='dpm_solver'): + """ + Singlestep solver DPM-Solver-2 from time `s` to time `t`. + Args: + x: A pytorch tensor. The initial value at time `s`. + s: A pytorch tensor. The starting time, with the shape (x.shape[0],). + t: A pytorch tensor. The ending time, with the shape (x.shape[0],). + r1: A `float`. The hyperparameter of the second-order solver. + model_s: A pytorch tensor. The model function evaluated at time `s`. + If `model_s` is None, we evaluate the model by `x` and `s`; otherwise we directly use it. + return_intermediate: A `bool`. If true, also return the model value at time `s` and `s1` (the intermediate time). + solver_type: either 'dpm_solver' or 'taylor'. The type for the high-order solvers. + The type slightly impacts the performance. We recommend to use 'dpm_solver' type. + Returns: + x_t: A pytorch tensor. The approximated solution at time `t`. + """ + if solver_type not in ['dpm_solver', 'taylor']: + raise ValueError("'solver_type' must be either 'dpm_solver' or 'taylor', got {}".format(solver_type)) + if r1 is None: + r1 = 0.5 + ns = self.noise_schedule + dims = x.dim() + lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t) + h = lambda_t - lambda_s + lambda_s1 = lambda_s + r1 * h + s1 = ns.inverse_lambda(lambda_s1) + log_alpha_s, log_alpha_s1, log_alpha_t = ns.marginal_log_mean_coeff(s), ns.marginal_log_mean_coeff( + s1), ns.marginal_log_mean_coeff(t) + sigma_s, sigma_s1, sigma_t = ns.marginal_std(s), ns.marginal_std(s1), ns.marginal_std(t) + alpha_s1, alpha_t = torch.exp(log_alpha_s1), torch.exp(log_alpha_t) + + if self.predict_x0: + phi_11 = torch.expm1(-r1 * h) + phi_1 = torch.expm1(-h) + + if model_s is None: + model_s = self.model_fn(x, s) + x_s1 = ( + expand_dims(sigma_s1 / sigma_s, dims) * x + - expand_dims(alpha_s1 * phi_11, dims) * model_s + ) + model_s1 = self.model_fn(x_s1, s1) + if solver_type == 'dpm_solver': + x_t = ( + expand_dims(sigma_t / sigma_s, dims) * x + - expand_dims(alpha_t * phi_1, dims) * model_s + - (0.5 / r1) * expand_dims(alpha_t * phi_1, dims) * (model_s1 - model_s) + ) + elif solver_type == 'taylor': + x_t = ( + expand_dims(sigma_t / sigma_s, dims) * x + - expand_dims(alpha_t * phi_1, dims) * model_s + + (1. / r1) * expand_dims(alpha_t * ((torch.exp(-h) - 1.) / h + 1.), dims) * ( + model_s1 - model_s) + ) + else: + phi_11 = torch.expm1(r1 * h) + phi_1 = torch.expm1(h) + + if model_s is None: + model_s = self.model_fn(x, s) + x_s1 = ( + expand_dims(torch.exp(log_alpha_s1 - log_alpha_s), dims) * x + - expand_dims(sigma_s1 * phi_11, dims) * model_s + ) + model_s1 = self.model_fn(x_s1, s1) + if solver_type == 'dpm_solver': + x_t = ( + expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x + - expand_dims(sigma_t * phi_1, dims) * model_s + - (0.5 / r1) * expand_dims(sigma_t * phi_1, dims) * (model_s1 - model_s) + ) + elif solver_type == 'taylor': + x_t = ( + expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x + - expand_dims(sigma_t * phi_1, dims) * model_s + - (1. / r1) * expand_dims(sigma_t * ((torch.exp(h) - 1.) / h - 1.), dims) * (model_s1 - model_s) + ) + if return_intermediate: + return x_t, {'model_s': model_s, 'model_s1': model_s1} + else: + return x_t + + def singlestep_dpm_solver_third_update(self, x, s, t, r1=1. / 3., r2=2. / 3., model_s=None, model_s1=None, + return_intermediate=False, solver_type='dpm_solver'): + """ + Singlestep solver DPM-Solver-3 from time `s` to time `t`. + Args: + x: A pytorch tensor. The initial value at time `s`. + s: A pytorch tensor. The starting time, with the shape (x.shape[0],). + t: A pytorch tensor. The ending time, with the shape (x.shape[0],). + r1: A `float`. The hyperparameter of the third-order solver. + r2: A `float`. The hyperparameter of the third-order solver. + model_s: A pytorch tensor. The model function evaluated at time `s`. + If `model_s` is None, we evaluate the model by `x` and `s`; otherwise we directly use it. + model_s1: A pytorch tensor. The model function evaluated at time `s1` (the intermediate time given by `r1`). + If `model_s1` is None, we evaluate the model at `s1`; otherwise we directly use it. + return_intermediate: A `bool`. If true, also return the model value at time `s`, `s1` and `s2` (the intermediate times). + solver_type: either 'dpm_solver' or 'taylor'. The type for the high-order solvers. + The type slightly impacts the performance. We recommend to use 'dpm_solver' type. + Returns: + x_t: A pytorch tensor. The approximated solution at time `t`. + """ + if solver_type not in ['dpm_solver', 'taylor']: + raise ValueError("'solver_type' must be either 'dpm_solver' or 'taylor', got {}".format(solver_type)) + if r1 is None: + r1 = 1. / 3. + if r2 is None: + r2 = 2. / 3. + ns = self.noise_schedule + dims = x.dim() + lambda_s, lambda_t = ns.marginal_lambda(s), ns.marginal_lambda(t) + h = lambda_t - lambda_s + lambda_s1 = lambda_s + r1 * h + lambda_s2 = lambda_s + r2 * h + s1 = ns.inverse_lambda(lambda_s1) + s2 = ns.inverse_lambda(lambda_s2) + log_alpha_s, log_alpha_s1, log_alpha_s2, log_alpha_t = ns.marginal_log_mean_coeff( + s), ns.marginal_log_mean_coeff(s1), ns.marginal_log_mean_coeff(s2), ns.marginal_log_mean_coeff(t) + sigma_s, sigma_s1, sigma_s2, sigma_t = ns.marginal_std(s), ns.marginal_std(s1), ns.marginal_std( + s2), ns.marginal_std(t) + alpha_s1, alpha_s2, alpha_t = torch.exp(log_alpha_s1), torch.exp(log_alpha_s2), torch.exp(log_alpha_t) + + if self.predict_x0: + phi_11 = torch.expm1(-r1 * h) + phi_12 = torch.expm1(-r2 * h) + phi_1 = torch.expm1(-h) + phi_22 = torch.expm1(-r2 * h) / (r2 * h) + 1. + phi_2 = phi_1 / h + 1. + phi_3 = phi_2 / h - 0.5 + + if model_s is None: + model_s = self.model_fn(x, s) + if model_s1 is None: + x_s1 = ( + expand_dims(sigma_s1 / sigma_s, dims) * x + - expand_dims(alpha_s1 * phi_11, dims) * model_s + ) + model_s1 = self.model_fn(x_s1, s1) + x_s2 = ( + expand_dims(sigma_s2 / sigma_s, dims) * x + - expand_dims(alpha_s2 * phi_12, dims) * model_s + + r2 / r1 * expand_dims(alpha_s2 * phi_22, dims) * (model_s1 - model_s) + ) + model_s2 = self.model_fn(x_s2, s2) + if solver_type == 'dpm_solver': + x_t = ( + expand_dims(sigma_t / sigma_s, dims) * x + - expand_dims(alpha_t * phi_1, dims) * model_s + + (1. / r2) * expand_dims(alpha_t * phi_2, dims) * (model_s2 - model_s) + ) + elif solver_type == 'taylor': + D1_0 = (1. / r1) * (model_s1 - model_s) + D1_1 = (1. / r2) * (model_s2 - model_s) + D1 = (r2 * D1_0 - r1 * D1_1) / (r2 - r1) + D2 = 2. * (D1_1 - D1_0) / (r2 - r1) + x_t = ( + expand_dims(sigma_t / sigma_s, dims) * x + - expand_dims(alpha_t * phi_1, dims) * model_s + + expand_dims(alpha_t * phi_2, dims) * D1 + - expand_dims(alpha_t * phi_3, dims) * D2 + ) + else: + phi_11 = torch.expm1(r1 * h) + phi_12 = torch.expm1(r2 * h) + phi_1 = torch.expm1(h) + phi_22 = torch.expm1(r2 * h) / (r2 * h) - 1. + phi_2 = phi_1 / h - 1. + phi_3 = phi_2 / h - 0.5 + + if model_s is None: + model_s = self.model_fn(x, s) + if model_s1 is None: + x_s1 = ( + expand_dims(torch.exp(log_alpha_s1 - log_alpha_s), dims) * x + - expand_dims(sigma_s1 * phi_11, dims) * model_s + ) + model_s1 = self.model_fn(x_s1, s1) + x_s2 = ( + expand_dims(torch.exp(log_alpha_s2 - log_alpha_s), dims) * x + - expand_dims(sigma_s2 * phi_12, dims) * model_s + - r2 / r1 * expand_dims(sigma_s2 * phi_22, dims) * (model_s1 - model_s) + ) + model_s2 = self.model_fn(x_s2, s2) + if solver_type == 'dpm_solver': + x_t = ( + expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x + - expand_dims(sigma_t * phi_1, dims) * model_s + - (1. / r2) * expand_dims(sigma_t * phi_2, dims) * (model_s2 - model_s) + ) + elif solver_type == 'taylor': + D1_0 = (1. / r1) * (model_s1 - model_s) + D1_1 = (1. / r2) * (model_s2 - model_s) + D1 = (r2 * D1_0 - r1 * D1_1) / (r2 - r1) + D2 = 2. * (D1_1 - D1_0) / (r2 - r1) + x_t = ( + expand_dims(torch.exp(log_alpha_t - log_alpha_s), dims) * x + - expand_dims(sigma_t * phi_1, dims) * model_s + - expand_dims(sigma_t * phi_2, dims) * D1 + - expand_dims(sigma_t * phi_3, dims) * D2 + ) + + if return_intermediate: + return x_t, {'model_s': model_s, 'model_s1': model_s1, 'model_s2': model_s2} + else: + return x_t + + def multistep_dpm_solver_second_update(self, x, model_prev_list, t_prev_list, t, solver_type="dpm_solver"): + """ + Multistep solver DPM-Solver-2 from time `t_prev_list[-1]` to time `t`. + Args: + x: A pytorch tensor. The initial value at time `s`. + model_prev_list: A list of pytorch tensor. The previous computed model values. + t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (x.shape[0],) + t: A pytorch tensor. The ending time, with the shape (x.shape[0],). + solver_type: either 'dpm_solver' or 'taylor'. The type for the high-order solvers. + The type slightly impacts the performance. We recommend to use 'dpm_solver' type. + Returns: + x_t: A pytorch tensor. The approximated solution at time `t`. + """ + if solver_type not in ['dpm_solver', 'taylor']: + raise ValueError("'solver_type' must be either 'dpm_solver' or 'taylor', got {}".format(solver_type)) + ns = self.noise_schedule + dims = x.dim() + model_prev_1, model_prev_0 = model_prev_list + t_prev_1, t_prev_0 = t_prev_list + lambda_prev_1, lambda_prev_0, lambda_t = ns.marginal_lambda(t_prev_1), ns.marginal_lambda( + t_prev_0), ns.marginal_lambda(t) + log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t) + sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t) + alpha_t = torch.exp(log_alpha_t) + + h_0 = lambda_prev_0 - lambda_prev_1 + h = lambda_t - lambda_prev_0 + r0 = h_0 / h + D1_0 = expand_dims(1. / r0, dims) * (model_prev_0 - model_prev_1) + if self.predict_x0: + if solver_type == 'dpm_solver': + x_t = ( + expand_dims(sigma_t / sigma_prev_0, dims) * x + - expand_dims(alpha_t * (torch.exp(-h) - 1.), dims) * model_prev_0 + - 0.5 * expand_dims(alpha_t * (torch.exp(-h) - 1.), dims) * D1_0 + ) + elif solver_type == 'taylor': + x_t = ( + expand_dims(sigma_t / sigma_prev_0, dims) * x + - expand_dims(alpha_t * (torch.exp(-h) - 1.), dims) * model_prev_0 + + expand_dims(alpha_t * ((torch.exp(-h) - 1.) / h + 1.), dims) * D1_0 + ) + else: + if solver_type == 'dpm_solver': + x_t = ( + expand_dims(torch.exp(log_alpha_t - log_alpha_prev_0), dims) * x + - expand_dims(sigma_t * (torch.exp(h) - 1.), dims) * model_prev_0 + - 0.5 * expand_dims(sigma_t * (torch.exp(h) - 1.), dims) * D1_0 + ) + elif solver_type == 'taylor': + x_t = ( + expand_dims(torch.exp(log_alpha_t - log_alpha_prev_0), dims) * x + - expand_dims(sigma_t * (torch.exp(h) - 1.), dims) * model_prev_0 + - expand_dims(sigma_t * ((torch.exp(h) - 1.) / h - 1.), dims) * D1_0 + ) + return x_t + + def multistep_dpm_solver_third_update(self, x, model_prev_list, t_prev_list, t, solver_type='dpm_solver'): + """ + Multistep solver DPM-Solver-3 from time `t_prev_list[-1]` to time `t`. + Args: + x: A pytorch tensor. The initial value at time `s`. + model_prev_list: A list of pytorch tensor. The previous computed model values. + t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (x.shape[0],) + t: A pytorch tensor. The ending time, with the shape (x.shape[0],). + solver_type: either 'dpm_solver' or 'taylor'. The type for the high-order solvers. + The type slightly impacts the performance. We recommend to use 'dpm_solver' type. + Returns: + x_t: A pytorch tensor. The approximated solution at time `t`. + """ + ns = self.noise_schedule + dims = x.dim() + model_prev_2, model_prev_1, model_prev_0 = model_prev_list + t_prev_2, t_prev_1, t_prev_0 = t_prev_list + lambda_prev_2, lambda_prev_1, lambda_prev_0, lambda_t = ns.marginal_lambda(t_prev_2), ns.marginal_lambda( + t_prev_1), ns.marginal_lambda(t_prev_0), ns.marginal_lambda(t) + log_alpha_prev_0, log_alpha_t = ns.marginal_log_mean_coeff(t_prev_0), ns.marginal_log_mean_coeff(t) + sigma_prev_0, sigma_t = ns.marginal_std(t_prev_0), ns.marginal_std(t) + alpha_t = torch.exp(log_alpha_t) + + h_1 = lambda_prev_1 - lambda_prev_2 + h_0 = lambda_prev_0 - lambda_prev_1 + h = lambda_t - lambda_prev_0 + r0, r1 = h_0 / h, h_1 / h + D1_0 = expand_dims(1. / r0, dims) * (model_prev_0 - model_prev_1) + D1_1 = expand_dims(1. / r1, dims) * (model_prev_1 - model_prev_2) + D1 = D1_0 + expand_dims(r0 / (r0 + r1), dims) * (D1_0 - D1_1) + D2 = expand_dims(1. / (r0 + r1), dims) * (D1_0 - D1_1) + if self.predict_x0: + x_t = ( + expand_dims(sigma_t / sigma_prev_0, dims) * x + - expand_dims(alpha_t * (torch.exp(-h) - 1.), dims) * model_prev_0 + + expand_dims(alpha_t * ((torch.exp(-h) - 1.) / h + 1.), dims) * D1 + - expand_dims(alpha_t * ((torch.exp(-h) - 1. + h) / h ** 2 - 0.5), dims) * D2 + ) + else: + x_t = ( + expand_dims(torch.exp(log_alpha_t - log_alpha_prev_0), dims) * x + - expand_dims(sigma_t * (torch.exp(h) - 1.), dims) * model_prev_0 + - expand_dims(sigma_t * ((torch.exp(h) - 1.) / h - 1.), dims) * D1 + - expand_dims(sigma_t * ((torch.exp(h) - 1. - h) / h ** 2 - 0.5), dims) * D2 + ) + return x_t + + def singlestep_dpm_solver_update(self, x, s, t, order, return_intermediate=False, solver_type='dpm_solver', r1=None, + r2=None): + """ + Singlestep DPM-Solver with the order `order` from time `s` to time `t`. + Args: + x: A pytorch tensor. The initial value at time `s`. + s: A pytorch tensor. The starting time, with the shape (x.shape[0],). + t: A pytorch tensor. The ending time, with the shape (x.shape[0],). + order: A `int`. The order of DPM-Solver. We only support order == 1 or 2 or 3. + return_intermediate: A `bool`. If true, also return the model value at time `s`, `s1` and `s2` (the intermediate times). + solver_type: either 'dpm_solver' or 'taylor'. The type for the high-order solvers. + The type slightly impacts the performance. We recommend to use 'dpm_solver' type. + r1: A `float`. The hyperparameter of the second-order or third-order solver. + r2: A `float`. The hyperparameter of the third-order solver. + Returns: + x_t: A pytorch tensor. The approximated solution at time `t`. + """ + if order == 1: + return self.dpm_solver_first_update(x, s, t, return_intermediate=return_intermediate) + elif order == 2: + return self.singlestep_dpm_solver_second_update(x, s, t, return_intermediate=return_intermediate, + solver_type=solver_type, r1=r1) + elif order == 3: + return self.singlestep_dpm_solver_third_update(x, s, t, return_intermediate=return_intermediate, + solver_type=solver_type, r1=r1, r2=r2) + else: + raise ValueError("Solver order must be 1 or 2 or 3, got {}".format(order)) + + def multistep_dpm_solver_update(self, x, model_prev_list, t_prev_list, t, order, solver_type='dpm_solver'): + """ + Multistep DPM-Solver with the order `order` from time `t_prev_list[-1]` to time `t`. + Args: + x: A pytorch tensor. The initial value at time `s`. + model_prev_list: A list of pytorch tensor. The previous computed model values. + t_prev_list: A list of pytorch tensor. The previous times, each time has the shape (x.shape[0],) + t: A pytorch tensor. The ending time, with the shape (x.shape[0],). + order: A `int`. The order of DPM-Solver. We only support order == 1 or 2 or 3. + solver_type: either 'dpm_solver' or 'taylor'. The type for the high-order solvers. + The type slightly impacts the performance. We recommend to use 'dpm_solver' type. + Returns: + x_t: A pytorch tensor. The approximated solution at time `t`. + """ + if order == 1: + return self.dpm_solver_first_update(x, t_prev_list[-1], t, model_s=model_prev_list[-1]) + elif order == 2: + return self.multistep_dpm_solver_second_update(x, model_prev_list, t_prev_list, t, solver_type=solver_type) + elif order == 3: + return self.multistep_dpm_solver_third_update(x, model_prev_list, t_prev_list, t, solver_type=solver_type) + else: + raise ValueError("Solver order must be 1 or 2 or 3, got {}".format(order)) + + def dpm_solver_adaptive(self, x, order, t_T, t_0, h_init=0.05, atol=0.0078, rtol=0.05, theta=0.9, t_err=1e-5, + solver_type='dpm_solver'): + """ + The adaptive step size solver based on singlestep DPM-Solver. + Args: + x: A pytorch tensor. The initial value at time `t_T`. + order: A `int`. The (higher) order of the solver. We only support order == 2 or 3. + t_T: A `float`. The starting time of the sampling (default is T). + t_0: A `float`. The ending time of the sampling (default is epsilon). + h_init: A `float`. The initial step size (for logSNR). + atol: A `float`. The absolute tolerance of the solver. For image data, the default setting is 0.0078, followed [1]. + rtol: A `float`. The relative tolerance of the solver. The default setting is 0.05. + theta: A `float`. The safety hyperparameter for adapting the step size. The default setting is 0.9, followed [1]. + t_err: A `float`. The tolerance for the time. We solve the diffusion ODE until the absolute error between the + current time and `t_0` is less than `t_err`. The default setting is 1e-5. + solver_type: either 'dpm_solver' or 'taylor'. The type for the high-order solvers. + The type slightly impacts the performance. We recommend to use 'dpm_solver' type. + Returns: + x_0: A pytorch tensor. The approximated solution at time `t_0`. + [1] A. Jolicoeur-Martineau, K. Li, R. Piché-Taillefer, T. Kachman, and I. Mitliagkas, "Gotta go fast when generating data with score-based models," arXiv preprint arXiv:2105.14080, 2021. + """ + ns = self.noise_schedule + s = t_T * torch.ones((x.shape[0],)).to(x) + lambda_s = ns.marginal_lambda(s) + lambda_0 = ns.marginal_lambda(t_0 * torch.ones_like(s).to(x)) + h = h_init * torch.ones_like(s).to(x) + x_prev = x + nfe = 0 + if order == 2: + r1 = 0.5 + lower_update = lambda x, s, t: self.dpm_solver_first_update(x, s, t, return_intermediate=True) + higher_update = lambda x, s, t, **kwargs: self.singlestep_dpm_solver_second_update(x, s, t, r1=r1, + solver_type=solver_type, + **kwargs) + elif order == 3: + r1, r2 = 1. / 3., 2. / 3. + lower_update = lambda x, s, t: self.singlestep_dpm_solver_second_update(x, s, t, r1=r1, + return_intermediate=True, + solver_type=solver_type) + higher_update = lambda x, s, t, **kwargs: self.singlestep_dpm_solver_third_update(x, s, t, r1=r1, r2=r2, + solver_type=solver_type, + **kwargs) + else: + raise ValueError("For adaptive step size solver, order must be 2 or 3, got {}".format(order)) + while torch.abs((s - t_0)).mean() > t_err: + t = ns.inverse_lambda(lambda_s + h) + x_lower, lower_noise_kwargs = lower_update(x, s, t) + x_higher = higher_update(x, s, t, **lower_noise_kwargs) + delta = torch.max(torch.ones_like(x).to(x) * atol, rtol * torch.max(torch.abs(x_lower), torch.abs(x_prev))) + norm_fn = lambda v: torch.sqrt(torch.square(v.reshape((v.shape[0], -1))).mean(dim=-1, keepdim=True)) + E = norm_fn((x_higher - x_lower) / delta).max() + if torch.all(E <= 1.): + x = x_higher + s = t + x_prev = x_lower + lambda_s = ns.marginal_lambda(s) + h = torch.min(theta * h * torch.float_power(E, -1. / order).float(), lambda_0 - lambda_s) + nfe += order + print('adaptive solver nfe', nfe) + return x + + def sample(self, x, steps=20, t_start=None, t_end=None, order=3, skip_type='time_uniform', + method='singlestep', lower_order_final=True, denoise_to_zero=False, solver_type='dpm_solver', + atol=0.0078, rtol=0.05, + ): + """ + Compute the sample at time `t_end` by DPM-Solver, given the initial `x` at time `t_start`. + ===================================================== + We support the following algorithms for both noise prediction model and data prediction model: + - 'singlestep': + Singlestep DPM-Solver (i.e. "DPM-Solver-fast" in the paper), which combines different orders of singlestep DPM-Solver. + We combine all the singlestep solvers with order <= `order` to use up all the function evaluations (steps). + The total number of function evaluations (NFE) == `steps`. + Given a fixed NFE == `steps`, the sampling procedure is: + - If `order` == 1: + - Denote K = steps. We use K steps of DPM-Solver-1 (i.e. DDIM). + - If `order` == 2: + - Denote K = (steps // 2) + (steps % 2). We take K intermediate time steps for sampling. + - If steps % 2 == 0, we use K steps of singlestep DPM-Solver-2. + - If steps % 2 == 1, we use (K - 1) steps of singlestep DPM-Solver-2 and 1 step of DPM-Solver-1. + - If `order` == 3: + - Denote K = (steps // 3 + 1). We take K intermediate time steps for sampling. + - If steps % 3 == 0, we use (K - 2) steps of singlestep DPM-Solver-3, and 1 step of singlestep DPM-Solver-2 and 1 step of DPM-Solver-1. + - If steps % 3 == 1, we use (K - 1) steps of singlestep DPM-Solver-3 and 1 step of DPM-Solver-1. + - If steps % 3 == 2, we use (K - 1) steps of singlestep DPM-Solver-3 and 1 step of singlestep DPM-Solver-2. + - 'multistep': + Multistep DPM-Solver with the order of `order`. The total number of function evaluations (NFE) == `steps`. + We initialize the first `order` values by lower order multistep solvers. + Given a fixed NFE == `steps`, the sampling procedure is: + Denote K = steps. + - If `order` == 1: + - We use K steps of DPM-Solver-1 (i.e. DDIM). + - If `order` == 2: + - We firstly use 1 step of DPM-Solver-1, then use (K - 1) step of multistep DPM-Solver-2. + - If `order` == 3: + - We firstly use 1 step of DPM-Solver-1, then 1 step of multistep DPM-Solver-2, then (K - 2) step of multistep DPM-Solver-3. + - 'singlestep_fixed': + Fixed order singlestep DPM-Solver (i.e. DPM-Solver-1 or singlestep DPM-Solver-2 or singlestep DPM-Solver-3). + We use singlestep DPM-Solver-`order` for `order`=1 or 2 or 3, with total [`steps` // `order`] * `order` NFE. + - 'adaptive': + Adaptive step size DPM-Solver (i.e. "DPM-Solver-12" and "DPM-Solver-23" in the paper). + We ignore `steps` and use adaptive step size DPM-Solver with a higher order of `order`. + You can adjust the absolute tolerance `atol` and the relative tolerance `rtol` to balance the computatation costs + (NFE) and the sample quality. + - If `order` == 2, we use DPM-Solver-12 which combines DPM-Solver-1 and singlestep DPM-Solver-2. + - If `order` == 3, we use DPM-Solver-23 which combines singlestep DPM-Solver-2 and singlestep DPM-Solver-3. + ===================================================== + Some advices for choosing the algorithm: + - For **unconditional sampling** or **guided sampling with small guidance scale** by DPMs: + Use singlestep DPM-Solver ("DPM-Solver-fast" in the paper) with `order = 3`. + e.g. + >>> dpm_solver = DPM_Solver(model_fn, noise_schedule, predict_x0=False) + >>> x_sample = dpm_solver.sample(x, steps=steps, t_start=t_start, t_end=t_end, order=3, + skip_type='time_uniform', method='singlestep') + - For **guided sampling with large guidance scale** by DPMs: + Use multistep DPM-Solver with `predict_x0 = True` and `order = 2`. + e.g. + >>> dpm_solver = DPM_Solver(model_fn, noise_schedule, predict_x0=True) + >>> x_sample = dpm_solver.sample(x, steps=steps, t_start=t_start, t_end=t_end, order=2, + skip_type='time_uniform', method='multistep') + We support three types of `skip_type`: + - 'logSNR': uniform logSNR for the time steps. **Recommended for low-resolutional images** + - 'time_uniform': uniform time for the time steps. **Recommended for high-resolutional images**. + - 'time_quadratic': quadratic time for the time steps. + ===================================================== + Args: + x: A pytorch tensor. The initial value at time `t_start` + e.g. if `t_start` == T, then `x` is a sample from the standard normal distribution. + steps: A `int`. The total number of function evaluations (NFE). + t_start: A `float`. The starting time of the sampling. + If `T` is None, we use self.noise_schedule.T (default is 1.0). + t_end: A `float`. The ending time of the sampling. + If `t_end` is None, we use 1. / self.noise_schedule.total_N. + e.g. if total_N == 1000, we have `t_end` == 1e-3. + For discrete-time DPMs: + - We recommend `t_end` == 1. / self.noise_schedule.total_N. + For continuous-time DPMs: + - We recommend `t_end` == 1e-3 when `steps` <= 15; and `t_end` == 1e-4 when `steps` > 15. + order: A `int`. The order of DPM-Solver. + skip_type: A `str`. The type for the spacing of the time steps. 'time_uniform' or 'logSNR' or 'time_quadratic'. + method: A `str`. The method for sampling. 'singlestep' or 'multistep' or 'singlestep_fixed' or 'adaptive'. + denoise_to_zero: A `bool`. Whether to denoise to time 0 at the final step. + Default is `False`. If `denoise_to_zero` is `True`, the total NFE is (`steps` + 1). + This trick is firstly proposed by DDPM (https://arxiv.org/abs/2006.11239) and + score_sde (https://arxiv.org/abs/2011.13456). Such trick can improve the FID + for diffusion models sampling by diffusion SDEs for low-resolutional images + (such as CIFAR-10). However, we observed that such trick does not matter for + high-resolutional images. As it needs an additional NFE, we do not recommend + it for high-resolutional images. + lower_order_final: A `bool`. Whether to use lower order solvers at the final steps. + Only valid for `method=multistep` and `steps < 15`. We empirically find that + this trick is a key to stabilizing the sampling by DPM-Solver with very few steps + (especially for steps <= 10). So we recommend to set it to be `True`. + solver_type: A `str`. The taylor expansion type for the solver. `dpm_solver` or `taylor`. We recommend `dpm_solver`. + atol: A `float`. The absolute tolerance of the adaptive step size solver. Valid when `method` == 'adaptive'. + rtol: A `float`. The relative tolerance of the adaptive step size solver. Valid when `method` == 'adaptive'. + Returns: + x_end: A pytorch tensor. The approximated solution at time `t_end`. + """ + t_0 = 1. / self.noise_schedule.total_N if t_end is None else t_end + t_T = self.noise_schedule.T if t_start is None else t_start + device = x.device + if method == 'adaptive': + with torch.no_grad(): + x = self.dpm_solver_adaptive(x, order=order, t_T=t_T, t_0=t_0, atol=atol, rtol=rtol, + solver_type=solver_type) + elif method == 'multistep': + assert steps >= order + timesteps = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=steps, device=device) + assert timesteps.shape[0] - 1 == steps + with torch.no_grad(): + vec_t = timesteps[0].expand((x.shape[0])) + model_prev_list = [self.model_fn(x, vec_t)] + t_prev_list = [vec_t] + # Init the first `order` values by lower order multistep DPM-Solver. + for init_order in tqdm(range(1, order), desc="DPM init order"): + vec_t = timesteps[init_order].expand(x.shape[0]) + x = self.multistep_dpm_solver_update(x, model_prev_list, t_prev_list, vec_t, init_order, + solver_type=solver_type) + model_prev_list.append(self.model_fn(x, vec_t)) + t_prev_list.append(vec_t) + # Compute the remaining values by `order`-th order multistep DPM-Solver. + for step in tqdm(range(order, steps + 1), desc="DPM multistep"): + vec_t = timesteps[step].expand(x.shape[0]) + if lower_order_final and steps < 15: + step_order = min(order, steps + 1 - step) + else: + step_order = order + x = self.multistep_dpm_solver_update(x, model_prev_list, t_prev_list, vec_t, step_order, + solver_type=solver_type) + for i in range(order - 1): + t_prev_list[i] = t_prev_list[i + 1] + model_prev_list[i] = model_prev_list[i + 1] + t_prev_list[-1] = vec_t + # We do not need to evaluate the final model value. + if step < steps: + model_prev_list[-1] = self.model_fn(x, vec_t) + elif method in ['singlestep', 'singlestep_fixed']: + if method == 'singlestep': + timesteps_outer, orders = self.get_orders_and_timesteps_for_singlestep_solver(steps=steps, order=order, + skip_type=skip_type, + t_T=t_T, t_0=t_0, + device=device) + elif method == 'singlestep_fixed': + K = steps // order + orders = [order, ] * K + timesteps_outer = self.get_time_steps(skip_type=skip_type, t_T=t_T, t_0=t_0, N=K, device=device) + for i, order in enumerate(orders): + t_T_inner, t_0_inner = timesteps_outer[i], timesteps_outer[i + 1] + timesteps_inner = self.get_time_steps(skip_type=skip_type, t_T=t_T_inner.item(), t_0=t_0_inner.item(), + N=order, device=device) + lambda_inner = self.noise_schedule.marginal_lambda(timesteps_inner) + vec_s, vec_t = t_T_inner.tile(x.shape[0]), t_0_inner.tile(x.shape[0]) + h = lambda_inner[-1] - lambda_inner[0] + r1 = None if order <= 1 else (lambda_inner[1] - lambda_inner[0]) / h + r2 = None if order <= 2 else (lambda_inner[2] - lambda_inner[0]) / h + x = self.singlestep_dpm_solver_update(x, vec_s, vec_t, order, solver_type=solver_type, r1=r1, r2=r2) + if denoise_to_zero: + x = self.denoise_to_zero_fn(x, torch.ones((x.shape[0],)).to(device) * t_0) + return x + + +############################################################# +# other utility functions +############################################################# + +def interpolate_fn(x, xp, yp): + """ + A piecewise linear function y = f(x), using xp and yp as keypoints. + We implement f(x) in a differentiable way (i.e. applicable for autograd). + The function f(x) is well-defined for all x-axis. (For x beyond the bounds of xp, we use the outmost points of xp to define the linear function.) + Args: + x: PyTorch tensor with shape [N, C], where N is the batch size, C is the number of channels (we use C = 1 for DPM-Solver). + xp: PyTorch tensor with shape [C, K], where K is the number of keypoints. + yp: PyTorch tensor with shape [C, K]. + Returns: + The function values f(x), with shape [N, C]. + """ + N, K = x.shape[0], xp.shape[1] + all_x = torch.cat([x.unsqueeze(2), xp.unsqueeze(0).repeat((N, 1, 1))], dim=2) + sorted_all_x, x_indices = torch.sort(all_x, dim=2) + x_idx = torch.argmin(x_indices, dim=2) + cand_start_idx = x_idx - 1 + start_idx = torch.where( + torch.eq(x_idx, 0), + torch.tensor(1, device=x.device), + torch.where( + torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx, + ), + ) + end_idx = torch.where(torch.eq(start_idx, cand_start_idx), start_idx + 2, start_idx + 1) + start_x = torch.gather(sorted_all_x, dim=2, index=start_idx.unsqueeze(2)).squeeze(2) + end_x = torch.gather(sorted_all_x, dim=2, index=end_idx.unsqueeze(2)).squeeze(2) + start_idx2 = torch.where( + torch.eq(x_idx, 0), + torch.tensor(0, device=x.device), + torch.where( + torch.eq(x_idx, K), torch.tensor(K - 2, device=x.device), cand_start_idx, + ), + ) + y_positions_expanded = yp.unsqueeze(0).expand(N, -1, -1) + start_y = torch.gather(y_positions_expanded, dim=2, index=start_idx2.unsqueeze(2)).squeeze(2) + end_y = torch.gather(y_positions_expanded, dim=2, index=(start_idx2 + 1).unsqueeze(2)).squeeze(2) + cand = start_y + (x - start_x) * (end_y - start_y) / (end_x - start_x) + return cand + + +def expand_dims(v, dims): + """ + Expand the tensor `v` to the dim `dims`. + Args: + `v`: a PyTorch tensor with shape [N]. + `dim`: a `int`. + Returns: + a PyTorch tensor with shape [N, 1, 1, ..., 1] and the total dimension is `dims`. + """ + return v[(...,) + (None,) * (dims - 1)] \ No newline at end of file diff --git a/inpaint/model/anytext/ldm/models/diffusion/dpm_solver/sampler.py b/inpaint/model/anytext/ldm/models/diffusion/dpm_solver/sampler.py new file mode 100644 index 0000000..7d137b8 --- /dev/null +++ b/inpaint/model/anytext/ldm/models/diffusion/dpm_solver/sampler.py @@ -0,0 +1,87 @@ +"""SAMPLING ONLY.""" +import torch + +from .dpm_solver import NoiseScheduleVP, model_wrapper, DPM_Solver + + +MODEL_TYPES = { + "eps": "noise", + "v": "v" +} + + +class DPMSolverSampler(object): + def __init__(self, model, **kwargs): + super().__init__() + self.model = model + to_torch = lambda x: x.clone().detach().to(torch.float32).to(model.device) + self.register_buffer('alphas_cumprod', to_torch(model.alphas_cumprod)) + + def register_buffer(self, name, attr): + if type(attr) == torch.Tensor: + if attr.device != torch.device("cuda"): + attr = attr.to(torch.device("cuda")) + setattr(self, name, attr) + + @torch.no_grad() + def sample(self, + S, + batch_size, + shape, + conditioning=None, + callback=None, + normals_sequence=None, + img_callback=None, + quantize_x0=False, + eta=0., + mask=None, + x0=None, + temperature=1., + noise_dropout=0., + score_corrector=None, + corrector_kwargs=None, + verbose=True, + x_T=None, + log_every_t=100, + unconditional_guidance_scale=1., + unconditional_conditioning=None, + # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ... + **kwargs + ): + if conditioning is not None: + if isinstance(conditioning, dict): + cbs = conditioning[list(conditioning.keys())[0]].shape[0] + if cbs != batch_size: + print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}") + else: + if conditioning.shape[0] != batch_size: + print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}") + + # sampling + C, H, W = shape + size = (batch_size, C, H, W) + + print(f'Data shape for DPM-Solver sampling is {size}, sampling steps {S}') + + device = self.model.betas.device + if x_T is None: + img = torch.randn(size, device=device) + else: + img = x_T + + ns = NoiseScheduleVP('discrete', alphas_cumprod=self.alphas_cumprod) + + model_fn = model_wrapper( + lambda x, t, c: self.model.apply_model(x, t, c), + ns, + model_type=MODEL_TYPES[self.model.parameterization], + guidance_type="classifier-free", + condition=conditioning, + unconditional_condition=unconditional_conditioning, + guidance_scale=unconditional_guidance_scale, + ) + + dpm_solver = DPM_Solver(model_fn, ns, predict_x0=True, thresholding=False) + x = dpm_solver.sample(img, steps=S, skip_type="time_uniform", method="multistep", order=2, lower_order_final=True) + + return x.to(device), None \ No newline at end of file diff --git a/inpaint/model/anytext/ldm/models/diffusion/plms.py b/inpaint/model/anytext/ldm/models/diffusion/plms.py new file mode 100644 index 0000000..5f35d55 --- /dev/null +++ b/inpaint/model/anytext/ldm/models/diffusion/plms.py @@ -0,0 +1,244 @@ +"""SAMPLING ONLY.""" + +import torch +import numpy as np +from tqdm import tqdm +from functools import partial + +from iopaint.model.anytext.ldm.modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like +from iopaint.model.anytext.ldm.models.diffusion.sampling_util import norm_thresholding + + +class PLMSSampler(object): + def __init__(self, model, schedule="linear", **kwargs): + super().__init__() + self.model = model + self.ddpm_num_timesteps = model.num_timesteps + self.schedule = schedule + + def register_buffer(self, name, attr): + if type(attr) == torch.Tensor: + if attr.device != torch.device("cuda"): + attr = attr.to(torch.device("cuda")) + setattr(self, name, attr) + + def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True): + if ddim_eta != 0: + raise ValueError('ddim_eta must be 0 for PLMS') + self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps, + num_ddpm_timesteps=self.ddpm_num_timesteps,verbose=verbose) + alphas_cumprod = self.model.alphas_cumprod + assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep' + to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device) + + self.register_buffer('betas', to_torch(self.model.betas)) + self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod)) + self.register_buffer('alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev)) + + # calculations for diffusion q(x_t | x_{t-1}) and others + self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu()))) + self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu()))) + self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu()))) + self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu()))) + self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1))) + + # ddim sampling parameters + ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(), + ddim_timesteps=self.ddim_timesteps, + eta=ddim_eta,verbose=verbose) + self.register_buffer('ddim_sigmas', ddim_sigmas) + self.register_buffer('ddim_alphas', ddim_alphas) + self.register_buffer('ddim_alphas_prev', ddim_alphas_prev) + self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas)) + sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt( + (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * ( + 1 - self.alphas_cumprod / self.alphas_cumprod_prev)) + self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps) + + @torch.no_grad() + def sample(self, + S, + batch_size, + shape, + conditioning=None, + callback=None, + normals_sequence=None, + img_callback=None, + quantize_x0=False, + eta=0., + mask=None, + x0=None, + temperature=1., + noise_dropout=0., + score_corrector=None, + corrector_kwargs=None, + verbose=True, + x_T=None, + log_every_t=100, + unconditional_guidance_scale=1., + unconditional_conditioning=None, + # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ... + dynamic_threshold=None, + **kwargs + ): + if conditioning is not None: + if isinstance(conditioning, dict): + cbs = conditioning[list(conditioning.keys())[0]].shape[0] + if cbs != batch_size: + print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}") + else: + if conditioning.shape[0] != batch_size: + print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}") + + self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose) + # sampling + C, H, W = shape + size = (batch_size, C, H, W) + print(f'Data shape for PLMS sampling is {size}') + + samples, intermediates = self.plms_sampling(conditioning, size, + callback=callback, + img_callback=img_callback, + quantize_denoised=quantize_x0, + mask=mask, x0=x0, + ddim_use_original_steps=False, + noise_dropout=noise_dropout, + temperature=temperature, + score_corrector=score_corrector, + corrector_kwargs=corrector_kwargs, + x_T=x_T, + log_every_t=log_every_t, + unconditional_guidance_scale=unconditional_guidance_scale, + unconditional_conditioning=unconditional_conditioning, + dynamic_threshold=dynamic_threshold, + ) + return samples, intermediates + + @torch.no_grad() + def plms_sampling(self, cond, shape, + x_T=None, ddim_use_original_steps=False, + callback=None, timesteps=None, quantize_denoised=False, + mask=None, x0=None, img_callback=None, log_every_t=100, + temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None, + unconditional_guidance_scale=1., unconditional_conditioning=None, + dynamic_threshold=None): + device = self.model.betas.device + b = shape[0] + if x_T is None: + img = torch.randn(shape, device=device) + else: + img = x_T + + if timesteps is None: + timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps + elif timesteps is not None and not ddim_use_original_steps: + subset_end = int(min(timesteps / self.ddim_timesteps.shape[0], 1) * self.ddim_timesteps.shape[0]) - 1 + timesteps = self.ddim_timesteps[:subset_end] + + intermediates = {'x_inter': [img], 'pred_x0': [img]} + time_range = list(reversed(range(0,timesteps))) if ddim_use_original_steps else np.flip(timesteps) + total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0] + print(f"Running PLMS Sampling with {total_steps} timesteps") + + iterator = tqdm(time_range, desc='PLMS Sampler', total=total_steps) + old_eps = [] + + for i, step in enumerate(iterator): + index = total_steps - i - 1 + ts = torch.full((b,), step, device=device, dtype=torch.long) + ts_next = torch.full((b,), time_range[min(i + 1, len(time_range) - 1)], device=device, dtype=torch.long) + + if mask is not None: + assert x0 is not None + img_orig = self.model.q_sample(x0, ts) # TODO: deterministic forward pass? + img = img_orig * mask + (1. - mask) * img + + outs = self.p_sample_plms(img, cond, ts, index=index, use_original_steps=ddim_use_original_steps, + quantize_denoised=quantize_denoised, temperature=temperature, + noise_dropout=noise_dropout, score_corrector=score_corrector, + corrector_kwargs=corrector_kwargs, + unconditional_guidance_scale=unconditional_guidance_scale, + unconditional_conditioning=unconditional_conditioning, + old_eps=old_eps, t_next=ts_next, + dynamic_threshold=dynamic_threshold) + img, pred_x0, e_t = outs + old_eps.append(e_t) + if len(old_eps) >= 4: + old_eps.pop(0) + if callback: callback(i) + if img_callback: img_callback(pred_x0, i) + + if index % log_every_t == 0 or index == total_steps - 1: + intermediates['x_inter'].append(img) + intermediates['pred_x0'].append(pred_x0) + + return img, intermediates + + @torch.no_grad() + def p_sample_plms(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False, + temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None, + unconditional_guidance_scale=1., unconditional_conditioning=None, old_eps=None, t_next=None, + dynamic_threshold=None): + b, *_, device = *x.shape, x.device + + def get_model_output(x, t): + if unconditional_conditioning is None or unconditional_guidance_scale == 1.: + e_t = self.model.apply_model(x, t, c) + else: + x_in = torch.cat([x] * 2) + t_in = torch.cat([t] * 2) + c_in = torch.cat([unconditional_conditioning, c]) + e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2) + e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond) + + if score_corrector is not None: + assert self.model.parameterization == "eps" + e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs) + + return e_t + + alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas + alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev + sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas + sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas + + def get_x_prev_and_pred_x0(e_t, index): + # select parameters corresponding to the currently considered timestep + a_t = torch.full((b, 1, 1, 1), alphas[index], device=device) + a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device) + sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device) + sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index],device=device) + + # current prediction for x_0 + pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt() + if quantize_denoised: + pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0) + if dynamic_threshold is not None: + pred_x0 = norm_thresholding(pred_x0, dynamic_threshold) + # direction pointing to x_t + dir_xt = (1. - a_prev - sigma_t**2).sqrt() * e_t + noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature + if noise_dropout > 0.: + noise = torch.nn.functional.dropout(noise, p=noise_dropout) + x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise + return x_prev, pred_x0 + + e_t = get_model_output(x, t) + if len(old_eps) == 0: + # Pseudo Improved Euler (2nd order) + x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t, index) + e_t_next = get_model_output(x_prev, t_next) + e_t_prime = (e_t + e_t_next) / 2 + elif len(old_eps) == 1: + # 2nd order Pseudo Linear Multistep (Adams-Bashforth) + e_t_prime = (3 * e_t - old_eps[-1]) / 2 + elif len(old_eps) == 2: + # 3nd order Pseudo Linear Multistep (Adams-Bashforth) + e_t_prime = (23 * e_t - 16 * old_eps[-1] + 5 * old_eps[-2]) / 12 + elif len(old_eps) >= 3: + # 4nd order Pseudo Linear Multistep (Adams-Bashforth) + e_t_prime = (55 * e_t - 59 * old_eps[-1] + 37 * old_eps[-2] - 9 * old_eps[-3]) / 24 + + x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t_prime, index) + + return x_prev, pred_x0, e_t diff --git a/inpaint/model/anytext/ldm/models/diffusion/sampling_util.py b/inpaint/model/anytext/ldm/models/diffusion/sampling_util.py new file mode 100644 index 0000000..7eff02b --- /dev/null +++ b/inpaint/model/anytext/ldm/models/diffusion/sampling_util.py @@ -0,0 +1,22 @@ +import torch +import numpy as np + + +def append_dims(x, target_dims): + """Appends dimensions to the end of a tensor until it has target_dims dimensions. + From https://github.com/crowsonkb/k-diffusion/blob/master/k_diffusion/utils.py""" + dims_to_append = target_dims - x.ndim + if dims_to_append < 0: + raise ValueError(f'input has {x.ndim} dims but target_dims is {target_dims}, which is less') + return x[(...,) + (None,) * dims_to_append] + + +def norm_thresholding(x0, value): + s = append_dims(x0.pow(2).flatten(1).mean(1).sqrt().clamp(min=value), x0.ndim) + return x0 * (value / s) + + +def spatial_norm_thresholding(x0, value): + # b c h w + s = x0.pow(2).mean(1, keepdim=True).sqrt().clamp(min=value) + return x0 * (value / s) \ No newline at end of file diff --git a/inpaint/model/anytext/ldm/modules/__init__.py b/inpaint/model/anytext/ldm/modules/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/inpaint/model/anytext/ldm/modules/attention.py b/inpaint/model/anytext/ldm/modules/attention.py new file mode 100644 index 0000000..df92aa7 --- /dev/null +++ b/inpaint/model/anytext/ldm/modules/attention.py @@ -0,0 +1,360 @@ +from inspect import isfunction +import math +import torch +import torch.nn.functional as F +from torch import nn, einsum +from einops import rearrange, repeat +from typing import Optional, Any + +from iopaint.model.anytext.ldm.modules.diffusionmodules.util import checkpoint + + +# CrossAttn precision handling +import os + +_ATTN_PRECISION = os.environ.get("ATTN_PRECISION", "fp32") + + +def exists(val): + return val is not None + + +def uniq(arr): + return {el: True for el in arr}.keys() + + +def default(val, d): + if exists(val): + return val + return d() if isfunction(d) else d + + +def max_neg_value(t): + return -torch.finfo(t.dtype).max + + +def init_(tensor): + dim = tensor.shape[-1] + std = 1 / math.sqrt(dim) + tensor.uniform_(-std, std) + return tensor + + +# feedforward +class GEGLU(nn.Module): + def __init__(self, dim_in, dim_out): + super().__init__() + self.proj = nn.Linear(dim_in, dim_out * 2) + + def forward(self, x): + x, gate = self.proj(x).chunk(2, dim=-1) + return x * F.gelu(gate) + + +class FeedForward(nn.Module): + def __init__(self, dim, dim_out=None, mult=4, glu=False, dropout=0.0): + super().__init__() + inner_dim = int(dim * mult) + dim_out = default(dim_out, dim) + project_in = ( + nn.Sequential(nn.Linear(dim, inner_dim), nn.GELU()) + if not glu + else GEGLU(dim, inner_dim) + ) + + self.net = nn.Sequential( + project_in, nn.Dropout(dropout), nn.Linear(inner_dim, dim_out) + ) + + def forward(self, x): + return self.net(x) + + +def zero_module(module): + """ + Zero out the parameters of a module and return it. + """ + for p in module.parameters(): + p.detach().zero_() + return module + + +def Normalize(in_channels): + return torch.nn.GroupNorm( + num_groups=32, num_channels=in_channels, eps=1e-6, affine=True + ) + + +class SpatialSelfAttention(nn.Module): + def __init__(self, in_channels): + super().__init__() + self.in_channels = in_channels + + self.norm = Normalize(in_channels) + self.q = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0 + ) + self.k = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0 + ) + self.v = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0 + ) + self.proj_out = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0 + ) + + def forward(self, x): + h_ = x + h_ = self.norm(h_) + q = self.q(h_) + k = self.k(h_) + v = self.v(h_) + + # compute attention + b, c, h, w = q.shape + q = rearrange(q, "b c h w -> b (h w) c") + k = rearrange(k, "b c h w -> b c (h w)") + w_ = torch.einsum("bij,bjk->bik", q, k) + + w_ = w_ * (int(c) ** (-0.5)) + w_ = torch.nn.functional.softmax(w_, dim=2) + + # attend to values + v = rearrange(v, "b c h w -> b c (h w)") + w_ = rearrange(w_, "b i j -> b j i") + h_ = torch.einsum("bij,bjk->bik", v, w_) + h_ = rearrange(h_, "b c (h w) -> b c h w", h=h) + h_ = self.proj_out(h_) + + return x + h_ + + +class CrossAttention(nn.Module): + def __init__(self, query_dim, context_dim=None, heads=8, dim_head=64, dropout=0.0): + super().__init__() + inner_dim = dim_head * heads + context_dim = default(context_dim, query_dim) + + self.scale = dim_head**-0.5 + self.heads = heads + + self.to_q = nn.Linear(query_dim, inner_dim, bias=False) + self.to_k = nn.Linear(context_dim, inner_dim, bias=False) + self.to_v = nn.Linear(context_dim, inner_dim, bias=False) + + self.to_out = nn.Sequential( + nn.Linear(inner_dim, query_dim), nn.Dropout(dropout) + ) + + def forward(self, x, context=None, mask=None): + h = self.heads + + q = self.to_q(x) + context = default(context, x) + k = self.to_k(context) + v = self.to_v(context) + + q, k, v = map(lambda t: rearrange(t, "b n (h d) -> (b h) n d", h=h), (q, k, v)) + + # force cast to fp32 to avoid overflowing + if _ATTN_PRECISION == "fp32": + with torch.autocast(enabled=False, device_type="cuda"): + q, k = q.float(), k.float() + sim = einsum("b i d, b j d -> b i j", q, k) * self.scale + else: + sim = einsum("b i d, b j d -> b i j", q, k) * self.scale + + del q, k + + if exists(mask): + mask = rearrange(mask, "b ... -> b (...)") + max_neg_value = -torch.finfo(sim.dtype).max + mask = repeat(mask, "b j -> (b h) () j", h=h) + sim.masked_fill_(~mask, max_neg_value) + + # attention, what we cannot get enough of + sim = sim.softmax(dim=-1) + + out = einsum("b i j, b j d -> b i d", sim, v) + out = rearrange(out, "(b h) n d -> b n (h d)", h=h) + return self.to_out(out) + + +class SDPACrossAttention(CrossAttention): + def forward(self, x, context=None, mask=None): + batch_size, sequence_length, inner_dim = x.shape + + if mask is not None: + mask = self.prepare_attention_mask(mask, sequence_length, batch_size) + mask = mask.view(batch_size, self.heads, -1, mask.shape[-1]) + + h = self.heads + q_in = self.to_q(x) + context = default(context, x) + + k_in = self.to_k(context) + v_in = self.to_v(context) + + head_dim = inner_dim // h + q = q_in.view(batch_size, -1, h, head_dim).transpose(1, 2) + k = k_in.view(batch_size, -1, h, head_dim).transpose(1, 2) + v = v_in.view(batch_size, -1, h, head_dim).transpose(1, 2) + + del q_in, k_in, v_in + + dtype = q.dtype + if _ATTN_PRECISION == "fp32": + q, k, v = q.float(), k.float(), v.float() + + # the output of sdp = (batch, num_heads, seq_len, head_dim) + hidden_states = torch.nn.functional.scaled_dot_product_attention( + q, k, v, attn_mask=mask, dropout_p=0.0, is_causal=False + ) + + hidden_states = hidden_states.transpose(1, 2).reshape( + batch_size, -1, h * head_dim + ) + hidden_states = hidden_states.to(dtype) + + # linear proj + hidden_states = self.to_out[0](hidden_states) + # dropout + hidden_states = self.to_out[1](hidden_states) + return hidden_states + + +class BasicTransformerBlock(nn.Module): + def __init__( + self, + dim, + n_heads, + d_head, + dropout=0.0, + context_dim=None, + gated_ff=True, + checkpoint=True, + disable_self_attn=False, + ): + super().__init__() + + if hasattr(torch.nn.functional, "scaled_dot_product_attention"): + attn_cls = SDPACrossAttention + else: + attn_cls = CrossAttention + + self.disable_self_attn = disable_self_attn + self.attn1 = attn_cls( + query_dim=dim, + heads=n_heads, + dim_head=d_head, + dropout=dropout, + context_dim=context_dim if self.disable_self_attn else None, + ) # is a self-attention if not self.disable_self_attn + self.ff = FeedForward(dim, dropout=dropout, glu=gated_ff) + self.attn2 = attn_cls( + query_dim=dim, + context_dim=context_dim, + heads=n_heads, + dim_head=d_head, + dropout=dropout, + ) # is self-attn if context is none + self.norm1 = nn.LayerNorm(dim) + self.norm2 = nn.LayerNorm(dim) + self.norm3 = nn.LayerNorm(dim) + self.checkpoint = checkpoint + + def forward(self, x, context=None): + return checkpoint( + self._forward, (x, context), self.parameters(), self.checkpoint + ) + + def _forward(self, x, context=None): + x = ( + self.attn1( + self.norm1(x), context=context if self.disable_self_attn else None + ) + + x + ) + x = self.attn2(self.norm2(x), context=context) + x + x = self.ff(self.norm3(x)) + x + return x + + +class SpatialTransformer(nn.Module): + """ + Transformer block for image-like data. + First, project the input (aka embedding) + and reshape to b, t, d. + Then apply standard transformer action. + Finally, reshape to image + NEW: use_linear for more efficiency instead of the 1x1 convs + """ + + def __init__( + self, + in_channels, + n_heads, + d_head, + depth=1, + dropout=0.0, + context_dim=None, + disable_self_attn=False, + use_linear=False, + use_checkpoint=True, + ): + super().__init__() + if exists(context_dim) and not isinstance(context_dim, list): + context_dim = [context_dim] + self.in_channels = in_channels + inner_dim = n_heads * d_head + self.norm = Normalize(in_channels) + if not use_linear: + self.proj_in = nn.Conv2d( + in_channels, inner_dim, kernel_size=1, stride=1, padding=0 + ) + else: + self.proj_in = nn.Linear(in_channels, inner_dim) + + self.transformer_blocks = nn.ModuleList( + [ + BasicTransformerBlock( + inner_dim, + n_heads, + d_head, + dropout=dropout, + context_dim=context_dim[d], + disable_self_attn=disable_self_attn, + checkpoint=use_checkpoint, + ) + for d in range(depth) + ] + ) + if not use_linear: + self.proj_out = zero_module( + nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0) + ) + else: + self.proj_out = zero_module(nn.Linear(in_channels, inner_dim)) + self.use_linear = use_linear + + def forward(self, x, context=None): + # note: if no context is given, cross-attention defaults to self-attention + if not isinstance(context, list): + context = [context] + b, c, h, w = x.shape + x_in = x + x = self.norm(x) + if not self.use_linear: + x = self.proj_in(x) + x = rearrange(x, "b c h w -> b (h w) c").contiguous() + if self.use_linear: + x = self.proj_in(x) + for i, block in enumerate(self.transformer_blocks): + x = block(x, context=context[i]) + if self.use_linear: + x = self.proj_out(x) + x = rearrange(x, "b (h w) c -> b c h w", h=h, w=w).contiguous() + if not self.use_linear: + x = self.proj_out(x) + return x + x_in diff --git a/inpaint/model/anytext/ldm/modules/diffusionmodules/__init__.py b/inpaint/model/anytext/ldm/modules/diffusionmodules/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/inpaint/model/anytext/ldm/modules/diffusionmodules/model.py b/inpaint/model/anytext/ldm/modules/diffusionmodules/model.py new file mode 100644 index 0000000..3472824 --- /dev/null +++ b/inpaint/model/anytext/ldm/modules/diffusionmodules/model.py @@ -0,0 +1,973 @@ +# pytorch_diffusion + derived encoder decoder +import math + +import numpy as np +import torch +import torch.nn as nn + + +def get_timestep_embedding(timesteps, embedding_dim): + """ + This matches the implementation in Denoising Diffusion Probabilistic Models: + From Fairseq. + Build sinusoidal embeddings. + This matches the implementation in tensor2tensor, but differs slightly + from the description in Section 3.5 of "Attention Is All You Need". + """ + assert len(timesteps.shape) == 1 + + half_dim = embedding_dim // 2 + emb = math.log(10000) / (half_dim - 1) + emb = torch.exp(torch.arange(half_dim, dtype=torch.float32) * -emb) + emb = emb.to(device=timesteps.device) + emb = timesteps.float()[:, None] * emb[None, :] + emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1) + if embedding_dim % 2 == 1: # zero pad + emb = torch.nn.functional.pad(emb, (0, 1, 0, 0)) + return emb + + +def nonlinearity(x): + # swish + return x * torch.sigmoid(x) + + +def Normalize(in_channels, num_groups=32): + return torch.nn.GroupNorm( + num_groups=num_groups, num_channels=in_channels, eps=1e-6, affine=True + ) + + +class Upsample(nn.Module): + def __init__(self, in_channels, with_conv): + super().__init__() + self.with_conv = with_conv + if self.with_conv: + self.conv = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=3, stride=1, padding=1 + ) + + def forward(self, x): + x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest") + if self.with_conv: + x = self.conv(x) + return x + + +class Downsample(nn.Module): + def __init__(self, in_channels, with_conv): + super().__init__() + self.with_conv = with_conv + if self.with_conv: + # no asymmetric padding in torch conv, must do it ourselves + self.conv = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=3, stride=2, padding=0 + ) + + def forward(self, x): + if self.with_conv: + pad = (0, 1, 0, 1) + x = torch.nn.functional.pad(x, pad, mode="constant", value=0) + x = self.conv(x) + else: + x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2) + return x + + +class ResnetBlock(nn.Module): + def __init__( + self, + *, + in_channels, + out_channels=None, + conv_shortcut=False, + dropout, + temb_channels=512, + ): + super().__init__() + self.in_channels = in_channels + out_channels = in_channels if out_channels is None else out_channels + self.out_channels = out_channels + self.use_conv_shortcut = conv_shortcut + + self.norm1 = Normalize(in_channels) + self.conv1 = torch.nn.Conv2d( + in_channels, out_channels, kernel_size=3, stride=1, padding=1 + ) + if temb_channels > 0: + self.temb_proj = torch.nn.Linear(temb_channels, out_channels) + self.norm2 = Normalize(out_channels) + self.dropout = torch.nn.Dropout(dropout) + self.conv2 = torch.nn.Conv2d( + out_channels, out_channels, kernel_size=3, stride=1, padding=1 + ) + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + self.conv_shortcut = torch.nn.Conv2d( + in_channels, out_channels, kernel_size=3, stride=1, padding=1 + ) + else: + self.nin_shortcut = torch.nn.Conv2d( + in_channels, out_channels, kernel_size=1, stride=1, padding=0 + ) + + def forward(self, x, temb): + h = x + h = self.norm1(h) + h = nonlinearity(h) + h = self.conv1(h) + + if temb is not None: + h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None] + + h = self.norm2(h) + h = nonlinearity(h) + h = self.dropout(h) + h = self.conv2(h) + + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + x = self.conv_shortcut(x) + else: + x = self.nin_shortcut(x) + + return x + h + + +class AttnBlock(nn.Module): + def __init__(self, in_channels): + super().__init__() + self.in_channels = in_channels + + self.norm = Normalize(in_channels) + self.q = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0 + ) + self.k = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0 + ) + self.v = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0 + ) + self.proj_out = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0 + ) + + def forward(self, x): + h_ = x + h_ = self.norm(h_) + q = self.q(h_) + k = self.k(h_) + v = self.v(h_) + + # compute attention + b, c, h, w = q.shape + q = q.reshape(b, c, h * w) + q = q.permute(0, 2, 1) # b,hw,c + k = k.reshape(b, c, h * w) # b,c,hw + w_ = torch.bmm(q, k) # b,hw,hw w[b,i,j]=sum_c q[b,i,c]k[b,c,j] + w_ = w_ * (int(c) ** (-0.5)) + w_ = torch.nn.functional.softmax(w_, dim=2) + + # attend to values + v = v.reshape(b, c, h * w) + w_ = w_.permute(0, 2, 1) # b,hw,hw (first hw of k, second of q) + h_ = torch.bmm(v, w_) # b, c,hw (hw of q) h_[b,c,j] = sum_i v[b,c,i] w_[b,i,j] + h_ = h_.reshape(b, c, h, w) + + h_ = self.proj_out(h_) + + return x + h_ + + +class AttnBlock2_0(nn.Module): + def __init__(self, in_channels): + super().__init__() + self.in_channels = in_channels + + self.norm = Normalize(in_channels) + self.q = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0 + ) + self.k = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0 + ) + self.v = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0 + ) + self.proj_out = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0 + ) + + def forward(self, x): + h_ = x + h_ = self.norm(h_) + # output: [1, 512, 64, 64] + q = self.q(h_) + k = self.k(h_) + v = self.v(h_) + + # compute attention + b, c, h, w = q.shape + + # q = q.reshape(b, c, h * w).transpose() + # q = q.permute(0, 2, 1) # b,hw,c + # k = k.reshape(b, c, h * w) # b,c,hw + q = q.transpose(1, 2) + k = k.transpose(1, 2) + v = v.transpose(1, 2) + # (batch, num_heads, seq_len, head_dim) + hidden_states = torch.nn.functional.scaled_dot_product_attention( + q, k, v, attn_mask=None, dropout_p=0.0, is_causal=False + ) + hidden_states = hidden_states.transpose(1, 2) + hidden_states = hidden_states.to(q.dtype) + + h_ = self.proj_out(hidden_states) + + return x + h_ + + +def make_attn(in_channels, attn_type="vanilla", attn_kwargs=None): + assert attn_type in [ + "vanilla", + "vanilla-xformers", + "memory-efficient-cross-attn", + "linear", + "none", + ], f"attn_type {attn_type} unknown" + assert attn_kwargs is None + if hasattr(torch.nn.functional, "scaled_dot_product_attention"): + # print(f"Using torch.nn.functional.scaled_dot_product_attention") + return AttnBlock2_0(in_channels) + return AttnBlock(in_channels) + + +class Model(nn.Module): + def __init__( + self, + *, + ch, + out_ch, + ch_mult=(1, 2, 4, 8), + num_res_blocks, + attn_resolutions, + dropout=0.0, + resamp_with_conv=True, + in_channels, + resolution, + use_timestep=True, + use_linear_attn=False, + attn_type="vanilla", + ): + super().__init__() + if use_linear_attn: + attn_type = "linear" + self.ch = ch + self.temb_ch = self.ch * 4 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.resolution = resolution + self.in_channels = in_channels + + self.use_timestep = use_timestep + if self.use_timestep: + # timestep embedding + self.temb = nn.Module() + self.temb.dense = nn.ModuleList( + [ + torch.nn.Linear(self.ch, self.temb_ch), + torch.nn.Linear(self.temb_ch, self.temb_ch), + ] + ) + + # downsampling + self.conv_in = torch.nn.Conv2d( + in_channels, self.ch, kernel_size=3, stride=1, padding=1 + ) + + curr_res = resolution + in_ch_mult = (1,) + tuple(ch_mult) + self.down = nn.ModuleList() + for i_level in range(self.num_resolutions): + block = nn.ModuleList() + attn = nn.ModuleList() + block_in = ch * in_ch_mult[i_level] + block_out = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks): + block.append( + ResnetBlock( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + ) + ) + block_in = block_out + if curr_res in attn_resolutions: + attn.append(make_attn(block_in, attn_type=attn_type)) + down = nn.Module() + down.block = block + down.attn = attn + if i_level != self.num_resolutions - 1: + down.downsample = Downsample(block_in, resamp_with_conv) + curr_res = curr_res // 2 + self.down.append(down) + + # middle + self.mid = nn.Module() + self.mid.block_1 = ResnetBlock( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + ) + self.mid.attn_1 = make_attn(block_in, attn_type=attn_type) + self.mid.block_2 = ResnetBlock( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + ) + + # upsampling + self.up = nn.ModuleList() + for i_level in reversed(range(self.num_resolutions)): + block = nn.ModuleList() + attn = nn.ModuleList() + block_out = ch * ch_mult[i_level] + skip_in = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks + 1): + if i_block == self.num_res_blocks: + skip_in = ch * in_ch_mult[i_level] + block.append( + ResnetBlock( + in_channels=block_in + skip_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + ) + ) + block_in = block_out + if curr_res in attn_resolutions: + attn.append(make_attn(block_in, attn_type=attn_type)) + up = nn.Module() + up.block = block + up.attn = attn + if i_level != 0: + up.upsample = Upsample(block_in, resamp_with_conv) + curr_res = curr_res * 2 + self.up.insert(0, up) # prepend to get consistent order + + # end + self.norm_out = Normalize(block_in) + self.conv_out = torch.nn.Conv2d( + block_in, out_ch, kernel_size=3, stride=1, padding=1 + ) + + def forward(self, x, t=None, context=None): + # assert x.shape[2] == x.shape[3] == self.resolution + if context is not None: + # assume aligned context, cat along channel axis + x = torch.cat((x, context), dim=1) + if self.use_timestep: + # timestep embedding + assert t is not None + temb = get_timestep_embedding(t, self.ch) + temb = self.temb.dense[0](temb) + temb = nonlinearity(temb) + temb = self.temb.dense[1](temb) + else: + temb = None + + # downsampling + hs = [self.conv_in(x)] + for i_level in range(self.num_resolutions): + for i_block in range(self.num_res_blocks): + h = self.down[i_level].block[i_block](hs[-1], temb) + if len(self.down[i_level].attn) > 0: + h = self.down[i_level].attn[i_block](h) + hs.append(h) + if i_level != self.num_resolutions - 1: + hs.append(self.down[i_level].downsample(hs[-1])) + + # middle + h = hs[-1] + h = self.mid.block_1(h, temb) + h = self.mid.attn_1(h) + h = self.mid.block_2(h, temb) + + # upsampling + for i_level in reversed(range(self.num_resolutions)): + for i_block in range(self.num_res_blocks + 1): + h = self.up[i_level].block[i_block]( + torch.cat([h, hs.pop()], dim=1), temb + ) + if len(self.up[i_level].attn) > 0: + h = self.up[i_level].attn[i_block](h) + if i_level != 0: + h = self.up[i_level].upsample(h) + + # end + h = self.norm_out(h) + h = nonlinearity(h) + h = self.conv_out(h) + return h + + def get_last_layer(self): + return self.conv_out.weight + + +class Encoder(nn.Module): + def __init__( + self, + *, + ch, + out_ch, + ch_mult=(1, 2, 4, 8), + num_res_blocks, + attn_resolutions, + dropout=0.0, + resamp_with_conv=True, + in_channels, + resolution, + z_channels, + double_z=True, + use_linear_attn=False, + attn_type="vanilla", + **ignore_kwargs, + ): + super().__init__() + if use_linear_attn: + attn_type = "linear" + self.ch = ch + self.temb_ch = 0 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.resolution = resolution + self.in_channels = in_channels + + # downsampling + self.conv_in = torch.nn.Conv2d( + in_channels, self.ch, kernel_size=3, stride=1, padding=1 + ) + + curr_res = resolution + in_ch_mult = (1,) + tuple(ch_mult) + self.in_ch_mult = in_ch_mult + self.down = nn.ModuleList() + for i_level in range(self.num_resolutions): + block = nn.ModuleList() + attn = nn.ModuleList() + block_in = ch * in_ch_mult[i_level] + block_out = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks): + block.append( + ResnetBlock( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + ) + ) + block_in = block_out + if curr_res in attn_resolutions: + attn.append(make_attn(block_in, attn_type=attn_type)) + down = nn.Module() + down.block = block + down.attn = attn + if i_level != self.num_resolutions - 1: + down.downsample = Downsample(block_in, resamp_with_conv) + curr_res = curr_res // 2 + self.down.append(down) + + # middle + self.mid = nn.Module() + self.mid.block_1 = ResnetBlock( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + ) + self.mid.attn_1 = make_attn(block_in, attn_type=attn_type) + self.mid.block_2 = ResnetBlock( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + ) + + # end + self.norm_out = Normalize(block_in) + self.conv_out = torch.nn.Conv2d( + block_in, + 2 * z_channels if double_z else z_channels, + kernel_size=3, + stride=1, + padding=1, + ) + + def forward(self, x): + # timestep embedding + temb = None + + # downsampling + hs = [self.conv_in(x)] + for i_level in range(self.num_resolutions): + for i_block in range(self.num_res_blocks): + h = self.down[i_level].block[i_block](hs[-1], temb) + if len(self.down[i_level].attn) > 0: + h = self.down[i_level].attn[i_block](h) + hs.append(h) + if i_level != self.num_resolutions - 1: + hs.append(self.down[i_level].downsample(hs[-1])) + + # middle + h = hs[-1] + h = self.mid.block_1(h, temb) + h = self.mid.attn_1(h) + h = self.mid.block_2(h, temb) + + # end + h = self.norm_out(h) + h = nonlinearity(h) + h = self.conv_out(h) + return h + + +class Decoder(nn.Module): + def __init__( + self, + *, + ch, + out_ch, + ch_mult=(1, 2, 4, 8), + num_res_blocks, + attn_resolutions, + dropout=0.0, + resamp_with_conv=True, + in_channels, + resolution, + z_channels, + give_pre_end=False, + tanh_out=False, + use_linear_attn=False, + attn_type="vanilla", + **ignorekwargs, + ): + super().__init__() + if use_linear_attn: + attn_type = "linear" + self.ch = ch + self.temb_ch = 0 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.resolution = resolution + self.in_channels = in_channels + self.give_pre_end = give_pre_end + self.tanh_out = tanh_out + + # compute in_ch_mult, block_in and curr_res at lowest res + in_ch_mult = (1,) + tuple(ch_mult) + block_in = ch * ch_mult[self.num_resolutions - 1] + curr_res = resolution // 2 ** (self.num_resolutions - 1) + self.z_shape = (1, z_channels, curr_res, curr_res) + print( + "Working with z of shape {} = {} dimensions.".format( + self.z_shape, np.prod(self.z_shape) + ) + ) + + # z to block_in + self.conv_in = torch.nn.Conv2d( + z_channels, block_in, kernel_size=3, stride=1, padding=1 + ) + + # middle + self.mid = nn.Module() + self.mid.block_1 = ResnetBlock( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + ) + self.mid.attn_1 = make_attn(block_in, attn_type=attn_type) + self.mid.block_2 = ResnetBlock( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + ) + + # upsampling + self.up = nn.ModuleList() + for i_level in reversed(range(self.num_resolutions)): + block = nn.ModuleList() + attn = nn.ModuleList() + block_out = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks + 1): + block.append( + ResnetBlock( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + ) + ) + block_in = block_out + if curr_res in attn_resolutions: + attn.append(make_attn(block_in, attn_type=attn_type)) + up = nn.Module() + up.block = block + up.attn = attn + if i_level != 0: + up.upsample = Upsample(block_in, resamp_with_conv) + curr_res = curr_res * 2 + self.up.insert(0, up) # prepend to get consistent order + + # end + self.norm_out = Normalize(block_in) + self.conv_out = torch.nn.Conv2d( + block_in, out_ch, kernel_size=3, stride=1, padding=1 + ) + + def forward(self, z): + # assert z.shape[1:] == self.z_shape[1:] + self.last_z_shape = z.shape + + # timestep embedding + temb = None + + # z to block_in + h = self.conv_in(z) + + # middle + h = self.mid.block_1(h, temb) + h = self.mid.attn_1(h) + h = self.mid.block_2(h, temb) + + # upsampling + for i_level in reversed(range(self.num_resolutions)): + for i_block in range(self.num_res_blocks + 1): + h = self.up[i_level].block[i_block](h, temb) + if len(self.up[i_level].attn) > 0: + h = self.up[i_level].attn[i_block](h) + if i_level != 0: + h = self.up[i_level].upsample(h) + + # end + if self.give_pre_end: + return h + + h = self.norm_out(h) + h = nonlinearity(h) + h = self.conv_out(h) + if self.tanh_out: + h = torch.tanh(h) + return h + + +class SimpleDecoder(nn.Module): + def __init__(self, in_channels, out_channels, *args, **kwargs): + super().__init__() + self.model = nn.ModuleList( + [ + nn.Conv2d(in_channels, in_channels, 1), + ResnetBlock( + in_channels=in_channels, + out_channels=2 * in_channels, + temb_channels=0, + dropout=0.0, + ), + ResnetBlock( + in_channels=2 * in_channels, + out_channels=4 * in_channels, + temb_channels=0, + dropout=0.0, + ), + ResnetBlock( + in_channels=4 * in_channels, + out_channels=2 * in_channels, + temb_channels=0, + dropout=0.0, + ), + nn.Conv2d(2 * in_channels, in_channels, 1), + Upsample(in_channels, with_conv=True), + ] + ) + # end + self.norm_out = Normalize(in_channels) + self.conv_out = torch.nn.Conv2d( + in_channels, out_channels, kernel_size=3, stride=1, padding=1 + ) + + def forward(self, x): + for i, layer in enumerate(self.model): + if i in [1, 2, 3]: + x = layer(x, None) + else: + x = layer(x) + + h = self.norm_out(x) + h = nonlinearity(h) + x = self.conv_out(h) + return x + + +class UpsampleDecoder(nn.Module): + def __init__( + self, + in_channels, + out_channels, + ch, + num_res_blocks, + resolution, + ch_mult=(2, 2), + dropout=0.0, + ): + super().__init__() + # upsampling + self.temb_ch = 0 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + block_in = in_channels + curr_res = resolution // 2 ** (self.num_resolutions - 1) + self.res_blocks = nn.ModuleList() + self.upsample_blocks = nn.ModuleList() + for i_level in range(self.num_resolutions): + res_block = [] + block_out = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks + 1): + res_block.append( + ResnetBlock( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + ) + ) + block_in = block_out + self.res_blocks.append(nn.ModuleList(res_block)) + if i_level != self.num_resolutions - 1: + self.upsample_blocks.append(Upsample(block_in, True)) + curr_res = curr_res * 2 + + # end + self.norm_out = Normalize(block_in) + self.conv_out = torch.nn.Conv2d( + block_in, out_channels, kernel_size=3, stride=1, padding=1 + ) + + def forward(self, x): + # upsampling + h = x + for k, i_level in enumerate(range(self.num_resolutions)): + for i_block in range(self.num_res_blocks + 1): + h = self.res_blocks[i_level][i_block](h, None) + if i_level != self.num_resolutions - 1: + h = self.upsample_blocks[k](h) + h = self.norm_out(h) + h = nonlinearity(h) + h = self.conv_out(h) + return h + + +class LatentRescaler(nn.Module): + def __init__(self, factor, in_channels, mid_channels, out_channels, depth=2): + super().__init__() + # residual block, interpolate, residual block + self.factor = factor + self.conv_in = nn.Conv2d( + in_channels, mid_channels, kernel_size=3, stride=1, padding=1 + ) + self.res_block1 = nn.ModuleList( + [ + ResnetBlock( + in_channels=mid_channels, + out_channels=mid_channels, + temb_channels=0, + dropout=0.0, + ) + for _ in range(depth) + ] + ) + self.attn = AttnBlock(mid_channels) + self.res_block2 = nn.ModuleList( + [ + ResnetBlock( + in_channels=mid_channels, + out_channels=mid_channels, + temb_channels=0, + dropout=0.0, + ) + for _ in range(depth) + ] + ) + + self.conv_out = nn.Conv2d( + mid_channels, + out_channels, + kernel_size=1, + ) + + def forward(self, x): + x = self.conv_in(x) + for block in self.res_block1: + x = block(x, None) + x = torch.nn.functional.interpolate( + x, + size=( + int(round(x.shape[2] * self.factor)), + int(round(x.shape[3] * self.factor)), + ), + ) + x = self.attn(x) + for block in self.res_block2: + x = block(x, None) + x = self.conv_out(x) + return x + + +class MergedRescaleEncoder(nn.Module): + def __init__( + self, + in_channels, + ch, + resolution, + out_ch, + num_res_blocks, + attn_resolutions, + dropout=0.0, + resamp_with_conv=True, + ch_mult=(1, 2, 4, 8), + rescale_factor=1.0, + rescale_module_depth=1, + ): + super().__init__() + intermediate_chn = ch * ch_mult[-1] + self.encoder = Encoder( + in_channels=in_channels, + num_res_blocks=num_res_blocks, + ch=ch, + ch_mult=ch_mult, + z_channels=intermediate_chn, + double_z=False, + resolution=resolution, + attn_resolutions=attn_resolutions, + dropout=dropout, + resamp_with_conv=resamp_with_conv, + out_ch=None, + ) + self.rescaler = LatentRescaler( + factor=rescale_factor, + in_channels=intermediate_chn, + mid_channels=intermediate_chn, + out_channels=out_ch, + depth=rescale_module_depth, + ) + + def forward(self, x): + x = self.encoder(x) + x = self.rescaler(x) + return x + + +class MergedRescaleDecoder(nn.Module): + def __init__( + self, + z_channels, + out_ch, + resolution, + num_res_blocks, + attn_resolutions, + ch, + ch_mult=(1, 2, 4, 8), + dropout=0.0, + resamp_with_conv=True, + rescale_factor=1.0, + rescale_module_depth=1, + ): + super().__init__() + tmp_chn = z_channels * ch_mult[-1] + self.decoder = Decoder( + out_ch=out_ch, + z_channels=tmp_chn, + attn_resolutions=attn_resolutions, + dropout=dropout, + resamp_with_conv=resamp_with_conv, + in_channels=None, + num_res_blocks=num_res_blocks, + ch_mult=ch_mult, + resolution=resolution, + ch=ch, + ) + self.rescaler = LatentRescaler( + factor=rescale_factor, + in_channels=z_channels, + mid_channels=tmp_chn, + out_channels=tmp_chn, + depth=rescale_module_depth, + ) + + def forward(self, x): + x = self.rescaler(x) + x = self.decoder(x) + return x + + +class Upsampler(nn.Module): + def __init__(self, in_size, out_size, in_channels, out_channels, ch_mult=2): + super().__init__() + assert out_size >= in_size + num_blocks = int(np.log2(out_size // in_size)) + 1 + factor_up = 1.0 + (out_size % in_size) + print( + f"Building {self.__class__.__name__} with in_size: {in_size} --> out_size {out_size} and factor {factor_up}" + ) + self.rescaler = LatentRescaler( + factor=factor_up, + in_channels=in_channels, + mid_channels=2 * in_channels, + out_channels=in_channels, + ) + self.decoder = Decoder( + out_ch=out_channels, + resolution=out_size, + z_channels=in_channels, + num_res_blocks=2, + attn_resolutions=[], + in_channels=None, + ch=in_channels, + ch_mult=[ch_mult for _ in range(num_blocks)], + ) + + def forward(self, x): + x = self.rescaler(x) + x = self.decoder(x) + return x + + +class Resize(nn.Module): + def __init__(self, in_channels=None, learned=False, mode="bilinear"): + super().__init__() + self.with_conv = learned + self.mode = mode + if self.with_conv: + print( + f"Note: {self.__class__.__name} uses learned downsampling and will ignore the fixed {mode} mode" + ) + raise NotImplementedError() + assert in_channels is not None + # no asymmetric padding in torch conv, must do it ourselves + self.conv = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=4, stride=2, padding=1 + ) + + def forward(self, x, scale_factor=1.0): + if scale_factor == 1.0: + return x + else: + x = torch.nn.functional.interpolate( + x, mode=self.mode, align_corners=False, scale_factor=scale_factor + ) + return x diff --git a/inpaint/model/anytext/ldm/modules/diffusionmodules/openaimodel.py b/inpaint/model/anytext/ldm/modules/diffusionmodules/openaimodel.py new file mode 100644 index 0000000..fd3d6be --- /dev/null +++ b/inpaint/model/anytext/ldm/modules/diffusionmodules/openaimodel.py @@ -0,0 +1,786 @@ +from abc import abstractmethod +import math + +import numpy as np +import torch as th +import torch.nn as nn +import torch.nn.functional as F + +from iopaint.model.anytext.ldm.modules.diffusionmodules.util import ( + checkpoint, + conv_nd, + linear, + avg_pool_nd, + zero_module, + normalization, + timestep_embedding, +) +from iopaint.model.anytext.ldm.modules.attention import SpatialTransformer +from iopaint.model.anytext.ldm.util import exists + + +# dummy replace +def convert_module_to_f16(x): + pass + +def convert_module_to_f32(x): + pass + + +## go +class AttentionPool2d(nn.Module): + """ + Adapted from CLIP: https://github.com/openai/CLIP/blob/main/clip/model.py + """ + + def __init__( + self, + spacial_dim: int, + embed_dim: int, + num_heads_channels: int, + output_dim: int = None, + ): + super().__init__() + self.positional_embedding = nn.Parameter(th.randn(embed_dim, spacial_dim ** 2 + 1) / embed_dim ** 0.5) + self.qkv_proj = conv_nd(1, embed_dim, 3 * embed_dim, 1) + self.c_proj = conv_nd(1, embed_dim, output_dim or embed_dim, 1) + self.num_heads = embed_dim // num_heads_channels + self.attention = QKVAttention(self.num_heads) + + def forward(self, x): + b, c, *_spatial = x.shape + x = x.reshape(b, c, -1) # NC(HW) + x = th.cat([x.mean(dim=-1, keepdim=True), x], dim=-1) # NC(HW+1) + x = x + self.positional_embedding[None, :, :].to(x.dtype) # NC(HW+1) + x = self.qkv_proj(x) + x = self.attention(x) + x = self.c_proj(x) + return x[:, :, 0] + + +class TimestepBlock(nn.Module): + """ + Any module where forward() takes timestep embeddings as a second argument. + """ + + @abstractmethod + def forward(self, x, emb): + """ + Apply the module to `x` given `emb` timestep embeddings. + """ + + +class TimestepEmbedSequential(nn.Sequential, TimestepBlock): + """ + A sequential module that passes timestep embeddings to the children that + support it as an extra input. + """ + + def forward(self, x, emb, context=None): + for layer in self: + if isinstance(layer, TimestepBlock): + x = layer(x, emb) + elif isinstance(layer, SpatialTransformer): + x = layer(x, context) + else: + x = layer(x) + return x + + +class Upsample(nn.Module): + """ + An upsampling layer with an optional convolution. + :param channels: channels in the inputs and outputs. + :param use_conv: a bool determining if a convolution is applied. + :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then + upsampling occurs in the inner-two dimensions. + """ + + def __init__(self, channels, use_conv, dims=2, out_channels=None, padding=1): + super().__init__() + self.channels = channels + self.out_channels = out_channels or channels + self.use_conv = use_conv + self.dims = dims + if use_conv: + self.conv = conv_nd(dims, self.channels, self.out_channels, 3, padding=padding) + + def forward(self, x): + assert x.shape[1] == self.channels + if self.dims == 3: + x = F.interpolate( + x, (x.shape[2], x.shape[3] * 2, x.shape[4] * 2), mode="nearest" + ) + else: + x = F.interpolate(x, scale_factor=2, mode="nearest") + if self.use_conv: + x = self.conv(x) + return x + +class TransposedUpsample(nn.Module): + 'Learned 2x upsampling without padding' + def __init__(self, channels, out_channels=None, ks=5): + super().__init__() + self.channels = channels + self.out_channels = out_channels or channels + + self.up = nn.ConvTranspose2d(self.channels,self.out_channels,kernel_size=ks,stride=2) + + def forward(self,x): + return self.up(x) + + +class Downsample(nn.Module): + """ + A downsampling layer with an optional convolution. + :param channels: channels in the inputs and outputs. + :param use_conv: a bool determining if a convolution is applied. + :param dims: determines if the signal is 1D, 2D, or 3D. If 3D, then + downsampling occurs in the inner-two dimensions. + """ + + def __init__(self, channels, use_conv, dims=2, out_channels=None,padding=1): + super().__init__() + self.channels = channels + self.out_channels = out_channels or channels + self.use_conv = use_conv + self.dims = dims + stride = 2 if dims != 3 else (1, 2, 2) + if use_conv: + self.op = conv_nd( + dims, self.channels, self.out_channels, 3, stride=stride, padding=padding + ) + else: + assert self.channels == self.out_channels + self.op = avg_pool_nd(dims, kernel_size=stride, stride=stride) + + def forward(self, x): + assert x.shape[1] == self.channels + return self.op(x) + + +class ResBlock(TimestepBlock): + """ + A residual block that can optionally change the number of channels. + :param channels: the number of input channels. + :param emb_channels: the number of timestep embedding channels. + :param dropout: the rate of dropout. + :param out_channels: if specified, the number of out channels. + :param use_conv: if True and out_channels is specified, use a spatial + convolution instead of a smaller 1x1 convolution to change the + channels in the skip connection. + :param dims: determines if the signal is 1D, 2D, or 3D. + :param use_checkpoint: if True, use gradient checkpointing on this module. + :param up: if True, use this block for upsampling. + :param down: if True, use this block for downsampling. + """ + + def __init__( + self, + channels, + emb_channels, + dropout, + out_channels=None, + use_conv=False, + use_scale_shift_norm=False, + dims=2, + use_checkpoint=False, + up=False, + down=False, + ): + super().__init__() + self.channels = channels + self.emb_channels = emb_channels + self.dropout = dropout + self.out_channels = out_channels or channels + self.use_conv = use_conv + self.use_checkpoint = use_checkpoint + self.use_scale_shift_norm = use_scale_shift_norm + + self.in_layers = nn.Sequential( + normalization(channels), + nn.SiLU(), + conv_nd(dims, channels, self.out_channels, 3, padding=1), + ) + + self.updown = up or down + + if up: + self.h_upd = Upsample(channels, False, dims) + self.x_upd = Upsample(channels, False, dims) + elif down: + self.h_upd = Downsample(channels, False, dims) + self.x_upd = Downsample(channels, False, dims) + else: + self.h_upd = self.x_upd = nn.Identity() + + self.emb_layers = nn.Sequential( + nn.SiLU(), + linear( + emb_channels, + 2 * self.out_channels if use_scale_shift_norm else self.out_channels, + ), + ) + self.out_layers = nn.Sequential( + normalization(self.out_channels), + nn.SiLU(), + nn.Dropout(p=dropout), + zero_module( + conv_nd(dims, self.out_channels, self.out_channels, 3, padding=1) + ), + ) + + if self.out_channels == channels: + self.skip_connection = nn.Identity() + elif use_conv: + self.skip_connection = conv_nd( + dims, channels, self.out_channels, 3, padding=1 + ) + else: + self.skip_connection = conv_nd(dims, channels, self.out_channels, 1) + + def forward(self, x, emb): + """ + Apply the block to a Tensor, conditioned on a timestep embedding. + :param x: an [N x C x ...] Tensor of features. + :param emb: an [N x emb_channels] Tensor of timestep embeddings. + :return: an [N x C x ...] Tensor of outputs. + """ + return checkpoint( + self._forward, (x, emb), self.parameters(), self.use_checkpoint + ) + + + def _forward(self, x, emb): + if self.updown: + in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1] + h = in_rest(x) + h = self.h_upd(h) + x = self.x_upd(x) + h = in_conv(h) + else: + h = self.in_layers(x) + emb_out = self.emb_layers(emb).type(h.dtype) + while len(emb_out.shape) < len(h.shape): + emb_out = emb_out[..., None] + if self.use_scale_shift_norm: + out_norm, out_rest = self.out_layers[0], self.out_layers[1:] + scale, shift = th.chunk(emb_out, 2, dim=1) + h = out_norm(h) * (1 + scale) + shift + h = out_rest(h) + else: + h = h + emb_out + h = self.out_layers(h) + return self.skip_connection(x) + h + + +class AttentionBlock(nn.Module): + """ + An attention block that allows spatial positions to attend to each other. + Originally ported from here, but adapted to the N-d case. + https://github.com/hojonathanho/diffusion/blob/1e0dceb3b3495bbe19116a5e1b3596cd0706c543/diffusion_tf/models/unet.py#L66. + """ + + def __init__( + self, + channels, + num_heads=1, + num_head_channels=-1, + use_checkpoint=False, + use_new_attention_order=False, + ): + super().__init__() + self.channels = channels + if num_head_channels == -1: + self.num_heads = num_heads + else: + assert ( + channels % num_head_channels == 0 + ), f"q,k,v channels {channels} is not divisible by num_head_channels {num_head_channels}" + self.num_heads = channels // num_head_channels + self.use_checkpoint = use_checkpoint + self.norm = normalization(channels) + self.qkv = conv_nd(1, channels, channels * 3, 1) + if use_new_attention_order: + # split qkv before split heads + self.attention = QKVAttention(self.num_heads) + else: + # split heads before split qkv + self.attention = QKVAttentionLegacy(self.num_heads) + + self.proj_out = zero_module(conv_nd(1, channels, channels, 1)) + + def forward(self, x): + return checkpoint(self._forward, (x,), self.parameters(), True) # TODO: check checkpoint usage, is True # TODO: fix the .half call!!! + #return pt_checkpoint(self._forward, x) # pytorch + + def _forward(self, x): + b, c, *spatial = x.shape + x = x.reshape(b, c, -1) + qkv = self.qkv(self.norm(x)) + h = self.attention(qkv) + h = self.proj_out(h) + return (x + h).reshape(b, c, *spatial) + + +def count_flops_attn(model, _x, y): + """ + A counter for the `thop` package to count the operations in an + attention operation. + Meant to be used like: + macs, params = thop.profile( + model, + inputs=(inputs, timestamps), + custom_ops={QKVAttention: QKVAttention.count_flops}, + ) + """ + b, c, *spatial = y[0].shape + num_spatial = int(np.prod(spatial)) + # We perform two matmuls with the same number of ops. + # The first computes the weight matrix, the second computes + # the combination of the value vectors. + matmul_ops = 2 * b * (num_spatial ** 2) * c + model.total_ops += th.DoubleTensor([matmul_ops]) + + +class QKVAttentionLegacy(nn.Module): + """ + A module which performs QKV attention. Matches legacy QKVAttention + input/ouput heads shaping + """ + + def __init__(self, n_heads): + super().__init__() + self.n_heads = n_heads + + def forward(self, qkv): + """ + Apply QKV attention. + :param qkv: an [N x (H * 3 * C) x T] tensor of Qs, Ks, and Vs. + :return: an [N x (H * C) x T] tensor after attention. + """ + bs, width, length = qkv.shape + assert width % (3 * self.n_heads) == 0 + ch = width // (3 * self.n_heads) + q, k, v = qkv.reshape(bs * self.n_heads, ch * 3, length).split(ch, dim=1) + scale = 1 / math.sqrt(math.sqrt(ch)) + weight = th.einsum( + "bct,bcs->bts", q * scale, k * scale + ) # More stable with f16 than dividing afterwards + weight = th.softmax(weight.float(), dim=-1).type(weight.dtype) + a = th.einsum("bts,bcs->bct", weight, v) + return a.reshape(bs, -1, length) + + @staticmethod + def count_flops(model, _x, y): + return count_flops_attn(model, _x, y) + + +class QKVAttention(nn.Module): + """ + A module which performs QKV attention and splits in a different order. + """ + + def __init__(self, n_heads): + super().__init__() + self.n_heads = n_heads + + def forward(self, qkv): + """ + Apply QKV attention. + :param qkv: an [N x (3 * H * C) x T] tensor of Qs, Ks, and Vs. + :return: an [N x (H * C) x T] tensor after attention. + """ + bs, width, length = qkv.shape + assert width % (3 * self.n_heads) == 0 + ch = width // (3 * self.n_heads) + q, k, v = qkv.chunk(3, dim=1) + scale = 1 / math.sqrt(math.sqrt(ch)) + weight = th.einsum( + "bct,bcs->bts", + (q * scale).view(bs * self.n_heads, ch, length), + (k * scale).view(bs * self.n_heads, ch, length), + ) # More stable with f16 than dividing afterwards + weight = th.softmax(weight.float(), dim=-1).type(weight.dtype) + a = th.einsum("bts,bcs->bct", weight, v.reshape(bs * self.n_heads, ch, length)) + return a.reshape(bs, -1, length) + + @staticmethod + def count_flops(model, _x, y): + return count_flops_attn(model, _x, y) + + +class UNetModel(nn.Module): + """ + The full UNet model with attention and timestep embedding. + :param in_channels: channels in the input Tensor. + :param model_channels: base channel count for the model. + :param out_channels: channels in the output Tensor. + :param num_res_blocks: number of residual blocks per downsample. + :param attention_resolutions: a collection of downsample rates at which + attention will take place. May be a set, list, or tuple. + For example, if this contains 4, then at 4x downsampling, attention + will be used. + :param dropout: the dropout probability. + :param channel_mult: channel multiplier for each level of the UNet. + :param conv_resample: if True, use learned convolutions for upsampling and + downsampling. + :param dims: determines if the signal is 1D, 2D, or 3D. + :param num_classes: if specified (as an int), then this model will be + class-conditional with `num_classes` classes. + :param use_checkpoint: use gradient checkpointing to reduce memory usage. + :param num_heads: the number of attention heads in each attention layer. + :param num_heads_channels: if specified, ignore num_heads and instead use + a fixed channel width per attention head. + :param num_heads_upsample: works with num_heads to set a different number + of heads for upsampling. Deprecated. + :param use_scale_shift_norm: use a FiLM-like conditioning mechanism. + :param resblock_updown: use residual blocks for up/downsampling. + :param use_new_attention_order: use a different attention pattern for potentially + increased efficiency. + """ + + def __init__( + self, + image_size, + in_channels, + model_channels, + out_channels, + num_res_blocks, + attention_resolutions, + dropout=0, + channel_mult=(1, 2, 4, 8), + conv_resample=True, + dims=2, + num_classes=None, + use_checkpoint=False, + use_fp16=False, + num_heads=-1, + num_head_channels=-1, + num_heads_upsample=-1, + use_scale_shift_norm=False, + resblock_updown=False, + use_new_attention_order=False, + use_spatial_transformer=False, # custom transformer support + transformer_depth=1, # custom transformer support + context_dim=None, # custom transformer support + n_embed=None, # custom support for prediction of discrete ids into codebook of first stage vq model + legacy=True, + disable_self_attentions=None, + num_attention_blocks=None, + disable_middle_self_attn=False, + use_linear_in_transformer=False, + ): + super().__init__() + if use_spatial_transformer: + assert context_dim is not None, 'Fool!! You forgot to include the dimension of your cross-attention conditioning...' + + if context_dim is not None: + assert use_spatial_transformer, 'Fool!! You forgot to use the spatial transformer for your cross-attention conditioning...' + from omegaconf.listconfig import ListConfig + if type(context_dim) == ListConfig: + context_dim = list(context_dim) + + if num_heads_upsample == -1: + num_heads_upsample = num_heads + + if num_heads == -1: + assert num_head_channels != -1, 'Either num_heads or num_head_channels has to be set' + + if num_head_channels == -1: + assert num_heads != -1, 'Either num_heads or num_head_channels has to be set' + + self.image_size = image_size + self.in_channels = in_channels + self.model_channels = model_channels + self.out_channels = out_channels + if isinstance(num_res_blocks, int): + self.num_res_blocks = len(channel_mult) * [num_res_blocks] + else: + if len(num_res_blocks) != len(channel_mult): + raise ValueError("provide num_res_blocks either as an int (globally constant) or " + "as a list/tuple (per-level) with the same length as channel_mult") + self.num_res_blocks = num_res_blocks + if disable_self_attentions is not None: + # should be a list of booleans, indicating whether to disable self-attention in TransformerBlocks or not + assert len(disable_self_attentions) == len(channel_mult) + if num_attention_blocks is not None: + assert len(num_attention_blocks) == len(self.num_res_blocks) + assert all(map(lambda i: self.num_res_blocks[i] >= num_attention_blocks[i], range(len(num_attention_blocks)))) + print(f"Constructor of UNetModel received num_attention_blocks={num_attention_blocks}. " + f"This option has LESS priority than attention_resolutions {attention_resolutions}, " + f"i.e., in cases where num_attention_blocks[i] > 0 but 2**i not in attention_resolutions, " + f"attention will still not be set.") + self.use_fp16 = use_fp16 + self.attention_resolutions = attention_resolutions + self.dropout = dropout + self.channel_mult = channel_mult + self.conv_resample = conv_resample + self.num_classes = num_classes + self.use_checkpoint = use_checkpoint + self.dtype = th.float16 if use_fp16 else th.float32 + self.num_heads = num_heads + self.num_head_channels = num_head_channels + self.num_heads_upsample = num_heads_upsample + self.predict_codebook_ids = n_embed is not None + + time_embed_dim = model_channels * 4 + self.time_embed = nn.Sequential( + linear(model_channels, time_embed_dim), + nn.SiLU(), + linear(time_embed_dim, time_embed_dim), + ) + + if self.num_classes is not None: + if isinstance(self.num_classes, int): + self.label_emb = nn.Embedding(num_classes, time_embed_dim) + elif self.num_classes == "continuous": + print("setting up linear c_adm embedding layer") + self.label_emb = nn.Linear(1, time_embed_dim) + else: + raise ValueError() + + self.input_blocks = nn.ModuleList( + [ + TimestepEmbedSequential( + conv_nd(dims, in_channels, model_channels, 3, padding=1) + ) + ] + ) + self._feature_size = model_channels + input_block_chans = [model_channels] + ch = model_channels + ds = 1 + for level, mult in enumerate(channel_mult): + for nr in range(self.num_res_blocks[level]): + layers = [ + ResBlock( + ch, + time_embed_dim, + dropout, + out_channels=mult * model_channels, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ) + ] + ch = mult * model_channels + if ds in attention_resolutions: + if num_head_channels == -1: + dim_head = ch // num_heads + else: + num_heads = ch // num_head_channels + dim_head = num_head_channels + if legacy: + #num_heads = 1 + dim_head = ch // num_heads if use_spatial_transformer else num_head_channels + if exists(disable_self_attentions): + disabled_sa = disable_self_attentions[level] + else: + disabled_sa = False + + if not exists(num_attention_blocks) or nr < num_attention_blocks[level]: + layers.append( + AttentionBlock( + ch, + use_checkpoint=use_checkpoint, + num_heads=num_heads, + num_head_channels=dim_head, + use_new_attention_order=use_new_attention_order, + ) if not use_spatial_transformer else SpatialTransformer( + ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim, + disable_self_attn=disabled_sa, use_linear=use_linear_in_transformer, + use_checkpoint=use_checkpoint + ) + ) + self.input_blocks.append(TimestepEmbedSequential(*layers)) + self._feature_size += ch + input_block_chans.append(ch) + if level != len(channel_mult) - 1: + out_ch = ch + self.input_blocks.append( + TimestepEmbedSequential( + ResBlock( + ch, + time_embed_dim, + dropout, + out_channels=out_ch, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + down=True, + ) + if resblock_updown + else Downsample( + ch, conv_resample, dims=dims, out_channels=out_ch + ) + ) + ) + ch = out_ch + input_block_chans.append(ch) + ds *= 2 + self._feature_size += ch + + if num_head_channels == -1: + dim_head = ch // num_heads + else: + num_heads = ch // num_head_channels + dim_head = num_head_channels + if legacy: + #num_heads = 1 + dim_head = ch // num_heads if use_spatial_transformer else num_head_channels + self.middle_block = TimestepEmbedSequential( + ResBlock( + ch, + time_embed_dim, + dropout, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ), + AttentionBlock( + ch, + use_checkpoint=use_checkpoint, + num_heads=num_heads, + num_head_channels=dim_head, + use_new_attention_order=use_new_attention_order, + ) if not use_spatial_transformer else SpatialTransformer( # always uses a self-attn + ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim, + disable_self_attn=disable_middle_self_attn, use_linear=use_linear_in_transformer, + use_checkpoint=use_checkpoint + ), + ResBlock( + ch, + time_embed_dim, + dropout, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ), + ) + self._feature_size += ch + + self.output_blocks = nn.ModuleList([]) + for level, mult in list(enumerate(channel_mult))[::-1]: + for i in range(self.num_res_blocks[level] + 1): + ich = input_block_chans.pop() + layers = [ + ResBlock( + ch + ich, + time_embed_dim, + dropout, + out_channels=model_channels * mult, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + ) + ] + ch = model_channels * mult + if ds in attention_resolutions: + if num_head_channels == -1: + dim_head = ch // num_heads + else: + num_heads = ch // num_head_channels + dim_head = num_head_channels + if legacy: + #num_heads = 1 + dim_head = ch // num_heads if use_spatial_transformer else num_head_channels + if exists(disable_self_attentions): + disabled_sa = disable_self_attentions[level] + else: + disabled_sa = False + + if not exists(num_attention_blocks) or i < num_attention_blocks[level]: + layers.append( + AttentionBlock( + ch, + use_checkpoint=use_checkpoint, + num_heads=num_heads_upsample, + num_head_channels=dim_head, + use_new_attention_order=use_new_attention_order, + ) if not use_spatial_transformer else SpatialTransformer( + ch, num_heads, dim_head, depth=transformer_depth, context_dim=context_dim, + disable_self_attn=disabled_sa, use_linear=use_linear_in_transformer, + use_checkpoint=use_checkpoint + ) + ) + if level and i == self.num_res_blocks[level]: + out_ch = ch + layers.append( + ResBlock( + ch, + time_embed_dim, + dropout, + out_channels=out_ch, + dims=dims, + use_checkpoint=use_checkpoint, + use_scale_shift_norm=use_scale_shift_norm, + up=True, + ) + if resblock_updown + else Upsample(ch, conv_resample, dims=dims, out_channels=out_ch) + ) + ds //= 2 + self.output_blocks.append(TimestepEmbedSequential(*layers)) + self._feature_size += ch + + self.out = nn.Sequential( + normalization(ch), + nn.SiLU(), + zero_module(conv_nd(dims, model_channels, out_channels, 3, padding=1)), + ) + if self.predict_codebook_ids: + self.id_predictor = nn.Sequential( + normalization(ch), + conv_nd(dims, model_channels, n_embed, 1), + #nn.LogSoftmax(dim=1) # change to cross_entropy and produce non-normalized logits + ) + + def convert_to_fp16(self): + """ + Convert the torso of the model to float16. + """ + self.input_blocks.apply(convert_module_to_f16) + self.middle_block.apply(convert_module_to_f16) + self.output_blocks.apply(convert_module_to_f16) + + def convert_to_fp32(self): + """ + Convert the torso of the model to float32. + """ + self.input_blocks.apply(convert_module_to_f32) + self.middle_block.apply(convert_module_to_f32) + self.output_blocks.apply(convert_module_to_f32) + + def forward(self, x, timesteps=None, context=None, y=None,**kwargs): + """ + Apply the model to an input batch. + :param x: an [N x C x ...] Tensor of inputs. + :param timesteps: a 1-D batch of timesteps. + :param context: conditioning plugged in via crossattn + :param y: an [N] Tensor of labels, if class-conditional. + :return: an [N x C x ...] Tensor of outputs. + """ + assert (y is not None) == ( + self.num_classes is not None + ), "must specify y if and only if the model is class-conditional" + hs = [] + t_emb = timestep_embedding(timesteps, self.model_channels, repeat_only=False) + emb = self.time_embed(t_emb) + + if self.num_classes is not None: + assert y.shape[0] == x.shape[0] + emb = emb + self.label_emb(y) + + h = x.type(self.dtype) + for module in self.input_blocks: + h = module(h, emb, context) + hs.append(h) + h = self.middle_block(h, emb, context) + for module in self.output_blocks: + h = th.cat([h, hs.pop()], dim=1) + h = module(h, emb, context) + h = h.type(x.dtype) + if self.predict_codebook_ids: + return self.id_predictor(h) + else: + return self.out(h) diff --git a/inpaint/model/anytext/ldm/modules/diffusionmodules/upscaling.py b/inpaint/model/anytext/ldm/modules/diffusionmodules/upscaling.py new file mode 100644 index 0000000..5f92630 --- /dev/null +++ b/inpaint/model/anytext/ldm/modules/diffusionmodules/upscaling.py @@ -0,0 +1,81 @@ +import torch +import torch.nn as nn +import numpy as np +from functools import partial + +from iopaint.model.anytext.ldm.modules.diffusionmodules.util import extract_into_tensor, make_beta_schedule +from iopaint.model.anytext.ldm.util import default + + +class AbstractLowScaleModel(nn.Module): + # for concatenating a downsampled image to the latent representation + def __init__(self, noise_schedule_config=None): + super(AbstractLowScaleModel, self).__init__() + if noise_schedule_config is not None: + self.register_schedule(**noise_schedule_config) + + def register_schedule(self, beta_schedule="linear", timesteps=1000, + linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3): + betas = make_beta_schedule(beta_schedule, timesteps, linear_start=linear_start, linear_end=linear_end, + cosine_s=cosine_s) + alphas = 1. - betas + alphas_cumprod = np.cumprod(alphas, axis=0) + alphas_cumprod_prev = np.append(1., alphas_cumprod[:-1]) + + timesteps, = betas.shape + self.num_timesteps = int(timesteps) + self.linear_start = linear_start + self.linear_end = linear_end + assert alphas_cumprod.shape[0] == self.num_timesteps, 'alphas have to be defined for each timestep' + + to_torch = partial(torch.tensor, dtype=torch.float32) + + self.register_buffer('betas', to_torch(betas)) + self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod)) + self.register_buffer('alphas_cumprod_prev', to_torch(alphas_cumprod_prev)) + + # calculations for diffusion q(x_t | x_{t-1}) and others + self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod))) + self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod))) + self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod))) + self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod))) + self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod - 1))) + + def q_sample(self, x_start, t, noise=None): + noise = default(noise, lambda: torch.randn_like(x_start)) + return (extract_into_tensor(self.sqrt_alphas_cumprod, t, x_start.shape) * x_start + + extract_into_tensor(self.sqrt_one_minus_alphas_cumprod, t, x_start.shape) * noise) + + def forward(self, x): + return x, None + + def decode(self, x): + return x + + +class SimpleImageConcat(AbstractLowScaleModel): + # no noise level conditioning + def __init__(self): + super(SimpleImageConcat, self).__init__(noise_schedule_config=None) + self.max_noise_level = 0 + + def forward(self, x): + # fix to constant noise level + return x, torch.zeros(x.shape[0], device=x.device).long() + + +class ImageConcatWithNoiseAugmentation(AbstractLowScaleModel): + def __init__(self, noise_schedule_config, max_noise_level=1000, to_cuda=False): + super().__init__(noise_schedule_config=noise_schedule_config) + self.max_noise_level = max_noise_level + + def forward(self, x, noise_level=None): + if noise_level is None: + noise_level = torch.randint(0, self.max_noise_level, (x.shape[0],), device=x.device).long() + else: + assert isinstance(noise_level, torch.Tensor) + z = self.q_sample(x, noise_level) + return z, noise_level + + + diff --git a/inpaint/model/anytext/ldm/modules/diffusionmodules/util.py b/inpaint/model/anytext/ldm/modules/diffusionmodules/util.py new file mode 100644 index 0000000..da29c72 --- /dev/null +++ b/inpaint/model/anytext/ldm/modules/diffusionmodules/util.py @@ -0,0 +1,271 @@ +# adopted from +# https://github.com/openai/improved-diffusion/blob/main/improved_diffusion/gaussian_diffusion.py +# and +# https://github.com/lucidrains/denoising-diffusion-pytorch/blob/7706bdfc6f527f58d33f84b7b522e61e6e3164b3/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py +# and +# https://github.com/openai/guided-diffusion/blob/0ba878e517b276c45d1195eb29f6f5f72659a05b/guided_diffusion/nn.py +# +# thanks! + + +import os +import math +import torch +import torch.nn as nn +import numpy as np +from einops import repeat + +from iopaint.model.anytext.ldm.util import instantiate_from_config + + +def make_beta_schedule(schedule, n_timestep, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3): + if schedule == "linear": + betas = ( + torch.linspace(linear_start ** 0.5, linear_end ** 0.5, n_timestep, dtype=torch.float64) ** 2 + ) + + elif schedule == "cosine": + timesteps = ( + torch.arange(n_timestep + 1, dtype=torch.float64) / n_timestep + cosine_s + ) + alphas = timesteps / (1 + cosine_s) * np.pi / 2 + alphas = torch.cos(alphas).pow(2) + alphas = alphas / alphas[0] + betas = 1 - alphas[1:] / alphas[:-1] + betas = np.clip(betas, a_min=0, a_max=0.999) + + elif schedule == "sqrt_linear": + betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64) + elif schedule == "sqrt": + betas = torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64) ** 0.5 + else: + raise ValueError(f"schedule '{schedule}' unknown.") + return betas.numpy() + + +def make_ddim_timesteps(ddim_discr_method, num_ddim_timesteps, num_ddpm_timesteps, verbose=True): + if ddim_discr_method == 'uniform': + c = num_ddpm_timesteps // num_ddim_timesteps + ddim_timesteps = np.asarray(list(range(0, num_ddpm_timesteps, c))) + elif ddim_discr_method == 'quad': + ddim_timesteps = ((np.linspace(0, np.sqrt(num_ddpm_timesteps * .8), num_ddim_timesteps)) ** 2).astype(int) + else: + raise NotImplementedError(f'There is no ddim discretization method called "{ddim_discr_method}"') + + # assert ddim_timesteps.shape[0] == num_ddim_timesteps + # add one to get the final alpha values right (the ones from first scale to data during sampling) + steps_out = ddim_timesteps + 1 + if verbose: + print(f'Selected timesteps for ddim sampler: {steps_out}') + return steps_out + + +def make_ddim_sampling_parameters(alphacums, ddim_timesteps, eta, verbose=True): + # select alphas for computing the variance schedule + alphas = alphacums[ddim_timesteps] + alphas_prev = np.asarray([alphacums[0]] + alphacums[ddim_timesteps[:-1]].tolist()) + + # according the the formula provided in https://arxiv.org/abs/2010.02502 + sigmas = eta * np.sqrt((1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev)) + if verbose: + print(f'Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}') + print(f'For the chosen value of eta, which is {eta}, ' + f'this results in the following sigma_t schedule for ddim sampler {sigmas}') + return sigmas.to(torch.float32), alphas.to(torch.float32), alphas_prev.astype(np.float32) + + +def betas_for_alpha_bar(num_diffusion_timesteps, alpha_bar, max_beta=0.999): + """ + Create a beta schedule that discretizes the given alpha_t_bar function, + which defines the cumulative product of (1-beta) over time from t = [0,1]. + :param num_diffusion_timesteps: the number of betas to produce. + :param alpha_bar: a lambda that takes an argument t from 0 to 1 and + produces the cumulative product of (1-beta) up to that + part of the diffusion process. + :param max_beta: the maximum beta to use; use values lower than 1 to + prevent singularities. + """ + betas = [] + for i in range(num_diffusion_timesteps): + t1 = i / num_diffusion_timesteps + t2 = (i + 1) / num_diffusion_timesteps + betas.append(min(1 - alpha_bar(t2) / alpha_bar(t1), max_beta)) + return np.array(betas) + + +def extract_into_tensor(a, t, x_shape): + b, *_ = t.shape + out = a.gather(-1, t) + return out.reshape(b, *((1,) * (len(x_shape) - 1))) + + +def checkpoint(func, inputs, params, flag): + """ + Evaluate a function without caching intermediate activations, allowing for + reduced memory at the expense of extra compute in the backward pass. + :param func: the function to evaluate. + :param inputs: the argument sequence to pass to `func`. + :param params: a sequence of parameters `func` depends on but does not + explicitly take as arguments. + :param flag: if False, disable gradient checkpointing. + """ + if flag: + args = tuple(inputs) + tuple(params) + return CheckpointFunction.apply(func, len(inputs), *args) + else: + return func(*inputs) + + +class CheckpointFunction(torch.autograd.Function): + @staticmethod + def forward(ctx, run_function, length, *args): + ctx.run_function = run_function + ctx.input_tensors = list(args[:length]) + ctx.input_params = list(args[length:]) + ctx.gpu_autocast_kwargs = {"enabled": torch.is_autocast_enabled(), + "dtype": torch.get_autocast_gpu_dtype(), + "cache_enabled": torch.is_autocast_cache_enabled()} + with torch.no_grad(): + output_tensors = ctx.run_function(*ctx.input_tensors) + return output_tensors + + @staticmethod + def backward(ctx, *output_grads): + ctx.input_tensors = [x.detach().requires_grad_(True) for x in ctx.input_tensors] + with torch.enable_grad(), \ + torch.cuda.amp.autocast(**ctx.gpu_autocast_kwargs): + # Fixes a bug where the first op in run_function modifies the + # Tensor storage in place, which is not allowed for detach()'d + # Tensors. + shallow_copies = [x.view_as(x) for x in ctx.input_tensors] + output_tensors = ctx.run_function(*shallow_copies) + input_grads = torch.autograd.grad( + output_tensors, + ctx.input_tensors + ctx.input_params, + output_grads, + allow_unused=True, + ) + del ctx.input_tensors + del ctx.input_params + del output_tensors + return (None, None) + input_grads + + +def timestep_embedding(timesteps, dim, max_period=10000, repeat_only=False): + """ + Create sinusoidal timestep embeddings. + :param timesteps: a 1-D Tensor of N indices, one per batch element. + These may be fractional. + :param dim: the dimension of the output. + :param max_period: controls the minimum frequency of the embeddings. + :return: an [N x dim] Tensor of positional embeddings. + """ + if not repeat_only: + half = dim // 2 + freqs = torch.exp( + -math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half + ).to(device=timesteps.device) + args = timesteps[:, None].float() * freqs[None] + embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1) + if dim % 2: + embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1) + else: + embedding = repeat(timesteps, 'b -> b d', d=dim) + return embedding + + +def zero_module(module): + """ + Zero out the parameters of a module and return it. + """ + for p in module.parameters(): + p.detach().zero_() + return module + + +def scale_module(module, scale): + """ + Scale the parameters of a module and return it. + """ + for p in module.parameters(): + p.detach().mul_(scale) + return module + + +def mean_flat(tensor): + """ + Take the mean over all non-batch dimensions. + """ + return tensor.mean(dim=list(range(1, len(tensor.shape)))) + + +def normalization(channels): + """ + Make a standard normalization layer. + :param channels: number of input channels. + :return: an nn.Module for normalization. + """ + return GroupNorm32(32, channels) + + +# PyTorch 1.7 has SiLU, but we support PyTorch 1.5. +class SiLU(nn.Module): + def forward(self, x): + return x * torch.sigmoid(x) + + +class GroupNorm32(nn.GroupNorm): + def forward(self, x): + # return super().forward(x.float()).type(x.dtype) + return super().forward(x).type(x.dtype) + +def conv_nd(dims, *args, **kwargs): + """ + Create a 1D, 2D, or 3D convolution module. + """ + if dims == 1: + return nn.Conv1d(*args, **kwargs) + elif dims == 2: + return nn.Conv2d(*args, **kwargs) + elif dims == 3: + return nn.Conv3d(*args, **kwargs) + raise ValueError(f"unsupported dimensions: {dims}") + + +def linear(*args, **kwargs): + """ + Create a linear module. + """ + return nn.Linear(*args, **kwargs) + + +def avg_pool_nd(dims, *args, **kwargs): + """ + Create a 1D, 2D, or 3D average pooling module. + """ + if dims == 1: + return nn.AvgPool1d(*args, **kwargs) + elif dims == 2: + return nn.AvgPool2d(*args, **kwargs) + elif dims == 3: + return nn.AvgPool3d(*args, **kwargs) + raise ValueError(f"unsupported dimensions: {dims}") + + +class HybridConditioner(nn.Module): + + def __init__(self, c_concat_config, c_crossattn_config): + super().__init__() + self.concat_conditioner = instantiate_from_config(c_concat_config) + self.crossattn_conditioner = instantiate_from_config(c_crossattn_config) + + def forward(self, c_concat, c_crossattn): + c_concat = self.concat_conditioner(c_concat) + c_crossattn = self.crossattn_conditioner(c_crossattn) + return {'c_concat': [c_concat], 'c_crossattn': [c_crossattn]} + + +def noise_like(shape, device, repeat=False): + repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat(shape[0], *((1,) * (len(shape) - 1))) + noise = lambda: torch.randn(shape, device=device) + return repeat_noise() if repeat else noise() \ No newline at end of file diff --git a/inpaint/model/anytext/ldm/modules/distributions/__init__.py b/inpaint/model/anytext/ldm/modules/distributions/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/inpaint/model/anytext/ldm/modules/distributions/distributions.py b/inpaint/model/anytext/ldm/modules/distributions/distributions.py new file mode 100644 index 0000000..f2b8ef9 --- /dev/null +++ b/inpaint/model/anytext/ldm/modules/distributions/distributions.py @@ -0,0 +1,92 @@ +import torch +import numpy as np + + +class AbstractDistribution: + def sample(self): + raise NotImplementedError() + + def mode(self): + raise NotImplementedError() + + +class DiracDistribution(AbstractDistribution): + def __init__(self, value): + self.value = value + + def sample(self): + return self.value + + def mode(self): + return self.value + + +class DiagonalGaussianDistribution(object): + def __init__(self, parameters, deterministic=False): + self.parameters = parameters + self.mean, self.logvar = torch.chunk(parameters, 2, dim=1) + self.logvar = torch.clamp(self.logvar, -30.0, 20.0) + self.deterministic = deterministic + self.std = torch.exp(0.5 * self.logvar) + self.var = torch.exp(self.logvar) + if self.deterministic: + self.var = self.std = torch.zeros_like(self.mean).to(device=self.parameters.device) + + def sample(self): + x = self.mean + self.std * torch.randn(self.mean.shape).to(device=self.parameters.device) + return x + + def kl(self, other=None): + if self.deterministic: + return torch.Tensor([0.]) + else: + if other is None: + return 0.5 * torch.sum(torch.pow(self.mean, 2) + + self.var - 1.0 - self.logvar, + dim=[1, 2, 3]) + else: + return 0.5 * torch.sum( + torch.pow(self.mean - other.mean, 2) / other.var + + self.var / other.var - 1.0 - self.logvar + other.logvar, + dim=[1, 2, 3]) + + def nll(self, sample, dims=[1,2,3]): + if self.deterministic: + return torch.Tensor([0.]) + logtwopi = np.log(2.0 * np.pi) + return 0.5 * torch.sum( + logtwopi + self.logvar + torch.pow(sample - self.mean, 2) / self.var, + dim=dims) + + def mode(self): + return self.mean + + +def normal_kl(mean1, logvar1, mean2, logvar2): + """ + source: https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/losses.py#L12 + Compute the KL divergence between two gaussians. + Shapes are automatically broadcasted, so batches can be compared to + scalars, among other use cases. + """ + tensor = None + for obj in (mean1, logvar1, mean2, logvar2): + if isinstance(obj, torch.Tensor): + tensor = obj + break + assert tensor is not None, "at least one argument must be a Tensor" + + # Force variances to be Tensors. Broadcasting helps convert scalars to + # Tensors, but it does not work for torch.exp(). + logvar1, logvar2 = [ + x if isinstance(x, torch.Tensor) else torch.tensor(x).to(tensor) + for x in (logvar1, logvar2) + ] + + return 0.5 * ( + -1.0 + + logvar2 + - logvar1 + + torch.exp(logvar1 - logvar2) + + ((mean1 - mean2) ** 2) * torch.exp(-logvar2) + ) diff --git a/inpaint/model/anytext/ldm/modules/ema.py b/inpaint/model/anytext/ldm/modules/ema.py new file mode 100644 index 0000000..bded250 --- /dev/null +++ b/inpaint/model/anytext/ldm/modules/ema.py @@ -0,0 +1,80 @@ +import torch +from torch import nn + + +class LitEma(nn.Module): + def __init__(self, model, decay=0.9999, use_num_upates=True): + super().__init__() + if decay < 0.0 or decay > 1.0: + raise ValueError('Decay must be between 0 and 1') + + self.m_name2s_name = {} + self.register_buffer('decay', torch.tensor(decay, dtype=torch.float32)) + self.register_buffer('num_updates', torch.tensor(0, dtype=torch.int) if use_num_upates + else torch.tensor(-1, dtype=torch.int)) + + for name, p in model.named_parameters(): + if p.requires_grad: + # remove as '.'-character is not allowed in buffers + s_name = name.replace('.', '') + self.m_name2s_name.update({name: s_name}) + self.register_buffer(s_name, p.clone().detach().data) + + self.collected_params = [] + + def reset_num_updates(self): + del self.num_updates + self.register_buffer('num_updates', torch.tensor(0, dtype=torch.int)) + + def forward(self, model): + decay = self.decay + + if self.num_updates >= 0: + self.num_updates += 1 + decay = min(self.decay, (1 + self.num_updates) / (10 + self.num_updates)) + + one_minus_decay = 1.0 - decay + + with torch.no_grad(): + m_param = dict(model.named_parameters()) + shadow_params = dict(self.named_buffers()) + + for key in m_param: + if m_param[key].requires_grad: + sname = self.m_name2s_name[key] + shadow_params[sname] = shadow_params[sname].type_as(m_param[key]) + shadow_params[sname].sub_(one_minus_decay * (shadow_params[sname] - m_param[key])) + else: + assert not key in self.m_name2s_name + + def copy_to(self, model): + m_param = dict(model.named_parameters()) + shadow_params = dict(self.named_buffers()) + for key in m_param: + if m_param[key].requires_grad: + m_param[key].data.copy_(shadow_params[self.m_name2s_name[key]].data) + else: + assert not key in self.m_name2s_name + + def store(self, parameters): + """ + Save the current parameters for restoring later. + Args: + parameters: Iterable of `torch.nn.Parameter`; the parameters to be + temporarily stored. + """ + self.collected_params = [param.clone() for param in parameters] + + def restore(self, parameters): + """ + Restore the parameters stored with the `store` method. + Useful to validate the model with EMA parameters without affecting the + original optimization process. Store the parameters before the + `copy_to` method. After validation (or model saving), use this to + restore the former parameters. + Args: + parameters: Iterable of `torch.nn.Parameter`; the parameters to be + updated with the stored parameters. + """ + for c_param, param in zip(self.collected_params, parameters): + param.data.copy_(c_param.data) diff --git a/inpaint/model/anytext/ldm/modules/encoders/__init__.py b/inpaint/model/anytext/ldm/modules/encoders/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/inpaint/model/anytext/ldm/modules/encoders/modules.py b/inpaint/model/anytext/ldm/modules/encoders/modules.py new file mode 100644 index 0000000..ceac395 --- /dev/null +++ b/inpaint/model/anytext/ldm/modules/encoders/modules.py @@ -0,0 +1,411 @@ +import torch +import torch.nn as nn +from torch.utils.checkpoint import checkpoint + +from transformers import ( + T5Tokenizer, + T5EncoderModel, + CLIPTokenizer, + CLIPTextModel, + AutoProcessor, + CLIPVisionModelWithProjection, +) + +from iopaint.model.anytext.ldm.util import count_params + + +def _expand_mask(mask, dtype, tgt_len=None): + """ + Expands attention_mask from `[bsz, seq_len]` to `[bsz, 1, tgt_seq_len, src_seq_len]`. + """ + bsz, src_len = mask.size() + tgt_len = tgt_len if tgt_len is not None else src_len + + expanded_mask = mask[:, None, None, :].expand(bsz, 1, tgt_len, src_len).to(dtype) + + inverted_mask = 1.0 - expanded_mask + + return inverted_mask.masked_fill( + inverted_mask.to(torch.bool), torch.finfo(dtype).min + ) + + +def _build_causal_attention_mask(bsz, seq_len, dtype): + # lazily create causal attention mask, with full attention between the vision tokens + # pytorch uses additive attention mask; fill with -inf + mask = torch.empty(bsz, seq_len, seq_len, dtype=dtype) + mask.fill_(torch.tensor(torch.finfo(dtype).min)) + mask.triu_(1) # zero out the lower diagonal + mask = mask.unsqueeze(1) # expand mask + return mask + + +class AbstractEncoder(nn.Module): + def __init__(self): + super().__init__() + + def encode(self, *args, **kwargs): + raise NotImplementedError + + +class IdentityEncoder(AbstractEncoder): + def encode(self, x): + return x + + +class ClassEmbedder(nn.Module): + def __init__(self, embed_dim, n_classes=1000, key="class", ucg_rate=0.1): + super().__init__() + self.key = key + self.embedding = nn.Embedding(n_classes, embed_dim) + self.n_classes = n_classes + self.ucg_rate = ucg_rate + + def forward(self, batch, key=None, disable_dropout=False): + if key is None: + key = self.key + # this is for use in crossattn + c = batch[key][:, None] + if self.ucg_rate > 0.0 and not disable_dropout: + mask = 1.0 - torch.bernoulli(torch.ones_like(c) * self.ucg_rate) + c = mask * c + (1 - mask) * torch.ones_like(c) * (self.n_classes - 1) + c = c.long() + c = self.embedding(c) + return c + + def get_unconditional_conditioning(self, bs, device="cuda"): + uc_class = ( + self.n_classes - 1 + ) # 1000 classes --> 0 ... 999, one extra class for ucg (class 1000) + uc = torch.ones((bs,), device=device) * uc_class + uc = {self.key: uc} + return uc + + +def disabled_train(self, mode=True): + """Overwrite model.train with this function to make sure train/eval mode + does not change anymore.""" + return self + + +class FrozenT5Embedder(AbstractEncoder): + """Uses the T5 transformer encoder for text""" + + def __init__( + self, version="google/t5-v1_1-large", device="cuda", max_length=77, freeze=True + ): # others are google/t5-v1_1-xl and google/t5-v1_1-xxl + super().__init__() + self.tokenizer = T5Tokenizer.from_pretrained(version) + self.transformer = T5EncoderModel.from_pretrained(version) + self.device = device + self.max_length = max_length # TODO: typical value? + if freeze: + self.freeze() + + def freeze(self): + self.transformer = self.transformer.eval() + # self.train = disabled_train + for param in self.parameters(): + param.requires_grad = False + + def forward(self, text): + batch_encoding = self.tokenizer( + text, + truncation=True, + max_length=self.max_length, + return_length=True, + return_overflowing_tokens=False, + padding="max_length", + return_tensors="pt", + ) + tokens = batch_encoding["input_ids"].to(self.device) + outputs = self.transformer(input_ids=tokens) + + z = outputs.last_hidden_state + return z + + def encode(self, text): + return self(text) + + +class FrozenCLIPEmbedder(AbstractEncoder): + """Uses the CLIP transformer encoder for text (from huggingface)""" + + LAYERS = ["last", "pooled", "hidden"] + + def __init__( + self, + version="openai/clip-vit-large-patch14", + device="cuda", + max_length=77, + freeze=True, + layer="last", + layer_idx=None, + ): # clip-vit-base-patch32 + super().__init__() + assert layer in self.LAYERS + self.tokenizer = CLIPTokenizer.from_pretrained(version) + self.transformer = CLIPTextModel.from_pretrained(version) + self.device = device + self.max_length = max_length + if freeze: + self.freeze() + self.layer = layer + self.layer_idx = layer_idx + if layer == "hidden": + assert layer_idx is not None + assert 0 <= abs(layer_idx) <= 12 + + def freeze(self): + self.transformer = self.transformer.eval() + # self.train = disabled_train + for param in self.parameters(): + param.requires_grad = False + + def forward(self, text): + batch_encoding = self.tokenizer( + text, + truncation=True, + max_length=self.max_length, + return_length=True, + return_overflowing_tokens=False, + padding="max_length", + return_tensors="pt", + ) + tokens = batch_encoding["input_ids"].to(self.device) + outputs = self.transformer( + input_ids=tokens, output_hidden_states=self.layer == "hidden" + ) + if self.layer == "last": + z = outputs.last_hidden_state + elif self.layer == "pooled": + z = outputs.pooler_output[:, None, :] + else: + z = outputs.hidden_states[self.layer_idx] + return z + + def encode(self, text): + return self(text) + + +class FrozenCLIPT5Encoder(AbstractEncoder): + def __init__( + self, + clip_version="openai/clip-vit-large-patch14", + t5_version="google/t5-v1_1-xl", + device="cuda", + clip_max_length=77, + t5_max_length=77, + ): + super().__init__() + self.clip_encoder = FrozenCLIPEmbedder( + clip_version, device, max_length=clip_max_length + ) + self.t5_encoder = FrozenT5Embedder(t5_version, device, max_length=t5_max_length) + print( + f"{self.clip_encoder.__class__.__name__} has {count_params(self.clip_encoder)*1.e-6:.2f} M parameters, " + f"{self.t5_encoder.__class__.__name__} comes with {count_params(self.t5_encoder)*1.e-6:.2f} M params." + ) + + def encode(self, text): + return self(text) + + def forward(self, text): + clip_z = self.clip_encoder.encode(text) + t5_z = self.t5_encoder.encode(text) + return [clip_z, t5_z] + + +class FrozenCLIPEmbedderT3(AbstractEncoder): + """Uses the CLIP transformer encoder for text (from Hugging Face)""" + + def __init__( + self, + version="openai/clip-vit-large-patch14", + device="cuda", + max_length=77, + freeze=True, + use_vision=False, + ): + super().__init__() + self.tokenizer = CLIPTokenizer.from_pretrained(version) + self.transformer = CLIPTextModel.from_pretrained(version) + if use_vision: + self.vit = CLIPVisionModelWithProjection.from_pretrained(version) + self.processor = AutoProcessor.from_pretrained(version) + self.device = device + self.max_length = max_length + if freeze: + self.freeze() + + def embedding_forward( + self, + input_ids=None, + position_ids=None, + inputs_embeds=None, + embedding_manager=None, + ): + seq_length = ( + input_ids.shape[-1] + if input_ids is not None + else inputs_embeds.shape[-2] + ) + if position_ids is None: + position_ids = self.position_ids[:, :seq_length] + if inputs_embeds is None: + inputs_embeds = self.token_embedding(input_ids) + if embedding_manager is not None: + inputs_embeds = embedding_manager(input_ids, inputs_embeds) + position_embeddings = self.position_embedding(position_ids) + embeddings = inputs_embeds + position_embeddings + return embeddings + + self.transformer.text_model.embeddings.forward = embedding_forward.__get__( + self.transformer.text_model.embeddings + ) + + def encoder_forward( + self, + inputs_embeds, + attention_mask=None, + causal_attention_mask=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + ): + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + encoder_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + hidden_states = inputs_embeds + for idx, encoder_layer in enumerate(self.layers): + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + layer_outputs = encoder_layer( + hidden_states, + attention_mask, + causal_attention_mask, + output_attentions=output_attentions, + ) + hidden_states = layer_outputs[0] + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + if output_hidden_states: + encoder_states = encoder_states + (hidden_states,) + return hidden_states + + self.transformer.text_model.encoder.forward = encoder_forward.__get__( + self.transformer.text_model.encoder + ) + + def text_encoder_forward( + self, + input_ids=None, + attention_mask=None, + position_ids=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + embedding_manager=None, + ): + output_attentions = ( + output_attentions + if output_attentions is not None + else self.config.output_attentions + ) + output_hidden_states = ( + output_hidden_states + if output_hidden_states is not None + else self.config.output_hidden_states + ) + return_dict = ( + return_dict if return_dict is not None else self.config.use_return_dict + ) + if input_ids is None: + raise ValueError("You have to specify either input_ids") + input_shape = input_ids.size() + input_ids = input_ids.view(-1, input_shape[-1]) + hidden_states = self.embeddings( + input_ids=input_ids, + position_ids=position_ids, + embedding_manager=embedding_manager, + ) + bsz, seq_len = input_shape + # CLIP's text model uses causal mask, prepare it here. + # https://github.com/openai/CLIP/blob/cfcffb90e69f37bf2ff1e988237a0fbe41f33c04/clip/model.py#L324 + causal_attention_mask = _build_causal_attention_mask( + bsz, seq_len, hidden_states.dtype + ).to(hidden_states.device) + # expand attention_mask + if attention_mask is not None: + # [bsz, seq_len] -> [bsz, 1, tgt_seq_len, src_seq_len] + attention_mask = _expand_mask(attention_mask, hidden_states.dtype) + last_hidden_state = self.encoder( + inputs_embeds=hidden_states, + attention_mask=attention_mask, + causal_attention_mask=causal_attention_mask, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + ) + last_hidden_state = self.final_layer_norm(last_hidden_state) + return last_hidden_state + + self.transformer.text_model.forward = text_encoder_forward.__get__( + self.transformer.text_model + ) + + def transformer_forward( + self, + input_ids=None, + attention_mask=None, + position_ids=None, + output_attentions=None, + output_hidden_states=None, + return_dict=None, + embedding_manager=None, + ): + return self.text_model( + input_ids=input_ids, + attention_mask=attention_mask, + position_ids=position_ids, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + embedding_manager=embedding_manager, + ) + + self.transformer.forward = transformer_forward.__get__(self.transformer) + + def freeze(self): + self.transformer = self.transformer.eval() + for param in self.parameters(): + param.requires_grad = False + + def forward(self, text, **kwargs): + batch_encoding = self.tokenizer( + text, + truncation=True, + max_length=self.max_length, + return_length=True, + return_overflowing_tokens=False, + padding="max_length", + return_tensors="pt", + ) + tokens = batch_encoding["input_ids"].to(self.device) + z = self.transformer(input_ids=tokens, **kwargs) + return z + + def encode(self, text, **kwargs): + return self(text, **kwargs) diff --git a/inpaint/model/anytext/ldm/util.py b/inpaint/model/anytext/ldm/util.py new file mode 100644 index 0000000..d456a86 --- /dev/null +++ b/inpaint/model/anytext/ldm/util.py @@ -0,0 +1,197 @@ +import importlib + +import torch +from torch import optim +import numpy as np + +from inspect import isfunction +from PIL import Image, ImageDraw, ImageFont + + +def log_txt_as_img(wh, xc, size=10): + # wh a tuple of (width, height) + # xc a list of captions to plot + b = len(xc) + txts = list() + for bi in range(b): + txt = Image.new("RGB", wh, color="white") + draw = ImageDraw.Draw(txt) + font = ImageFont.truetype('font/Arial_Unicode.ttf', size=size) + nc = int(32 * (wh[0] / 256)) + lines = "\n".join(xc[bi][start:start + nc] for start in range(0, len(xc[bi]), nc)) + + try: + draw.text((0, 0), lines, fill="black", font=font) + except UnicodeEncodeError: + print("Cant encode string for logging. Skipping.") + + txt = np.array(txt).transpose(2, 0, 1) / 127.5 - 1.0 + txts.append(txt) + txts = np.stack(txts) + txts = torch.tensor(txts) + return txts + + +def ismap(x): + if not isinstance(x, torch.Tensor): + return False + return (len(x.shape) == 4) and (x.shape[1] > 3) + + +def isimage(x): + if not isinstance(x,torch.Tensor): + return False + return (len(x.shape) == 4) and (x.shape[1] == 3 or x.shape[1] == 1) + + +def exists(x): + return x is not None + + +def default(val, d): + if exists(val): + return val + return d() if isfunction(d) else d + + +def mean_flat(tensor): + """ + https://github.com/openai/guided-diffusion/blob/27c20a8fab9cb472df5d6bdd6c8d11c8f430b924/guided_diffusion/nn.py#L86 + Take the mean over all non-batch dimensions. + """ + return tensor.mean(dim=list(range(1, len(tensor.shape)))) + + +def count_params(model, verbose=False): + total_params = sum(p.numel() for p in model.parameters()) + if verbose: + print(f"{model.__class__.__name__} has {total_params*1.e-6:.2f} M params.") + return total_params + + +def instantiate_from_config(config, **kwargs): + if "target" not in config: + if config == '__is_first_stage__': + return None + elif config == "__is_unconditional__": + return None + raise KeyError("Expected key `target` to instantiate.") + return get_obj_from_str(config["target"])(**config.get("params", dict()), **kwargs) + + +def get_obj_from_str(string, reload=False): + module, cls = string.rsplit(".", 1) + if reload: + module_imp = importlib.import_module(module) + importlib.reload(module_imp) + return getattr(importlib.import_module(module, package=None), cls) + + +class AdamWwithEMAandWings(optim.Optimizer): + # credit to https://gist.github.com/crowsonkb/65f7265353f403714fce3b2595e0b298 + def __init__(self, params, lr=1.e-3, betas=(0.9, 0.999), eps=1.e-8, # TODO: check hyperparameters before using + weight_decay=1.e-2, amsgrad=False, ema_decay=0.9999, # ema decay to match previous code + ema_power=1., param_names=()): + """AdamW that saves EMA versions of the parameters.""" + if not 0.0 <= lr: + raise ValueError("Invalid learning rate: {}".format(lr)) + if not 0.0 <= eps: + raise ValueError("Invalid epsilon value: {}".format(eps)) + if not 0.0 <= betas[0] < 1.0: + raise ValueError("Invalid beta parameter at index 0: {}".format(betas[0])) + if not 0.0 <= betas[1] < 1.0: + raise ValueError("Invalid beta parameter at index 1: {}".format(betas[1])) + if not 0.0 <= weight_decay: + raise ValueError("Invalid weight_decay value: {}".format(weight_decay)) + if not 0.0 <= ema_decay <= 1.0: + raise ValueError("Invalid ema_decay value: {}".format(ema_decay)) + defaults = dict(lr=lr, betas=betas, eps=eps, + weight_decay=weight_decay, amsgrad=amsgrad, ema_decay=ema_decay, + ema_power=ema_power, param_names=param_names) + super().__init__(params, defaults) + + def __setstate__(self, state): + super().__setstate__(state) + for group in self.param_groups: + group.setdefault('amsgrad', False) + + @torch.no_grad() + def step(self, closure=None): + """Performs a single optimization step. + Args: + closure (callable, optional): A closure that reevaluates the model + and returns the loss. + """ + loss = None + if closure is not None: + with torch.enable_grad(): + loss = closure() + + for group in self.param_groups: + params_with_grad = [] + grads = [] + exp_avgs = [] + exp_avg_sqs = [] + ema_params_with_grad = [] + state_sums = [] + max_exp_avg_sqs = [] + state_steps = [] + amsgrad = group['amsgrad'] + beta1, beta2 = group['betas'] + ema_decay = group['ema_decay'] + ema_power = group['ema_power'] + + for p in group['params']: + if p.grad is None: + continue + params_with_grad.append(p) + if p.grad.is_sparse: + raise RuntimeError('AdamW does not support sparse gradients') + grads.append(p.grad) + + state = self.state[p] + + # State initialization + if len(state) == 0: + state['step'] = 0 + # Exponential moving average of gradient values + state['exp_avg'] = torch.zeros_like(p, memory_format=torch.preserve_format) + # Exponential moving average of squared gradient values + state['exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format) + if amsgrad: + # Maintains max of all exp. moving avg. of sq. grad. values + state['max_exp_avg_sq'] = torch.zeros_like(p, memory_format=torch.preserve_format) + # Exponential moving average of parameter values + state['param_exp_avg'] = p.detach().float().clone() + + exp_avgs.append(state['exp_avg']) + exp_avg_sqs.append(state['exp_avg_sq']) + ema_params_with_grad.append(state['param_exp_avg']) + + if amsgrad: + max_exp_avg_sqs.append(state['max_exp_avg_sq']) + + # update the steps for each param group update + state['step'] += 1 + # record the step after step update + state_steps.append(state['step']) + + optim._functional.adamw(params_with_grad, + grads, + exp_avgs, + exp_avg_sqs, + max_exp_avg_sqs, + state_steps, + amsgrad=amsgrad, + beta1=beta1, + beta2=beta2, + lr=group['lr'], + weight_decay=group['weight_decay'], + eps=group['eps'], + maximize=False) + + cur_ema_decay = min(ema_decay, 1 - state['step'] ** -ema_power) + for param, ema_param in zip(params_with_grad, ema_params_with_grad): + ema_param.mul_(cur_ema_decay).add_(param.float(), alpha=1 - cur_ema_decay) + + return loss \ No newline at end of file diff --git a/inpaint/model/anytext/main.py b/inpaint/model/anytext/main.py new file mode 100644 index 0000000..f7b2d2e --- /dev/null +++ b/inpaint/model/anytext/main.py @@ -0,0 +1,45 @@ +import cv2 +import os + +from anytext_pipeline import AnyTextPipeline +from utils import save_images + +seed = 66273235 +# seed_everything(seed) + +pipe = AnyTextPipeline( + ckpt_path="/Users/cwq/code/github/IOPaint/iopaint/model/anytext/anytext_v1.1_fp16.ckpt", + font_path="/Users/cwq/code/github/AnyText/anytext/font/SourceHanSansSC-Medium.otf", + use_fp16=False, + device="mps", +) + +img_save_folder = "SaveImages" +rgb_image = cv2.imread( + "/Users/cwq/code/github/AnyText/anytext/example_images/ref7.jpg" +)[..., ::-1] + +masked_image = cv2.imread( + "/Users/cwq/code/github/AnyText/anytext/example_images/edit7.png" +)[..., ::-1] + +rgb_image = cv2.resize(rgb_image, (512, 512)) +masked_image = cv2.resize(masked_image, (512, 512)) + +# results: list of rgb ndarray +results, rtn_code, rtn_warning = pipe( + prompt='A cake with colorful characters that reads "EVERYDAY", best quality, extremely detailed,4k, HD, supper legible text, clear text edges, clear strokes, neat writing, no watermarks', + negative_prompt="low-res, bad anatomy, extra digit, fewer digits, cropped, worst quality, low quality, watermark, unreadable text, messy words, distorted text, disorganized writing, advertising picture", + image=rgb_image, + masked_image=masked_image, + num_inference_steps=20, + strength=1.0, + guidance_scale=9.0, + height=rgb_image.shape[0], + width=rgb_image.shape[1], + seed=seed, + sort_priority="y", +) +if rtn_code >= 0: + save_images(results, img_save_folder) + print(f"Done, result images are saved in: {img_save_folder}") diff --git a/inpaint/model/anytext/ocr_recog/RNN.py b/inpaint/model/anytext/ocr_recog/RNN.py new file mode 100755 index 0000000..cf16855 --- /dev/null +++ b/inpaint/model/anytext/ocr_recog/RNN.py @@ -0,0 +1,210 @@ +from torch import nn +import torch +from .RecSVTR import Block + +class Swish(nn.Module): + def __int__(self): + super(Swish, self).__int__() + + def forward(self,x): + return x*torch.sigmoid(x) + +class Im2Im(nn.Module): + def __init__(self, in_channels, **kwargs): + super().__init__() + self.out_channels = in_channels + + def forward(self, x): + return x + +class Im2Seq(nn.Module): + def __init__(self, in_channels, **kwargs): + super().__init__() + self.out_channels = in_channels + + def forward(self, x): + B, C, H, W = x.shape + # assert H == 1 + x = x.reshape(B, C, H * W) + x = x.permute((0, 2, 1)) + return x + +class EncoderWithRNN(nn.Module): + def __init__(self, in_channels,**kwargs): + super(EncoderWithRNN, self).__init__() + hidden_size = kwargs.get('hidden_size', 256) + self.out_channels = hidden_size * 2 + self.lstm = nn.LSTM(in_channels, hidden_size, bidirectional=True, num_layers=2,batch_first=True) + + def forward(self, x): + self.lstm.flatten_parameters() + x, _ = self.lstm(x) + return x + +class SequenceEncoder(nn.Module): + def __init__(self, in_channels, encoder_type='rnn', **kwargs): + super(SequenceEncoder, self).__init__() + self.encoder_reshape = Im2Seq(in_channels) + self.out_channels = self.encoder_reshape.out_channels + self.encoder_type = encoder_type + if encoder_type == 'reshape': + self.only_reshape = True + else: + support_encoder_dict = { + 'reshape': Im2Seq, + 'rnn': EncoderWithRNN, + 'svtr': EncoderWithSVTR + } + assert encoder_type in support_encoder_dict, '{} must in {}'.format( + encoder_type, support_encoder_dict.keys()) + + self.encoder = support_encoder_dict[encoder_type]( + self.encoder_reshape.out_channels,**kwargs) + self.out_channels = self.encoder.out_channels + self.only_reshape = False + + def forward(self, x): + if self.encoder_type != 'svtr': + x = self.encoder_reshape(x) + if not self.only_reshape: + x = self.encoder(x) + return x + else: + x = self.encoder(x) + x = self.encoder_reshape(x) + return x + +class ConvBNLayer(nn.Module): + def __init__(self, + in_channels, + out_channels, + kernel_size=3, + stride=1, + padding=0, + bias_attr=False, + groups=1, + act=nn.GELU): + super().__init__() + self.conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + # weight_attr=paddle.ParamAttr(initializer=nn.initializer.KaimingUniform()), + bias=bias_attr) + self.norm = nn.BatchNorm2d(out_channels) + self.act = Swish() + + def forward(self, inputs): + out = self.conv(inputs) + out = self.norm(out) + out = self.act(out) + return out + + +class EncoderWithSVTR(nn.Module): + def __init__( + self, + in_channels, + dims=64, # XS + depth=2, + hidden_dims=120, + use_guide=False, + num_heads=8, + qkv_bias=True, + mlp_ratio=2.0, + drop_rate=0.1, + attn_drop_rate=0.1, + drop_path=0., + qk_scale=None): + super(EncoderWithSVTR, self).__init__() + self.depth = depth + self.use_guide = use_guide + self.conv1 = ConvBNLayer( + in_channels, in_channels // 8, padding=1, act='swish') + self.conv2 = ConvBNLayer( + in_channels // 8, hidden_dims, kernel_size=1, act='swish') + + self.svtr_block = nn.ModuleList([ + Block( + dim=hidden_dims, + num_heads=num_heads, + mixer='Global', + HW=None, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + act_layer='swish', + attn_drop=attn_drop_rate, + drop_path=drop_path, + norm_layer='nn.LayerNorm', + epsilon=1e-05, + prenorm=False) for i in range(depth) + ]) + self.norm = nn.LayerNorm(hidden_dims, eps=1e-6) + self.conv3 = ConvBNLayer( + hidden_dims, in_channels, kernel_size=1, act='swish') + # last conv-nxn, the input is concat of input tensor and conv3 output tensor + self.conv4 = ConvBNLayer( + 2 * in_channels, in_channels // 8, padding=1, act='swish') + + self.conv1x1 = ConvBNLayer( + in_channels // 8, dims, kernel_size=1, act='swish') + self.out_channels = dims + self.apply(self._init_weights) + + def _init_weights(self, m): + # weight initialization + if isinstance(m, nn.Conv2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out') + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.BatchNorm2d): + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) + elif isinstance(m, nn.Linear): + nn.init.normal_(m.weight, 0, 0.01) + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.ConvTranspose2d): + nn.init.kaiming_normal_(m.weight, mode='fan_out') + if m.bias is not None: + nn.init.zeros_(m.bias) + elif isinstance(m, nn.LayerNorm): + nn.init.ones_(m.weight) + nn.init.zeros_(m.bias) + + def forward(self, x): + # for use guide + if self.use_guide: + z = x.clone() + z.stop_gradient = True + else: + z = x + # for short cut + h = z + # reduce dim + z = self.conv1(z) + z = self.conv2(z) + # SVTR global block + B, C, H, W = z.shape + z = z.flatten(2).permute(0, 2, 1) + + for blk in self.svtr_block: + z = blk(z) + + z = self.norm(z) + # last stage + z = z.reshape([-1, H, W, C]).permute(0, 3, 1, 2) + z = self.conv3(z) + z = torch.cat((h, z), dim=1) + z = self.conv1x1(self.conv4(z)) + + return z + +if __name__=="__main__": + svtrRNN = EncoderWithSVTR(56) + print(svtrRNN) \ No newline at end of file diff --git a/inpaint/model/anytext/ocr_recog/RecCTCHead.py b/inpaint/model/anytext/ocr_recog/RecCTCHead.py new file mode 100755 index 0000000..867ede9 --- /dev/null +++ b/inpaint/model/anytext/ocr_recog/RecCTCHead.py @@ -0,0 +1,48 @@ +from torch import nn + + +class CTCHead(nn.Module): + def __init__(self, + in_channels, + out_channels=6625, + fc_decay=0.0004, + mid_channels=None, + return_feats=False, + **kwargs): + super(CTCHead, self).__init__() + if mid_channels is None: + self.fc = nn.Linear( + in_channels, + out_channels, + bias=True,) + else: + self.fc1 = nn.Linear( + in_channels, + mid_channels, + bias=True, + ) + self.fc2 = nn.Linear( + mid_channels, + out_channels, + bias=True, + ) + + self.out_channels = out_channels + self.mid_channels = mid_channels + self.return_feats = return_feats + + def forward(self, x, labels=None): + if self.mid_channels is None: + predicts = self.fc(x) + else: + x = self.fc1(x) + predicts = self.fc2(x) + + if self.return_feats: + result = dict() + result['ctc'] = predicts + result['ctc_neck'] = x + else: + result = predicts + + return result diff --git a/inpaint/model/anytext/ocr_recog/RecModel.py b/inpaint/model/anytext/ocr_recog/RecModel.py new file mode 100755 index 0000000..c2313bf --- /dev/null +++ b/inpaint/model/anytext/ocr_recog/RecModel.py @@ -0,0 +1,45 @@ +from torch import nn +from .RNN import SequenceEncoder, Im2Seq, Im2Im +from .RecMv1_enhance import MobileNetV1Enhance + +from .RecCTCHead import CTCHead + +backbone_dict = {"MobileNetV1Enhance":MobileNetV1Enhance} +neck_dict = {'SequenceEncoder': SequenceEncoder, 'Im2Seq': Im2Seq,'None':Im2Im} +head_dict = {'CTCHead':CTCHead} + + +class RecModel(nn.Module): + def __init__(self, config): + super().__init__() + assert 'in_channels' in config, 'in_channels must in model config' + backbone_type = config.backbone.pop('type') + assert backbone_type in backbone_dict, f'backbone.type must in {backbone_dict}' + self.backbone = backbone_dict[backbone_type](config.in_channels, **config.backbone) + + neck_type = config.neck.pop('type') + assert neck_type in neck_dict, f'neck.type must in {neck_dict}' + self.neck = neck_dict[neck_type](self.backbone.out_channels, **config.neck) + + head_type = config.head.pop('type') + assert head_type in head_dict, f'head.type must in {head_dict}' + self.head = head_dict[head_type](self.neck.out_channels, **config.head) + + self.name = f'RecModel_{backbone_type}_{neck_type}_{head_type}' + + def load_3rd_state_dict(self, _3rd_name, _state): + self.backbone.load_3rd_state_dict(_3rd_name, _state) + self.neck.load_3rd_state_dict(_3rd_name, _state) + self.head.load_3rd_state_dict(_3rd_name, _state) + + def forward(self, x): + x = self.backbone(x) + x = self.neck(x) + x = self.head(x) + return x + + def encode(self, x): + x = self.backbone(x) + x = self.neck(x) + x = self.head.ctc_encoder(x) + return x diff --git a/inpaint/model/anytext/ocr_recog/RecMv1_enhance.py b/inpaint/model/anytext/ocr_recog/RecMv1_enhance.py new file mode 100644 index 0000000..7529b4a --- /dev/null +++ b/inpaint/model/anytext/ocr_recog/RecMv1_enhance.py @@ -0,0 +1,232 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F +from .common import Activation + + +class ConvBNLayer(nn.Module): + def __init__(self, + num_channels, + filter_size, + num_filters, + stride, + padding, + channels=None, + num_groups=1, + act='hard_swish'): + super(ConvBNLayer, self).__init__() + self.act = act + self._conv = nn.Conv2d( + in_channels=num_channels, + out_channels=num_filters, + kernel_size=filter_size, + stride=stride, + padding=padding, + groups=num_groups, + bias=False) + + self._batch_norm = nn.BatchNorm2d( + num_filters, + ) + if self.act is not None: + self._act = Activation(act_type=act, inplace=True) + + def forward(self, inputs): + y = self._conv(inputs) + y = self._batch_norm(y) + if self.act is not None: + y = self._act(y) + return y + + +class DepthwiseSeparable(nn.Module): + def __init__(self, + num_channels, + num_filters1, + num_filters2, + num_groups, + stride, + scale, + dw_size=3, + padding=1, + use_se=False): + super(DepthwiseSeparable, self).__init__() + self.use_se = use_se + self._depthwise_conv = ConvBNLayer( + num_channels=num_channels, + num_filters=int(num_filters1 * scale), + filter_size=dw_size, + stride=stride, + padding=padding, + num_groups=int(num_groups * scale)) + if use_se: + self._se = SEModule(int(num_filters1 * scale)) + self._pointwise_conv = ConvBNLayer( + num_channels=int(num_filters1 * scale), + filter_size=1, + num_filters=int(num_filters2 * scale), + stride=1, + padding=0) + + def forward(self, inputs): + y = self._depthwise_conv(inputs) + if self.use_se: + y = self._se(y) + y = self._pointwise_conv(y) + return y + + +class MobileNetV1Enhance(nn.Module): + def __init__(self, + in_channels=3, + scale=0.5, + last_conv_stride=1, + last_pool_type='max', + **kwargs): + super().__init__() + self.scale = scale + self.block_list = [] + + self.conv1 = ConvBNLayer( + num_channels=in_channels, + filter_size=3, + channels=3, + num_filters=int(32 * scale), + stride=2, + padding=1) + + conv2_1 = DepthwiseSeparable( + num_channels=int(32 * scale), + num_filters1=32, + num_filters2=64, + num_groups=32, + stride=1, + scale=scale) + self.block_list.append(conv2_1) + + conv2_2 = DepthwiseSeparable( + num_channels=int(64 * scale), + num_filters1=64, + num_filters2=128, + num_groups=64, + stride=1, + scale=scale) + self.block_list.append(conv2_2) + + conv3_1 = DepthwiseSeparable( + num_channels=int(128 * scale), + num_filters1=128, + num_filters2=128, + num_groups=128, + stride=1, + scale=scale) + self.block_list.append(conv3_1) + + conv3_2 = DepthwiseSeparable( + num_channels=int(128 * scale), + num_filters1=128, + num_filters2=256, + num_groups=128, + stride=(2, 1), + scale=scale) + self.block_list.append(conv3_2) + + conv4_1 = DepthwiseSeparable( + num_channels=int(256 * scale), + num_filters1=256, + num_filters2=256, + num_groups=256, + stride=1, + scale=scale) + self.block_list.append(conv4_1) + + conv4_2 = DepthwiseSeparable( + num_channels=int(256 * scale), + num_filters1=256, + num_filters2=512, + num_groups=256, + stride=(2, 1), + scale=scale) + self.block_list.append(conv4_2) + + for _ in range(5): + conv5 = DepthwiseSeparable( + num_channels=int(512 * scale), + num_filters1=512, + num_filters2=512, + num_groups=512, + stride=1, + dw_size=5, + padding=2, + scale=scale, + use_se=False) + self.block_list.append(conv5) + + conv5_6 = DepthwiseSeparable( + num_channels=int(512 * scale), + num_filters1=512, + num_filters2=1024, + num_groups=512, + stride=(2, 1), + dw_size=5, + padding=2, + scale=scale, + use_se=True) + self.block_list.append(conv5_6) + + conv6 = DepthwiseSeparable( + num_channels=int(1024 * scale), + num_filters1=1024, + num_filters2=1024, + num_groups=1024, + stride=last_conv_stride, + dw_size=5, + padding=2, + use_se=True, + scale=scale) + self.block_list.append(conv6) + + self.block_list = nn.Sequential(*self.block_list) + if last_pool_type == 'avg': + self.pool = nn.AvgPool2d(kernel_size=2, stride=2, padding=0) + else: + self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0) + self.out_channels = int(1024 * scale) + + def forward(self, inputs): + y = self.conv1(inputs) + y = self.block_list(y) + y = self.pool(y) + return y + +def hardsigmoid(x): + return F.relu6(x + 3., inplace=True) / 6. + +class SEModule(nn.Module): + def __init__(self, channel, reduction=4): + super(SEModule, self).__init__() + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.conv1 = nn.Conv2d( + in_channels=channel, + out_channels=channel // reduction, + kernel_size=1, + stride=1, + padding=0, + bias=True) + self.conv2 = nn.Conv2d( + in_channels=channel // reduction, + out_channels=channel, + kernel_size=1, + stride=1, + padding=0, + bias=True) + + def forward(self, inputs): + outputs = self.avg_pool(inputs) + outputs = self.conv1(outputs) + outputs = F.relu(outputs) + outputs = self.conv2(outputs) + outputs = hardsigmoid(outputs) + x = torch.mul(inputs, outputs) + + return x diff --git a/inpaint/model/anytext/ocr_recog/RecSVTR.py b/inpaint/model/anytext/ocr_recog/RecSVTR.py new file mode 100644 index 0000000..484b3df --- /dev/null +++ b/inpaint/model/anytext/ocr_recog/RecSVTR.py @@ -0,0 +1,591 @@ +import torch +import torch.nn as nn +import numpy as np +from torch.nn.init import trunc_normal_, zeros_, ones_ +from torch.nn import functional + + +def drop_path(x, drop_prob=0., training=False): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... + """ + if drop_prob == 0. or not training: + return x + keep_prob = torch.tensor(1 - drop_prob) + shape = (x.size()[0], ) + (1, ) * (x.ndim - 1) + random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype) + random_tensor = torch.floor(random_tensor) # binarize + output = x.divide(keep_prob) * random_tensor + return output + + +class Swish(nn.Module): + def __int__(self): + super(Swish, self).__int__() + + def forward(self,x): + return x*torch.sigmoid(x) + + +class ConvBNLayer(nn.Module): + def __init__(self, + in_channels, + out_channels, + kernel_size=3, + stride=1, + padding=0, + bias_attr=False, + groups=1, + act=nn.GELU): + super().__init__() + self.conv = nn.Conv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + stride=stride, + padding=padding, + groups=groups, + # weight_attr=paddle.ParamAttr(initializer=nn.initializer.KaimingUniform()), + bias=bias_attr) + self.norm = nn.BatchNorm2d(out_channels) + self.act = act() + + def forward(self, inputs): + out = self.conv(inputs) + out = self.norm(out) + out = self.act(out) + return out + + +class DropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + """ + + def __init__(self, drop_prob=None): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training) + + +class Identity(nn.Module): + def __init__(self): + super(Identity, self).__init__() + + def forward(self, input): + return input + + +class Mlp(nn.Module): + def __init__(self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = nn.Linear(in_features, hidden_features) + if isinstance(act_layer, str): + self.act = Swish() + else: + self.act = act_layer() + self.fc2 = nn.Linear(hidden_features, out_features) + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class ConvMixer(nn.Module): + def __init__( + self, + dim, + num_heads=8, + HW=(8, 25), + local_k=(3, 3), ): + super().__init__() + self.HW = HW + self.dim = dim + self.local_mixer = nn.Conv2d( + dim, + dim, + local_k, + 1, (local_k[0] // 2, local_k[1] // 2), + groups=num_heads, + # weight_attr=ParamAttr(initializer=KaimingNormal()) + ) + + def forward(self, x): + h = self.HW[0] + w = self.HW[1] + x = x.transpose([0, 2, 1]).reshape([0, self.dim, h, w]) + x = self.local_mixer(x) + x = x.flatten(2).transpose([0, 2, 1]) + return x + + +class Attention(nn.Module): + def __init__(self, + dim, + num_heads=8, + mixer='Global', + HW=(8, 25), + local_k=(7, 11), + qkv_bias=False, + qk_scale=None, + attn_drop=0., + proj_drop=0.): + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim**-0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.attn_drop = nn.Dropout(attn_drop) + self.proj = nn.Linear(dim, dim) + self.proj_drop = nn.Dropout(proj_drop) + self.HW = HW + if HW is not None: + H = HW[0] + W = HW[1] + self.N = H * W + self.C = dim + if mixer == 'Local' and HW is not None: + hk = local_k[0] + wk = local_k[1] + mask = torch.ones([H * W, H + hk - 1, W + wk - 1]) + for h in range(0, H): + for w in range(0, W): + mask[h * W + w, h:h + hk, w:w + wk] = 0. + mask_paddle = mask[:, hk // 2:H + hk // 2, wk // 2:W + wk // + 2].flatten(1) + mask_inf = torch.full([H * W, H * W],fill_value=float('-inf')) + mask = torch.where(mask_paddle < 1, mask_paddle, mask_inf) + self.mask = mask[None,None,:] + # self.mask = mask.unsqueeze([0, 1]) + self.mixer = mixer + + def forward(self, x): + if self.HW is not None: + N = self.N + C = self.C + else: + _, N, C = x.shape + qkv = self.qkv(x).reshape((-1, N, 3, self.num_heads, C //self.num_heads)).permute((2, 0, 3, 1, 4)) + q, k, v = qkv[0] * self.scale, qkv[1], qkv[2] + + attn = (q.matmul(k.permute((0, 1, 3, 2)))) + if self.mixer == 'Local': + attn += self.mask + attn = functional.softmax(attn, dim=-1) + attn = self.attn_drop(attn) + + x = (attn.matmul(v)).permute((0, 2, 1, 3)).reshape((-1, N, C)) + x = self.proj(x) + x = self.proj_drop(x) + return x + + +class Block(nn.Module): + def __init__(self, + dim, + num_heads, + mixer='Global', + local_mixer=(7, 11), + HW=(8, 25), + mlp_ratio=4., + qkv_bias=False, + qk_scale=None, + drop=0., + attn_drop=0., + drop_path=0., + act_layer=nn.GELU, + norm_layer='nn.LayerNorm', + epsilon=1e-6, + prenorm=True): + super().__init__() + if isinstance(norm_layer, str): + self.norm1 = eval(norm_layer)(dim, eps=epsilon) + else: + self.norm1 = norm_layer(dim) + if mixer == 'Global' or mixer == 'Local': + + self.mixer = Attention( + dim, + num_heads=num_heads, + mixer=mixer, + HW=HW, + local_k=local_mixer, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop) + elif mixer == 'Conv': + self.mixer = ConvMixer( + dim, num_heads=num_heads, HW=HW, local_k=local_mixer) + else: + raise TypeError("The mixer must be one of [Global, Local, Conv]") + + self.drop_path = DropPath(drop_path) if drop_path > 0. else Identity() + if isinstance(norm_layer, str): + self.norm2 = eval(norm_layer)(dim, eps=epsilon) + else: + self.norm2 = norm_layer(dim) + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp_ratio = mlp_ratio + self.mlp = Mlp(in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop) + self.prenorm = prenorm + + def forward(self, x): + if self.prenorm: + x = self.norm1(x + self.drop_path(self.mixer(x))) + x = self.norm2(x + self.drop_path(self.mlp(x))) + else: + x = x + self.drop_path(self.mixer(self.norm1(x))) + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + +class PatchEmbed(nn.Module): + """ Image to Patch Embedding + """ + + def __init__(self, + img_size=(32, 100), + in_channels=3, + embed_dim=768, + sub_num=2): + super().__init__() + num_patches = (img_size[1] // (2 ** sub_num)) * \ + (img_size[0] // (2 ** sub_num)) + self.img_size = img_size + self.num_patches = num_patches + self.embed_dim = embed_dim + self.norm = None + if sub_num == 2: + self.proj = nn.Sequential( + ConvBNLayer( + in_channels=in_channels, + out_channels=embed_dim // 2, + kernel_size=3, + stride=2, + padding=1, + act=nn.GELU, + bias_attr=False), + ConvBNLayer( + in_channels=embed_dim // 2, + out_channels=embed_dim, + kernel_size=3, + stride=2, + padding=1, + act=nn.GELU, + bias_attr=False)) + if sub_num == 3: + self.proj = nn.Sequential( + ConvBNLayer( + in_channels=in_channels, + out_channels=embed_dim // 4, + kernel_size=3, + stride=2, + padding=1, + act=nn.GELU, + bias_attr=False), + ConvBNLayer( + in_channels=embed_dim // 4, + out_channels=embed_dim // 2, + kernel_size=3, + stride=2, + padding=1, + act=nn.GELU, + bias_attr=False), + ConvBNLayer( + in_channels=embed_dim // 2, + out_channels=embed_dim, + kernel_size=3, + stride=2, + padding=1, + act=nn.GELU, + bias_attr=False)) + + def forward(self, x): + B, C, H, W = x.shape + assert H == self.img_size[0] and W == self.img_size[1], \ + f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})." + x = self.proj(x).flatten(2).permute(0, 2, 1) + return x + + +class SubSample(nn.Module): + def __init__(self, + in_channels, + out_channels, + types='Pool', + stride=(2, 1), + sub_norm='nn.LayerNorm', + act=None): + super().__init__() + self.types = types + if types == 'Pool': + self.avgpool = nn.AvgPool2d( + kernel_size=(3, 5), stride=stride, padding=(1, 2)) + self.maxpool = nn.MaxPool2d( + kernel_size=(3, 5), stride=stride, padding=(1, 2)) + self.proj = nn.Linear(in_channels, out_channels) + else: + self.conv = nn.Conv2d( + in_channels, + out_channels, + kernel_size=3, + stride=stride, + padding=1, + # weight_attr=ParamAttr(initializer=KaimingNormal()) + ) + self.norm = eval(sub_norm)(out_channels) + if act is not None: + self.act = act() + else: + self.act = None + + def forward(self, x): + + if self.types == 'Pool': + x1 = self.avgpool(x) + x2 = self.maxpool(x) + x = (x1 + x2) * 0.5 + out = self.proj(x.flatten(2).permute((0, 2, 1))) + else: + x = self.conv(x) + out = x.flatten(2).permute((0, 2, 1)) + out = self.norm(out) + if self.act is not None: + out = self.act(out) + + return out + + +class SVTRNet(nn.Module): + def __init__( + self, + img_size=[48, 100], + in_channels=3, + embed_dim=[64, 128, 256], + depth=[3, 6, 3], + num_heads=[2, 4, 8], + mixer=['Local'] * 6 + ['Global'] * + 6, # Local atten, Global atten, Conv + local_mixer=[[7, 11], [7, 11], [7, 11]], + patch_merging='Conv', # Conv, Pool, None + mlp_ratio=4, + qkv_bias=True, + qk_scale=None, + drop_rate=0., + last_drop=0.1, + attn_drop_rate=0., + drop_path_rate=0.1, + norm_layer='nn.LayerNorm', + sub_norm='nn.LayerNorm', + epsilon=1e-6, + out_channels=192, + out_char_num=25, + block_unit='Block', + act='nn.GELU', + last_stage=True, + sub_num=2, + prenorm=True, + use_lenhead=False, + **kwargs): + super().__init__() + self.img_size = img_size + self.embed_dim = embed_dim + self.out_channels = out_channels + self.prenorm = prenorm + patch_merging = None if patch_merging != 'Conv' and patch_merging != 'Pool' else patch_merging + self.patch_embed = PatchEmbed( + img_size=img_size, + in_channels=in_channels, + embed_dim=embed_dim[0], + sub_num=sub_num) + num_patches = self.patch_embed.num_patches + self.HW = [img_size[0] // (2**sub_num), img_size[1] // (2**sub_num)] + self.pos_embed = nn.Parameter(torch.zeros(1, num_patches, embed_dim[0])) + # self.pos_embed = self.create_parameter( + # shape=[1, num_patches, embed_dim[0]], default_initializer=zeros_) + + # self.add_parameter("pos_embed", self.pos_embed) + + self.pos_drop = nn.Dropout(p=drop_rate) + Block_unit = eval(block_unit) + + dpr = np.linspace(0, drop_path_rate, sum(depth)) + self.blocks1 = nn.ModuleList( + [ + Block_unit( + dim=embed_dim[0], + num_heads=num_heads[0], + mixer=mixer[0:depth[0]][i], + HW=self.HW, + local_mixer=local_mixer[0], + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + act_layer=eval(act), + attn_drop=attn_drop_rate, + drop_path=dpr[0:depth[0]][i], + norm_layer=norm_layer, + epsilon=epsilon, + prenorm=prenorm) for i in range(depth[0]) + ] + ) + if patch_merging is not None: + self.sub_sample1 = SubSample( + embed_dim[0], + embed_dim[1], + sub_norm=sub_norm, + stride=[2, 1], + types=patch_merging) + HW = [self.HW[0] // 2, self.HW[1]] + else: + HW = self.HW + self.patch_merging = patch_merging + self.blocks2 = nn.ModuleList([ + Block_unit( + dim=embed_dim[1], + num_heads=num_heads[1], + mixer=mixer[depth[0]:depth[0] + depth[1]][i], + HW=HW, + local_mixer=local_mixer[1], + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + act_layer=eval(act), + attn_drop=attn_drop_rate, + drop_path=dpr[depth[0]:depth[0] + depth[1]][i], + norm_layer=norm_layer, + epsilon=epsilon, + prenorm=prenorm) for i in range(depth[1]) + ]) + if patch_merging is not None: + self.sub_sample2 = SubSample( + embed_dim[1], + embed_dim[2], + sub_norm=sub_norm, + stride=[2, 1], + types=patch_merging) + HW = [self.HW[0] // 4, self.HW[1]] + else: + HW = self.HW + self.blocks3 = nn.ModuleList([ + Block_unit( + dim=embed_dim[2], + num_heads=num_heads[2], + mixer=mixer[depth[0] + depth[1]:][i], + HW=HW, + local_mixer=local_mixer[2], + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop_rate, + act_layer=eval(act), + attn_drop=attn_drop_rate, + drop_path=dpr[depth[0] + depth[1]:][i], + norm_layer=norm_layer, + epsilon=epsilon, + prenorm=prenorm) for i in range(depth[2]) + ]) + self.last_stage = last_stage + if last_stage: + self.avg_pool = nn.AdaptiveAvgPool2d((1, out_char_num)) + self.last_conv = nn.Conv2d( + in_channels=embed_dim[2], + out_channels=self.out_channels, + kernel_size=1, + stride=1, + padding=0, + bias=False) + self.hardswish = nn.Hardswish() + self.dropout = nn.Dropout(p=last_drop) + if not prenorm: + self.norm = eval(norm_layer)(embed_dim[-1], epsilon=epsilon) + self.use_lenhead = use_lenhead + if use_lenhead: + self.len_conv = nn.Linear(embed_dim[2], self.out_channels) + self.hardswish_len = nn.Hardswish() + self.dropout_len = nn.Dropout( + p=last_drop) + + trunc_normal_(self.pos_embed,std=.02) + self.apply(self._init_weights) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight,std=.02) + if isinstance(m, nn.Linear) and m.bias is not None: + zeros_(m.bias) + elif isinstance(m, nn.LayerNorm): + zeros_(m.bias) + ones_(m.weight) + + def forward_features(self, x): + x = self.patch_embed(x) + x = x + self.pos_embed + x = self.pos_drop(x) + for blk in self.blocks1: + x = blk(x) + if self.patch_merging is not None: + x = self.sub_sample1( + x.permute([0, 2, 1]).reshape( + [-1, self.embed_dim[0], self.HW[0], self.HW[1]])) + for blk in self.blocks2: + x = blk(x) + if self.patch_merging is not None: + x = self.sub_sample2( + x.permute([0, 2, 1]).reshape( + [-1, self.embed_dim[1], self.HW[0] // 2, self.HW[1]])) + for blk in self.blocks3: + x = blk(x) + if not self.prenorm: + x = self.norm(x) + return x + + def forward(self, x): + x = self.forward_features(x) + if self.use_lenhead: + len_x = self.len_conv(x.mean(1)) + len_x = self.dropout_len(self.hardswish_len(len_x)) + if self.last_stage: + if self.patch_merging is not None: + h = self.HW[0] // 4 + else: + h = self.HW[0] + x = self.avg_pool( + x.permute([0, 2, 1]).reshape( + [-1, self.embed_dim[2], h, self.HW[1]])) + x = self.last_conv(x) + x = self.hardswish(x) + x = self.dropout(x) + if self.use_lenhead: + return x, len_x + return x + + +if __name__=="__main__": + a = torch.rand(1,3,48,100) + svtr = SVTRNet() + + out = svtr(a) + print(svtr) + print(out.size()) \ No newline at end of file diff --git a/inpaint/model/anytext/ocr_recog/__init__.py b/inpaint/model/anytext/ocr_recog/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/inpaint/model/anytext/ocr_recog/common.py b/inpaint/model/anytext/ocr_recog/common.py new file mode 100644 index 0000000..a328bb0 --- /dev/null +++ b/inpaint/model/anytext/ocr_recog/common.py @@ -0,0 +1,74 @@ + + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class Hswish(nn.Module): + def __init__(self, inplace=True): + super(Hswish, self).__init__() + self.inplace = inplace + + def forward(self, x): + return x * F.relu6(x + 3., inplace=self.inplace) / 6. + +# out = max(0, min(1, slop*x+offset)) +# paddle.fluid.layers.hard_sigmoid(x, slope=0.2, offset=0.5, name=None) +class Hsigmoid(nn.Module): + def __init__(self, inplace=True): + super(Hsigmoid, self).__init__() + self.inplace = inplace + + def forward(self, x): + # torch: F.relu6(x + 3., inplace=self.inplace) / 6. + # paddle: F.relu6(1.2 * x + 3., inplace=self.inplace) / 6. + return F.relu6(1.2 * x + 3., inplace=self.inplace) / 6. + +class GELU(nn.Module): + def __init__(self, inplace=True): + super(GELU, self).__init__() + self.inplace = inplace + + def forward(self, x): + return torch.nn.functional.gelu(x) + + +class Swish(nn.Module): + def __init__(self, inplace=True): + super(Swish, self).__init__() + self.inplace = inplace + + def forward(self, x): + if self.inplace: + x.mul_(torch.sigmoid(x)) + return x + else: + return x*torch.sigmoid(x) + + +class Activation(nn.Module): + def __init__(self, act_type, inplace=True): + super(Activation, self).__init__() + act_type = act_type.lower() + if act_type == 'relu': + self.act = nn.ReLU(inplace=inplace) + elif act_type == 'relu6': + self.act = nn.ReLU6(inplace=inplace) + elif act_type == 'sigmoid': + raise NotImplementedError + elif act_type == 'hard_sigmoid': + self.act = Hsigmoid(inplace) + elif act_type == 'hard_swish': + self.act = Hswish(inplace=inplace) + elif act_type == 'leakyrelu': + self.act = nn.LeakyReLU(inplace=inplace) + elif act_type == 'gelu': + self.act = GELU(inplace=inplace) + elif act_type == 'swish': + self.act = Swish(inplace=inplace) + else: + raise NotImplementedError + + def forward(self, inputs): + return self.act(inputs) \ No newline at end of file diff --git a/inpaint/model/anytext/ocr_recog/en_dict.txt b/inpaint/model/anytext/ocr_recog/en_dict.txt new file mode 100644 index 0000000..7677d31 --- /dev/null +++ b/inpaint/model/anytext/ocr_recog/en_dict.txt @@ -0,0 +1,95 @@ +0 +1 +2 +3 +4 +5 +6 +7 +8 +9 +: +; +< += +> +? +@ +A +B +C +D +E +F +G +H +I +J +K +L +M +N +O +P +Q +R +S +T +U +V +W +X +Y +Z +[ +\ +] +^ +_ +` +a +b +c +d +e +f +g +h +i +j +k +l +m +n +o +p +q +r +s +t +u +v +w +x +y +z +{ +| +} +~ +! +" +# +$ +% +& +' +( +) +* ++ +, +- +. +/ + diff --git a/inpaint/model/anytext/ocr_recog/ppocr_keys_v1.txt b/inpaint/model/anytext/ocr_recog/ppocr_keys_v1.txt new file mode 100644 index 0000000..84b885d --- /dev/null +++ b/inpaint/model/anytext/ocr_recog/ppocr_keys_v1.txt @@ -0,0 +1,6623 @@ +' +疗 +绚 +诚 +娇 +溜 +题 +贿 +者 +廖 +更 +纳 +加 +奉 +公 +一 +就 +汴 +计 +与 +路 +房 +原 +妇 +2 +0 +8 +- +7 +其 +> +: +] +, +, +骑 +刈 +全 +消 +昏 +傈 +安 +久 +钟 +嗅 +不 +影 +处 +驽 +蜿 +资 +关 +椤 +地 +瘸 +专 +问 +忖 +票 +嫉 +炎 +韵 +要 +月 +田 +节 +陂 +鄙 +捌 +备 +拳 +伺 +眼 +网 +盎 +大 +傍 +心 +东 +愉 +汇 +蹿 +科 +每 +业 +里 +航 +晏 +字 +平 +录 +先 +1 +3 +彤 +鲶 +产 +稍 +督 +腴 +有 +象 +岳 +注 +绍 +在 +泺 +文 +定 +核 +名 +水 +过 +理 +让 +偷 +率 +等 +这 +发 +” +为 +含 +肥 +酉 +相 +鄱 +七 +编 +猥 +锛 +日 +镀 +蒂 +掰 +倒 +辆 +栾 +栗 +综 +涩 +州 +雌 +滑 +馀 +了 +机 +块 +司 +宰 +甙 +兴 +矽 +抚 +保 +用 +沧 +秩 +如 +收 +息 +滥 +页 +疑 +埠 +! +! +姥 +异 +橹 +钇 +向 +下 +跄 +的 +椴 +沫 +国 +绥 +獠 +报 +开 +民 +蜇 +何 +分 +凇 +长 +讥 +藏 +掏 +施 +羽 +中 +讲 +派 +嘟 +人 +提 +浼 +间 +世 +而 +古 +多 +倪 +唇 +饯 +控 +庚 +首 +赛 +蜓 +味 +断 +制 +觉 +技 +替 +艰 +溢 +潮 +夕 +钺 +外 +摘 +枋 +动 +双 +单 +啮 +户 +枇 +确 +锦 +曜 +杜 +或 +能 +效 +霜 +盒 +然 +侗 +电 +晁 +放 +步 +鹃 +新 +杖 +蜂 +吒 +濂 +瞬 +评 +总 +隍 +对 +独 +合 +也 +是 +府 +青 +天 +诲 +墙 +组 +滴 +级 +邀 +帘 +示 +已 +时 +骸 +仄 +泅 +和 +遨 +店 +雇 +疫 +持 +巍 +踮 +境 +只 +亨 +目 +鉴 +崤 +闲 +体 +泄 +杂 +作 +般 +轰 +化 +解 +迂 +诿 +蛭 +璀 +腾 +告 +版 +服 +省 +师 +小 +规 +程 +线 +海 +办 +引 +二 +桧 +牌 +砺 +洄 +裴 +修 +图 +痫 +胡 +许 +犊 +事 +郛 +基 +柴 +呼 +食 +研 +奶 +律 +蛋 +因 +葆 +察 +戏 +褒 +戒 +再 +李 +骁 +工 +貂 +油 +鹅 +章 +啄 +休 +场 +给 +睡 +纷 +豆 +器 +捎 +说 +敏 +学 +会 +浒 +设 +诊 +格 +廓 +查 +来 +霓 +室 +溆 +¢ +诡 +寥 +焕 +舜 +柒 +狐 +回 +戟 +砾 +厄 +实 +翩 +尿 +五 +入 +径 +惭 +喹 +股 +宇 +篝 +| +; +美 +期 +云 +九 +祺 +扮 +靠 +锝 +槌 +系 +企 +酰 +阊 +暂 +蚕 +忻 +豁 +本 +羹 +执 +条 +钦 +H +獒 +限 +进 +季 +楦 +于 +芘 +玖 +铋 +茯 +未 +答 +粘 +括 +样 +精 +欠 +矢 +甥 +帷 +嵩 +扣 +令 +仔 +风 +皈 +行 +支 +部 +蓉 +刮 +站 +蜡 +救 +钊 +汗 +松 +嫌 +成 +可 +. +鹤 +院 +从 +交 +政 +怕 +活 +调 +球 +局 +验 +髌 +第 +韫 +谗 +串 +到 +圆 +年 +米 +/ +* +友 +忿 +检 +区 +看 +自 +敢 +刃 +个 +兹 +弄 +流 +留 +同 +没 +齿 +星 +聆 +轼 +湖 +什 +三 +建 +蛔 +儿 +椋 +汕 +震 +颧 +鲤 +跟 +力 +情 +璺 +铨 +陪 +务 +指 +族 +训 +滦 +鄣 +濮 +扒 +商 +箱 +十 +召 +慷 +辗 +所 +莞 +管 +护 +臭 +横 +硒 +嗓 +接 +侦 +六 +露 +党 +馋 +驾 +剖 +高 +侬 +妪 +幂 +猗 +绺 +骐 +央 +酐 +孝 +筝 +课 +徇 +缰 +门 +男 +西 +项 +句 +谙 +瞒 +秃 +篇 +教 +碲 +罚 +声 +呐 +景 +前 +富 +嘴 +鳌 +稀 +免 +朋 +啬 +睐 +去 +赈 +鱼 +住 +肩 +愕 +速 +旁 +波 +厅 +健 +茼 +厥 +鲟 +谅 +投 +攸 +炔 +数 +方 +击 +呋 +谈 +绩 +别 +愫 +僚 +躬 +鹧 +胪 +炳 +招 +喇 +膨 +泵 +蹦 +毛 +结 +5 +4 +谱 +识 +陕 +粽 +婚 +拟 +构 +且 +搜 +任 +潘 +比 +郢 +妨 +醪 +陀 +桔 +碘 +扎 +选 +哈 +骷 +楷 +亿 +明 +缆 +脯 +监 +睫 +逻 +婵 +共 +赴 +淝 +凡 +惦 +及 +达 +揖 +谩 +澹 +减 +焰 +蛹 +番 +祁 +柏 +员 +禄 +怡 +峤 +龙 +白 +叽 +生 +闯 +起 +细 +装 +谕 +竟 +聚 +钙 +上 +导 +渊 +按 +艾 +辘 +挡 +耒 +盹 +饪 +臀 +记 +邮 +蕙 +受 +各 +医 +搂 +普 +滇 +朗 +茸 +带 +翻 +酚 +( +光 +堤 +墟 +蔷 +万 +幻 +〓 +瑙 +辈 +昧 +盏 +亘 +蛀 +吉 +铰 +请 +子 +假 +闻 +税 +井 +诩 +哨 +嫂 +好 +面 +琐 +校 +馊 +鬣 +缂 +营 +访 +炖 +占 +农 +缀 +否 +经 +钚 +棵 +趟 +张 +亟 +吏 +茶 +谨 +捻 +论 +迸 +堂 +玉 +信 +吧 +瞠 +乡 +姬 +寺 +咬 +溏 +苄 +皿 +意 +赉 +宝 +尔 +钰 +艺 +特 +唳 +踉 +都 +荣 +倚 +登 +荐 +丧 +奇 +涵 +批 +炭 +近 +符 +傩 +感 +道 +着 +菊 +虹 +仲 +众 +懈 +濯 +颞 +眺 +南 +释 +北 +缝 +标 +既 +茗 +整 +撼 +迤 +贲 +挎 +耱 +拒 +某 +妍 +卫 +哇 +英 +矶 +藩 +治 +他 +元 +领 +膜 +遮 +穗 +蛾 +飞 +荒 +棺 +劫 +么 +市 +火 +温 +拈 +棚 +洼 +转 +果 +奕 +卸 +迪 +伸 +泳 +斗 +邡 +侄 +涨 +屯 +萋 +胭 +氡 +崮 +枞 +惧 +冒 +彩 +斜 +手 +豚 +随 +旭 +淑 +妞 +形 +菌 +吲 +沱 +争 +驯 +歹 +挟 +兆 +柱 +传 +至 +包 +内 +响 +临 +红 +功 +弩 +衡 +寂 +禁 +老 +棍 +耆 +渍 +织 +害 +氵 +渑 +布 +载 +靥 +嗬 +虽 +苹 +咨 +娄 +库 +雉 +榜 +帜 +嘲 +套 +瑚 +亲 +簸 +欧 +边 +6 +腿 +旮 +抛 +吹 +瞳 +得 +镓 +梗 +厨 +继 +漾 +愣 +憨 +士 +策 +窑 +抑 +躯 +襟 +脏 +参 +贸 +言 +干 +绸 +鳄 +穷 +藜 +音 +折 +详 +) +举 +悍 +甸 +癌 +黎 +谴 +死 +罩 +迁 +寒 +驷 +袖 +媒 +蒋 +掘 +模 +纠 +恣 +观 +祖 +蛆 +碍 +位 +稿 +主 +澧 +跌 +筏 +京 +锏 +帝 +贴 +证 +糠 +才 +黄 +鲸 +略 +炯 +饱 +四 +出 +园 +犀 +牧 +容 +汉 +杆 +浈 +汰 +瑷 +造 +虫 +瘩 +怪 +驴 +济 +应 +花 +沣 +谔 +夙 +旅 +价 +矿 +以 +考 +s +u +呦 +晒 +巡 +茅 +准 +肟 +瓴 +詹 +仟 +褂 +译 +桌 +混 +宁 +怦 +郑 +抿 +些 +余 +鄂 +饴 +攒 +珑 +群 +阖 +岔 +琨 +藓 +预 +环 +洮 +岌 +宀 +杲 +瀵 +最 +常 +囡 +周 +踊 +女 +鼓 +袭 +喉 +简 +范 +薯 +遐 +疏 +粱 +黜 +禧 +法 +箔 +斤 +遥 +汝 +奥 +直 +贞 +撑 +置 +绱 +集 +她 +馅 +逗 +钧 +橱 +魉 +[ +恙 +躁 +唤 +9 +旺 +膘 +待 +脾 +惫 +购 +吗 +依 +盲 +度 +瘿 +蠖 +俾 +之 +镗 +拇 +鲵 +厝 +簧 +续 +款 +展 +啃 +表 +剔 +品 +钻 +腭 +损 +清 +锶 +统 +涌 +寸 +滨 +贪 +链 +吠 +冈 +伎 +迥 +咏 +吁 +览 +防 +迅 +失 +汾 +阔 +逵 +绀 +蔑 +列 +川 +凭 +努 +熨 +揪 +利 +俱 +绉 +抢 +鸨 +我 +即 +责 +膦 +易 +毓 +鹊 +刹 +玷 +岿 +空 +嘞 +绊 +排 +术 +估 +锷 +违 +们 +苟 +铜 +播 +肘 +件 +烫 +审 +鲂 +广 +像 +铌 +惰 +铟 +巳 +胍 +鲍 +康 +憧 +色 +恢 +想 +拷 +尤 +疳 +知 +S +Y +F +D +A +峄 +裕 +帮 +握 +搔 +氐 +氘 +难 +墒 +沮 +雨 +叁 +缥 +悴 +藐 +湫 +娟 +苑 +稠 +颛 +簇 +后 +阕 +闭 +蕤 +缚 +怎 +佞 +码 +嘤 +蔡 +痊 +舱 +螯 +帕 +赫 +昵 +升 +烬 +岫 +、 +疵 +蜻 +髁 +蕨 +隶 +烛 +械 +丑 +盂 +梁 +强 +鲛 +由 +拘 +揉 +劭 +龟 +撤 +钩 +呕 +孛 +费 +妻 +漂 +求 +阑 +崖 +秤 +甘 +通 +深 +补 +赃 +坎 +床 +啪 +承 +吼 +量 +暇 +钼 +烨 +阂 +擎 +脱 +逮 +称 +P +神 +属 +矗 +华 +届 +狍 +葑 +汹 +育 +患 +窒 +蛰 +佼 +静 +槎 +运 +鳗 +庆 +逝 +曼 +疱 +克 +代 +官 +此 +麸 +耧 +蚌 +晟 +例 +础 +榛 +副 +测 +唰 +缢 +迹 +灬 +霁 +身 +岁 +赭 +扛 +又 +菡 +乜 +雾 +板 +读 +陷 +徉 +贯 +郁 +虑 +变 +钓 +菜 +圾 +现 +琢 +式 +乐 +维 +渔 +浜 +左 +吾 +脑 +钡 +警 +T +啵 +拴 +偌 +漱 +湿 +硕 +止 +骼 +魄 +积 +燥 +联 +踢 +玛 +则 +窿 +见 +振 +畿 +送 +班 +钽 +您 +赵 +刨 +印 +讨 +踝 +籍 +谡 +舌 +崧 +汽 +蔽 +沪 +酥 +绒 +怖 +财 +帖 +肱 +私 +莎 +勋 +羔 +霸 +励 +哼 +帐 +将 +帅 +渠 +纪 +婴 +娩 +岭 +厘 +滕 +吻 +伤 +坝 +冠 +戊 +隆 +瘁 +介 +涧 +物 +黍 +并 +姗 +奢 +蹑 +掣 +垸 +锴 +命 +箍 +捉 +病 +辖 +琰 +眭 +迩 +艘 +绌 +繁 +寅 +若 +毋 +思 +诉 +类 +诈 +燮 +轲 +酮 +狂 +重 +反 +职 +筱 +县 +委 +磕 +绣 +奖 +晋 +濉 +志 +徽 +肠 +呈 +獐 +坻 +口 +片 +碰 +几 +村 +柿 +劳 +料 +获 +亩 +惕 +晕 +厌 +号 +罢 +池 +正 +鏖 +煨 +家 +棕 +复 +尝 +懋 +蜥 +锅 +岛 +扰 +队 +坠 +瘾 +钬 +@ +卧 +疣 +镇 +譬 +冰 +彷 +频 +黯 +据 +垄 +采 +八 +缪 +瘫 +型 +熹 +砰 +楠 +襁 +箐 +但 +嘶 +绳 +啤 +拍 +盥 +穆 +傲 +洗 +盯 +塘 +怔 +筛 +丿 +台 +恒 +喂 +葛 +永 +¥ +烟 +酒 +桦 +书 +砂 +蚝 +缉 +态 +瀚 +袄 +圳 +轻 +蛛 +超 +榧 +遛 +姒 +奘 +铮 +右 +荽 +望 +偻 +卡 +丶 +氰 +附 +做 +革 +索 +戚 +坨 +桷 +唁 +垅 +榻 +岐 +偎 +坛 +莨 +山 +殊 +微 +骇 +陈 +爨 +推 +嗝 +驹 +澡 +藁 +呤 +卤 +嘻 +糅 +逛 +侵 +郓 +酌 +德 +摇 +※ +鬃 +被 +慨 +殡 +羸 +昌 +泡 +戛 +鞋 +河 +宪 +沿 +玲 +鲨 +翅 +哽 +源 +铅 +语 +照 +邯 +址 +荃 +佬 +顺 +鸳 +町 +霭 +睾 +瓢 +夸 +椁 +晓 +酿 +痈 +咔 +侏 +券 +噎 +湍 +签 +嚷 +离 +午 +尚 +社 +锤 +背 +孟 +使 +浪 +缦 +潍 +鞅 +军 +姹 +驶 +笑 +鳟 +鲁 +》 +孽 +钜 +绿 +洱 +礴 +焯 +椰 +颖 +囔 +乌 +孔 +巴 +互 +性 +椽 +哞 +聘 +昨 +早 +暮 +胶 +炀 +隧 +低 +彗 +昝 +铁 +呓 +氽 +藉 +喔 +癖 +瑗 +姨 +权 +胱 +韦 +堑 +蜜 +酋 +楝 +砝 +毁 +靓 +歙 +锲 +究 +屋 +喳 +骨 +辨 +碑 +武 +鸠 +宫 +辜 +烊 +适 +坡 +殃 +培 +佩 +供 +走 +蜈 +迟 +翼 +况 +姣 +凛 +浔 +吃 +飘 +债 +犟 +金 +促 +苛 +崇 +坂 +莳 +畔 +绂 +兵 +蠕 +斋 +根 +砍 +亢 +欢 +恬 +崔 +剁 +餐 +榫 +快 +扶 +‖ +濒 +缠 +鳜 +当 +彭 +驭 +浦 +篮 +昀 +锆 +秸 +钳 +弋 +娣 +瞑 +夷 +龛 +苫 +拱 +致 +% +嵊 +障 +隐 +弑 +初 +娓 +抉 +汩 +累 +蓖 +" +唬 +助 +苓 +昙 +押 +毙 +破 +城 +郧 +逢 +嚏 +獭 +瞻 +溱 +婿 +赊 +跨 +恼 +璧 +萃 +姻 +貉 +灵 +炉 +密 +氛 +陶 +砸 +谬 +衔 +点 +琛 +沛 +枳 +层 +岱 +诺 +脍 +榈 +埂 +征 +冷 +裁 +打 +蹴 +素 +瘘 +逞 +蛐 +聊 +激 +腱 +萘 +踵 +飒 +蓟 +吆 +取 +咙 +簋 +涓 +矩 +曝 +挺 +揣 +座 +你 +史 +舵 +焱 +尘 +苏 +笈 +脚 +溉 +榨 +诵 +樊 +邓 +焊 +义 +庶 +儋 +蟋 +蒲 +赦 +呷 +杞 +诠 +豪 +还 +试 +颓 +茉 +太 +除 +紫 +逃 +痴 +草 +充 +鳕 +珉 +祗 +墨 +渭 +烩 +蘸 +慕 +璇 +镶 +穴 +嵘 +恶 +骂 +险 +绋 +幕 +碉 +肺 +戳 +刘 +潞 +秣 +纾 +潜 +銮 +洛 +须 +罘 +销 +瘪 +汞 +兮 +屉 +r +林 +厕 +质 +探 +划 +狸 +殚 +善 +煊 +烹 +〒 +锈 +逯 +宸 +辍 +泱 +柚 +袍 +远 +蹋 +嶙 +绝 +峥 +娥 +缍 +雀 +徵 +认 +镱 +谷 += +贩 +勉 +撩 +鄯 +斐 +洋 +非 +祚 +泾 +诒 +饿 +撬 +威 +晷 +搭 +芍 +锥 +笺 +蓦 +候 +琊 +档 +礁 +沼 +卵 +荠 +忑 +朝 +凹 +瑞 +头 +仪 +弧 +孵 +畏 +铆 +突 +衲 +车 +浩 +气 +茂 +悖 +厢 +枕 +酝 +戴 +湾 +邹 +飚 +攘 +锂 +写 +宵 +翁 +岷 +无 +喜 +丈 +挑 +嗟 +绛 +殉 +议 +槽 +具 +醇 +淞 +笃 +郴 +阅 +饼 +底 +壕 +砚 +弈 +询 +缕 +庹 +翟 +零 +筷 +暨 +舟 +闺 +甯 +撞 +麂 +茌 +蔼 +很 +珲 +捕 +棠 +角 +阉 +媛 +娲 +诽 +剿 +尉 +爵 +睬 +韩 +诰 +匣 +危 +糍 +镯 +立 +浏 +阳 +少 +盆 +舔 +擘 +匪 +申 +尬 +铣 +旯 +抖 +赘 +瓯 +居 +ˇ +哮 +游 +锭 +茏 +歌 +坏 +甚 +秒 +舞 +沙 +仗 +劲 +潺 +阿 +燧 +郭 +嗖 +霏 +忠 +材 +奂 +耐 +跺 +砀 +输 +岖 +媳 +氟 +极 +摆 +灿 +今 +扔 +腻 +枝 +奎 +药 +熄 +吨 +话 +q +额 +慑 +嘌 +协 +喀 +壳 +埭 +视 +著 +於 +愧 +陲 +翌 +峁 +颅 +佛 +腹 +聋 +侯 +咎 +叟 +秀 +颇 +存 +较 +罪 +哄 +岗 +扫 +栏 +钾 +羌 +己 +璨 +枭 +霉 +煌 +涸 +衿 +键 +镝 +益 +岢 +奏 +连 +夯 +睿 +冥 +均 +糖 +狞 +蹊 +稻 +爸 +刿 +胥 +煜 +丽 +肿 +璃 +掸 +跚 +灾 +垂 +樾 +濑 +乎 +莲 +窄 +犹 +撮 +战 +馄 +软 +络 +显 +鸢 +胸 +宾 +妲 +恕 +埔 +蝌 +份 +遇 +巧 +瞟 +粒 +恰 +剥 +桡 +博 +讯 +凯 +堇 +阶 +滤 +卖 +斌 +骚 +彬 +兑 +磺 +樱 +舷 +两 +娱 +福 +仃 +差 +找 +桁 +÷ +净 +把 +阴 +污 +戬 +雷 +碓 +蕲 +楚 +罡 +焖 +抽 +妫 +咒 +仑 +闱 +尽 +邑 +菁 +爱 +贷 +沥 +鞑 +牡 +嗉 +崴 +骤 +塌 +嗦 +订 +拮 +滓 +捡 +锻 +次 +坪 +杩 +臃 +箬 +融 +珂 +鹗 +宗 +枚 +降 +鸬 +妯 +阄 +堰 +盐 +毅 +必 +杨 +崃 +俺 +甬 +状 +莘 +货 +耸 +菱 +腼 +铸 +唏 +痤 +孚 +澳 +懒 +溅 +翘 +疙 +杷 +淼 +缙 +骰 +喊 +悉 +砻 +坷 +艇 +赁 +界 +谤 +纣 +宴 +晃 +茹 +归 +饭 +梢 +铡 +街 +抄 +肼 +鬟 +苯 +颂 +撷 +戈 +炒 +咆 +茭 +瘙 +负 +仰 +客 +琉 +铢 +封 +卑 +珥 +椿 +镧 +窨 +鬲 +寿 +御 +袤 +铃 +萎 +砖 +餮 +脒 +裳 +肪 +孕 +嫣 +馗 +嵇 +恳 +氯 +江 +石 +褶 +冢 +祸 +阻 +狈 +羞 +银 +靳 +透 +咳 +叼 +敷 +芷 +啥 +它 +瓤 +兰 +痘 +懊 +逑 +肌 +往 +捺 +坊 +甩 +呻 +〃 +沦 +忘 +膻 +祟 +菅 +剧 +崆 +智 +坯 +臧 +霍 +墅 +攻 +眯 +倘 +拢 +骠 +铐 +庭 +岙 +瓠 +′ +缺 +泥 +迢 +捶 +? +? +郏 +喙 +掷 +沌 +纯 +秘 +种 +听 +绘 +固 +螨 +团 +香 +盗 +妒 +埚 +蓝 +拖 +旱 +荞 +铀 +血 +遏 +汲 +辰 +叩 +拽 +幅 +硬 +惶 +桀 +漠 +措 +泼 +唑 +齐 +肾 +念 +酱 +虚 +屁 +耶 +旗 +砦 +闵 +婉 +馆 +拭 +绅 +韧 +忏 +窝 +醋 +葺 +顾 +辞 +倜 +堆 +辋 +逆 +玟 +贱 +疾 +董 +惘 +倌 +锕 +淘 +嘀 +莽 +俭 +笏 +绑 +鲷 +杈 +择 +蟀 +粥 +嗯 +驰 +逾 +案 +谪 +褓 +胫 +哩 +昕 +颚 +鲢 +绠 +躺 +鹄 +崂 +儒 +俨 +丝 +尕 +泌 +啊 +萸 +彰 +幺 +吟 +骄 +苣 +弦 +脊 +瑰 +〈 +诛 +镁 +析 +闪 +剪 +侧 +哟 +框 +螃 +守 +嬗 +燕 +狭 +铈 +缮 +概 +迳 +痧 +鲲 +俯 +售 +笼 +痣 +扉 +挖 +满 +咋 +援 +邱 +扇 +歪 +便 +玑 +绦 +峡 +蛇 +叨 +〖 +泽 +胃 +斓 +喋 +怂 +坟 +猪 +该 +蚬 +炕 +弥 +赞 +棣 +晔 +娠 +挲 +狡 +创 +疖 +铕 +镭 +稷 +挫 +弭 +啾 +翔 +粉 +履 +苘 +哦 +楼 +秕 +铂 +土 +锣 +瘟 +挣 +栉 +习 +享 +桢 +袅 +磨 +桂 +谦 +延 +坚 +蔚 +噗 +署 +谟 +猬 +钎 +恐 +嬉 +雒 +倦 +衅 +亏 +璩 +睹 +刻 +殿 +王 +算 +雕 +麻 +丘 +柯 +骆 +丸 +塍 +谚 +添 +鲈 +垓 +桎 +蚯 +芥 +予 +飕 +镦 +谌 +窗 +醚 +菀 +亮 +搪 +莺 +蒿 +羁 +足 +J +真 +轶 +悬 +衷 +靛 +翊 +掩 +哒 +炅 +掐 +冼 +妮 +l +谐 +稚 +荆 +擒 +犯 +陵 +虏 +浓 +崽 +刍 +陌 +傻 +孜 +千 +靖 +演 +矜 +钕 +煽 +杰 +酗 +渗 +伞 +栋 +俗 +泫 +戍 +罕 +沾 +疽 +灏 +煦 +芬 +磴 +叱 +阱 +榉 +湃 +蜀 +叉 +醒 +彪 +租 +郡 +篷 +屎 +良 +垢 +隗 +弱 +陨 +峪 +砷 +掴 +颁 +胎 +雯 +绵 +贬 +沐 +撵 +隘 +篙 +暖 +曹 +陡 +栓 +填 +臼 +彦 +瓶 +琪 +潼 +哪 +鸡 +摩 +啦 +俟 +锋 +域 +耻 +蔫 +疯 +纹 +撇 +毒 +绶 +痛 +酯 +忍 +爪 +赳 +歆 +嘹 +辕 +烈 +册 +朴 +钱 +吮 +毯 +癜 +娃 +谀 +邵 +厮 +炽 +璞 +邃 +丐 +追 +词 +瓒 +忆 +轧 +芫 +谯 +喷 +弟 +半 +冕 +裙 +掖 +墉 +绮 +寝 +苔 +势 +顷 +褥 +切 +衮 +君 +佳 +嫒 +蚩 +霞 +佚 +洙 +逊 +镖 +暹 +唛 +& +殒 +顶 +碗 +獗 +轭 +铺 +蛊 +废 +恹 +汨 +崩 +珍 +那 +杵 +曲 +纺 +夏 +薰 +傀 +闳 +淬 +姘 +舀 +拧 +卷 +楂 +恍 +讪 +厩 +寮 +篪 +赓 +乘 +灭 +盅 +鞣 +沟 +慎 +挂 +饺 +鼾 +杳 +树 +缨 +丛 +絮 +娌 +臻 +嗳 +篡 +侩 +述 +衰 +矛 +圈 +蚜 +匕 +筹 +匿 +濞 +晨 +叶 +骋 +郝 +挚 +蚴 +滞 +增 +侍 +描 +瓣 +吖 +嫦 +蟒 +匾 +圣 +赌 +毡 +癞 +恺 +百 +曳 +需 +篓 +肮 +庖 +帏 +卿 +驿 +遗 +蹬 +鬓 +骡 +歉 +芎 +胳 +屐 +禽 +烦 +晌 +寄 +媾 +狄 +翡 +苒 +船 +廉 +终 +痞 +殇 +々 +畦 +饶 +改 +拆 +悻 +萄 +£ +瓿 +乃 +訾 +桅 +匮 +溧 +拥 +纱 +铍 +骗 +蕃 +龋 +缬 +父 +佐 +疚 +栎 +醍 +掳 +蓄 +x +惆 +颜 +鲆 +榆 +〔 +猎 +敌 +暴 +谥 +鲫 +贾 +罗 +玻 +缄 +扦 +芪 +癣 +落 +徒 +臾 +恿 +猩 +托 +邴 +肄 +牵 +春 +陛 +耀 +刊 +拓 +蓓 +邳 +堕 +寇 +枉 +淌 +啡 +湄 +兽 +酷 +萼 +碚 +濠 +萤 +夹 +旬 +戮 +梭 +琥 +椭 +昔 +勺 +蜊 +绐 +晚 +孺 +僵 +宣 +摄 +冽 +旨 +萌 +忙 +蚤 +眉 +噼 +蟑 +付 +契 +瓜 +悼 +颡 +壁 +曾 +窕 +颢 +澎 +仿 +俑 +浑 +嵌 +浣 +乍 +碌 +褪 +乱 +蔟 +隙 +玩 +剐 +葫 +箫 +纲 +围 +伐 +决 +伙 +漩 +瑟 +刑 +肓 +镳 +缓 +蹭 +氨 +皓 +典 +畲 +坍 +铑 +檐 +塑 +洞 +倬 +储 +胴 +淳 +戾 +吐 +灼 +惺 +妙 +毕 +珐 +缈 +虱 +盖 +羰 +鸿 +磅 +谓 +髅 +娴 +苴 +唷 +蚣 +霹 +抨 +贤 +唠 +犬 +誓 +逍 +庠 +逼 +麓 +籼 +釉 +呜 +碧 +秧 +氩 +摔 +霄 +穸 +纨 +辟 +妈 +映 +完 +牛 +缴 +嗷 +炊 +恩 +荔 +茆 +掉 +紊 +慌 +莓 +羟 +阙 +萁 +磐 +另 +蕹 +辱 +鳐 +湮 +吡 +吩 +唐 +睦 +垠 +舒 +圜 +冗 +瞿 +溺 +芾 +囱 +匠 +僳 +汐 +菩 +饬 +漓 +黑 +霰 +浸 +濡 +窥 +毂 +蒡 +兢 +驻 +鹉 +芮 +诙 +迫 +雳 +厂 +忐 +臆 +猴 +鸣 +蚪 +栈 +箕 +羡 +渐 +莆 +捍 +眈 +哓 +趴 +蹼 +埕 +嚣 +骛 +宏 +淄 +斑 +噜 +严 +瑛 +垃 +椎 +诱 +压 +庾 +绞 +焘 +廿 +抡 +迄 +棘 +夫 +纬 +锹 +眨 +瞌 +侠 +脐 +竞 +瀑 +孳 +骧 +遁 +姜 +颦 +荪 +滚 +萦 +伪 +逸 +粳 +爬 +锁 +矣 +役 +趣 +洒 +颔 +诏 +逐 +奸 +甭 +惠 +攀 +蹄 +泛 +尼 +拼 +阮 +鹰 +亚 +颈 +惑 +勒 +〉 +际 +肛 +爷 +刚 +钨 +丰 +养 +冶 +鲽 +辉 +蔻 +画 +覆 +皴 +妊 +麦 +返 +醉 +皂 +擀 +〗 +酶 +凑 +粹 +悟 +诀 +硖 +港 +卜 +z +杀 +涕 +± +舍 +铠 +抵 +弛 +段 +敝 +镐 +奠 +拂 +轴 +跛 +袱 +e +t +沉 +菇 +俎 +薪 +峦 +秭 +蟹 +历 +盟 +菠 +寡 +液 +肢 +喻 +染 +裱 +悱 +抱 +氙 +赤 +捅 +猛 +跑 +氮 +谣 +仁 +尺 +辊 +窍 +烙 +衍 +架 +擦 +倏 +璐 +瑁 +币 +楞 +胖 +夔 +趸 +邛 +惴 +饕 +虔 +蝎 +§ +哉 +贝 +宽 +辫 +炮 +扩 +饲 +籽 +魏 +菟 +锰 +伍 +猝 +末 +琳 +哚 +蛎 +邂 +呀 +姿 +鄞 +却 +歧 +仙 +恸 +椐 +森 +牒 +寤 +袒 +婆 +虢 +雅 +钉 +朵 +贼 +欲 +苞 +寰 +故 +龚 +坭 +嘘 +咫 +礼 +硷 +兀 +睢 +汶 +’ +铲 +烧 +绕 +诃 +浃 +钿 +哺 +柜 +讼 +颊 +璁 +腔 +洽 +咐 +脲 +簌 +筠 +镣 +玮 +鞠 +谁 +兼 +姆 +挥 +梯 +蝴 +谘 +漕 +刷 +躏 +宦 +弼 +b +垌 +劈 +麟 +莉 +揭 +笙 +渎 +仕 +嗤 +仓 +配 +怏 +抬 +错 +泯 +镊 +孰 +猿 +邪 +仍 +秋 +鼬 +壹 +歇 +吵 +炼 +< +尧 +射 +柬 +廷 +胧 +霾 +凳 +隋 +肚 +浮 +梦 +祥 +株 +堵 +退 +L +鹫 +跎 +凶 +毽 +荟 +炫 +栩 +玳 +甜 +沂 +鹿 +顽 +伯 +爹 +赔 +蛴 +徐 +匡 +欣 +狰 +缸 +雹 +蟆 +疤 +默 +沤 +啜 +痂 +衣 +禅 +w +i +h +辽 +葳 +黝 +钗 +停 +沽 +棒 +馨 +颌 +肉 +吴 +硫 +悯 +劾 +娈 +马 +啧 +吊 +悌 +镑 +峭 +帆 +瀣 +涉 +咸 +疸 +滋 +泣 +翦 +拙 +癸 +钥 +蜒 ++ +尾 +庄 +凝 +泉 +婢 +渴 +谊 +乞 +陆 +锉 +糊 +鸦 +淮 +I +B +N +晦 +弗 +乔 +庥 +葡 +尻 +席 +橡 +傣 +渣 +拿 +惩 +麋 +斛 +缃 +矮 +蛏 +岘 +鸽 +姐 +膏 +催 +奔 +镒 +喱 +蠡 +摧 +钯 +胤 +柠 +拐 +璋 +鸥 +卢 +荡 +倾 +^ +_ +珀 +逄 +萧 +塾 +掇 +贮 +笆 +聂 +圃 +冲 +嵬 +M +滔 +笕 +值 +炙 +偶 +蜱 +搐 +梆 +汪 +蔬 +腑 +鸯 +蹇 +敞 +绯 +仨 +祯 +谆 +梧 +糗 +鑫 +啸 +豺 +囹 +猾 +巢 +柄 +瀛 +筑 +踌 +沭 +暗 +苁 +鱿 +蹉 +脂 +蘖 +牢 +热 +木 +吸 +溃 +宠 +序 +泞 +偿 +拜 +檩 +厚 +朐 +毗 +螳 +吞 +媚 +朽 +担 +蝗 +橘 +畴 +祈 +糟 +盱 +隼 +郜 +惜 +珠 +裨 +铵 +焙 +琚 +唯 +咚 +噪 +骊 +丫 +滢 +勤 +棉 +呸 +咣 +淀 +隔 +蕾 +窈 +饨 +挨 +煅 +短 +匙 +粕 +镜 +赣 +撕 +墩 +酬 +馁 +豌 +颐 +抗 +酣 +氓 +佑 +搁 +哭 +递 +耷 +涡 +桃 +贻 +碣 +截 +瘦 +昭 +镌 +蔓 +氚 +甲 +猕 +蕴 +蓬 +散 +拾 +纛 +狼 +猷 +铎 +埋 +旖 +矾 +讳 +囊 +糜 +迈 +粟 +蚂 +紧 +鲳 +瘢 +栽 +稼 +羊 +锄 +斟 +睁 +桥 +瓮 +蹙 +祉 +醺 +鼻 +昱 +剃 +跳 +篱 +跷 +蒜 +翎 +宅 +晖 +嗑 +壑 +峻 +癫 +屏 +狠 +陋 +袜 +途 +憎 +祀 +莹 +滟 +佶 +溥 +臣 +约 +盛 +峰 +磁 +慵 +婪 +拦 +莅 +朕 +鹦 +粲 +裤 +哎 +疡 +嫖 +琵 +窟 +堪 +谛 +嘉 +儡 +鳝 +斩 +郾 +驸 +酊 +妄 +胜 +贺 +徙 +傅 +噌 +钢 +栅 +庇 +恋 +匝 +巯 +邈 +尸 +锚 +粗 +佟 +蛟 +薹 +纵 +蚊 +郅 +绢 +锐 +苗 +俞 +篆 +淆 +膀 +鲜 +煎 +诶 +秽 +寻 +涮 +刺 +怀 +噶 +巨 +褰 +魅 +灶 +灌 +桉 +藕 +谜 +舸 +薄 +搀 +恽 +借 +牯 +痉 +渥 +愿 +亓 +耘 +杠 +柩 +锔 +蚶 +钣 +珈 +喘 +蹒 +幽 +赐 +稗 +晤 +莱 +泔 +扯 +肯 +菪 +裆 +腩 +豉 +疆 +骜 +腐 +倭 +珏 +唔 +粮 +亡 +润 +慰 +伽 +橄 +玄 +誉 +醐 +胆 +龊 +粼 +塬 +陇 +彼 +削 +嗣 +绾 +芽 +妗 +垭 +瘴 +爽 +薏 +寨 +龈 +泠 +弹 +赢 +漪 +猫 +嘧 +涂 +恤 +圭 +茧 +烽 +屑 +痕 +巾 +赖 +荸 +凰 +腮 +畈 +亵 +蹲 +偃 +苇 +澜 +艮 +换 +骺 +烘 +苕 +梓 +颉 +肇 +哗 +悄 +氤 +涠 +葬 +屠 +鹭 +植 +竺 +佯 +诣 +鲇 +瘀 +鲅 +邦 +移 +滁 +冯 +耕 +癔 +戌 +茬 +沁 +巩 +悠 +湘 +洪 +痹 +锟 +循 +谋 +腕 +鳃 +钠 +捞 +焉 +迎 +碱 +伫 +急 +榷 +奈 +邝 +卯 +辄 +皲 +卟 +醛 +畹 +忧 +稳 +雄 +昼 +缩 +阈 +睑 +扌 +耗 +曦 +涅 +捏 +瞧 +邕 +淖 +漉 +铝 +耦 +禹 +湛 +喽 +莼 +琅 +诸 +苎 +纂 +硅 +始 +嗨 +傥 +燃 +臂 +赅 +嘈 +呆 +贵 +屹 +壮 +肋 +亍 +蚀 +卅 +豹 +腆 +邬 +迭 +浊 +} +童 +螂 +捐 +圩 +勐 +触 +寞 +汊 +壤 +荫 +膺 +渌 +芳 +懿 +遴 +螈 +泰 +蓼 +蛤 +茜 +舅 +枫 +朔 +膝 +眙 +避 +梅 +判 +鹜 +璜 +牍 +缅 +垫 +藻 +黔 +侥 +惚 +懂 +踩 +腰 +腈 +札 +丞 +唾 +慈 +顿 +摹 +荻 +琬 +~ +斧 +沈 +滂 +胁 +胀 +幄 +莜 +Z +匀 +鄄 +掌 +绰 +茎 +焚 +赋 +萱 +谑 +汁 +铒 +瞎 +夺 +蜗 +野 +娆 +冀 +弯 +篁 +懵 +灞 +隽 +芡 +脘 +俐 +辩 +芯 +掺 +喏 +膈 +蝈 +觐 +悚 +踹 +蔗 +熠 +鼠 +呵 +抓 +橼 +峨 +畜 +缔 +禾 +崭 +弃 +熊 +摒 +凸 +拗 +穹 +蒙 +抒 +祛 +劝 +闫 +扳 +阵 +醌 +踪 +喵 +侣 +搬 +仅 +荧 +赎 +蝾 +琦 +买 +婧 +瞄 +寓 +皎 +冻 +赝 +箩 +莫 +瞰 +郊 +笫 +姝 +筒 +枪 +遣 +煸 +袋 +舆 +痱 +涛 +母 +〇 +启 +践 +耙 +绲 +盘 +遂 +昊 +搞 +槿 +诬 +纰 +泓 +惨 +檬 +亻 +越 +C +o +憩 +熵 +祷 +钒 +暧 +塔 +阗 +胰 +咄 +娶 +魔 +琶 +钞 +邻 +扬 +杉 +殴 +咽 +弓 +〆 +髻 +】 +吭 +揽 +霆 +拄 +殖 +脆 +彻 +岩 +芝 +勃 +辣 +剌 +钝 +嘎 +甄 +佘 +皖 +伦 +授 +徕 +憔 +挪 +皇 +庞 +稔 +芜 +踏 +溴 +兖 +卒 +擢 +饥 +鳞 +煲 +‰ +账 +颗 +叻 +斯 +捧 +鳍 +琮 +讹 +蛙 +纽 +谭 +酸 +兔 +莒 +睇 +伟 +觑 +羲 +嗜 +宜 +褐 +旎 +辛 +卦 +诘 +筋 +鎏 +溪 +挛 +熔 +阜 +晰 +鳅 +丢 +奚 +灸 +呱 +献 +陉 +黛 +鸪 +甾 +萨 +疮 +拯 +洲 +疹 +辑 +叙 +恻 +谒 +允 +柔 +烂 +氏 +逅 +漆 +拎 +惋 +扈 +湟 +纭 +啕 +掬 +擞 +哥 +忽 +涤 +鸵 +靡 +郗 +瓷 +扁 +廊 +怨 +雏 +钮 +敦 +E +懦 +憋 +汀 +拚 +啉 +腌 +岸 +f +痼 +瞅 +尊 +咀 +眩 +飙 +忌 +仝 +迦 +熬 +毫 +胯 +篑 +茄 +腺 +凄 +舛 +碴 +锵 +诧 +羯 +後 +漏 +汤 +宓 +仞 +蚁 +壶 +谰 +皑 +铄 +棰 +罔 +辅 +晶 +苦 +牟 +闽 +\ +烃 +饮 +聿 +丙 +蛳 +朱 +煤 +涔 +鳖 +犁 +罐 +荼 +砒 +淦 +妤 +黏 +戎 +孑 +婕 +瑾 +戢 +钵 +枣 +捋 +砥 +衩 +狙 +桠 +稣 +阎 +肃 +梏 +诫 +孪 +昶 +婊 +衫 +嗔 +侃 +塞 +蜃 +樵 +峒 +貌 +屿 +欺 +缫 +阐 +栖 +诟 +珞 +荭 +吝 +萍 +嗽 +恂 +啻 +蜴 +磬 +峋 +俸 +豫 +谎 +徊 +镍 +韬 +魇 +晴 +U +囟 +猜 +蛮 +坐 +囿 +伴 +亭 +肝 +佗 +蝠 +妃 +胞 +滩 +榴 +氖 +垩 +苋 +砣 +扪 +馏 +姓 +轩 +厉 +夥 +侈 +禀 +垒 +岑 +赏 +钛 +辐 +痔 +披 +纸 +碳 +“ +坞 +蠓 +挤 +荥 +沅 +悔 +铧 +帼 +蒌 +蝇 +a +p +y +n +g +哀 +浆 +瑶 +凿 +桶 +馈 +皮 +奴 +苜 +佤 +伶 +晗 +铱 +炬 +优 +弊 +氢 +恃 +甫 +攥 +端 +锌 +灰 +稹 +炝 +曙 +邋 +亥 +眶 +碾 +拉 +萝 +绔 +捷 +浍 +腋 +姑 +菖 +凌 +涞 +麽 +锢 +桨 +潢 +绎 +镰 +殆 +锑 +渝 +铬 +困 +绽 +觎 +匈 +糙 +暑 +裹 +鸟 +盔 +肽 +迷 +綦 +『 +亳 +佝 +俘 +钴 +觇 +骥 +仆 +疝 +跪 +婶 +郯 +瀹 +唉 +脖 +踞 +针 +晾 +忒 +扼 +瞩 +叛 +椒 +疟 +嗡 +邗 +肆 +跆 +玫 +忡 +捣 +咧 +唆 +艄 +蘑 +潦 +笛 +阚 +沸 +泻 +掊 +菽 +贫 +斥 +髂 +孢 +镂 +赂 +麝 +鸾 +屡 +衬 +苷 +恪 +叠 +希 +粤 +爻 +喝 +茫 +惬 +郸 +绻 +庸 +撅 +碟 +宄 +妹 +膛 +叮 +饵 +崛 +嗲 +椅 +冤 +搅 +咕 +敛 +尹 +垦 +闷 +蝉 +霎 +勰 +败 +蓑 +泸 +肤 +鹌 +幌 +焦 +浠 +鞍 +刁 +舰 +乙 +竿 +裔 +。 +茵 +函 +伊 +兄 +丨 +娜 +匍 +謇 +莪 +宥 +似 +蝽 +翳 +酪 +翠 +粑 +薇 +祢 +骏 +赠 +叫 +Q +噤 +噻 +竖 +芗 +莠 +潭 +俊 +羿 +耜 +O +郫 +趁 +嗪 +囚 +蹶 +芒 +洁 +笋 +鹑 +敲 +硝 +啶 +堡 +渲 +揩 +』 +携 +宿 +遒 +颍 +扭 +棱 +割 +萜 +蔸 +葵 +琴 +捂 +饰 +衙 +耿 +掠 +募 +岂 +窖 +涟 +蔺 +瘤 +柞 +瞪 +怜 +匹 +距 +楔 +炜 +哆 +秦 +缎 +幼 +茁 +绪 +痨 +恨 +楸 +娅 +瓦 +桩 +雪 +嬴 +伏 +榔 +妥 +铿 +拌 +眠 +雍 +缇 +‘ +卓 +搓 +哌 +觞 +噩 +屈 +哧 +髓 +咦 +巅 +娑 +侑 +淫 +膳 +祝 +勾 +姊 +莴 +胄 +疃 +薛 +蜷 +胛 +巷 +芙 +芋 +熙 +闰 +勿 +窃 +狱 +剩 +钏 +幢 +陟 +铛 +慧 +靴 +耍 +k +浙 +浇 +飨 +惟 +绗 +祜 +澈 +啼 +咪 +磷 +摞 +诅 +郦 +抹 +跃 +壬 +吕 +肖 +琏 +颤 +尴 +剡 +抠 +凋 +赚 +泊 +津 +宕 +殷 +倔 +氲 +漫 +邺 +涎 +怠 +$ +垮 +荬 +遵 +俏 +叹 +噢 +饽 +蜘 +孙 +筵 +疼 +鞭 +羧 +牦 +箭 +潴 +c +眸 +祭 +髯 +啖 +坳 +愁 +芩 +驮 +倡 +巽 +穰 +沃 +胚 +怒 +凤 +槛 +剂 +趵 +嫁 +v +邢 +灯 +鄢 +桐 +睽 +檗 +锯 +槟 +婷 +嵋 +圻 +诗 +蕈 +颠 +遭 +痢 +芸 +怯 +馥 +竭 +锗 +徜 +恭 +遍 +籁 +剑 +嘱 +苡 +龄 +僧 +桑 +潸 +弘 +澶 +楹 +悲 +讫 +愤 +腥 +悸 +谍 +椹 +呢 +桓 +葭 +攫 +阀 +翰 +躲 +敖 +柑 +郎 +笨 +橇 +呃 +魁 +燎 +脓 +葩 +磋 +垛 +玺 +狮 +沓 +砜 +蕊 +锺 +罹 +蕉 +翱 +虐 +闾 +巫 +旦 +茱 +嬷 +枯 +鹏 +贡 +芹 +汛 +矫 +绁 +拣 +禺 +佃 +讣 +舫 +惯 +乳 +趋 +疲 +挽 +岚 +虾 +衾 +蠹 +蹂 +飓 +氦 +铖 +孩 +稞 +瑜 +壅 +掀 +勘 +妓 +畅 +髋 +W +庐 +牲 +蓿 +榕 +练 +垣 +唱 +邸 +菲 +昆 +婺 +穿 +绡 +麒 +蚱 +掂 +愚 +泷 +涪 +漳 +妩 +娉 +榄 +讷 +觅 +旧 +藤 +煮 +呛 +柳 +腓 +叭 +庵 +烷 +阡 +罂 +蜕 +擂 +猖 +咿 +媲 +脉 +【 +沏 +貅 +黠 +熏 +哲 +烁 +坦 +酵 +兜 +× +潇 +撒 +剽 +珩 +圹 +乾 +摸 +樟 +帽 +嗒 +襄 +魂 +轿 +憬 +锡 +〕 +喃 +皆 +咖 +隅 +脸 +残 +泮 +袂 +鹂 +珊 +囤 +捆 +咤 +误 +徨 +闹 +淙 +芊 +淋 +怆 +囗 +拨 +梳 +渤 +R +G +绨 +蚓 +婀 +幡 +狩 +麾 +谢 +唢 +裸 +旌 +伉 +纶 +裂 +驳 +砼 +咛 +澄 +樨 +蹈 +宙 +澍 +倍 +貔 +操 +勇 +蟠 +摈 +砧 +虬 +够 +缁 +悦 +藿 +撸 +艹 +摁 +淹 +豇 +虎 +榭 +ˉ +吱 +d +° +喧 +荀 +踱 +侮 +奋 +偕 +饷 +犍 +惮 +坑 +璎 +徘 +宛 +妆 +袈 +倩 +窦 +昂 +荏 +乖 +K +怅 +撰 +鳙 +牙 +袁 +酞 +X +痿 +琼 +闸 +雁 +趾 +荚 +虻 +涝 +《 +杏 +韭 +偈 +烤 +绫 +鞘 +卉 +症 +遢 +蓥 +诋 +杭 +荨 +匆 +竣 +簪 +辙 +敕 +虞 +丹 +缭 +咩 +黟 +m +淤 +瑕 +咂 +铉 +硼 +茨 +嶂 +痒 +畸 +敬 +涿 +粪 +窘 +熟 +叔 +嫔 +盾 +忱 +裘 +憾 +梵 +赡 +珙 +咯 +娘 +庙 +溯 +胺 +葱 +痪 +摊 +荷 +卞 +乒 +髦 +寐 +铭 +坩 +胗 +枷 +爆 +溟 +嚼 +羚 +砬 +轨 +惊 +挠 +罄 +竽 +菏 +氧 +浅 +楣 +盼 +枢 +炸 +阆 +杯 +谏 +噬 +淇 +渺 +俪 +秆 +墓 +泪 +跻 +砌 +痰 +垡 +渡 +耽 +釜 +讶 +鳎 +煞 +呗 +韶 +舶 +绷 +鹳 +缜 +旷 +铊 +皱 +龌 +檀 +霖 +奄 +槐 +艳 +蝶 +旋 +哝 +赶 +骞 +蚧 +腊 +盈 +丁 +` +蜚 +矸 +蝙 +睨 +嚓 +僻 +鬼 +醴 +夜 +彝 +磊 +笔 +拔 +栀 +糕 +厦 +邰 +纫 +逭 +纤 +眦 +膊 +馍 +躇 +烯 +蘼 +冬 +诤 +暄 +骶 +哑 +瘠 +」 +臊 +丕 +愈 +咱 +螺 +擅 +跋 +搏 +硪 +谄 +笠 +淡 +嘿 +骅 +谧 +鼎 +皋 +姚 +歼 +蠢 +驼 +耳 +胬 +挝 +涯 +狗 +蒽 +孓 +犷 +凉 +芦 +箴 +铤 +孤 +嘛 +坤 +V +茴 +朦 +挞 +尖 +橙 +诞 +搴 +碇 +洵 +浚 +帚 +蜍 +漯 +柘 +嚎 +讽 +芭 +荤 +咻 +祠 +秉 +跖 +埃 +吓 +糯 +眷 +馒 +惹 +娼 +鲑 +嫩 +讴 +轮 +瞥 +靶 +褚 +乏 +缤 +宋 +帧 +删 +驱 +碎 +扑 +俩 +俄 +偏 +涣 +竹 +噱 +皙 +佰 +渚 +唧 +斡 +# +镉 +刀 +崎 +筐 +佣 +夭 +贰 +肴 +峙 +哔 +艿 +匐 +牺 +镛 +缘 +仡 +嫡 +劣 +枸 +堀 +梨 +簿 +鸭 +蒸 +亦 +稽 +浴 +{ +衢 +束 +槲 +j +阁 +揍 +疥 +棋 +潋 +聪 +窜 +乓 +睛 +插 +冉 +阪 +苍 +搽 +「 +蟾 +螟 +幸 +仇 +樽 +撂 +慢 +跤 +幔 +俚 +淅 +覃 +觊 +溶 +妖 +帛 +侨 +曰 +妾 +泗 +· +: +瀘 +風 +Ë +( +) +∶ +紅 +紗 +瑭 +雲 +頭 +鶏 +財 +許 +• +¥ +樂 +焗 +麗 +— +; +滙 +東 +榮 +繪 +興 +… +門 +業 +π +楊 +國 +顧 +é +盤 +寳 +Λ +龍 +鳳 +島 +誌 +緣 +結 +銭 +萬 +勝 +祎 +璟 +優 +歡 +臨 +時 +購 += +★ +藍 +昇 +鐵 +觀 +勅 +農 +聲 +畫 +兿 +術 +發 +劉 +記 +專 +耑 +園 +書 +壴 +種 +Ο +● +褀 +號 +銀 +匯 +敟 +锘 +葉 +橪 +廣 +進 +蒄 +鑽 +阝 +祙 +貢 +鍋 +豊 +夬 +喆 +團 +閣 +開 +燁 +賓 +館 +酡 +沔 +順 ++ +硚 +劵 +饸 +陽 +車 +湓 +復 +萊 +氣 +軒 +華 +堃 +迮 +纟 +戶 +馬 +學 +裡 +電 +嶽 +獨 +マ +シ +サ +ジ +燘 +袪 +環 +❤ +臺 +灣 +専 +賣 +孖 +聖 +攝 +線 +▪ +α +傢 +俬 +夢 +達 +莊 +喬 +貝 +薩 +劍 +羅 +壓 +棛 +饦 +尃 +璈 +囍 +醫 +G +I +A +# +N +鷄 +髙 +嬰 +啓 +約 +隹 +潔 +賴 +藝 +~ +寶 +籣 +麺 +  +嶺 +√ +義 +網 +峩 +長 +∧ +魚 +機 +構 +② +鳯 +偉 +L +B +㙟 +畵 +鴿 +' +詩 +溝 +嚞 +屌 +藔 +佧 +玥 +蘭 +織 +1 +3 +9 +0 +7 +點 +砭 +鴨 +鋪 +銘 +廳 +弍 +‧ +創 +湯 +坶 +℃ +卩 +骝 +& +烜 +荘 +當 +潤 +扞 +係 +懷 +碶 +钅 +蚨 +讠 +☆ +叢 +爲 +埗 +涫 +塗 +→ +楽 +現 +鯨 +愛 +瑪 +鈺 +忄 +悶 +藥 +飾 +樓 +視 +孬 +ㆍ +燚 +苪 +師 +① +丼 +锽 +│ +韓 +標 +è +兒 +閏 +匋 +張 +漢 +Ü +髪 +會 +閑 +檔 +習 +裝 +の +峯 +菘 +輝 +И +雞 +釣 +億 +浐 +K +O +R +8 +H +E +P +T +W +D +S +C +M +F +姌 +饹 +» +晞 +廰 +ä +嵯 +鷹 +負 +飲 +絲 +冚 +楗 +澤 +綫 +區 +❋ +← +質 +靑 +揚 +③ +滬 +統 +産 +協 +﹑ +乸 +畐 +經 +運 +際 +洺 +岽 +為 +粵 +諾 +崋 +豐 +碁 +ɔ +V +2 +6 +齋 +誠 +訂 +´ +勑 +雙 +陳 +無 +í +泩 +媄 +夌 +刂 +i +c +t +o +r +a +嘢 +耄 +燴 +暃 +壽 +媽 +靈 +抻 +體 +唻 +É +冮 +甹 +鎮 +錦 +ʌ +蜛 +蠄 +尓 +駕 +戀 +飬 +逹 +倫 +貴 +極 +Я +Й +寬 +磚 +嶪 +郎 +職 +| +間 +n +d +剎 +伈 +課 +飛 +橋 +瘊 +№ +譜 +骓 +圗 +滘 +縣 +粿 +咅 +養 +濤 +彳 +® +% +Ⅱ +啰 +㴪 +見 +矞 +薬 +糁 +邨 +鲮 +顔 +罱 +З +選 +話 +贏 +氪 +俵 +競 +瑩 +繡 +枱 +β +綉 +á +獅 +爾 +™ +麵 +戋 +淩 +徳 +個 +劇 +場 +務 +簡 +寵 +h +實 +膠 +轱 +圖 +築 +嘣 +樹 +㸃 +營 +耵 +孫 +饃 +鄺 +飯 +麯 +遠 +輸 +坫 +孃 +乚 +閃 +鏢 +㎡ +題 +廠 +關 +↑ +爺 +將 +軍 +連 +篦 +覌 +參 +箸 +- +窠 +棽 +寕 +夀 +爰 +歐 +呙 +閥 +頡 +熱 +雎 +垟 +裟 +凬 +勁 +帑 +馕 +夆 +疌 +枼 +馮 +貨 +蒤 +樸 +彧 +旸 +靜 +龢 +暢 +㐱 +鳥 +珺 +鏡 +灡 +爭 +堷 +廚 +Ó +騰 +診 +┅ +蘇 +褔 +凱 +頂 +豕 +亞 +帥 +嘬 +⊥ +仺 +桖 +複 +饣 +絡 +穂 +顏 +棟 +納 +▏ +濟 +親 +設 +計 +攵 +埌 +烺 +ò +頤 +燦 +蓮 +撻 +節 +講 +濱 +濃 +娽 +洳 +朿 +燈 +鈴 +護 +膚 +铔 +過 +補 +Z +U +5 +4 +坋 +闿 +䖝 +餘 +缐 +铞 +貿 +铪 +桼 +趙 +鍊 +[ +㐂 +垚 +菓 +揸 +捲 +鐘 +滏 +𣇉 +爍 +輪 +燜 +鴻 +鮮 +動 +鹞 +鷗 +丄 +慶 +鉌 +翥 +飮 +腸 +⇋ +漁 +覺 +來 +熘 +昴 +翏 +鲱 +圧 +鄉 +萭 +頔 +爐 +嫚 +г +貭 +類 +聯 +幛 +輕 +訓 +鑒 +夋 +锨 +芃 +珣 +䝉 +扙 +嵐 +銷 +處 +ㄱ +語 +誘 +苝 +歸 +儀 +燒 +楿 +內 +粢 +葒 +奧 +麥 +礻 +滿 +蠔 +穵 +瞭 +態 +鱬 +榞 +硂 +鄭 +黃 +煙 +祐 +奓 +逺 +* +瑄 +獲 +聞 +薦 +讀 +這 +樣 +決 +問 +啟 +們 +執 +説 +轉 +單 +隨 +唘 +帶 +倉 +庫 +還 +贈 +尙 +皺 +■ +餅 +產 +○ +∈ +報 +狀 +楓 +賠 +琯 +嗮 +禮 +` +傳 +> +≤ +嗞 +Φ +≥ +換 +咭 +∣ +↓ +曬 +ε +応 +寫 +″ +終 +様 +純 +費 +療 +聨 +凍 +壐 +郵 +ü +黒 +∫ +製 +塊 +調 +軽 +確 +撃 +級 +馴 +Ⅲ +涇 +繹 +數 +碼 +證 +狒 +処 +劑 +< +晧 +賀 +衆 +] +櫥 +兩 +陰 +絶 +對 +鯉 +憶 +◎ +p +e +Y +蕒 +煖 +頓 +測 +試 +鼽 +僑 +碩 +妝 +帯 +≈ +鐡 +舖 +權 +喫 +倆 +ˋ +該 +悅 +ā +俫 +. +f +s +b +m +k +g +u +j +貼 +淨 +濕 +針 +適 +備 +l +/ +給 +謢 +強 +觸 +衛 +與 +⊙ +$ +緯 +變 +⑴ +⑵ +⑶ +㎏ +殺 +∩ +幚 +─ +價 +▲ +離 +ú +ó +飄 +烏 +関 +閟 +﹝ +﹞ +邏 +輯 +鍵 +驗 +訣 +導 +歷 +屆 +層 +▼ +儱 +錄 +熳 +ē +艦 +吋 +錶 +辧 +飼 +顯 +④ +禦 +販 +気 +対 +枰 +閩 +紀 +幹 +瞓 +貊 +淚 +△ +眞 +墊 +Ω +獻 +褲 +縫 +緑 +亜 +鉅 +餠 +{ +} +◆ +蘆 +薈 +█ +◇ +溫 +彈 +晳 +粧 +犸 +穩 +訊 +崬 +凖 +熥 +П +舊 +條 +紋 +圍 +Ⅳ +筆 +尷 +難 +雜 +錯 +綁 +識 +頰 +鎖 +艶 +□ +殁 +殼 +⑧ +├ +▕ +鵬 +ǐ +ō +ǒ +糝 +綱 +▎ +μ +盜 +饅 +醬 +籤 +蓋 +釀 +鹽 +據 +à +ɡ +辦 +◥ +彐 +┌ +婦 +獸 +鲩 +伱 +ī +蒟 +蒻 +齊 +袆 +腦 +寧 +凈 +妳 +煥 +詢 +偽 +謹 +啫 +鯽 +騷 +鱸 +損 +傷 +鎻 +髮 +買 +冏 +儥 +両 +﹢ +∞ +載 +喰 +z +羙 +悵 +燙 +曉 +員 +組 +徹 +艷 +痠 +鋼 +鼙 +縮 +細 +嚒 +爯 +≠ +維 +" +鱻 +壇 +厍 +帰 +浥 +犇 +薡 +軎 +² +應 +醜 +刪 +緻 +鶴 +賜 +噁 +軌 +尨 +镔 +鷺 +槗 +彌 +葚 +濛 +請 +溇 +緹 +賢 +訪 +獴 +瑅 +資 +縤 +陣 +蕟 +栢 +韻 +祼 +恁 +伢 +謝 +劃 +涑 +總 +衖 +踺 +砋 +凉 +籃 +駿 +苼 +瘋 +昽 +紡 +驊 +腎 +﹗ +響 +杋 +剛 +嚴 +禪 +歓 +槍 +傘 +檸 +檫 +炣 +勢 +鏜 +鎢 +銑 +尐 +減 +奪 +惡 +θ +僮 +婭 +臘 +ū +ì +殻 +鉄 +∑ +蛲 +焼 +緖 +續 +紹 +懮 \ No newline at end of file diff --git a/inpaint/model/anytext/utils.py b/inpaint/model/anytext/utils.py new file mode 100644 index 0000000..c9f55b8 --- /dev/null +++ b/inpaint/model/anytext/utils.py @@ -0,0 +1,151 @@ +import os +import datetime +import cv2 +import numpy as np +from PIL import Image, ImageDraw + + +def save_images(img_list, folder): + if not os.path.exists(folder): + os.makedirs(folder) + now = datetime.datetime.now() + date_str = now.strftime("%Y-%m-%d") + folder_path = os.path.join(folder, date_str) + if not os.path.exists(folder_path): + os.makedirs(folder_path) + time_str = now.strftime("%H_%M_%S") + for idx, img in enumerate(img_list): + image_number = idx + 1 + filename = f"{time_str}_{image_number}.jpg" + save_path = os.path.join(folder_path, filename) + cv2.imwrite(save_path, img[..., ::-1]) + + +def check_channels(image): + channels = image.shape[2] if len(image.shape) == 3 else 1 + if channels == 1: + image = cv2.cvtColor(image, cv2.COLOR_GRAY2BGR) + elif channels > 3: + image = image[:, :, :3] + return image + + +def resize_image(img, max_length=768): + height, width = img.shape[:2] + max_dimension = max(height, width) + + if max_dimension > max_length: + scale_factor = max_length / max_dimension + new_width = int(round(width * scale_factor)) + new_height = int(round(height * scale_factor)) + new_size = (new_width, new_height) + img = cv2.resize(img, new_size) + height, width = img.shape[:2] + img = cv2.resize(img, (width - (width % 64), height - (height % 64))) + return img + + +def insert_spaces(string, nSpace): + if nSpace == 0: + return string + new_string = "" + for char in string: + new_string += char + " " * nSpace + return new_string[:-nSpace] + + +def draw_glyph(font, text): + g_size = 50 + W, H = (512, 80) + new_font = font.font_variant(size=g_size) + img = Image.new(mode="1", size=(W, H), color=0) + draw = ImageDraw.Draw(img) + left, top, right, bottom = new_font.getbbox(text) + text_width = max(right - left, 5) + text_height = max(bottom - top, 5) + ratio = min(W * 0.9 / text_width, H * 0.9 / text_height) + new_font = font.font_variant(size=int(g_size * ratio)) + + text_width, text_height = new_font.getsize(text) + offset_x, offset_y = new_font.getoffset(text) + x = (img.width - text_width) // 2 + y = (img.height - text_height) // 2 - offset_y // 2 + draw.text((x, y), text, font=new_font, fill="white") + img = np.expand_dims(np.array(img), axis=2).astype(np.float64) + return img + + +def draw_glyph2( + font, text, polygon, vertAng=10, scale=1, width=512, height=512, add_space=True +): + enlarge_polygon = polygon * scale + rect = cv2.minAreaRect(enlarge_polygon) + box = cv2.boxPoints(rect) + box = np.int0(box) + w, h = rect[1] + angle = rect[2] + if angle < -45: + angle += 90 + angle = -angle + if w < h: + angle += 90 + + vert = False + if abs(angle) % 90 < vertAng or abs(90 - abs(angle) % 90) % 90 < vertAng: + _w = max(box[:, 0]) - min(box[:, 0]) + _h = max(box[:, 1]) - min(box[:, 1]) + if _h >= _w: + vert = True + angle = 0 + + img = np.zeros((height * scale, width * scale, 3), np.uint8) + img = Image.fromarray(img) + + # infer font size + image4ratio = Image.new("RGB", img.size, "white") + draw = ImageDraw.Draw(image4ratio) + _, _, _tw, _th = draw.textbbox(xy=(0, 0), text=text, font=font) + text_w = min(w, h) * (_tw / _th) + if text_w <= max(w, h): + # add space + if len(text) > 1 and not vert and add_space: + for i in range(1, 100): + text_space = insert_spaces(text, i) + _, _, _tw2, _th2 = draw.textbbox(xy=(0, 0), text=text_space, font=font) + if min(w, h) * (_tw2 / _th2) > max(w, h): + break + text = insert_spaces(text, i - 1) + font_size = min(w, h) * 0.80 + else: + shrink = 0.75 if vert else 0.85 + font_size = min(w, h) / (text_w / max(w, h)) * shrink + new_font = font.font_variant(size=int(font_size)) + + left, top, right, bottom = new_font.getbbox(text) + text_width = right - left + text_height = bottom - top + + layer = Image.new("RGBA", img.size, (0, 0, 0, 0)) + draw = ImageDraw.Draw(layer) + if not vert: + draw.text( + (rect[0][0] - text_width // 2, rect[0][1] - text_height // 2 - top), + text, + font=new_font, + fill=(255, 255, 255, 255), + ) + else: + x_s = min(box[:, 0]) + _w // 2 - text_height // 2 + y_s = min(box[:, 1]) + for c in text: + draw.text((x_s, y_s), c, font=new_font, fill=(255, 255, 255, 255)) + _, _t, _, _b = new_font.getbbox(c) + y_s += _b + + rotated_layer = layer.rotate(angle, expand=1, center=(rect[0][0], rect[0][1])) + + x_offset = int((img.width - rotated_layer.width) / 2) + y_offset = int((img.height - rotated_layer.height) / 2) + img.paste(rotated_layer, (x_offset, y_offset), rotated_layer) + img = np.expand_dims(np.array(img.convert("1")), axis=2).astype(np.float64) + return img diff --git a/inpaint/model/base.py b/inpaint/model/base.py new file mode 100644 index 0000000..433ad68 --- /dev/null +++ b/inpaint/model/base.py @@ -0,0 +1,405 @@ +import abc +from typing import Optional + +import cv2 +import torch +import numpy as np +from loguru import logger + +from iopaint.helper import ( + boxes_from_mask, + resize_max_size, + pad_img_to_modulo, + switch_mps_device, +) +from iopaint.schema import InpaintRequest, HDStrategy, SDSampler +from .helper.g_diffuser_bot import expand_image +from .utils import get_scheduler + + +class InpaintModel: + name = "base" + min_size: Optional[int] = None + pad_mod = 8 + pad_to_square = False + is_erase_model = False + + def __init__(self, device, **kwargs): + """ + + Args: + device: + """ + device = switch_mps_device(self.name, device) + self.device = device + self.init_model(device, **kwargs) + + @abc.abstractmethod + def init_model(self, device, **kwargs): ... + + @staticmethod + @abc.abstractmethod + def is_downloaded() -> bool: + return False + + @abc.abstractmethod + def forward(self, image, mask, config: InpaintRequest): + """Input images and output images have same size + images: [H, W, C] RGB + masks: [H, W, 1] 255 为 masks 区域 + return: BGR IMAGE + """ + ... + + @staticmethod + def download(): ... + + def _pad_forward(self, image, mask, config: InpaintRequest): + origin_height, origin_width = image.shape[:2] + pad_image = pad_img_to_modulo( + image, mod=self.pad_mod, square=self.pad_to_square, min_size=self.min_size + ) + pad_mask = pad_img_to_modulo( + mask, mod=self.pad_mod, square=self.pad_to_square, min_size=self.min_size + ) + + # logger.info(f"final forward pad size: {pad_image.shape}") + + image, mask = self.forward_pre_process(image, mask, config) + + result = self.forward(pad_image, pad_mask, config) + result = result[0:origin_height, 0:origin_width, :] + + result, image, mask = self.forward_post_process(result, image, mask, config) + + if config.sd_keep_unmasked_area: + mask = mask[:, :, np.newaxis] + result = result * (mask / 255) + image[:, :, ::-1] * (1 - (mask / 255)) + return result + + def forward_pre_process(self, image, mask, config): + return image, mask + + def forward_post_process(self, result, image, mask, config): + return result, image, mask + + @torch.no_grad() + def __call__(self, image, mask, config: InpaintRequest): + """ + images: [H, W, C] RGB, not normalized + masks: [H, W] + return: BGR IMAGE + """ + inpaint_result = None + # logger.info(f"hd_strategy: {config.hd_strategy}") + if config.hd_strategy == HDStrategy.CROP: + if max(image.shape) > config.hd_strategy_crop_trigger_size: + logger.info("Run crop strategy") + boxes = boxes_from_mask(mask) + crop_result = [] + for box in boxes: + crop_image, crop_box = self._run_box(image, mask, box, config) + crop_result.append((crop_image, crop_box)) + + inpaint_result = image[:, :, ::-1] + for crop_image, crop_box in crop_result: + x1, y1, x2, y2 = crop_box + inpaint_result[y1:y2, x1:x2, :] = crop_image + + elif config.hd_strategy == HDStrategy.RESIZE: + if max(image.shape) > config.hd_strategy_resize_limit: + origin_size = image.shape[:2] + downsize_image = resize_max_size( + image, size_limit=config.hd_strategy_resize_limit + ) + downsize_mask = resize_max_size( + mask, size_limit=config.hd_strategy_resize_limit + ) + + logger.info( + f"Run resize strategy, origin size: {image.shape} forward size: {downsize_image.shape}" + ) + inpaint_result = self._pad_forward( + downsize_image, downsize_mask, config + ) + + # only paste masked area result + inpaint_result = cv2.resize( + inpaint_result, + (origin_size[1], origin_size[0]), + interpolation=cv2.INTER_CUBIC, + ) + original_pixel_indices = mask < 127 + inpaint_result[original_pixel_indices] = image[:, :, ::-1][ + original_pixel_indices + ] + + if inpaint_result is None: + inpaint_result = self._pad_forward(image, mask, config) + + return inpaint_result + + def _crop_box(self, image, mask, box, config: InpaintRequest): + """ + + Args: + image: [H, W, C] RGB + mask: [H, W, 1] + box: [left,top,right,bottom] + + Returns: + BGR IMAGE, (l, r, r, b) + """ + box_h = box[3] - box[1] + box_w = box[2] - box[0] + cx = (box[0] + box[2]) // 2 + cy = (box[1] + box[3]) // 2 + img_h, img_w = image.shape[:2] + + w = box_w + config.hd_strategy_crop_margin * 2 + h = box_h + config.hd_strategy_crop_margin * 2 + + _l = cx - w // 2 + _r = cx + w // 2 + _t = cy - h // 2 + _b = cy + h // 2 + + l = max(_l, 0) + r = min(_r, img_w) + t = max(_t, 0) + b = min(_b, img_h) + + # try to get more context when crop around image edge + if _l < 0: + r += abs(_l) + if _r > img_w: + l -= _r - img_w + if _t < 0: + b += abs(_t) + if _b > img_h: + t -= _b - img_h + + l = max(l, 0) + r = min(r, img_w) + t = max(t, 0) + b = min(b, img_h) + + crop_img = image[t:b, l:r, :] + crop_mask = mask[t:b, l:r] + + # logger.info(f"box size: ({box_h},{box_w}) crop size: {crop_img.shape}") + + return crop_img, crop_mask, [l, t, r, b] + + def _calculate_cdf(self, histogram): + cdf = histogram.cumsum() + normalized_cdf = cdf / float(cdf.max()) + return normalized_cdf + + def _calculate_lookup(self, source_cdf, reference_cdf): + lookup_table = np.zeros(256) + lookup_val = 0 + for source_index, source_val in enumerate(source_cdf): + for reference_index, reference_val in enumerate(reference_cdf): + if reference_val >= source_val: + lookup_val = reference_index + break + lookup_table[source_index] = lookup_val + return lookup_table + + def _match_histograms(self, source, reference, mask): + transformed_channels = [] + if len(mask.shape) == 3: + mask = mask[:, :, -1] + + for channel in range(source.shape[-1]): + source_channel = source[:, :, channel] + reference_channel = reference[:, :, channel] + + # only calculate histograms for non-masked parts + source_histogram, _ = np.histogram(source_channel[mask == 0], 256, [0, 256]) + reference_histogram, _ = np.histogram( + reference_channel[mask == 0], 256, [0, 256] + ) + + source_cdf = self._calculate_cdf(source_histogram) + reference_cdf = self._calculate_cdf(reference_histogram) + + lookup = self._calculate_lookup(source_cdf, reference_cdf) + + transformed_channels.append(cv2.LUT(source_channel, lookup)) + + result = cv2.merge(transformed_channels) + result = cv2.convertScaleAbs(result) + + return result + + def _apply_cropper(self, image, mask, config: InpaintRequest): + img_h, img_w = image.shape[:2] + l, t, w, h = ( + config.croper_x, + config.croper_y, + config.croper_width, + config.croper_height, + ) + r = l + w + b = t + h + + l = max(l, 0) + r = min(r, img_w) + t = max(t, 0) + b = min(b, img_h) + + crop_img = image[t:b, l:r, :] + crop_mask = mask[t:b, l:r] + return crop_img, crop_mask, (l, t, r, b) + + def _run_box(self, image, mask, box, config: InpaintRequest): + """ + + Args: + image: [H, W, C] RGB + mask: [H, W, 1] + box: [left,top,right,bottom] + + Returns: + BGR IMAGE + """ + crop_img, crop_mask, [l, t, r, b] = self._crop_box(image, mask, box, config) + + return self._pad_forward(crop_img, crop_mask, config), [l, t, r, b] + + +class DiffusionInpaintModel(InpaintModel): + def __init__(self, device, **kwargs): + self.model_info = kwargs["model_info"] + self.model_id_or_path = self.model_info.path + super().__init__(device, **kwargs) + + @torch.no_grad() + def __call__(self, image, mask, config: InpaintRequest): + """ + images: [H, W, C] RGB, not normalized + masks: [H, W] + return: BGR IMAGE + """ + # boxes = boxes_from_mask(mask) + if config.use_croper: + crop_img, crop_mask, (l, t, r, b) = self._apply_cropper(image, mask, config) + crop_image = self._scaled_pad_forward(crop_img, crop_mask, config) + inpaint_result = image[:, :, ::-1] + inpaint_result[t:b, l:r, :] = crop_image + elif config.use_extender: + inpaint_result = self._do_outpainting(image, config) + else: + inpaint_result = self._scaled_pad_forward(image, mask, config) + + return inpaint_result + + def _do_outpainting(self, image, config: InpaintRequest): + # cropper 和 image 在同一个坐标系下,croper_x/y 可能为负数 + # 从 image 中 crop 出 outpainting 区域 + image_h, image_w = image.shape[:2] + cropper_l = config.extender_x + cropper_t = config.extender_y + cropper_r = config.extender_x + config.extender_width + cropper_b = config.extender_y + config.extender_height + image_l = 0 + image_t = 0 + image_r = image_w + image_b = image_h + + # 类似求 IOU + l = max(cropper_l, image_l) + t = max(cropper_t, image_t) + r = min(cropper_r, image_r) + b = min(cropper_b, image_b) + + assert ( + 0 <= l < r and 0 <= t < b + ), f"cropper and image not overlap, {l},{t},{r},{b}" + + cropped_image = image[t:b, l:r, :] + padding_l = max(0, image_l - cropper_l) + padding_t = max(0, image_t - cropper_t) + padding_r = max(0, cropper_r - image_r) + padding_b = max(0, cropper_b - image_b) + + expanded_image, mask_image = expand_image( + cropped_image, + left=padding_l, + top=padding_t, + right=padding_r, + bottom=padding_b, + ) + + # 最终扩大了的 image, BGR + expanded_cropped_result_image = self._scaled_pad_forward( + expanded_image, mask_image, config + ) + + # RGB -> BGR + outpainting_image = cv2.copyMakeBorder( + image, + left=padding_l, + top=padding_t, + right=padding_r, + bottom=padding_b, + borderType=cv2.BORDER_CONSTANT, + value=0, + )[:, :, ::-1] + + # 把 cropped_result_image 贴到 outpainting_image 上,这一步不需要 blend + paste_t = 0 if config.extender_y < 0 else config.extender_y + paste_l = 0 if config.extender_x < 0 else config.extender_x + + outpainting_image[ + paste_t : paste_t + expanded_cropped_result_image.shape[0], + paste_l : paste_l + expanded_cropped_result_image.shape[1], + :, + ] = expanded_cropped_result_image + return outpainting_image + + def _scaled_pad_forward(self, image, mask, config: InpaintRequest): + longer_side_length = int(config.sd_scale * max(image.shape[:2])) + origin_size = image.shape[:2] + downsize_image = resize_max_size(image, size_limit=longer_side_length) + downsize_mask = resize_max_size(mask, size_limit=longer_side_length) + if config.sd_scale != 1: + logger.info( + f"Resize image to do sd inpainting: {image.shape} -> {downsize_image.shape}" + ) + inpaint_result = self._pad_forward(downsize_image, downsize_mask, config) + # only paste masked area result + inpaint_result = cv2.resize( + inpaint_result, + (origin_size[1], origin_size[0]), + interpolation=cv2.INTER_CUBIC, + ) + + return inpaint_result + + def set_scheduler(self, config: InpaintRequest): + scheduler_config = self.model.scheduler.config + sd_sampler = config.sd_sampler + if config.sd_lcm_lora and self.model_info.support_lcm_lora: + sd_sampler = SDSampler.lcm + logger.info(f"LCM Lora enabled, use {sd_sampler} sampler") + scheduler = get_scheduler(sd_sampler, scheduler_config) + self.model.scheduler = scheduler + + def forward_pre_process(self, image, mask, config): + if config.sd_mask_blur != 0: + k = 2 * config.sd_mask_blur + 1 + mask = cv2.GaussianBlur(mask, (k, k), 0) + + return image, mask + + def forward_post_process(self, result, image, mask, config): + if config.sd_match_histograms: + result = self._match_histograms(result, image[:, :, ::-1], mask) + + if config.use_extender and config.sd_mask_blur != 0: + k = 2 * config.sd_mask_blur + 1 + mask = cv2.GaussianBlur(mask, (k, k), 0) + return result, image, mask diff --git a/inpaint/model/brushnet/__init__.py b/inpaint/model/brushnet/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/inpaint/model/brushnet/brushnet.py b/inpaint/model/brushnet/brushnet.py new file mode 100644 index 0000000..b3a045b --- /dev/null +++ b/inpaint/model/brushnet/brushnet.py @@ -0,0 +1,931 @@ +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple, Union + +import torch +from torch import nn + +from diffusers.configuration_utils import ConfigMixin, register_to_config +from diffusers.utils import BaseOutput, logging +from diffusers.models.attention_processor import ( + ADDED_KV_ATTENTION_PROCESSORS, + CROSS_ATTENTION_PROCESSORS, + AttentionProcessor, + AttnAddedKVProcessor, + AttnProcessor, +) +from diffusers.models.embeddings import TextImageProjection, TextImageTimeEmbedding, TextTimeEmbedding, \ + TimestepEmbedding, Timesteps +from diffusers.models.modeling_utils import ModelMixin +from diffusers.models.unets.unet_2d_blocks import ( + CrossAttnDownBlock2D, + DownBlock2D, get_down_block, get_up_block, +) + +from diffusers.models.unets.unet_2d_condition import UNet2DConditionModel +from .unet_2d_blocks import MidBlock2D + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +@dataclass +class BrushNetOutput(BaseOutput): + """ + The output of [`BrushNetModel`]. + + Args: + up_block_res_samples (`tuple[torch.Tensor]`): + A tuple of upsample activations at different resolutions for each upsampling block. Each tensor should + be of shape `(batch_size, channel * resolution, height //resolution, width // resolution)`. Output can be + used to condition the original UNet's upsampling activations. + down_block_res_samples (`tuple[torch.Tensor]`): + A tuple of downsample activations at different resolutions for each downsampling block. Each tensor should + be of shape `(batch_size, channel * resolution, height //resolution, width // resolution)`. Output can be + used to condition the original UNet's downsampling activations. + mid_down_block_re_sample (`torch.Tensor`): + The activation of the midde block (the lowest sample resolution). Each tensor should be of shape + `(batch_size, channel * lowest_resolution, height // lowest_resolution, width // lowest_resolution)`. + Output can be used to condition the original UNet's middle block activation. + """ + + up_block_res_samples: Tuple[torch.Tensor] + down_block_res_samples: Tuple[torch.Tensor] + mid_block_res_sample: torch.Tensor + + +class BrushNetModel(ModelMixin, ConfigMixin): + """ + A BrushNet model. + + Args: + in_channels (`int`, defaults to 4): + The number of channels in the input sample. + flip_sin_to_cos (`bool`, defaults to `True`): + Whether to flip the sin to cos in the time embedding. + freq_shift (`int`, defaults to 0): + The frequency shift to apply to the time embedding. + down_block_types (`tuple[str]`, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`): + The tuple of downsample blocks to use. + mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`): + Block type for middle of UNet, it can be one of `UNetMidBlock2DCrossAttn`, `UNetMidBlock2D`, or + `UNetMidBlock2DSimpleCrossAttn`. If `None`, the mid block layer is skipped. + up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`): + The tuple of upsample blocks to use. + only_cross_attention (`Union[bool, Tuple[bool]]`, defaults to `False`): + block_out_channels (`tuple[int]`, defaults to `(320, 640, 1280, 1280)`): + The tuple of output channels for each block. + layers_per_block (`int`, defaults to 2): + The number of layers per block. + downsample_padding (`int`, defaults to 1): + The padding to use for the downsampling convolution. + mid_block_scale_factor (`float`, defaults to 1): + The scale factor to use for the mid block. + act_fn (`str`, defaults to "silu"): + The activation function to use. + norm_num_groups (`int`, *optional*, defaults to 32): + The number of groups to use for the normalization. If None, normalization and activation layers is skipped + in post-processing. + norm_eps (`float`, defaults to 1e-5): + The epsilon to use for the normalization. + cross_attention_dim (`int`, defaults to 1280): + The dimension of the cross attention features. + transformer_layers_per_block (`int` or `Tuple[int]`, *optional*, defaults to 1): + The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for + [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`], + [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`]. + encoder_hid_dim (`int`, *optional*, defaults to None): + If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim` + dimension to `cross_attention_dim`. + encoder_hid_dim_type (`str`, *optional*, defaults to `None`): + If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text + embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`. + attention_head_dim (`Union[int, Tuple[int]]`, defaults to 8): + The dimension of the attention heads. + use_linear_projection (`bool`, defaults to `False`): + class_embed_type (`str`, *optional*, defaults to `None`): + The type of class embedding to use which is ultimately summed with the time embeddings. Choose from None, + `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`. + addition_embed_type (`str`, *optional*, defaults to `None`): + Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or + "text". "text" will use the `TextTimeEmbedding` layer. + num_class_embeds (`int`, *optional*, defaults to 0): + Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing + class conditioning with `class_embed_type` equal to `None`. + upcast_attention (`bool`, defaults to `False`): + resnet_time_scale_shift (`str`, defaults to `"default"`): + Time scale shift config for ResNet blocks (see `ResnetBlock2D`). Choose from `default` or `scale_shift`. + projection_class_embeddings_input_dim (`int`, *optional*, defaults to `None`): + The dimension of the `class_labels` input when `class_embed_type="projection"`. Required when + `class_embed_type="projection"`. + brushnet_conditioning_channel_order (`str`, defaults to `"rgb"`): + The channel order of conditional image. Will convert to `rgb` if it's `bgr`. + conditioning_embedding_out_channels (`tuple[int]`, *optional*, defaults to `(16, 32, 96, 256)`): + The tuple of output channel for each block in the `conditioning_embedding` layer. + global_pool_conditions (`bool`, defaults to `False`): + TODO(Patrick) - unused parameter. + addition_embed_type_num_heads (`int`, defaults to 64): + The number of heads to use for the `TextTimeEmbedding` layer. + """ + + _supports_gradient_checkpointing = True + + @register_to_config + def __init__( + self, + in_channels: int = 4, + conditioning_channels: int = 5, + flip_sin_to_cos: bool = True, + freq_shift: int = 0, + down_block_types: Tuple[str, ...] = ( + "DownBlock2D", + "DownBlock2D", + "DownBlock2D", + "DownBlock2D", + ), + mid_block_type: Optional[str] = "UNetMidBlock2D", + up_block_types: Tuple[str, ...] = ( + "UpBlock2D", + "UpBlock2D", + "UpBlock2D", + "UpBlock2D", + ), + only_cross_attention: Union[bool, Tuple[bool]] = False, + block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280), + layers_per_block: int = 2, + downsample_padding: int = 1, + mid_block_scale_factor: float = 1, + act_fn: str = "silu", + norm_num_groups: Optional[int] = 32, + norm_eps: float = 1e-5, + cross_attention_dim: int = 1280, + transformer_layers_per_block: Union[int, Tuple[int, ...]] = 1, + encoder_hid_dim: Optional[int] = None, + encoder_hid_dim_type: Optional[str] = None, + attention_head_dim: Union[int, Tuple[int, ...]] = 8, + num_attention_heads: Optional[Union[int, Tuple[int, ...]]] = None, + use_linear_projection: bool = False, + class_embed_type: Optional[str] = None, + addition_embed_type: Optional[str] = None, + addition_time_embed_dim: Optional[int] = None, + num_class_embeds: Optional[int] = None, + upcast_attention: bool = False, + resnet_time_scale_shift: str = "default", + projection_class_embeddings_input_dim: Optional[int] = None, + brushnet_conditioning_channel_order: str = "rgb", + conditioning_embedding_out_channels: Optional[Tuple[int, ...]] = (16, 32, 96, 256), + global_pool_conditions: bool = False, + addition_embed_type_num_heads: int = 64, + ): + super().__init__() + + # If `num_attention_heads` is not defined (which is the case for most models) + # it will default to `attention_head_dim`. This looks weird upon first reading it and it is. + # The reason for this behavior is to correct for incorrectly named variables that were introduced + # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131 + # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking + # which is why we correct for the naming here. + num_attention_heads = num_attention_heads or attention_head_dim + + # Check inputs + if len(down_block_types) != len(up_block_types): + raise ValueError( + f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}." + ) + + if len(block_out_channels) != len(down_block_types): + raise ValueError( + f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}." + ) + + if not isinstance(only_cross_attention, bool) and len(only_cross_attention) != len(down_block_types): + raise ValueError( + f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}." + ) + + if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len(down_block_types): + raise ValueError( + f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}." + ) + + if isinstance(transformer_layers_per_block, int): + transformer_layers_per_block = [transformer_layers_per_block] * len(down_block_types) + + # input + conv_in_kernel = 3 + conv_in_padding = (conv_in_kernel - 1) // 2 + self.conv_in_condition = nn.Conv2d( + in_channels + conditioning_channels, block_out_channels[0], kernel_size=conv_in_kernel, + padding=conv_in_padding + ) + + # time + time_embed_dim = block_out_channels[0] * 4 + self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift) + timestep_input_dim = block_out_channels[0] + self.time_embedding = TimestepEmbedding( + timestep_input_dim, + time_embed_dim, + act_fn=act_fn, + ) + + if encoder_hid_dim_type is None and encoder_hid_dim is not None: + encoder_hid_dim_type = "text_proj" + self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type) + logger.info("encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined.") + + if encoder_hid_dim is None and encoder_hid_dim_type is not None: + raise ValueError( + f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}." + ) + + if encoder_hid_dim_type == "text_proj": + self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim) + elif encoder_hid_dim_type == "text_image_proj": + # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much + # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use + # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)` + self.encoder_hid_proj = TextImageProjection( + text_embed_dim=encoder_hid_dim, + image_embed_dim=cross_attention_dim, + cross_attention_dim=cross_attention_dim, + ) + + elif encoder_hid_dim_type is not None: + raise ValueError( + f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'." + ) + else: + self.encoder_hid_proj = None + + # class embedding + if class_embed_type is None and num_class_embeds is not None: + self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim) + elif class_embed_type == "timestep": + self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim) + elif class_embed_type == "identity": + self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim) + elif class_embed_type == "projection": + if projection_class_embeddings_input_dim is None: + raise ValueError( + "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set" + ) + # The projection `class_embed_type` is the same as the timestep `class_embed_type` except + # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings + # 2. it projects from an arbitrary input dimension. + # + # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations. + # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings. + # As a result, `TimestepEmbedding` can be passed arbitrary vectors. + self.class_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim) + else: + self.class_embedding = None + + if addition_embed_type == "text": + if encoder_hid_dim is not None: + text_time_embedding_from_dim = encoder_hid_dim + else: + text_time_embedding_from_dim = cross_attention_dim + + self.add_embedding = TextTimeEmbedding( + text_time_embedding_from_dim, time_embed_dim, num_heads=addition_embed_type_num_heads + ) + elif addition_embed_type == "text_image": + # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much + # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use + # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)` + self.add_embedding = TextImageTimeEmbedding( + text_embed_dim=cross_attention_dim, image_embed_dim=cross_attention_dim, time_embed_dim=time_embed_dim + ) + elif addition_embed_type == "text_time": + self.add_time_proj = Timesteps(addition_time_embed_dim, flip_sin_to_cos, freq_shift) + self.add_embedding = TimestepEmbedding(projection_class_embeddings_input_dim, time_embed_dim) + + elif addition_embed_type is not None: + raise ValueError(f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'.") + + self.down_blocks = nn.ModuleList([]) + self.brushnet_down_blocks = nn.ModuleList([]) + + if isinstance(only_cross_attention, bool): + only_cross_attention = [only_cross_attention] * len(down_block_types) + + if isinstance(attention_head_dim, int): + attention_head_dim = (attention_head_dim,) * len(down_block_types) + + if isinstance(num_attention_heads, int): + num_attention_heads = (num_attention_heads,) * len(down_block_types) + + # down + output_channel = block_out_channels[0] + + brushnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1) + brushnet_block = zero_module(brushnet_block) + self.brushnet_down_blocks.append(brushnet_block) + + for i, down_block_type in enumerate(down_block_types): + input_channel = output_channel + output_channel = block_out_channels[i] + is_final_block = i == len(block_out_channels) - 1 + + down_block = get_down_block( + down_block_type, + num_layers=layers_per_block, + transformer_layers_per_block=transformer_layers_per_block[i], + in_channels=input_channel, + out_channels=output_channel, + temb_channels=time_embed_dim, + add_downsample=not is_final_block, + resnet_eps=norm_eps, + resnet_act_fn=act_fn, + resnet_groups=norm_num_groups, + cross_attention_dim=cross_attention_dim, + num_attention_heads=num_attention_heads[i], + attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel, + downsample_padding=downsample_padding, + use_linear_projection=use_linear_projection, + only_cross_attention=only_cross_attention[i], + upcast_attention=upcast_attention, + resnet_time_scale_shift=resnet_time_scale_shift, + ) + + self.down_blocks.append(down_block) + + for _ in range(layers_per_block): + brushnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1) + brushnet_block = zero_module(brushnet_block) + self.brushnet_down_blocks.append(brushnet_block) + + if not is_final_block: + brushnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1) + brushnet_block = zero_module(brushnet_block) + self.brushnet_down_blocks.append(brushnet_block) + + # mid + mid_block_channel = block_out_channels[-1] + + brushnet_block = nn.Conv2d(mid_block_channel, mid_block_channel, kernel_size=1) + brushnet_block = zero_module(brushnet_block) + self.brushnet_mid_block = brushnet_block + + self.mid_block = MidBlock2D( + in_channels=mid_block_channel, + temb_channels=time_embed_dim, + dropout=0.0, + resnet_eps=norm_eps, + resnet_act_fn=act_fn, + output_scale_factor=mid_block_scale_factor, + resnet_time_scale_shift=resnet_time_scale_shift, + resnet_groups=norm_num_groups, + use_linear_projection=use_linear_projection, + ) + + # count how many layers upsample the images + self.num_upsamplers = 0 + + # up + reversed_block_out_channels = list(reversed(block_out_channels)) + reversed_num_attention_heads = list(reversed(num_attention_heads)) + reversed_transformer_layers_per_block = (list(reversed(transformer_layers_per_block))) + only_cross_attention = list(reversed(only_cross_attention)) + + output_channel = reversed_block_out_channels[0] + + self.up_blocks = nn.ModuleList([]) + self.brushnet_up_blocks = nn.ModuleList([]) + + for i, up_block_type in enumerate(up_block_types): + is_final_block = i == len(block_out_channels) - 1 + + prev_output_channel = output_channel + output_channel = reversed_block_out_channels[i] + input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)] + + # add upsample block for all BUT final layer + if not is_final_block: + add_upsample = True + self.num_upsamplers += 1 + else: + add_upsample = False + + up_block = get_up_block( + up_block_type, + num_layers=layers_per_block + 1, + transformer_layers_per_block=reversed_transformer_layers_per_block[i], + in_channels=input_channel, + out_channels=output_channel, + prev_output_channel=prev_output_channel, + temb_channels=time_embed_dim, + add_upsample=add_upsample, + resnet_eps=norm_eps, + resnet_act_fn=act_fn, + resolution_idx=i, + resnet_groups=norm_num_groups, + cross_attention_dim=cross_attention_dim, + num_attention_heads=reversed_num_attention_heads[i], + use_linear_projection=use_linear_projection, + only_cross_attention=only_cross_attention[i], + upcast_attention=upcast_attention, + resnet_time_scale_shift=resnet_time_scale_shift, + attention_head_dim=attention_head_dim[i] if attention_head_dim[i] is not None else output_channel, + ) + + self.up_blocks.append(up_block) + prev_output_channel = output_channel + + for _ in range(layers_per_block + 1): + brushnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1) + brushnet_block = zero_module(brushnet_block) + self.brushnet_up_blocks.append(brushnet_block) + + if not is_final_block: + brushnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1) + brushnet_block = zero_module(brushnet_block) + self.brushnet_up_blocks.append(brushnet_block) + + @classmethod + def from_unet( + cls, + unet: UNet2DConditionModel, + brushnet_conditioning_channel_order: str = "rgb", + conditioning_embedding_out_channels: Optional[Tuple[int, ...]] = (16, 32, 96, 256), + load_weights_from_unet: bool = True, + conditioning_channels: int = 5, + ): + r""" + Instantiate a [`BrushNetModel`] from [`UNet2DConditionModel`]. + + Parameters: + unet (`UNet2DConditionModel`): + The UNet model weights to copy to the [`BrushNetModel`]. All configuration options are also copied + where applicable. + """ + transformer_layers_per_block = ( + unet.config.transformer_layers_per_block if "transformer_layers_per_block" in unet.config else 1 + ) + encoder_hid_dim = unet.config.encoder_hid_dim if "encoder_hid_dim" in unet.config else None + encoder_hid_dim_type = unet.config.encoder_hid_dim_type if "encoder_hid_dim_type" in unet.config else None + addition_embed_type = unet.config.addition_embed_type if "addition_embed_type" in unet.config else None + addition_time_embed_dim = ( + unet.config.addition_time_embed_dim if "addition_time_embed_dim" in unet.config else None + ) + + brushnet = cls( + in_channels=unet.config.in_channels, + conditioning_channels=conditioning_channels, + flip_sin_to_cos=unet.config.flip_sin_to_cos, + freq_shift=unet.config.freq_shift, + down_block_types=['DownBlock2D', 'DownBlock2D', 'DownBlock2D', 'DownBlock2D'], + mid_block_type='MidBlock2D', + up_block_types=['UpBlock2D', 'UpBlock2D', 'UpBlock2D', 'UpBlock2D'], + only_cross_attention=unet.config.only_cross_attention, + block_out_channels=unet.config.block_out_channels, + layers_per_block=unet.config.layers_per_block, + downsample_padding=unet.config.downsample_padding, + mid_block_scale_factor=unet.config.mid_block_scale_factor, + act_fn=unet.config.act_fn, + norm_num_groups=unet.config.norm_num_groups, + norm_eps=unet.config.norm_eps, + cross_attention_dim=unet.config.cross_attention_dim, + transformer_layers_per_block=transformer_layers_per_block, + encoder_hid_dim=encoder_hid_dim, + encoder_hid_dim_type=encoder_hid_dim_type, + attention_head_dim=unet.config.attention_head_dim, + num_attention_heads=unet.config.num_attention_heads, + use_linear_projection=unet.config.use_linear_projection, + class_embed_type=unet.config.class_embed_type, + addition_embed_type=addition_embed_type, + addition_time_embed_dim=addition_time_embed_dim, + num_class_embeds=unet.config.num_class_embeds, + upcast_attention=unet.config.upcast_attention, + resnet_time_scale_shift=unet.config.resnet_time_scale_shift, + projection_class_embeddings_input_dim=unet.config.projection_class_embeddings_input_dim, + brushnet_conditioning_channel_order=brushnet_conditioning_channel_order, + conditioning_embedding_out_channels=conditioning_embedding_out_channels, + ) + + if load_weights_from_unet: + conv_in_condition_weight = torch.zeros_like(brushnet.conv_in_condition.weight) + conv_in_condition_weight[:, :4, ...] = unet.conv_in.weight + conv_in_condition_weight[:, 4:8, ...] = unet.conv_in.weight + brushnet.conv_in_condition.weight = torch.nn.Parameter(conv_in_condition_weight) + brushnet.conv_in_condition.bias = unet.conv_in.bias + + brushnet.time_proj.load_state_dict(unet.time_proj.state_dict()) + brushnet.time_embedding.load_state_dict(unet.time_embedding.state_dict()) + + if brushnet.class_embedding: + brushnet.class_embedding.load_state_dict(unet.class_embedding.state_dict()) + + brushnet.down_blocks.load_state_dict(unet.down_blocks.state_dict(), strict=False) + brushnet.mid_block.load_state_dict(unet.mid_block.state_dict(), strict=False) + brushnet.up_blocks.load_state_dict(unet.up_blocks.state_dict(), strict=False) + + return brushnet + + @property + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors + def attn_processors(self) -> Dict[str, AttentionProcessor]: + r""" + Returns: + `dict` of attention processors: A dictionary containing all attention processors used in the model with + indexed by its weight name. + """ + # set recursively + processors = {} + + def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]): + if hasattr(module, "get_processor"): + processors[f"{name}.processor"] = module.get_processor(return_deprecated_lora=True) + + for sub_name, child in module.named_children(): + fn_recursive_add_processors(f"{name}.{sub_name}", child, processors) + + return processors + + for name, module in self.named_children(): + fn_recursive_add_processors(name, module, processors) + + return processors + + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor + def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]): + r""" + Sets the attention processor to use to compute attention. + + Parameters: + processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`): + The instantiated processor class or a dictionary of processor classes that will be set as the processor + for **all** `Attention` layers. + + If `processor` is a dict, the key needs to define the path to the corresponding cross attention + processor. This is strongly recommended when setting trainable attention processors. + + """ + count = len(self.attn_processors.keys()) + + if isinstance(processor, dict) and len(processor) != count: + raise ValueError( + f"A dict of processors was passed, but the number of processors {len(processor)} does not match the" + f" number of attention layers: {count}. Please make sure to pass {count} processor classes." + ) + + def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor): + if hasattr(module, "set_processor"): + if not isinstance(processor, dict): + module.set_processor(processor) + else: + module.set_processor(processor.pop(f"{name}.processor")) + + for sub_name, child in module.named_children(): + fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor) + + for name, module in self.named_children(): + fn_recursive_attn_processor(name, module, processor) + + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor + def set_default_attn_processor(self): + """ + Disables custom attention processors and sets the default attention implementation. + """ + if all(proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS for proc in self.attn_processors.values()): + processor = AttnAddedKVProcessor() + elif all(proc.__class__ in CROSS_ATTENTION_PROCESSORS for proc in self.attn_processors.values()): + processor = AttnProcessor() + else: + raise ValueError( + f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}" + ) + + self.set_attn_processor(processor) + + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attention_slice + def set_attention_slice(self, slice_size: Union[str, int, List[int]]) -> None: + r""" + Enable sliced attention computation. + + When this option is enabled, the attention module splits the input tensor in slices to compute attention in + several steps. This is useful for saving some memory in exchange for a small decrease in speed. + + Args: + slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`): + When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If + `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is + provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim` + must be a multiple of `slice_size`. + """ + sliceable_head_dims = [] + + def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module): + if hasattr(module, "set_attention_slice"): + sliceable_head_dims.append(module.sliceable_head_dim) + + for child in module.children(): + fn_recursive_retrieve_sliceable_dims(child) + + # retrieve number of attention layers + for module in self.children(): + fn_recursive_retrieve_sliceable_dims(module) + + num_sliceable_layers = len(sliceable_head_dims) + + if slice_size == "auto": + # half the attention head size is usually a good trade-off between + # speed and memory + slice_size = [dim // 2 for dim in sliceable_head_dims] + elif slice_size == "max": + # make smallest slice possible + slice_size = num_sliceable_layers * [1] + + slice_size = num_sliceable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size + + if len(slice_size) != len(sliceable_head_dims): + raise ValueError( + f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different" + f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}." + ) + + for i in range(len(slice_size)): + size = slice_size[i] + dim = sliceable_head_dims[i] + if size is not None and size > dim: + raise ValueError(f"size {size} has to be smaller or equal to {dim}.") + + # Recursively walk through all the children. + # Any children which exposes the set_attention_slice method + # gets the message + def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]): + if hasattr(module, "set_attention_slice"): + module.set_attention_slice(slice_size.pop()) + + for child in module.children(): + fn_recursive_set_attention_slice(child, slice_size) + + reversed_slice_size = list(reversed(slice_size)) + for module in self.children(): + fn_recursive_set_attention_slice(module, reversed_slice_size) + + def _set_gradient_checkpointing(self, module, value: bool = False) -> None: + if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D)): + module.gradient_checkpointing = value + + def forward( + self, + sample: torch.FloatTensor, + timestep: Union[torch.Tensor, float, int], + encoder_hidden_states: torch.Tensor, + brushnet_cond: torch.FloatTensor, + conditioning_scale: float = 1.0, + class_labels: Optional[torch.Tensor] = None, + timestep_cond: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + guess_mode: bool = False, + return_dict: bool = True, + ) -> Union[BrushNetOutput, Tuple[Tuple[torch.FloatTensor, ...], torch.FloatTensor]]: + """ + The [`BrushNetModel`] forward method. + + Args: + sample (`torch.FloatTensor`): + The noisy input tensor. + timestep (`Union[torch.Tensor, float, int]`): + The number of timesteps to denoise an input. + encoder_hidden_states (`torch.Tensor`): + The encoder hidden states. + brushnet_cond (`torch.FloatTensor`): + The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`. + conditioning_scale (`float`, defaults to `1.0`): + The scale factor for BrushNet outputs. + class_labels (`torch.Tensor`, *optional*, defaults to `None`): + Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings. + timestep_cond (`torch.Tensor`, *optional*, defaults to `None`): + Additional conditional embeddings for timestep. If provided, the embeddings will be summed with the + timestep_embedding passed through the `self.time_embedding` layer to obtain the final timestep + embeddings. + attention_mask (`torch.Tensor`, *optional*, defaults to `None`): + An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask + is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large + negative values to the attention scores corresponding to "discard" tokens. + added_cond_kwargs (`dict`): + Additional conditions for the Stable Diffusion XL UNet. + cross_attention_kwargs (`dict[str]`, *optional*, defaults to `None`): + A kwargs dictionary that if specified is passed along to the `AttnProcessor`. + guess_mode (`bool`, defaults to `False`): + In this mode, the BrushNet encoder tries its best to recognize the input content of the input even if + you remove all prompts. A `guidance_scale` between 3.0 and 5.0 is recommended. + return_dict (`bool`, defaults to `True`): + Whether or not to return a [`~models.brushnet.BrushNetOutput`] instead of a plain tuple. + + Returns: + [`~models.brushnet.BrushNetOutput`] **or** `tuple`: + If `return_dict` is `True`, a [`~models.brushnet.BrushNetOutput`] is returned, otherwise a tuple is + returned where the first element is the sample tensor. + """ + # check channel order + channel_order = self.config.brushnet_conditioning_channel_order + + if channel_order == "rgb": + # in rgb order by default + ... + elif channel_order == "bgr": + brushnet_cond = torch.flip(brushnet_cond, dims=[1]) + else: + raise ValueError(f"unknown `brushnet_conditioning_channel_order`: {channel_order}") + + # prepare attention_mask + if attention_mask is not None: + attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0 + attention_mask = attention_mask.unsqueeze(1) + + # 1. time + timesteps = timestep + if not torch.is_tensor(timesteps): + # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can + # This would be a good case for the `match` statement (Python 3.10+) + is_mps = sample.device.type == "mps" + if isinstance(timestep, float): + dtype = torch.float32 if is_mps else torch.float64 + else: + dtype = torch.int32 if is_mps else torch.int64 + timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device) + elif len(timesteps.shape) == 0: + timesteps = timesteps[None].to(sample.device) + + # broadcast to batch dimension in a way that's compatible with ONNX/Core ML + timesteps = timesteps.expand(sample.shape[0]) + + t_emb = self.time_proj(timesteps) + + # timesteps does not contain any weights and will always return f32 tensors + # but time_embedding might actually be running in fp16. so we need to cast here. + # there might be better ways to encapsulate this. + t_emb = t_emb.to(dtype=sample.dtype) + + emb = self.time_embedding(t_emb, timestep_cond) + aug_emb = None + + if self.class_embedding is not None: + if class_labels is None: + raise ValueError("class_labels should be provided when num_class_embeds > 0") + + if self.config.class_embed_type == "timestep": + class_labels = self.time_proj(class_labels) + + class_emb = self.class_embedding(class_labels).to(dtype=self.dtype) + emb = emb + class_emb + + if self.config.addition_embed_type is not None: + if self.config.addition_embed_type == "text": + aug_emb = self.add_embedding(encoder_hidden_states) + + elif self.config.addition_embed_type == "text_time": + if "text_embeds" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`" + ) + text_embeds = added_cond_kwargs.get("text_embeds") + if "time_ids" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`" + ) + time_ids = added_cond_kwargs.get("time_ids") + time_embeds = self.add_time_proj(time_ids.flatten()) + time_embeds = time_embeds.reshape((text_embeds.shape[0], -1)) + + add_embeds = torch.concat([text_embeds, time_embeds], dim=-1) + add_embeds = add_embeds.to(emb.dtype) + aug_emb = self.add_embedding(add_embeds) + + emb = emb + aug_emb if aug_emb is not None else emb + + # 2. pre-process + brushnet_cond = torch.concat([sample, brushnet_cond], 1) + sample = self.conv_in_condition(brushnet_cond) + + # 3. down + down_block_res_samples = (sample,) + for downsample_block in self.down_blocks: + if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention: + sample, res_samples = downsample_block( + hidden_states=sample, + temb=emb, + encoder_hidden_states=encoder_hidden_states, + attention_mask=attention_mask, + cross_attention_kwargs=cross_attention_kwargs, + ) + else: + sample, res_samples = downsample_block(hidden_states=sample, temb=emb) + + down_block_res_samples += res_samples + + # 4. PaintingNet down blocks + brushnet_down_block_res_samples = () + for down_block_res_sample, brushnet_down_block in zip(down_block_res_samples, self.brushnet_down_blocks): + down_block_res_sample = brushnet_down_block(down_block_res_sample) + brushnet_down_block_res_samples = brushnet_down_block_res_samples + (down_block_res_sample,) + + # 5. mid + if self.mid_block is not None: + if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention: + sample = self.mid_block( + sample, + emb, + encoder_hidden_states=encoder_hidden_states, + attention_mask=attention_mask, + cross_attention_kwargs=cross_attention_kwargs, + ) + else: + sample = self.mid_block(sample, emb) + + # 6. BrushNet mid blocks + brushnet_mid_block_res_sample = self.brushnet_mid_block(sample) + + # 7. up + up_block_res_samples = () + for i, upsample_block in enumerate(self.up_blocks): + is_final_block = i == len(self.up_blocks) - 1 + + res_samples = down_block_res_samples[-len(upsample_block.resnets):] + down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)] + + # if we have not reached the final block and need to forward the + # upsample size, we do it here + if not is_final_block: + upsample_size = down_block_res_samples[-1].shape[2:] + + if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention: + sample, up_res_samples = upsample_block( + hidden_states=sample, + temb=emb, + res_hidden_states_tuple=res_samples, + encoder_hidden_states=encoder_hidden_states, + cross_attention_kwargs=cross_attention_kwargs, + upsample_size=upsample_size, + attention_mask=attention_mask, + return_res_samples=True + ) + else: + sample, up_res_samples = upsample_block( + hidden_states=sample, + temb=emb, + res_hidden_states_tuple=res_samples, + upsample_size=upsample_size, + return_res_samples=True + ) + + up_block_res_samples += up_res_samples + + # 8. BrushNet up blocks + brushnet_up_block_res_samples = () + for up_block_res_sample, brushnet_up_block in zip(up_block_res_samples, self.brushnet_up_blocks): + up_block_res_sample = brushnet_up_block(up_block_res_sample) + brushnet_up_block_res_samples = brushnet_up_block_res_samples + (up_block_res_sample,) + + # 6. scaling + if guess_mode and not self.config.global_pool_conditions: + scales = torch.logspace(-1, 0, + len(brushnet_down_block_res_samples) + 1 + len(brushnet_up_block_res_samples), + device=sample.device) # 0.1 to 1.0 + scales = scales * conditioning_scale + + brushnet_down_block_res_samples = [sample * scale for sample, scale in zip(brushnet_down_block_res_samples, + scales[:len( + brushnet_down_block_res_samples)])] + brushnet_mid_block_res_sample = brushnet_mid_block_res_sample * scales[len(brushnet_down_block_res_samples)] + brushnet_up_block_res_samples = [sample * scale for sample, scale in zip(brushnet_up_block_res_samples, + scales[ + len(brushnet_down_block_res_samples) + 1:])] + else: + brushnet_down_block_res_samples = [sample * conditioning_scale for sample in + brushnet_down_block_res_samples] + brushnet_mid_block_res_sample = brushnet_mid_block_res_sample * conditioning_scale + brushnet_up_block_res_samples = [sample * conditioning_scale for sample in brushnet_up_block_res_samples] + + if self.config.global_pool_conditions: + brushnet_down_block_res_samples = [ + torch.mean(sample, dim=(2, 3), keepdim=True) for sample in brushnet_down_block_res_samples + ] + brushnet_mid_block_res_sample = torch.mean(brushnet_mid_block_res_sample, dim=(2, 3), keepdim=True) + brushnet_up_block_res_samples = [ + torch.mean(sample, dim=(2, 3), keepdim=True) for sample in brushnet_up_block_res_samples + ] + + if not return_dict: + return (brushnet_down_block_res_samples, brushnet_mid_block_res_sample, brushnet_up_block_res_samples) + + return BrushNetOutput( + down_block_res_samples=brushnet_down_block_res_samples, + mid_block_res_sample=brushnet_mid_block_res_sample, + up_block_res_samples=brushnet_up_block_res_samples + ) + + +def zero_module(module): + for p in module.parameters(): + nn.init.zeros_(p) + return module + + +if __name__ == "__main__": + BrushNetModel.from_pretrained("/Users/cwq/data/models/brushnet/brushnet_random_mask", variant='fp16', + use_safetensors=True) diff --git a/inpaint/model/brushnet/brushnet_unet_forward.py b/inpaint/model/brushnet/brushnet_unet_forward.py new file mode 100644 index 0000000..04e8f0a --- /dev/null +++ b/inpaint/model/brushnet/brushnet_unet_forward.py @@ -0,0 +1,322 @@ +from typing import Union, Optional, Dict, Any, Tuple + +import torch +from diffusers.models.unet_2d_condition import UNet2DConditionOutput +from diffusers.utils import USE_PEFT_BACKEND, unscale_lora_layers, deprecate, scale_lora_layers + + +def brushnet_unet_forward( + self, + sample: torch.FloatTensor, + timestep: Union[torch.Tensor, float, int], + encoder_hidden_states: torch.Tensor, + class_labels: Optional[torch.Tensor] = None, + timestep_cond: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None, + down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None, + mid_block_additional_residual: Optional[torch.Tensor] = None, + down_intrablock_additional_residuals: Optional[Tuple[torch.Tensor]] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + return_dict: bool = True, + down_block_add_samples: Optional[Tuple[torch.Tensor]] = None, + mid_block_add_sample: Optional[Tuple[torch.Tensor]] = None, + up_block_add_samples: Optional[Tuple[torch.Tensor]] = None, +) -> Union[UNet2DConditionOutput, Tuple]: + r""" + The [`UNet2DConditionModel`] forward method. + + Args: + sample (`torch.FloatTensor`): + The noisy input tensor with the following shape `(batch, channel, height, width)`. + timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. + encoder_hidden_states (`torch.FloatTensor`): + The encoder hidden states with shape `(batch, sequence_length, feature_dim)`. + class_labels (`torch.Tensor`, *optional*, defaults to `None`): + Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings. + timestep_cond: (`torch.Tensor`, *optional*, defaults to `None`): + Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed + through the `self.time_embedding` layer to obtain the timestep embeddings. + attention_mask (`torch.Tensor`, *optional*, defaults to `None`): + An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask + is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large + negative values to the attention scores corresponding to "discard" tokens. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + added_cond_kwargs: (`dict`, *optional*): + A kwargs dictionary containing additional embeddings that if specified are added to the embeddings that + are passed along to the UNet blocks. + down_block_additional_residuals: (`tuple` of `torch.Tensor`, *optional*): + A tuple of tensors that if specified are added to the residuals of down unet blocks. + mid_block_additional_residual: (`torch.Tensor`, *optional*): + A tensor that if specified is added to the residual of the middle unet block. + encoder_attention_mask (`torch.Tensor`): + A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If + `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias, + which adds large negative values to the attention scores corresponding to "discard" tokens. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain + tuple. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the [`AttnProcessor`]. + added_cond_kwargs: (`dict`, *optional*): + A kwargs dictionary containin additional embeddings that if specified are added to the embeddings that + are passed along to the UNet blocks. + down_block_additional_residuals (`tuple` of `torch.Tensor`, *optional*): + additional residuals to be added to UNet long skip connections from down blocks to up blocks for + example from ControlNet side model(s) + mid_block_additional_residual (`torch.Tensor`, *optional*): + additional residual to be added to UNet mid block output, for example from ControlNet side model + down_intrablock_additional_residuals (`tuple` of `torch.Tensor`, *optional*): + additional residuals to be added within UNet down blocks, for example from T2I-Adapter side model(s) + + Returns: + [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] or `tuple`: + If `return_dict` is True, an [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise + a `tuple` is returned where the first element is the sample tensor. + """ + # By default samples have to be AT least a multiple of the overall upsampling factor. + # The overall upsampling factor is equal to 2 ** (# num of upsampling layers). + # However, the upsampling interpolation output size can be forced to fit any upsampling size + # on the fly if necessary. + default_overall_up_factor = 2 ** self.num_upsamplers + + # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor` + forward_upsample_size = False + upsample_size = None + + for dim in sample.shape[-2:]: + if dim % default_overall_up_factor != 0: + # Forward upsample size to force interpolation output size. + forward_upsample_size = True + break + + # ensure attention_mask is a bias, and give it a singleton query_tokens dimension + # expects mask of shape: + # [batch, key_tokens] + # adds singleton query_tokens dimension: + # [batch, 1, key_tokens] + # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes: + # [batch, heads, query_tokens, key_tokens] (e.g. torch sdp attn) + # [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn) + if attention_mask is not None: + # assume that mask is expressed as: + # (1 = keep, 0 = discard) + # convert mask into a bias that can be added to attention scores: + # (keep = +0, discard = -10000.0) + attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0 + attention_mask = attention_mask.unsqueeze(1) + + # convert encoder_attention_mask to a bias the same way we do for attention_mask + if encoder_attention_mask is not None: + encoder_attention_mask = (1 - encoder_attention_mask.to(sample.dtype)) * -10000.0 + encoder_attention_mask = encoder_attention_mask.unsqueeze(1) + + # 0. center input if necessary + if self.config.center_input_sample: + sample = 2 * sample - 1.0 + + # 1. time + t_emb = self.get_time_embed(sample=sample, timestep=timestep) + emb = self.time_embedding(t_emb, timestep_cond) + aug_emb = None + + class_emb = self.get_class_embed(sample=sample, class_labels=class_labels) + if class_emb is not None: + if self.config.class_embeddings_concat: + emb = torch.cat([emb, class_emb], dim=-1) + else: + emb = emb + class_emb + + aug_emb = self.get_aug_embed( + emb=emb, encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs + ) + if self.config.addition_embed_type == "image_hint": + aug_emb, hint = aug_emb + sample = torch.cat([sample, hint], dim=1) + + emb = emb + aug_emb if aug_emb is not None else emb + + if self.time_embed_act is not None: + emb = self.time_embed_act(emb) + + encoder_hidden_states = self.process_encoder_hidden_states( + encoder_hidden_states=encoder_hidden_states, added_cond_kwargs=added_cond_kwargs + ) + + # 2. pre-process + sample = self.conv_in(sample) + + # 2.5 GLIGEN position net + if cross_attention_kwargs is not None and cross_attention_kwargs.get("gligen", None) is not None: + cross_attention_kwargs = cross_attention_kwargs.copy() + gligen_args = cross_attention_kwargs.pop("gligen") + cross_attention_kwargs["gligen"] = {"objs": self.position_net(**gligen_args)} + + # 3. down + lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0 + if USE_PEFT_BACKEND: + # weight the lora layers by setting `lora_scale` for each PEFT layer + scale_lora_layers(self, lora_scale) + + is_controlnet = mid_block_additional_residual is not None and down_block_additional_residuals is not None + # using new arg down_intrablock_additional_residuals for T2I-Adapters, to distinguish from controlnets + is_adapter = down_intrablock_additional_residuals is not None + # maintain backward compatibility for legacy usage, where + # T2I-Adapter and ControlNet both use down_block_additional_residuals arg + # but can only use one or the other + is_brushnet = down_block_add_samples is not None and mid_block_add_sample is not None and up_block_add_samples is not None + if not is_adapter and mid_block_additional_residual is None and down_block_additional_residuals is not None: + deprecate( + "T2I should not use down_block_additional_residuals", + "1.3.0", + "Passing intrablock residual connections with `down_block_additional_residuals` is deprecated \ + and will be removed in diffusers 1.3.0. `down_block_additional_residuals` should only be used \ + for ControlNet. Please make sure use `down_intrablock_additional_residuals` instead. ", + standard_warn=False, + ) + down_intrablock_additional_residuals = down_block_additional_residuals + is_adapter = True + + down_block_res_samples = (sample,) + + if is_brushnet: + sample = sample + down_block_add_samples.pop(0) + + for downsample_block in self.down_blocks: + if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention: + # For t2i-adapter CrossAttnDownBlock2D + additional_residuals = {} + if is_adapter and len(down_intrablock_additional_residuals) > 0: + additional_residuals["additional_residuals"] = down_intrablock_additional_residuals.pop(0) + + if is_brushnet and len(down_block_add_samples) > 0: + additional_residuals["down_block_add_samples"] = [down_block_add_samples.pop(0) + for _ in range( + len(downsample_block.resnets) + (downsample_block.downsamplers != None))] + + sample, res_samples = downsample_block( + hidden_states=sample, + temb=emb, + encoder_hidden_states=encoder_hidden_states, + attention_mask=attention_mask, + cross_attention_kwargs=cross_attention_kwargs, + encoder_attention_mask=encoder_attention_mask, + **additional_residuals, + ) + else: + additional_residuals = {} + if is_brushnet and len(down_block_add_samples) > 0: + additional_residuals["down_block_add_samples"] = [down_block_add_samples.pop(0) + for _ in range( + len(downsample_block.resnets) + (downsample_block.downsamplers != None))] + + sample, res_samples = downsample_block(hidden_states=sample, temb=emb, scale=lora_scale, + **additional_residuals) + if is_adapter and len(down_intrablock_additional_residuals) > 0: + sample += down_intrablock_additional_residuals.pop(0) + + down_block_res_samples += res_samples + + if is_controlnet: + new_down_block_res_samples = () + + for down_block_res_sample, down_block_additional_residual in zip( + down_block_res_samples, down_block_additional_residuals + ): + down_block_res_sample = down_block_res_sample + down_block_additional_residual + new_down_block_res_samples = new_down_block_res_samples + (down_block_res_sample,) + + down_block_res_samples = new_down_block_res_samples + + # 4. mid + if self.mid_block is not None: + if hasattr(self.mid_block, "has_cross_attention") and self.mid_block.has_cross_attention: + sample = self.mid_block( + sample, + emb, + encoder_hidden_states=encoder_hidden_states, + attention_mask=attention_mask, + cross_attention_kwargs=cross_attention_kwargs, + encoder_attention_mask=encoder_attention_mask, + ) + else: + sample = self.mid_block(sample, emb) + + # To support T2I-Adapter-XL + if ( + is_adapter + and len(down_intrablock_additional_residuals) > 0 + and sample.shape == down_intrablock_additional_residuals[0].shape + ): + sample += down_intrablock_additional_residuals.pop(0) + + if is_controlnet: + sample = sample + mid_block_additional_residual + + if is_brushnet: + sample = sample + mid_block_add_sample + + # 5. up + for i, upsample_block in enumerate(self.up_blocks): + is_final_block = i == len(self.up_blocks) - 1 + + res_samples = down_block_res_samples[-len(upsample_block.resnets):] + down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)] + + # if we have not reached the final block and need to forward the + # upsample size, we do it here + if not is_final_block and forward_upsample_size: + upsample_size = down_block_res_samples[-1].shape[2:] + + if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention: + additional_residuals = {} + if is_brushnet and len(up_block_add_samples) > 0: + additional_residuals["up_block_add_samples"] = [up_block_add_samples.pop(0) + for _ in range( + len(upsample_block.resnets) + (upsample_block.upsamplers != None))] + + sample = upsample_block( + hidden_states=sample, + temb=emb, + res_hidden_states_tuple=res_samples, + encoder_hidden_states=encoder_hidden_states, + cross_attention_kwargs=cross_attention_kwargs, + upsample_size=upsample_size, + attention_mask=attention_mask, + encoder_attention_mask=encoder_attention_mask, + **additional_residuals, + ) + else: + additional_residuals = {} + if is_brushnet and len(up_block_add_samples) > 0: + additional_residuals["up_block_add_samples"] = [up_block_add_samples.pop(0) + for _ in range( + len(upsample_block.resnets) + (upsample_block.upsamplers != None))] + + sample = upsample_block( + hidden_states=sample, + temb=emb, + res_hidden_states_tuple=res_samples, + upsample_size=upsample_size, + scale=lora_scale, + **additional_residuals, + ) + + # 6. post-process + if self.conv_norm_out: + sample = self.conv_norm_out(sample) + sample = self.conv_act(sample) + sample = self.conv_out(sample) + + if USE_PEFT_BACKEND: + # remove `lora_scale` from each PEFT layer + unscale_lora_layers(self, lora_scale) + + if not return_dict: + return (sample,) + + return UNet2DConditionOutput(sample=sample) diff --git a/inpaint/model/brushnet/brushnet_wrapper.py b/inpaint/model/brushnet/brushnet_wrapper.py new file mode 100644 index 0000000..c7343d2 --- /dev/null +++ b/inpaint/model/brushnet/brushnet_wrapper.py @@ -0,0 +1,157 @@ +import PIL.Image +import cv2 +import torch +from loguru import logger +import numpy as np + +from ..base import DiffusionInpaintModel +from ..helper.cpu_text_encoder import CPUTextEncoderWrapper +from ..original_sd_configs import get_config_files +from ..utils import ( + handle_from_pretrained_exceptions, + get_torch_dtype, + enable_low_mem, + is_local_files_only, +) +from .brushnet import BrushNetModel +from .brushnet_unet_forward import brushnet_unet_forward +from .unet_2d_blocks import CrossAttnDownBlock2D_forward, DownBlock2D_forward, CrossAttnUpBlock2D_forward, \ + UpBlock2D_forward +from ...schema import InpaintRequest, ModelType + + +class BrushNetWrapper(DiffusionInpaintModel): + pad_mod = 8 + min_size = 512 + + def init_model(self, device: torch.device, **kwargs): + from .pipeline_brushnet import StableDiffusionBrushNetPipeline + self.model_info = kwargs["model_info"] + self.brushnet_method = kwargs["brushnet_method"] + + use_gpu, torch_dtype = get_torch_dtype(device, kwargs.get("no_half", False)) + self.torch_dtype = torch_dtype + + model_kwargs = { + **kwargs.get("pipe_components", {}), + "local_files_only": is_local_files_only(**kwargs), + } + self.local_files_only = model_kwargs["local_files_only"] + + disable_nsfw_checker = kwargs["disable_nsfw"] or kwargs.get( + "cpu_offload", False + ) + if disable_nsfw_checker: + logger.info("Disable Stable Diffusion Model NSFW checker") + model_kwargs.update( + dict( + safety_checker=None, + feature_extractor=None, + requires_safety_checker=False, + ) + ) + + logger.info(f"Loading BrushNet model from {self.brushnet_method}") + brushnet = BrushNetModel.from_pretrained(self.brushnet_method, torch_dtype=torch_dtype) + + if self.model_info.is_single_file_diffusers: + if self.model_info.model_type == ModelType.DIFFUSERS_SD: + model_kwargs["num_in_channels"] = 4 + else: + model_kwargs["num_in_channels"] = 9 + + self.model = StableDiffusionBrushNetPipeline.from_single_file( + self.model_id_or_path, + torch_dtype=torch_dtype, + load_safety_checker=not disable_nsfw_checker, + original_config_file=get_config_files()['v1'], + brushnet=brushnet, + **model_kwargs, + ) + else: + self.model = handle_from_pretrained_exceptions( + StableDiffusionBrushNetPipeline.from_pretrained, + pretrained_model_name_or_path=self.model_id_or_path, + variant="fp16", + torch_dtype=torch_dtype, + brushnet=brushnet, + **model_kwargs, + ) + + enable_low_mem(self.model, kwargs.get("low_mem", False)) + + if kwargs.get("cpu_offload", False) and use_gpu: + logger.info("Enable sequential cpu offload") + self.model.enable_sequential_cpu_offload(gpu_id=0) + else: + self.model = self.model.to(device) + if kwargs["sd_cpu_textencoder"]: + logger.info("Run Stable Diffusion TextEncoder on CPU") + self.model.text_encoder = CPUTextEncoderWrapper( + self.model.text_encoder, torch_dtype + ) + + self.callback = kwargs.pop("callback", None) + + # Monkey patch the forward method of the UNet to use the brushnet_unet_forward method + self.model.unet.forward = brushnet_unet_forward.__get__(self.model.unet, self.model.unet.__class__) + + for down_block in self.model.brushnet.down_blocks: + down_block.forward = DownBlock2D_forward.__get__(down_block, down_block.__class__) + for up_block in self.model.brushnet.up_blocks: + up_block.forward = UpBlock2D_forward.__get__(up_block, up_block.__class__) + + # Monkey patch unet down_blocks to use CrossAttnDownBlock2D_forward + for down_block in self.model.unet.down_blocks: + if down_block.__class__.__name__ == "CrossAttnDownBlock2D": + down_block.forward = CrossAttnDownBlock2D_forward.__get__(down_block, down_block.__class__) + else: + down_block.forward = DownBlock2D_forward.__get__(down_block, down_block.__class__) + + for up_block in self.model.unet.up_blocks: + if up_block.__class__.__name__ == "CrossAttnUpBlock2D": + up_block.forward = CrossAttnUpBlock2D_forward.__get__(up_block, up_block.__class__) + else: + up_block.forward = UpBlock2D_forward.__get__(up_block, up_block.__class__) + + def switch_brushnet_method(self, new_method: str): + self.brushnet_method = new_method + brushnet = BrushNetModel.from_pretrained( + new_method, + resume_download=True, + local_files_only=self.local_files_only, + torch_dtype=self.torch_dtype, + ).to(self.model.device) + self.model.brushnet = brushnet + + def forward(self, image, mask, config: InpaintRequest): + """Input image and output image have same size + image: [H, W, C] RGB + mask: [H, W, 1] 255 means area to repaint + return: BGR IMAGE + """ + self.set_scheduler(config) + + img_h, img_w = image.shape[:2] + normalized_mask = mask[:, :].astype("float32") / 255.0 + image = image * (1 - normalized_mask) + image = image.astype(np.uint8) + output = self.model( + image=PIL.Image.fromarray(image), + prompt=config.prompt, + negative_prompt=config.negative_prompt, + mask=PIL.Image.fromarray(mask[:, :, -1], mode="L").convert("RGB"), + num_inference_steps=config.sd_steps, + # strength=config.sd_strength, + guidance_scale=config.sd_guidance_scale, + output_type="np", + callback_on_step_end=self.callback, + height=img_h, + width=img_w, + generator=torch.manual_seed(config.sd_seed), + brushnet_conditioning_scale=config.brushnet_conditioning_scale, + ).images[0] + + output = (output * 255).round().astype("uint8") + output = cv2.cvtColor(output, cv2.COLOR_RGB2BGR) + return output diff --git a/inpaint/model/brushnet/pipeline_brushnet.py b/inpaint/model/brushnet/pipeline_brushnet.py new file mode 100644 index 0000000..2826e77 --- /dev/null +++ b/inpaint/model/brushnet/pipeline_brushnet.py @@ -0,0 +1,1279 @@ +# https://github.com/TencentARC/BrushNet +import inspect +from typing import Any, Callable, Dict, List, Optional, Union + +import numpy as np +import PIL.Image +import torch +import torch.nn.functional as F +from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer, CLIPVisionModelWithProjection + +from diffusers.image_processor import PipelineImageInput, VaeImageProcessor +from diffusers.loaders import FromSingleFileMixin, IPAdapterMixin, LoraLoaderMixin, TextualInversionLoaderMixin +from diffusers.models import AutoencoderKL, ImageProjection, UNet2DConditionModel +from diffusers.models.lora import adjust_lora_scale_text_encoder +from diffusers.schedulers import KarrasDiffusionSchedulers +from diffusers.utils import ( + USE_PEFT_BACKEND, + deprecate, + logging, + replace_example_docstring, + scale_lora_layers, + unscale_lora_layers, +) +from diffusers.utils.torch_utils import is_compiled_module, is_torch_version, randn_tensor +from diffusers.pipelines.pipeline_utils import DiffusionPipeline, StableDiffusionMixin +from diffusers.pipelines.stable_diffusion.pipeline_output import StableDiffusionPipelineOutput +from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker + +from .brushnet import BrushNetModel + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + from diffusers import StableDiffusionBrushNetPipeline, BrushNetModel, UniPCMultistepScheduler + from diffusers.utils import load_image + import torch + import cv2 + import numpy as np + from PIL import Image + + base_model_path = "runwayml/stable-diffusion-v1-5" + brushnet_path = "ckpt_path" + + brushnet = BrushNetModel.from_pretrained(brushnet_path, torch_dtype=torch.float16) + pipe = StableDiffusionBrushNetPipeline.from_pretrained( + base_model_path, brushnet=brushnet, torch_dtype=torch.float16, low_cpu_mem_usage=False + ) + + # speed up diffusion process with faster scheduler and memory optimization + pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config) + # remove following line if xformers is not installed or when using Torch 2.0. + # pipe.enable_xformers_memory_efficient_attention() + # memory optimization. + pipe.enable_model_cpu_offload() + + image_path="examples/brushnet/src/test_image.jpg" + mask_path="examples/brushnet/src/test_mask.jpg" + caption="A cake on the table." + + init_image = cv2.imread(image_path) + mask_image = 1.*(cv2.imread(mask_path).sum(-1)>255)[:,:,np.newaxis] + init_image = init_image * (1-mask_image) + + init_image = Image.fromarray(init_image.astype(np.uint8)).convert("RGB") + mask_image = Image.fromarray(mask_image.astype(np.uint8).repeat(3,-1)*255).convert("RGB") + + generator = torch.Generator("cuda").manual_seed(1234) + + image = pipe( + caption, + init_image, + mask_image, + num_inference_steps=50, + generator=generator, + paintingnet_conditioning_scale=1.0 + ).images[0] + image.save("output.png") + ``` +""" + + +# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps +def retrieve_timesteps( + scheduler, + num_inference_steps: Optional[int] = None, + device: Optional[Union[str, torch.device]] = None, + timesteps: Optional[List[int]] = None, + **kwargs, +): + """ + Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles + custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. + + Args: + scheduler (`SchedulerMixin`): + The scheduler to get timesteps from. + num_inference_steps (`int`): + The number of diffusion steps used when generating samples with a pre-trained model. If used, + `timesteps` must be `None`. + device (`str` or `torch.device`, *optional*): + The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. + timesteps (`List[int]`, *optional*): + Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default + timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps` + must be `None`. + + Returns: + `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the + second element is the number of inference steps. + """ + if timesteps is not None: + accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys()) + if not accepts_timesteps: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" timestep schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) + else: + scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) + timesteps = scheduler.timesteps + return timesteps, num_inference_steps + + +class StableDiffusionBrushNetPipeline( + DiffusionPipeline, + StableDiffusionMixin, + TextualInversionLoaderMixin, + LoraLoaderMixin, + IPAdapterMixin, + FromSingleFileMixin, +): + r""" + Pipeline for text-to-image generation using Stable Diffusion with BrushNet guidance. + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods + implemented for all pipelines (downloading, saving, running on a particular device, etc.). + + The pipeline also inherits the following loading methods: + - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings + - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files + - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters + + Args: + vae ([`AutoencoderKL`]): + Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations. + text_encoder ([`~transformers.CLIPTextModel`]): + Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)). + tokenizer ([`~transformers.CLIPTokenizer`]): + A `CLIPTokenizer` to tokenize text. + unet ([`UNet2DConditionModel`]): + A `UNet2DConditionModel` to denoise the encoded image latents. + brushnet ([`BrushNetModel`]`): + Provides additional conditioning to the `unet` during the denoising process. + scheduler ([`SchedulerMixin`]): + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. + safety_checker ([`StableDiffusionSafetyChecker`]): + Classification module that estimates whether generated images could be considered offensive or harmful. + Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details + about a model's potential harms. + feature_extractor ([`~transformers.CLIPImageProcessor`]): + A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`. + """ + + model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae" + _optional_components = ["safety_checker", "feature_extractor", "image_encoder"] + _exclude_from_cpu_offload = ["safety_checker"] + _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"] + + def __init__( + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + brushnet: BrushNetModel, + scheduler: KarrasDiffusionSchedulers, + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPImageProcessor, + image_encoder: CLIPVisionModelWithProjection = None, + requires_safety_checker: bool = True, + ): + super().__init__() + + if safety_checker is None and requires_safety_checker: + logger.warning( + f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure" + " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered" + " results in services or applications open to the public. Both the diffusers team and Hugging Face" + " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling" + " it only for use-cases that involve analyzing network behavior or auditing its results. For more" + " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ." + ) + + if safety_checker is not None and feature_extractor is None: + raise ValueError( + "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety" + " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead." + ) + + self.register_modules( + vae=vae, + text_encoder=text_encoder, + tokenizer=tokenizer, + unet=unet, + brushnet=brushnet, + scheduler=scheduler, + safety_checker=safety_checker, + feature_extractor=feature_extractor, + image_encoder=image_encoder, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True) + self.register_to_config(requires_safety_checker=requires_safety_checker) + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt + def _encode_prompt( + self, + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, + **kwargs, + ): + deprecation_message = "`_encode_prompt()` is deprecated and it will be removed in a future version. Use `encode_prompt()` instead. Also, be aware that the output format changed from a concatenated tensor to a tuple." + deprecate("_encode_prompt()", "1.0.0", deprecation_message, standard_warn=False) + + prompt_embeds_tuple = self.encode_prompt( + prompt=prompt, + device=device, + num_images_per_prompt=num_images_per_prompt, + do_classifier_free_guidance=do_classifier_free_guidance, + negative_prompt=negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + lora_scale=lora_scale, + **kwargs, + ) + + # concatenate for backwards comp + prompt_embeds = torch.cat([prompt_embeds_tuple[1], prompt_embeds_tuple[0]]) + + return prompt_embeds + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt + def encode_prompt( + self, + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, + clip_skip: Optional[int] = None, + ): + r""" + Encodes the prompt into text encoder hidden states. + + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + device: (`torch.device`): + torch device + num_images_per_prompt (`int`): + number of images that should be generated per prompt + do_classifier_free_guidance (`bool`): + whether to use classifier free guidance or not + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + lora_scale (`float`, *optional*): + A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. + clip_skip (`int`, *optional*): + Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that + the output of the pre-final layer will be used for computing the prompt embeddings. + """ + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + + # dynamically adjust the LoRA scale + if not USE_PEFT_BACKEND: + adjust_lora_scale_text_encoder(self.text_encoder, lora_scale) + else: + scale_lora_layers(self.text_encoder, lora_scale) + + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + if prompt_embeds is None: + # textual inversion: process multi-vector tokens if necessary + if isinstance(self, TextualInversionLoaderMixin): + prompt = self.maybe_convert_prompt(prompt, self.tokenizer) + + text_inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + text_input_ids = text_inputs.input_ids + untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal( + text_input_ids, untruncated_ids + ): + removed_text = self.tokenizer.batch_decode( + untruncated_ids[:, self.tokenizer.model_max_length - 1: -1] + ) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: + attention_mask = text_inputs.attention_mask.to(device) + else: + attention_mask = None + + if clip_skip is None: + prompt_embeds = self.text_encoder(text_input_ids.to(device), attention_mask=attention_mask) + prompt_embeds = prompt_embeds[0] + else: + prompt_embeds = self.text_encoder( + text_input_ids.to(device), attention_mask=attention_mask, output_hidden_states=True + ) + # Access the `hidden_states` first, that contains a tuple of + # all the hidden states from the encoder layers. Then index into + # the tuple to access the hidden states from the desired layer. + prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)] + # We also need to apply the final LayerNorm here to not mess with the + # representations. The `last_hidden_states` that we typically use for + # obtaining the final prompt representations passes through the LayerNorm + # layer. + prompt_embeds = self.text_encoder.text_model.final_layer_norm(prompt_embeds) + + if self.text_encoder is not None: + prompt_embeds_dtype = self.text_encoder.dtype + elif self.unet is not None: + prompt_embeds_dtype = self.unet.dtype + else: + prompt_embeds_dtype = prompt_embeds.dtype + + prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device) + + bs_embed, seq_len, _ = prompt_embeds.shape + # duplicate text embeddings for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view(bs_embed * num_images_per_prompt, seq_len, -1) + + # get unconditional embeddings for classifier free guidance + if do_classifier_free_guidance and negative_prompt_embeds is None: + uncond_tokens: List[str] + if negative_prompt is None: + uncond_tokens = [""] * batch_size + elif prompt is not None and type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = negative_prompt + + # textual inversion: process multi-vector tokens if necessary + if isinstance(self, TextualInversionLoaderMixin): + uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer) + + max_length = prompt_embeds.shape[1] + uncond_input = self.tokenizer( + uncond_tokens, + padding="max_length", + max_length=max_length, + truncation=True, + return_tensors="pt", + ) + + if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask: + attention_mask = uncond_input.attention_mask.to(device) + else: + attention_mask = None + + negative_prompt_embeds = self.text_encoder( + uncond_input.input_ids.to(device), + attention_mask=attention_mask, + ) + negative_prompt_embeds = negative_prompt_embeds[0] + + if do_classifier_free_guidance: + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + seq_len = negative_prompt_embeds.shape[1] + + negative_prompt_embeds = negative_prompt_embeds.to(dtype=prompt_embeds_dtype, device=device) + + negative_prompt_embeds = negative_prompt_embeds.repeat(1, num_images_per_prompt, 1) + negative_prompt_embeds = negative_prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1) + + if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + # Retrieve the original scale by scaling back the LoRA layers + unscale_lora_layers(self.text_encoder, lora_scale) + + return prompt_embeds, negative_prompt_embeds + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image + def encode_image(self, image, device, num_images_per_prompt, output_hidden_states=None): + dtype = next(self.image_encoder.parameters()).dtype + + if not isinstance(image, torch.Tensor): + image = self.feature_extractor(image, return_tensors="pt").pixel_values + + image = image.to(device=device, dtype=dtype) + if output_hidden_states: + image_enc_hidden_states = self.image_encoder(image, output_hidden_states=True).hidden_states[-2] + image_enc_hidden_states = image_enc_hidden_states.repeat_interleave(num_images_per_prompt, dim=0) + uncond_image_enc_hidden_states = self.image_encoder( + torch.zeros_like(image), output_hidden_states=True + ).hidden_states[-2] + uncond_image_enc_hidden_states = uncond_image_enc_hidden_states.repeat_interleave( + num_images_per_prompt, dim=0 + ) + return image_enc_hidden_states, uncond_image_enc_hidden_states + else: + image_embeds = self.image_encoder(image).image_embeds + image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0) + uncond_image_embeds = torch.zeros_like(image_embeds) + + return image_embeds, uncond_image_embeds + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds + def prepare_ip_adapter_image_embeds( + self, ip_adapter_image, ip_adapter_image_embeds, device, num_images_per_prompt, do_classifier_free_guidance + ): + if ip_adapter_image_embeds is None: + if not isinstance(ip_adapter_image, list): + ip_adapter_image = [ip_adapter_image] + + if len(ip_adapter_image) != len(self.unet.encoder_hid_proj.image_projection_layers): + raise ValueError( + f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." + ) + + image_embeds = [] + for single_ip_adapter_image, image_proj_layer in zip( + ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers + ): + output_hidden_state = not isinstance(image_proj_layer, ImageProjection) + single_image_embeds, single_negative_image_embeds = self.encode_image( + single_ip_adapter_image, device, 1, output_hidden_state + ) + single_image_embeds = torch.stack([single_image_embeds] * num_images_per_prompt, dim=0) + single_negative_image_embeds = torch.stack( + [single_negative_image_embeds] * num_images_per_prompt, dim=0 + ) + + if do_classifier_free_guidance: + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + single_image_embeds = single_image_embeds.to(device) + + image_embeds.append(single_image_embeds) + else: + repeat_dims = [1] + image_embeds = [] + for single_image_embeds in ip_adapter_image_embeds: + if do_classifier_free_guidance: + single_negative_image_embeds, single_image_embeds = single_image_embeds.chunk(2) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) + single_negative_image_embeds = single_negative_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_negative_image_embeds.shape[1:])) + ) + single_image_embeds = torch.cat([single_negative_image_embeds, single_image_embeds]) + else: + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, *(repeat_dims * len(single_image_embeds.shape[1:])) + ) + image_embeds.append(single_image_embeds) + + return image_embeds + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker + def run_safety_checker(self, image, device, dtype): + if self.safety_checker is None: + has_nsfw_concept = None + else: + if torch.is_tensor(image): + feature_extractor_input = self.image_processor.postprocess(image, output_type="pil") + else: + feature_extractor_input = self.image_processor.numpy_to_pil(image) + safety_checker_input = self.feature_extractor(feature_extractor_input, return_tensors="pt").to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) + return image, has_nsfw_concept + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents + def decode_latents(self, latents): + deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead" + deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False) + + latents = 1 / self.vae.config.scaling_factor * latents + image = self.vae.decode(latents, return_dict=False)[0] + image = (image / 2 + 0.5).clamp(0, 1) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + image = image.cpu().permute(0, 2, 3, 1).float().numpy() + return image + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs + def prepare_extra_step_kwargs(self, generator, eta): + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + + accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys()) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + # check if the scheduler accepts generator + accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys()) + if accepts_generator: + extra_step_kwargs["generator"] = generator + return extra_step_kwargs + + def check_inputs( + self, + prompt, + image, + mask, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ip_adapter_image=None, + ip_adapter_image_embeds=None, + brushnet_conditioning_scale=1.0, + control_guidance_start=0.0, + control_guidance_end=1.0, + callback_on_step_end_tensor_inputs=None, + ): + if callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0): + raise ValueError( + f"`callback_steps` has to be a positive integer but is {callback_steps} of type" + f" {type(callback_steps)}." + ) + + if callback_on_step_end_tensor_inputs is not None and not all( + k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs + ): + raise ValueError( + f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}" + ) + + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)): + raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}") + + if negative_prompt is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + + if prompt_embeds is not None and negative_prompt_embeds is not None: + if prompt_embeds.shape != negative_prompt_embeds.shape: + raise ValueError( + "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" + f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" + f" {negative_prompt_embeds.shape}." + ) + + # Check `image` + is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance( + self.brushnet, torch._dynamo.eval_frame.OptimizedModule + ) + if ( + isinstance(self.brushnet, BrushNetModel) + or is_compiled + and isinstance(self.brushnet._orig_mod, BrushNetModel) + ): + self.check_image(image, mask, prompt, prompt_embeds) + else: + assert False + + # Check `brushnet_conditioning_scale` + if ( + isinstance(self.brushnet, BrushNetModel) + or is_compiled + and isinstance(self.brushnet._orig_mod, BrushNetModel) + ): + if not isinstance(brushnet_conditioning_scale, float): + raise TypeError("For single brushnet: `brushnet_conditioning_scale` must be type `float`.") + else: + assert False + + if not isinstance(control_guidance_start, (tuple, list)): + control_guidance_start = [control_guidance_start] + + if not isinstance(control_guidance_end, (tuple, list)): + control_guidance_end = [control_guidance_end] + + if len(control_guidance_start) != len(control_guidance_end): + raise ValueError( + f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list." + ) + + for start, end in zip(control_guidance_start, control_guidance_end): + if start >= end: + raise ValueError( + f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}." + ) + if start < 0.0: + raise ValueError(f"control guidance start: {start} can't be smaller than 0.") + if end > 1.0: + raise ValueError(f"control guidance end: {end} can't be larger than 1.0.") + + if ip_adapter_image is not None and ip_adapter_image_embeds is not None: + raise ValueError( + "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." + ) + + if ip_adapter_image_embeds is not None: + if not isinstance(ip_adapter_image_embeds, list): + raise ValueError( + f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" + ) + elif ip_adapter_image_embeds[0].ndim not in [3, 4]: + raise ValueError( + f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D" + ) + + def check_image(self, image, mask, prompt, prompt_embeds): + image_is_pil = isinstance(image, PIL.Image.Image) + image_is_tensor = isinstance(image, torch.Tensor) + image_is_np = isinstance(image, np.ndarray) + image_is_pil_list = isinstance(image, list) and isinstance(image[0], PIL.Image.Image) + image_is_tensor_list = isinstance(image, list) and isinstance(image[0], torch.Tensor) + image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray) + + if ( + not image_is_pil + and not image_is_tensor + and not image_is_np + and not image_is_pil_list + and not image_is_tensor_list + and not image_is_np_list + ): + raise TypeError( + f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}" + ) + + mask_is_pil = isinstance(mask, PIL.Image.Image) + mask_is_tensor = isinstance(mask, torch.Tensor) + mask_is_np = isinstance(mask, np.ndarray) + mask_is_pil_list = isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image) + mask_is_tensor_list = isinstance(mask, list) and isinstance(mask[0], torch.Tensor) + mask_is_np_list = isinstance(mask, list) and isinstance(mask[0], np.ndarray) + + if ( + not mask_is_pil + and not mask_is_tensor + and not mask_is_np + and not mask_is_pil_list + and not mask_is_tensor_list + and not mask_is_np_list + ): + raise TypeError( + f"mask must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(mask)}" + ) + + if image_is_pil: + image_batch_size = 1 + else: + image_batch_size = len(image) + + if prompt is not None and isinstance(prompt, str): + prompt_batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + prompt_batch_size = len(prompt) + elif prompt_embeds is not None: + prompt_batch_size = prompt_embeds.shape[0] + + if image_batch_size != 1 and image_batch_size != prompt_batch_size: + raise ValueError( + f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}" + ) + + def prepare_image( + self, + image, + width, + height, + batch_size, + num_images_per_prompt, + device, + dtype, + do_classifier_free_guidance=False, + guess_mode=False, + ): + image = self.image_processor.preprocess(image, height=height, width=width).to(dtype=torch.float32) + image_batch_size = image.shape[0] + + if image_batch_size == 1: + repeat_by = batch_size + else: + # image batch size is the same as prompt batch size + repeat_by = num_images_per_prompt + + image = image.repeat_interleave(repeat_by, dim=0) + + image = image.to(device=device, dtype=dtype) + + if do_classifier_free_guidance and not guess_mode: + image = torch.cat([image] * 2) + + return image + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents + def prepare_latents(self, batch_size, num_channels_latents, height, width, dtype, device, generator, latents=None): + shape = (batch_size, num_channels_latents, height // self.vae_scale_factor, width // self.vae_scale_factor) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + if latents is None: + noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + else: + noise = latents.to(device) + + # scale the initial noise by the standard deviation required by the scheduler + latents = noise * self.scheduler.init_noise_sigma + return latents, noise + + # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding + def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32): + """ + See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 + + Args: + timesteps (`torch.Tensor`): + generate embedding vectors at these timesteps + embedding_dim (`int`, *optional*, defaults to 512): + dimension of the embeddings to generate + dtype: + data type of the generated embeddings + + Returns: + `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + """ + assert len(w.shape) == 1 + w = w * 1000.0 + + half_dim = embedding_dim // 2 + emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1) + emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb) + emb = w.to(dtype)[:, None] * emb[None, :] + emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1) + if embedding_dim % 2 == 1: # zero pad + emb = torch.nn.functional.pad(emb, (0, 1)) + assert emb.shape == (w.shape[0], embedding_dim) + return emb + + @property + def guidance_scale(self): + return self._guidance_scale + + @property + def clip_skip(self): + return self._clip_skip + + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + @property + def do_classifier_free_guidance(self): + return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None + + @property + def cross_attention_kwargs(self): + return self._cross_attention_kwargs + + @property + def num_timesteps(self): + return self._num_timesteps + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + prompt: Union[str, List[str]] = None, + image: PipelineImageInput = None, + mask: PipelineImageInput = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + timesteps: List[int] = None, + guidance_scale: float = 7.5, + negative_prompt: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + ip_adapter_image: Optional[PipelineImageInput] = None, + ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + brushnet_conditioning_scale: Union[float, List[float]] = 1.0, + guess_mode: bool = False, + control_guidance_start: Union[float, List[float]] = 0.0, + control_guidance_end: Union[float, List[float]] = 1.0, + clip_skip: Optional[int] = None, + callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, + callback_on_step_end_tensor_inputs: List[str] = ["latents"], + **kwargs, + ): + r""" + The call function to the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. + image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + The BrushNet input condition to provide guidance to the `unet` for generation. If the type is + specified as `torch.FloatTensor`, it is passed to BrushNet as is. `PIL.Image.Image` can also be + accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height + and/or width are passed, `image` is resized accordingly. If multiple BrushNets are specified in + `init`, images must be passed as a list such that each element of the list can be correctly batched for + input to a single BrushNet. When `prompt` is a list, and if a list of images is passed for a single BrushNet, + each will be paired with each prompt in the `prompt` list. This also applies to multiple BrushNets, + where a list of image lists can be passed to batch for each prompt and each BrushNet. + mask (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + The BrushNet input condition to provide guidance to the `unet` for generation. If the type is + specified as `torch.FloatTensor`, it is passed to BrushNet as is. `PIL.Image.Image` can also be + accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height + and/or width are passed, `image` is resized accordingly. If multiple BrushNets are specified in + `init`, images must be passed as a list such that each element of the list can be correctly batched for + input to a single BrushNet. When `prompt` is a list, and if a list of images is passed for a single BrushNet, + each will be paired with each prompt in the `prompt` list. This also applies to multiple BrushNets, + where a list of image lists can be passed to batch for each prompt and each BrushNet. + height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): + The height in pixels of the generated image. + width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): + The width in pixels of the generated image. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + timesteps (`List[int]`, *optional*): + Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument + in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is + passed will be used. Must be in descending order. + guidance_scale (`float`, *optional*, defaults to 7.5): + A higher guidance scale value encourages the model to generate images closely linked to the text + `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide what to not include in image generation. If not defined, you need to + pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies + to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make + generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor is generated by sampling using the supplied random `generator`. + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not + provided, text embeddings are generated from the `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If + not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. + ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. + ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. + Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding + if `do_classifier_free_guidance` is set to `True`. + If not provided, embeddings are computed from the `ip_adapter_image` input argument. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generated image. Choose between `PIL.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a + plain tuple. + callback (`Callable`, *optional*): + A function that calls every `callback_steps` steps during inference. The function is called with the + following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + callback_steps (`int`, *optional*, defaults to 1): + The frequency at which the `callback` function is called. If not specified, the callback is called at + every step. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in + [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + brushnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0): + The outputs of the BrushNet are multiplied by `brushnet_conditioning_scale` before they are added + to the residual in the original `unet`. If multiple BrushNets are specified in `init`, you can set + the corresponding scale as a list. + guess_mode (`bool`, *optional*, defaults to `False`): + The BrushNet encoder tries to recognize the content of the input image even if you remove all + prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended. + control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0): + The percentage of total steps at which the BrushNet starts applying. + control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0): + The percentage of total steps at which the BrushNet stops applying. + clip_skip (`int`, *optional*): + Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that + the output of the pre-final layer will be used for computing the prompt embeddings. + callback_on_step_end (`Callable`, *optional*): + A function that calls at the end of each denoising steps during the inference. The function is called + with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, + callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by + `callback_on_step_end_tensor_inputs`. + callback_on_step_end_tensor_inputs (`List`, *optional*): + The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list + will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the + `._callback_tensor_inputs` attribute of your pipeine class. + + Examples: + + Returns: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: + If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned, + otherwise a `tuple` is returned where the first element is a list with the generated images and the + second element is a list of `bool`s indicating whether the corresponding generated image contains + "not-safe-for-work" (nsfw) content. + """ + + callback = kwargs.pop("callback", None) + callback_steps = kwargs.pop("callback_steps", None) + + if callback is not None: + deprecate( + "callback", + "1.0.0", + "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`", + ) + if callback_steps is not None: + deprecate( + "callback_steps", + "1.0.0", + "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`", + ) + + brushnet = self.brushnet._orig_mod if is_compiled_module(self.brushnet) else self.brushnet + + # align format for control guidance + if not isinstance(control_guidance_start, list) and isinstance(control_guidance_end, list): + control_guidance_start = len(control_guidance_end) * [control_guidance_start] + elif not isinstance(control_guidance_end, list) and isinstance(control_guidance_start, list): + control_guidance_end = len(control_guidance_start) * [control_guidance_end] + elif not isinstance(control_guidance_start, list) and not isinstance(control_guidance_end, list): + control_guidance_start, control_guidance_end = ( + [control_guidance_start], + [control_guidance_end], + ) + + # 1. Check inputs. Raise error if not correct + self.check_inputs( + prompt, + image, + mask, + callback_steps, + negative_prompt, + prompt_embeds, + negative_prompt_embeds, + ip_adapter_image, + ip_adapter_image_embeds, + brushnet_conditioning_scale, + control_guidance_start, + control_guidance_end, + callback_on_step_end_tensor_inputs, + ) + + self._guidance_scale = guidance_scale + self._clip_skip = clip_skip + self._cross_attention_kwargs = cross_attention_kwargs + + # 2. Define call parameters + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + device = self._execution_device + + global_pool_conditions = ( + brushnet.config.global_pool_conditions + if isinstance(brushnet, BrushNetModel) + else brushnet.nets[0].config.global_pool_conditions + ) + guess_mode = guess_mode or global_pool_conditions + + # 3. Encode input prompt + text_encoder_lora_scale = ( + self.cross_attention_kwargs.get("scale", None) if self.cross_attention_kwargs is not None else None + ) + prompt_embeds, negative_prompt_embeds = self.encode_prompt( + prompt, + device, + num_images_per_prompt, + self.do_classifier_free_guidance, + negative_prompt, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, + clip_skip=self.clip_skip, + ) + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + if self.do_classifier_free_guidance: + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) + + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: + image_embeds = self.prepare_ip_adapter_image_embeds( + ip_adapter_image, + ip_adapter_image_embeds, + device, + batch_size * num_images_per_prompt, + self.do_classifier_free_guidance, + ) + + # 4. Prepare image + if isinstance(brushnet, BrushNetModel): + image = self.prepare_image( + image=image, + width=width, + height=height, + batch_size=batch_size * num_images_per_prompt, + num_images_per_prompt=num_images_per_prompt, + device=device, + dtype=brushnet.dtype, + do_classifier_free_guidance=self.do_classifier_free_guidance, + guess_mode=guess_mode, + ) + original_mask = self.prepare_image( + image=mask, + width=width, + height=height, + batch_size=batch_size * num_images_per_prompt, + num_images_per_prompt=num_images_per_prompt, + device=device, + dtype=brushnet.dtype, + do_classifier_free_guidance=self.do_classifier_free_guidance, + guess_mode=guess_mode, + ) + original_mask = (original_mask.sum(1)[:, None, :, :] < 0).to(image.dtype) + height, width = image.shape[-2:] + else: + assert False + + # 5. Prepare timesteps + timesteps, num_inference_steps = retrieve_timesteps(self.scheduler, num_inference_steps, device, timesteps) + self._num_timesteps = len(timesteps) + + # 6. Prepare latent variables + num_channels_latents = self.unet.config.in_channels + latents, noise = self.prepare_latents( + batch_size * num_images_per_prompt, + num_channels_latents, + height, + width, + prompt_embeds.dtype, + device, + generator, + latents, + ) + + # 6.1 prepare condition latents + conditioning_latents = self.vae.encode(image).latent_dist.sample() * self.vae.config.scaling_factor + mask = torch.nn.functional.interpolate( + original_mask, + size=( + conditioning_latents.shape[-2], + conditioning_latents.shape[-1] + ) + ) + conditioning_latents = torch.concat([conditioning_latents, mask], 1) + + # 6.5 Optionally get Guidance Scale Embedding + timestep_cond = None + if self.unet.config.time_cond_proj_dim is not None: + guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat(batch_size * num_images_per_prompt) + timestep_cond = self.get_guidance_scale_embedding( + guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim + ).to(device=device, dtype=latents.dtype) + + # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + # 7.1 Add image embeds for IP-Adapter + added_cond_kwargs = ( + {"image_embeds": image_embeds} + if ip_adapter_image is not None or ip_adapter_image_embeds is not None + else None + ) + + # 7.2 Create tensor stating which brushnets to keep + brushnet_keep = [] + for i in range(len(timesteps)): + keeps = [ + 1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e) + for s, e in zip(control_guidance_start, control_guidance_end) + ] + brushnet_keep.append(keeps[0] if isinstance(brushnet, BrushNetModel) else keeps) + + # 8. Denoising loop + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + is_unet_compiled = is_compiled_module(self.unet) + is_brushnet_compiled = is_compiled_module(self.brushnet) + is_torch_higher_equal_2_1 = is_torch_version(">=", "2.1") + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + # Relevant thread: + # https://dev-discuss.pytorch.org/t/cudagraphs-in-pytorch-2-0/1428 + if (is_unet_compiled and is_brushnet_compiled) and is_torch_higher_equal_2_1: + torch._inductor.cudagraph_mark_step_begin() + # expand the latents if we are doing classifier free guidance + latent_model_input = torch.cat([latents] * 2) if self.do_classifier_free_guidance else latents + latent_model_input = self.scheduler.scale_model_input(latent_model_input, t) + + # brushnet(s) inference + if guess_mode and self.do_classifier_free_guidance: + # Infer BrushNet only for the conditional batch. + control_model_input = latents + control_model_input = self.scheduler.scale_model_input(control_model_input, t) + brushnet_prompt_embeds = prompt_embeds.chunk(2)[1] + else: + control_model_input = latent_model_input + brushnet_prompt_embeds = prompt_embeds + + if isinstance(brushnet_keep[i], list): + cond_scale = [c * s for c, s in zip(brushnet_conditioning_scale, brushnet_keep[i])] + else: + brushnet_cond_scale = brushnet_conditioning_scale + if isinstance(brushnet_cond_scale, list): + brushnet_cond_scale = brushnet_cond_scale[0] + cond_scale = brushnet_cond_scale * brushnet_keep[i] + + down_block_res_samples, mid_block_res_sample, up_block_res_samples = self.brushnet( + control_model_input, + t, + encoder_hidden_states=brushnet_prompt_embeds, + brushnet_cond=conditioning_latents, + conditioning_scale=cond_scale, + guess_mode=guess_mode, + return_dict=False, + ) + + if guess_mode and self.do_classifier_free_guidance: + # Infered BrushNet only for the conditional batch. + # To apply the output of BrushNet to both the unconditional and conditional batches, + # add 0 to the unconditional batch to keep it unchanged. + down_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in down_block_res_samples] + mid_block_res_sample = torch.cat([torch.zeros_like(mid_block_res_sample), mid_block_res_sample]) + up_block_res_samples = [torch.cat([torch.zeros_like(d), d]) for d in up_block_res_samples] + + # predict the noise residual + noise_pred = self.unet( + latent_model_input, + t, + encoder_hidden_states=prompt_embeds, + timestep_cond=timestep_cond, + cross_attention_kwargs=self.cross_attention_kwargs, + down_block_add_samples=down_block_res_samples, + mid_block_add_sample=mid_block_res_sample, + up_block_add_samples=up_block_res_samples, + added_cond_kwargs=added_cond_kwargs, + return_dict=False, + )[0] + + # perform guidance + if self.do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + self.guidance_scale * (noise_pred_text - noise_pred_uncond) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs, return_dict=False)[0] + + if callback_on_step_end is not None: + callback_kwargs = {} + for k in callback_on_step_end_tensor_inputs: + callback_kwargs[k] = locals()[k] + callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) + + latents = callback_outputs.pop("latents", latents) + prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) + negative_prompt_embeds = callback_outputs.pop("negative_prompt_embeds", negative_prompt_embeds) + + # call the callback, if provided + if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + step_idx = i // getattr(self.scheduler, "order", 1) + callback(step_idx, t, latents) + + # If we do sequential model offloading, let's offload unet and brushnet + # manually for max memory savings + if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: + self.unet.to("cpu") + self.brushnet.to("cpu") + torch.cuda.empty_cache() + + if not output_type == "latent": + image = self.vae.decode(latents / self.vae.config.scaling_factor, return_dict=False, generator=generator)[ + 0 + ] + image, has_nsfw_concept = self.run_safety_checker(image, device, prompt_embeds.dtype) + else: + image = latents + has_nsfw_concept = None + + if has_nsfw_concept is None: + do_denormalize = [True] * image.shape[0] + else: + do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] + + image = self.image_processor.postprocess(image, output_type=output_type, do_denormalize=do_denormalize) + + # Offload all models + self.maybe_free_model_hooks() + + if not return_dict: + return (image, has_nsfw_concept) + + return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept) diff --git a/inpaint/model/brushnet/unet_2d_blocks.py b/inpaint/model/brushnet/unet_2d_blocks.py new file mode 100644 index 0000000..dcaae8e --- /dev/null +++ b/inpaint/model/brushnet/unet_2d_blocks.py @@ -0,0 +1,388 @@ +from typing import Dict, Any, Optional, Tuple + +import torch +from diffusers.models.resnet import ResnetBlock2D +from diffusers.utils import is_torch_version +from diffusers.utils.torch_utils import apply_freeu +from torch import nn + + +class MidBlock2D(nn.Module): + def __init__( + self, + in_channels: int, + temb_channels: int, + dropout: float = 0.0, + num_layers: int = 1, + resnet_eps: float = 1e-6, + resnet_time_scale_shift: str = "default", + resnet_act_fn: str = "swish", + resnet_groups: int = 32, + resnet_pre_norm: bool = True, + output_scale_factor: float = 1.0, + use_linear_projection: bool = False, + ): + super().__init__() + + self.has_cross_attention = False + resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32) + + # there is always at least one resnet + resnets = [ + ResnetBlock2D( + in_channels=in_channels, + out_channels=in_channels, + temb_channels=temb_channels, + eps=resnet_eps, + groups=resnet_groups, + dropout=dropout, + time_embedding_norm=resnet_time_scale_shift, + non_linearity=resnet_act_fn, + output_scale_factor=output_scale_factor, + pre_norm=resnet_pre_norm, + ) + ] + + for i in range(num_layers): + resnets.append( + ResnetBlock2D( + in_channels=in_channels, + out_channels=in_channels, + temb_channels=temb_channels, + eps=resnet_eps, + groups=resnet_groups, + dropout=dropout, + time_embedding_norm=resnet_time_scale_shift, + non_linearity=resnet_act_fn, + output_scale_factor=output_scale_factor, + pre_norm=resnet_pre_norm, + ) + ) + + self.resnets = nn.ModuleList(resnets) + + self.gradient_checkpointing = False + + def forward( + self, + hidden_states: torch.FloatTensor, + temb: Optional[torch.FloatTensor] = None, + ) -> torch.FloatTensor: + lora_scale = 1.0 + hidden_states = self.resnets[0](hidden_states, temb, scale=lora_scale) + for resnet in self.resnets[1:]: + if self.training and self.gradient_checkpointing: + + def create_custom_forward(module, return_dict=None): + def custom_forward(*inputs): + if return_dict is not None: + return module(*inputs, return_dict=return_dict) + else: + return module(*inputs) + + return custom_forward + + ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {} + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(resnet), + hidden_states, + temb, + **ckpt_kwargs, + ) + else: + hidden_states = resnet(hidden_states, temb, scale=lora_scale) + + return hidden_states + + +def DownBlock2D_forward( + self, hidden_states: torch.FloatTensor, temb: Optional[torch.FloatTensor] = None, scale: float = 1.0, + down_block_add_samples: Optional[torch.FloatTensor] = None, +) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + output_states = () + + for resnet in self.resnets: + if self.training and self.gradient_checkpointing: + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs) + + return custom_forward + + if is_torch_version(">=", "1.11.0"): + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(resnet), hidden_states, temb, use_reentrant=False + ) + else: + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(resnet), hidden_states, temb + ) + else: + hidden_states = resnet(hidden_states, temb, scale=scale) + + if down_block_add_samples is not None: + hidden_states = hidden_states + down_block_add_samples.pop(0) + + output_states = output_states + (hidden_states,) + + if self.downsamplers is not None: + for downsampler in self.downsamplers: + hidden_states = downsampler(hidden_states, scale=scale) + + if down_block_add_samples is not None: + hidden_states = hidden_states + down_block_add_samples.pop(0) # todo: add before or after + + output_states = output_states + (hidden_states,) + + return hidden_states, output_states + + +def CrossAttnDownBlock2D_forward( + self, + hidden_states: torch.FloatTensor, + temb: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + additional_residuals: Optional[torch.FloatTensor] = None, + down_block_add_samples: Optional[torch.FloatTensor] = None, +) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + output_states = () + + lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0 + + blocks = list(zip(self.resnets, self.attentions)) + + for i, (resnet, attn) in enumerate(blocks): + if self.training and self.gradient_checkpointing: + + def create_custom_forward(module, return_dict=None): + def custom_forward(*inputs): + if return_dict is not None: + return module(*inputs, return_dict=return_dict) + else: + return module(*inputs) + + return custom_forward + + ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {} + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(resnet), + hidden_states, + temb, + **ckpt_kwargs, + ) + hidden_states = attn( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + cross_attention_kwargs=cross_attention_kwargs, + attention_mask=attention_mask, + encoder_attention_mask=encoder_attention_mask, + return_dict=False, + )[0] + else: + hidden_states = resnet(hidden_states, temb, scale=lora_scale) + hidden_states = attn( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + cross_attention_kwargs=cross_attention_kwargs, + attention_mask=attention_mask, + encoder_attention_mask=encoder_attention_mask, + return_dict=False, + )[0] + + # apply additional residuals to the output of the last pair of resnet and attention blocks + if i == len(blocks) - 1 and additional_residuals is not None: + hidden_states = hidden_states + additional_residuals + + if down_block_add_samples is not None: + hidden_states = hidden_states + down_block_add_samples.pop(0) + + output_states = output_states + (hidden_states,) + + if self.downsamplers is not None: + for downsampler in self.downsamplers: + hidden_states = downsampler(hidden_states, scale=lora_scale) + + if down_block_add_samples is not None: + hidden_states = hidden_states + down_block_add_samples.pop(0) # todo: add before or after + + output_states = output_states + (hidden_states,) + + return hidden_states, output_states + + +def CrossAttnUpBlock2D_forward( + self, + hidden_states: torch.FloatTensor, + res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], + temb: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + upsample_size: Optional[int] = None, + attention_mask: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + return_res_samples: Optional[bool] = False, + up_block_add_samples: Optional[torch.FloatTensor] = None, +) -> torch.FloatTensor: + lora_scale = cross_attention_kwargs.get("scale", 1.0) if cross_attention_kwargs is not None else 1.0 + is_freeu_enabled = ( + getattr(self, "s1", None) + and getattr(self, "s2", None) + and getattr(self, "b1", None) + and getattr(self, "b2", None) + ) + if return_res_samples: + output_states = () + + for resnet, attn in zip(self.resnets, self.attentions): + # pop res hidden states + res_hidden_states = res_hidden_states_tuple[-1] + res_hidden_states_tuple = res_hidden_states_tuple[:-1] + + # FreeU: Only operate on the first two stages + if is_freeu_enabled: + hidden_states, res_hidden_states = apply_freeu( + self.resolution_idx, + hidden_states, + res_hidden_states, + s1=self.s1, + s2=self.s2, + b1=self.b1, + b2=self.b2, + ) + + hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1) + + if self.training and self.gradient_checkpointing: + + def create_custom_forward(module, return_dict=None): + def custom_forward(*inputs): + if return_dict is not None: + return module(*inputs, return_dict=return_dict) + else: + return module(*inputs) + + return custom_forward + + ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {} + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(resnet), + hidden_states, + temb, + **ckpt_kwargs, + ) + hidden_states = attn( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + cross_attention_kwargs=cross_attention_kwargs, + attention_mask=attention_mask, + encoder_attention_mask=encoder_attention_mask, + return_dict=False, + )[0] + else: + hidden_states = resnet(hidden_states, temb, scale=lora_scale) + hidden_states = attn( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + cross_attention_kwargs=cross_attention_kwargs, + attention_mask=attention_mask, + encoder_attention_mask=encoder_attention_mask, + return_dict=False, + )[0] + if return_res_samples: + output_states = output_states + (hidden_states,) + if up_block_add_samples is not None: + hidden_states = hidden_states + up_block_add_samples.pop(0) + + if self.upsamplers is not None: + for upsampler in self.upsamplers: + hidden_states = upsampler(hidden_states, upsample_size, scale=lora_scale) + if return_res_samples: + output_states = output_states + (hidden_states,) + if up_block_add_samples is not None: + hidden_states = hidden_states + up_block_add_samples.pop(0) + + if return_res_samples: + return hidden_states, output_states + else: + return hidden_states + + +def UpBlock2D_forward( + self, + hidden_states: torch.FloatTensor, + res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], + temb: Optional[torch.FloatTensor] = None, + upsample_size: Optional[int] = None, + scale: float = 1.0, + return_res_samples: Optional[bool] = False, + up_block_add_samples: Optional[torch.FloatTensor] = None, +) -> torch.FloatTensor: + is_freeu_enabled = ( + getattr(self, "s1", None) + and getattr(self, "s2", None) + and getattr(self, "b1", None) + and getattr(self, "b2", None) + ) + if return_res_samples: + output_states = () + + for resnet in self.resnets: + # pop res hidden states + res_hidden_states = res_hidden_states_tuple[-1] + res_hidden_states_tuple = res_hidden_states_tuple[:-1] + + # FreeU: Only operate on the first two stages + if is_freeu_enabled: + hidden_states, res_hidden_states = apply_freeu( + self.resolution_idx, + hidden_states, + res_hidden_states, + s1=self.s1, + s2=self.s2, + b1=self.b1, + b2=self.b2, + ) + + hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1) + + if self.training and self.gradient_checkpointing: + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs) + + return custom_forward + + if is_torch_version(">=", "1.11.0"): + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(resnet), hidden_states, temb, use_reentrant=False + ) + else: + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(resnet), hidden_states, temb + ) + else: + hidden_states = resnet(hidden_states, temb, scale=scale) + + if return_res_samples: + output_states = output_states + (hidden_states,) + if up_block_add_samples is not None: + hidden_states = hidden_states + up_block_add_samples.pop(0) # todo: add before or after + + if self.upsamplers is not None: + for upsampler in self.upsamplers: + hidden_states = upsampler(hidden_states, upsample_size, scale=scale) + + if return_res_samples: + output_states = output_states + (hidden_states,) + if up_block_add_samples is not None: + hidden_states = hidden_states + up_block_add_samples.pop(0) # todo: add before or after + + if return_res_samples: + return hidden_states, output_states + else: + return hidden_states diff --git a/inpaint/model/controlnet.py b/inpaint/model/controlnet.py new file mode 100644 index 0000000..7b4d243 --- /dev/null +++ b/inpaint/model/controlnet.py @@ -0,0 +1,194 @@ +import PIL.Image +import cv2 +import torch +from diffusers import ControlNetModel +from loguru import logger +from iopaint.schema import InpaintRequest, ModelType + +from .base import DiffusionInpaintModel +from .helper.controlnet_preprocess import ( + make_canny_control_image, + make_openpose_control_image, + make_depth_control_image, + make_inpaint_control_image, +) +from .helper.cpu_text_encoder import CPUTextEncoderWrapper +from .original_sd_configs import get_config_files +from .utils import ( + get_scheduler, + handle_from_pretrained_exceptions, + get_torch_dtype, + enable_low_mem, + is_local_files_only, +) + + +class ControlNet(DiffusionInpaintModel): + name = "controlnet" + pad_mod = 8 + min_size = 512 + + @property + def lcm_lora_id(self): + if self.model_info.model_type in [ + ModelType.DIFFUSERS_SD, + ModelType.DIFFUSERS_SD_INPAINT, + ]: + return "latent-consistency/lcm-lora-sdv1-5" + if self.model_info.model_type in [ + ModelType.DIFFUSERS_SDXL, + ModelType.DIFFUSERS_SDXL_INPAINT, + ]: + return "latent-consistency/lcm-lora-sdxl" + raise NotImplementedError(f"Unsupported controlnet lcm model {self.model_info}") + + def init_model(self, device: torch.device, **kwargs): + model_info = kwargs["model_info"] + controlnet_method = kwargs["controlnet_method"] + + self.model_info = model_info + self.controlnet_method = controlnet_method + + model_kwargs = { + **kwargs.get("pipe_components", {}), + "local_files_only": is_local_files_only(**kwargs), + } + self.local_files_only = model_kwargs["local_files_only"] + + disable_nsfw_checker = kwargs["disable_nsfw"] or kwargs.get( + "cpu_offload", False + ) + if disable_nsfw_checker: + logger.info("Disable Stable Diffusion Model NSFW checker") + model_kwargs.update( + dict( + safety_checker=None, + feature_extractor=None, + requires_safety_checker=False, + ) + ) + + use_gpu, torch_dtype = get_torch_dtype(device, kwargs.get("no_half", False)) + self.torch_dtype = torch_dtype + + original_config_file_name = "v1" + if model_info.model_type in [ + ModelType.DIFFUSERS_SD, + ModelType.DIFFUSERS_SD_INPAINT, + ]: + from diffusers import ( + StableDiffusionControlNetInpaintPipeline as PipeClass, + ) + original_config_file_name = "v1" + + elif model_info.model_type in [ + ModelType.DIFFUSERS_SDXL, + ModelType.DIFFUSERS_SDXL_INPAINT, + ]: + from diffusers import ( + StableDiffusionXLControlNetInpaintPipeline as PipeClass, + ) + original_config_file_name = "xl" + + controlnet = ControlNetModel.from_pretrained( + pretrained_model_name_or_path=controlnet_method, + resume_download=True, + local_files_only=model_kwargs["local_files_only"], + torch_dtype=self.torch_dtype, + ) + if model_info.is_single_file_diffusers: + if self.model_info.model_type == ModelType.DIFFUSERS_SD: + model_kwargs["num_in_channels"] = 4 + else: + model_kwargs["num_in_channels"] = 9 + + self.model = PipeClass.from_single_file( + model_info.path, + controlnet=controlnet, + load_safety_checker=not disable_nsfw_checker, + torch_dtype=torch_dtype, + original_config_file=get_config_files()[original_config_file_name], + **model_kwargs, + ) + else: + self.model = handle_from_pretrained_exceptions( + PipeClass.from_pretrained, + pretrained_model_name_or_path=model_info.path, + controlnet=controlnet, + variant="fp16", + torch_dtype=torch_dtype, + **model_kwargs, + ) + + enable_low_mem(self.model, kwargs.get("low_mem", False)) + + if kwargs.get("cpu_offload", False) and use_gpu: + logger.info("Enable sequential cpu offload") + self.model.enable_sequential_cpu_offload(gpu_id=0) + else: + self.model = self.model.to(device) + if kwargs["sd_cpu_textencoder"]: + logger.info("Run Stable Diffusion TextEncoder on CPU") + self.model.text_encoder = CPUTextEncoderWrapper( + self.model.text_encoder, torch_dtype + ) + + self.callback = kwargs.pop("callback", None) + + def switch_controlnet_method(self, new_method: str): + self.controlnet_method = new_method + controlnet = ControlNetModel.from_pretrained( + new_method, + resume_download=True, + local_files_only=self.local_files_only, + torch_dtype=self.torch_dtype, + ).to(self.model.device) + self.model.controlnet = controlnet + + def _get_control_image(self, image, mask): + if "canny" in self.controlnet_method: + control_image = make_canny_control_image(image) + elif "openpose" in self.controlnet_method: + control_image = make_openpose_control_image(image) + elif "depth" in self.controlnet_method: + control_image = make_depth_control_image(image) + elif "inpaint" in self.controlnet_method: + control_image = make_inpaint_control_image(image, mask) + else: + raise NotImplementedError(f"{self.controlnet_method} not implemented") + return control_image + + def forward(self, image, mask, config: InpaintRequest): + """Input image and output image have same size + image: [H, W, C] RGB + mask: [H, W, 1] 255 means area to repaint + return: BGR IMAGE + """ + scheduler_config = self.model.scheduler.config + scheduler = get_scheduler(config.sd_sampler, scheduler_config) + self.model.scheduler = scheduler + + img_h, img_w = image.shape[:2] + control_image = self._get_control_image(image, mask) + mask_image = PIL.Image.fromarray(mask[:, :, -1], mode="L") + image = PIL.Image.fromarray(image) + + output = self.model( + image=image, + mask_image=mask_image, + control_image=control_image, + prompt=config.prompt, + negative_prompt=config.negative_prompt, + num_inference_steps=config.sd_steps, + guidance_scale=config.sd_guidance_scale, + output_type="np", + callback_on_step_end=self.callback, + height=img_h, + width=img_w, + generator=torch.manual_seed(config.sd_seed), + controlnet_conditioning_scale=config.controlnet_conditioning_scale, + ).images[0] + + output = (output * 255).round().astype("uint8") + output = cv2.cvtColor(output, cv2.COLOR_RGB2BGR) + return output diff --git a/inpaint/model/ddim_sampler.py b/inpaint/model/ddim_sampler.py new file mode 100644 index 0000000..a3f44fd --- /dev/null +++ b/inpaint/model/ddim_sampler.py @@ -0,0 +1,193 @@ +import torch +import numpy as np +from tqdm import tqdm + +from .utils import make_ddim_timesteps, make_ddim_sampling_parameters, noise_like + +from loguru import logger + + +class DDIMSampler(object): + def __init__(self, model, schedule="linear"): + super().__init__() + self.model = model + self.ddpm_num_timesteps = model.num_timesteps + self.schedule = schedule + + def register_buffer(self, name, attr): + setattr(self, name, attr) + + def make_schedule( + self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0.0, verbose=True + ): + self.ddim_timesteps = make_ddim_timesteps( + ddim_discr_method=ddim_discretize, + num_ddim_timesteps=ddim_num_steps, + # array([1]) + num_ddpm_timesteps=self.ddpm_num_timesteps, + verbose=verbose, + ) + alphas_cumprod = self.model.alphas_cumprod # torch.Size([1000]) + assert ( + alphas_cumprod.shape[0] == self.ddpm_num_timesteps + ), "alphas have to be defined for each timestep" + to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device) + + self.register_buffer("betas", to_torch(self.model.betas)) + self.register_buffer("alphas_cumprod", to_torch(alphas_cumprod)) + self.register_buffer( + "alphas_cumprod_prev", to_torch(self.model.alphas_cumprod_prev) + ) + + # calculations for diffusion q(x_t | x_{t-1}) and others + self.register_buffer( + "sqrt_alphas_cumprod", to_torch(np.sqrt(alphas_cumprod.cpu())) + ) + self.register_buffer( + "sqrt_one_minus_alphas_cumprod", + to_torch(np.sqrt(1.0 - alphas_cumprod.cpu())), + ) + self.register_buffer( + "log_one_minus_alphas_cumprod", to_torch(np.log(1.0 - alphas_cumprod.cpu())) + ) + self.register_buffer( + "sqrt_recip_alphas_cumprod", to_torch(np.sqrt(1.0 / alphas_cumprod.cpu())) + ) + self.register_buffer( + "sqrt_recipm1_alphas_cumprod", + to_torch(np.sqrt(1.0 / alphas_cumprod.cpu() - 1)), + ) + + # ddim sampling parameters + ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters( + alphacums=alphas_cumprod.cpu(), + ddim_timesteps=self.ddim_timesteps, + eta=ddim_eta, + verbose=verbose, + ) + self.register_buffer("ddim_sigmas", ddim_sigmas) + self.register_buffer("ddim_alphas", ddim_alphas) + self.register_buffer("ddim_alphas_prev", ddim_alphas_prev) + self.register_buffer("ddim_sqrt_one_minus_alphas", np.sqrt(1.0 - ddim_alphas)) + sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt( + (1 - self.alphas_cumprod_prev) + / (1 - self.alphas_cumprod) + * (1 - self.alphas_cumprod / self.alphas_cumprod_prev) + ) + self.register_buffer( + "ddim_sigmas_for_original_num_steps", sigmas_for_original_sampling_steps + ) + + @torch.no_grad() + def sample(self, steps, conditioning, batch_size, shape): + self.make_schedule(ddim_num_steps=steps, ddim_eta=0, verbose=False) + # sampling + C, H, W = shape + size = (batch_size, C, H, W) + + # samples: 1,3,128,128 + return self.ddim_sampling( + conditioning, + size, + quantize_denoised=False, + ddim_use_original_steps=False, + noise_dropout=0, + temperature=1.0, + ) + + @torch.no_grad() + def ddim_sampling( + self, + cond, + shape, + ddim_use_original_steps=False, + quantize_denoised=False, + temperature=1.0, + noise_dropout=0.0, + ): + device = self.model.betas.device + b = shape[0] + img = torch.randn(shape, device=device, dtype=cond.dtype) + timesteps = ( + self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps + ) + + time_range = ( + reversed(range(0, timesteps)) + if ddim_use_original_steps + else np.flip(timesteps) + ) + total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0] + logger.info(f"Running DDIM Sampling with {total_steps} timesteps") + + iterator = tqdm(time_range, desc="DDIM Sampler", total=total_steps) + + for i, step in enumerate(iterator): + index = total_steps - i - 1 + ts = torch.full((b,), step, device=device, dtype=torch.long) + + outs = self.p_sample_ddim( + img, + cond, + ts, + index=index, + use_original_steps=ddim_use_original_steps, + quantize_denoised=quantize_denoised, + temperature=temperature, + noise_dropout=noise_dropout, + ) + img, _ = outs + + return img + + @torch.no_grad() + def p_sample_ddim( + self, + x, + c, + t, + index, + repeat_noise=False, + use_original_steps=False, + quantize_denoised=False, + temperature=1.0, + noise_dropout=0.0, + ): + b, *_, device = *x.shape, x.device + e_t = self.model.apply_model(x, t, c) + + alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas + alphas_prev = ( + self.model.alphas_cumprod_prev + if use_original_steps + else self.ddim_alphas_prev + ) + sqrt_one_minus_alphas = ( + self.model.sqrt_one_minus_alphas_cumprod + if use_original_steps + else self.ddim_sqrt_one_minus_alphas + ) + sigmas = ( + self.model.ddim_sigmas_for_original_num_steps + if use_original_steps + else self.ddim_sigmas + ) + # select parameters corresponding to the currently considered timestep + a_t = torch.full((b, 1, 1, 1), alphas[index], device=device) + a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device) + sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device) + sqrt_one_minus_at = torch.full( + (b, 1, 1, 1), sqrt_one_minus_alphas[index], device=device + ) + + # current prediction for x_0 + pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt() + if quantize_denoised: # 没用 + pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0) + # direction pointing to x_t + dir_xt = (1.0 - a_prev - sigma_t ** 2).sqrt() * e_t + noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature + if noise_dropout > 0.0: # 没用 + noise = torch.nn.functional.dropout(noise, p=noise_dropout) + x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise + return x_prev, pred_x0 diff --git a/inpaint/model/fcf.py b/inpaint/model/fcf.py new file mode 100644 index 0000000..a6f2d42 --- /dev/null +++ b/inpaint/model/fcf.py @@ -0,0 +1,1737 @@ +import os +import random + +import cv2 +import torch +import numpy as np +import torch.fft as fft + +from iopaint.schema import InpaintRequest + +from iopaint.helper import ( + load_model, + get_cache_path_by_url, + norm_img, + boxes_from_mask, + resize_max_size, + download_model, +) +from .base import InpaintModel +from torch import conv2d, nn +import torch.nn.functional as F + +from .utils import ( + setup_filter, + _parse_scaling, + _parse_padding, + Conv2dLayer, + FullyConnectedLayer, + MinibatchStdLayer, + activation_funcs, + conv2d_resample, + bias_act, + upsample2d, + normalize_2nd_moment, + downsample2d, +) + + +def upfirdn2d(x, f, up=1, down=1, padding=0, flip_filter=False, gain=1, impl="cuda"): + assert isinstance(x, torch.Tensor) + return _upfirdn2d_ref( + x, f, up=up, down=down, padding=padding, flip_filter=flip_filter, gain=gain + ) + + +def _upfirdn2d_ref(x, f, up=1, down=1, padding=0, flip_filter=False, gain=1): + """Slow reference implementation of `upfirdn2d()` using standard PyTorch ops.""" + # Validate arguments. + assert isinstance(x, torch.Tensor) and x.ndim == 4 + if f is None: + f = torch.ones([1, 1], dtype=torch.float32, device=x.device) + assert isinstance(f, torch.Tensor) and f.ndim in [1, 2] + assert f.dtype == torch.float32 and not f.requires_grad + batch_size, num_channels, in_height, in_width = x.shape + upx, upy = _parse_scaling(up) + downx, downy = _parse_scaling(down) + padx0, padx1, pady0, pady1 = _parse_padding(padding) + + # Upsample by inserting zeros. + x = x.reshape([batch_size, num_channels, in_height, 1, in_width, 1]) + x = torch.nn.functional.pad(x, [0, upx - 1, 0, 0, 0, upy - 1]) + x = x.reshape([batch_size, num_channels, in_height * upy, in_width * upx]) + + # Pad or crop. + x = torch.nn.functional.pad( + x, [max(padx0, 0), max(padx1, 0), max(pady0, 0), max(pady1, 0)] + ) + x = x[ + :, + :, + max(-pady0, 0) : x.shape[2] - max(-pady1, 0), + max(-padx0, 0) : x.shape[3] - max(-padx1, 0), + ] + + # Setup filter. + f = f * (gain ** (f.ndim / 2)) + f = f.to(x.dtype) + if not flip_filter: + f = f.flip(list(range(f.ndim))) + + # Convolve with the filter. + f = f[np.newaxis, np.newaxis].repeat([num_channels, 1] + [1] * f.ndim) + if f.ndim == 4: + x = conv2d(input=x, weight=f, groups=num_channels) + else: + x = conv2d(input=x, weight=f.unsqueeze(2), groups=num_channels) + x = conv2d(input=x, weight=f.unsqueeze(3), groups=num_channels) + + # Downsample by throwing away pixels. + x = x[:, :, ::downy, ::downx] + return x + + +class EncoderEpilogue(torch.nn.Module): + def __init__( + self, + in_channels, # Number of input channels. + cmap_dim, # Dimensionality of mapped conditioning label, 0 = no label. + z_dim, # Output Latent (Z) dimensionality. + resolution, # Resolution of this block. + img_channels, # Number of input color channels. + architecture="resnet", # Architecture: 'orig', 'skip', 'resnet'. + mbstd_group_size=4, # Group size for the minibatch standard deviation layer, None = entire minibatch. + mbstd_num_channels=1, # Number of features for the minibatch standard deviation layer, 0 = disable. + activation="lrelu", # Activation function: 'relu', 'lrelu', etc. + conv_clamp=None, # Clamp the output of convolution layers to +-X, None = disable clamping. + ): + assert architecture in ["orig", "skip", "resnet"] + super().__init__() + self.in_channels = in_channels + self.cmap_dim = cmap_dim + self.resolution = resolution + self.img_channels = img_channels + self.architecture = architecture + + if architecture == "skip": + self.fromrgb = Conv2dLayer( + self.img_channels, in_channels, kernel_size=1, activation=activation + ) + self.mbstd = ( + MinibatchStdLayer( + group_size=mbstd_group_size, num_channels=mbstd_num_channels + ) + if mbstd_num_channels > 0 + else None + ) + self.conv = Conv2dLayer( + in_channels + mbstd_num_channels, + in_channels, + kernel_size=3, + activation=activation, + conv_clamp=conv_clamp, + ) + self.fc = FullyConnectedLayer( + in_channels * (resolution**2), z_dim, activation=activation + ) + self.dropout = torch.nn.Dropout(p=0.5) + + def forward(self, x, cmap, force_fp32=False): + _ = force_fp32 # unused + dtype = torch.float32 + memory_format = torch.contiguous_format + + # FromRGB. + x = x.to(dtype=dtype, memory_format=memory_format) + + # Main layers. + if self.mbstd is not None: + x = self.mbstd(x) + const_e = self.conv(x) + x = self.fc(const_e.flatten(1)) + x = self.dropout(x) + + # Conditioning. + if self.cmap_dim > 0: + x = (x * cmap).sum(dim=1, keepdim=True) * (1 / np.sqrt(self.cmap_dim)) + + assert x.dtype == dtype + return x, const_e + + +class EncoderBlock(torch.nn.Module): + def __init__( + self, + in_channels, # Number of input channels, 0 = first block. + tmp_channels, # Number of intermediate channels. + out_channels, # Number of output channels. + resolution, # Resolution of this block. + img_channels, # Number of input color channels. + first_layer_idx, # Index of the first layer. + architecture="skip", # Architecture: 'orig', 'skip', 'resnet'. + activation="lrelu", # Activation function: 'relu', 'lrelu', etc. + resample_filter=[ + 1, + 3, + 3, + 1, + ], # Low-pass filter to apply when resampling activations. + conv_clamp=None, # Clamp the output of convolution layers to +-X, None = disable clamping. + use_fp16=False, # Use FP16 for this block? + fp16_channels_last=False, # Use channels-last memory format with FP16? + freeze_layers=0, # Freeze-D: Number of layers to freeze. + ): + assert in_channels in [0, tmp_channels] + assert architecture in ["orig", "skip", "resnet"] + super().__init__() + self.in_channels = in_channels + self.resolution = resolution + self.img_channels = img_channels + 1 + self.first_layer_idx = first_layer_idx + self.architecture = architecture + self.use_fp16 = use_fp16 + self.channels_last = use_fp16 and fp16_channels_last + self.register_buffer("resample_filter", setup_filter(resample_filter)) + + self.num_layers = 0 + + def trainable_gen(): + while True: + layer_idx = self.first_layer_idx + self.num_layers + trainable = layer_idx >= freeze_layers + self.num_layers += 1 + yield trainable + + trainable_iter = trainable_gen() + + if in_channels == 0: + self.fromrgb = Conv2dLayer( + self.img_channels, + tmp_channels, + kernel_size=1, + activation=activation, + trainable=next(trainable_iter), + conv_clamp=conv_clamp, + channels_last=self.channels_last, + ) + + self.conv0 = Conv2dLayer( + tmp_channels, + tmp_channels, + kernel_size=3, + activation=activation, + trainable=next(trainable_iter), + conv_clamp=conv_clamp, + channels_last=self.channels_last, + ) + + self.conv1 = Conv2dLayer( + tmp_channels, + out_channels, + kernel_size=3, + activation=activation, + down=2, + trainable=next(trainable_iter), + resample_filter=resample_filter, + conv_clamp=conv_clamp, + channels_last=self.channels_last, + ) + + if architecture == "resnet": + self.skip = Conv2dLayer( + tmp_channels, + out_channels, + kernel_size=1, + bias=False, + down=2, + trainable=next(trainable_iter), + resample_filter=resample_filter, + channels_last=self.channels_last, + ) + + def forward(self, x, img, force_fp32=False): + # dtype = torch.float16 if self.use_fp16 and not force_fp32 else torch.float32 + dtype = torch.float32 + memory_format = ( + torch.channels_last + if self.channels_last and not force_fp32 + else torch.contiguous_format + ) + + # Input. + if x is not None: + x = x.to(dtype=dtype, memory_format=memory_format) + + # FromRGB. + if self.in_channels == 0: + img = img.to(dtype=dtype, memory_format=memory_format) + y = self.fromrgb(img) + x = x + y if x is not None else y + img = ( + downsample2d(img, self.resample_filter) + if self.architecture == "skip" + else None + ) + + # Main layers. + if self.architecture == "resnet": + y = self.skip(x, gain=np.sqrt(0.5)) + x = self.conv0(x) + feat = x.clone() + x = self.conv1(x, gain=np.sqrt(0.5)) + x = y.add_(x) + else: + x = self.conv0(x) + feat = x.clone() + x = self.conv1(x) + + assert x.dtype == dtype + return x, img, feat + + +class EncoderNetwork(torch.nn.Module): + def __init__( + self, + c_dim, # Conditioning label (C) dimensionality. + z_dim, # Input latent (Z) dimensionality. + img_resolution, # Input resolution. + img_channels, # Number of input color channels. + architecture="orig", # Architecture: 'orig', 'skip', 'resnet'. + channel_base=16384, # Overall multiplier for the number of channels. + channel_max=512, # Maximum number of channels in any layer. + num_fp16_res=0, # Use FP16 for the N highest resolutions. + conv_clamp=None, # Clamp the output of convolution layers to +-X, None = disable clamping. + cmap_dim=None, # Dimensionality of mapped conditioning label, None = default. + block_kwargs={}, # Arguments for DiscriminatorBlock. + mapping_kwargs={}, # Arguments for MappingNetwork. + epilogue_kwargs={}, # Arguments for EncoderEpilogue. + ): + super().__init__() + self.c_dim = c_dim + self.z_dim = z_dim + self.img_resolution = img_resolution + self.img_resolution_log2 = int(np.log2(img_resolution)) + self.img_channels = img_channels + self.block_resolutions = [ + 2**i for i in range(self.img_resolution_log2, 2, -1) + ] + channels_dict = { + res: min(channel_base // res, channel_max) + for res in self.block_resolutions + [4] + } + fp16_resolution = max(2 ** (self.img_resolution_log2 + 1 - num_fp16_res), 8) + + if cmap_dim is None: + cmap_dim = channels_dict[4] + if c_dim == 0: + cmap_dim = 0 + + common_kwargs = dict( + img_channels=img_channels, architecture=architecture, conv_clamp=conv_clamp + ) + cur_layer_idx = 0 + for res in self.block_resolutions: + in_channels = channels_dict[res] if res < img_resolution else 0 + tmp_channels = channels_dict[res] + out_channels = channels_dict[res // 2] + use_fp16 = res >= fp16_resolution + use_fp16 = False + block = EncoderBlock( + in_channels, + tmp_channels, + out_channels, + resolution=res, + first_layer_idx=cur_layer_idx, + use_fp16=use_fp16, + **block_kwargs, + **common_kwargs, + ) + setattr(self, f"b{res}", block) + cur_layer_idx += block.num_layers + if c_dim > 0: + self.mapping = MappingNetwork( + z_dim=0, + c_dim=c_dim, + w_dim=cmap_dim, + num_ws=None, + w_avg_beta=None, + **mapping_kwargs, + ) + self.b4 = EncoderEpilogue( + channels_dict[4], + cmap_dim=cmap_dim, + z_dim=z_dim * 2, + resolution=4, + **epilogue_kwargs, + **common_kwargs, + ) + + def forward(self, img, c, **block_kwargs): + x = None + feats = {} + for res in self.block_resolutions: + block = getattr(self, f"b{res}") + x, img, feat = block(x, img, **block_kwargs) + feats[res] = feat + + cmap = None + if self.c_dim > 0: + cmap = self.mapping(None, c) + x, const_e = self.b4(x, cmap) + feats[4] = const_e + + B, _ = x.shape + z = torch.zeros( + (B, self.z_dim), requires_grad=False, dtype=x.dtype, device=x.device + ) ## Noise for Co-Modulation + return x, z, feats + + +def fma(a, b, c): # => a * b + c + return _FusedMultiplyAdd.apply(a, b, c) + + +class _FusedMultiplyAdd(torch.autograd.Function): # a * b + c + @staticmethod + def forward(ctx, a, b, c): # pylint: disable=arguments-differ + out = torch.addcmul(c, a, b) + ctx.save_for_backward(a, b) + ctx.c_shape = c.shape + return out + + @staticmethod + def backward(ctx, dout): # pylint: disable=arguments-differ + a, b = ctx.saved_tensors + c_shape = ctx.c_shape + da = None + db = None + dc = None + + if ctx.needs_input_grad[0]: + da = _unbroadcast(dout * b, a.shape) + + if ctx.needs_input_grad[1]: + db = _unbroadcast(dout * a, b.shape) + + if ctx.needs_input_grad[2]: + dc = _unbroadcast(dout, c_shape) + + return da, db, dc + + +def _unbroadcast(x, shape): + extra_dims = x.ndim - len(shape) + assert extra_dims >= 0 + dim = [ + i + for i in range(x.ndim) + if x.shape[i] > 1 and (i < extra_dims or shape[i - extra_dims] == 1) + ] + if len(dim): + x = x.sum(dim=dim, keepdim=True) + if extra_dims: + x = x.reshape(-1, *x.shape[extra_dims + 1 :]) + assert x.shape == shape + return x + + +def modulated_conv2d( + x, # Input tensor of shape [batch_size, in_channels, in_height, in_width]. + weight, # Weight tensor of shape [out_channels, in_channels, kernel_height, kernel_width]. + styles, # Modulation coefficients of shape [batch_size, in_channels]. + noise=None, # Optional noise tensor to add to the output activations. + up=1, # Integer upsampling factor. + down=1, # Integer downsampling factor. + padding=0, # Padding with respect to the upsampled image. + resample_filter=None, + # Low-pass filter to apply when resampling activations. Must be prepared beforehand by calling upfirdn2d.setup_filter(). + demodulate=True, # Apply weight demodulation? + flip_weight=True, # False = convolution, True = correlation (matches torch.nn.functional.conv2d). + fused_modconv=True, # Perform modulation, convolution, and demodulation as a single fused operation? +): + batch_size = x.shape[0] + out_channels, in_channels, kh, kw = weight.shape + + # Pre-normalize inputs to avoid FP16 overflow. + if x.dtype == torch.float16 and demodulate: + weight = weight * ( + 1 + / np.sqrt(in_channels * kh * kw) + / weight.norm(float("inf"), dim=[1, 2, 3], keepdim=True) + ) # max_Ikk + styles = styles / styles.norm(float("inf"), dim=1, keepdim=True) # max_I + + # Calculate per-sample weights and demodulation coefficients. + w = None + dcoefs = None + if demodulate or fused_modconv: + w = weight.unsqueeze(0) # [NOIkk] + w = w * styles.reshape(batch_size, 1, -1, 1, 1) # [NOIkk] + if demodulate: + dcoefs = (w.square().sum(dim=[2, 3, 4]) + 1e-8).rsqrt() # [NO] + if demodulate and fused_modconv: + w = w * dcoefs.reshape(batch_size, -1, 1, 1, 1) # [NOIkk] + # Execute by scaling the activations before and after the convolution. + if not fused_modconv: + x = x * styles.to(x.dtype).reshape(batch_size, -1, 1, 1) + x = conv2d_resample.conv2d_resample( + x=x, + w=weight.to(x.dtype), + f=resample_filter, + up=up, + down=down, + padding=padding, + flip_weight=flip_weight, + ) + if demodulate and noise is not None: + x = fma( + x, dcoefs.to(x.dtype).reshape(batch_size, -1, 1, 1), noise.to(x.dtype) + ) + elif demodulate: + x = x * dcoefs.to(x.dtype).reshape(batch_size, -1, 1, 1) + elif noise is not None: + x = x.add_(noise.to(x.dtype)) + return x + + # Execute as one fused op using grouped convolution. + batch_size = int(batch_size) + x = x.reshape(1, -1, *x.shape[2:]) + w = w.reshape(-1, in_channels, kh, kw) + x = conv2d_resample( + x=x, + w=w.to(x.dtype), + f=resample_filter, + up=up, + down=down, + padding=padding, + groups=batch_size, + flip_weight=flip_weight, + ) + x = x.reshape(batch_size, -1, *x.shape[2:]) + if noise is not None: + x = x.add_(noise) + return x + + +class SynthesisLayer(torch.nn.Module): + def __init__( + self, + in_channels, # Number of input channels. + out_channels, # Number of output channels. + w_dim, # Intermediate latent (W) dimensionality. + resolution, # Resolution of this layer. + kernel_size=3, # Convolution kernel size. + up=1, # Integer upsampling factor. + use_noise=True, # Enable noise input? + activation="lrelu", # Activation function: 'relu', 'lrelu', etc. + resample_filter=[ + 1, + 3, + 3, + 1, + ], # Low-pass filter to apply when resampling activations. + conv_clamp=None, # Clamp the output of convolution layers to +-X, None = disable clamping. + channels_last=False, # Use channels_last format for the weights? + ): + super().__init__() + self.resolution = resolution + self.up = up + self.use_noise = use_noise + self.activation = activation + self.conv_clamp = conv_clamp + self.register_buffer("resample_filter", setup_filter(resample_filter)) + self.padding = kernel_size // 2 + self.act_gain = activation_funcs[activation].def_gain + + self.affine = FullyConnectedLayer(w_dim, in_channels, bias_init=1) + memory_format = ( + torch.channels_last if channels_last else torch.contiguous_format + ) + self.weight = torch.nn.Parameter( + torch.randn([out_channels, in_channels, kernel_size, kernel_size]).to( + memory_format=memory_format + ) + ) + if use_noise: + self.register_buffer("noise_const", torch.randn([resolution, resolution])) + self.noise_strength = torch.nn.Parameter(torch.zeros([])) + self.bias = torch.nn.Parameter(torch.zeros([out_channels])) + + def forward(self, x, w, noise_mode="none", fused_modconv=True, gain=1): + assert noise_mode in ["random", "const", "none"] + in_resolution = self.resolution // self.up + styles = self.affine(w) + + noise = None + if self.use_noise and noise_mode == "random": + noise = ( + torch.randn( + [x.shape[0], 1, self.resolution, self.resolution], device=x.device + ) + * self.noise_strength + ) + if self.use_noise and noise_mode == "const": + noise = self.noise_const * self.noise_strength + + flip_weight = self.up == 1 # slightly faster + x = modulated_conv2d( + x=x, + weight=self.weight, + styles=styles, + noise=noise, + up=self.up, + padding=self.padding, + resample_filter=self.resample_filter, + flip_weight=flip_weight, + fused_modconv=fused_modconv, + ) + + act_gain = self.act_gain * gain + act_clamp = self.conv_clamp * gain if self.conv_clamp is not None else None + x = F.leaky_relu(x, negative_slope=0.2, inplace=False) + if act_gain != 1: + x = x * act_gain + if act_clamp is not None: + x = x.clamp(-act_clamp, act_clamp) + return x + + +class ToRGBLayer(torch.nn.Module): + def __init__( + self, + in_channels, + out_channels, + w_dim, + kernel_size=1, + conv_clamp=None, + channels_last=False, + ): + super().__init__() + self.conv_clamp = conv_clamp + self.affine = FullyConnectedLayer(w_dim, in_channels, bias_init=1) + memory_format = ( + torch.channels_last if channels_last else torch.contiguous_format + ) + self.weight = torch.nn.Parameter( + torch.randn([out_channels, in_channels, kernel_size, kernel_size]).to( + memory_format=memory_format + ) + ) + self.bias = torch.nn.Parameter(torch.zeros([out_channels])) + self.weight_gain = 1 / np.sqrt(in_channels * (kernel_size**2)) + + def forward(self, x, w, fused_modconv=True): + styles = self.affine(w) * self.weight_gain + x = modulated_conv2d( + x=x, + weight=self.weight, + styles=styles, + demodulate=False, + fused_modconv=fused_modconv, + ) + x = bias_act(x, self.bias.to(x.dtype), clamp=self.conv_clamp) + return x + + +class SynthesisForeword(torch.nn.Module): + def __init__( + self, + z_dim, # Output Latent (Z) dimensionality. + resolution, # Resolution of this block. + in_channels, + img_channels, # Number of input color channels. + architecture="skip", # Architecture: 'orig', 'skip', 'resnet'. + activation="lrelu", # Activation function: 'relu', 'lrelu', etc. + ): + super().__init__() + self.in_channels = in_channels + self.z_dim = z_dim + self.resolution = resolution + self.img_channels = img_channels + self.architecture = architecture + + self.fc = FullyConnectedLayer( + self.z_dim, (self.z_dim // 2) * 4 * 4, activation=activation + ) + self.conv = SynthesisLayer( + self.in_channels, self.in_channels, w_dim=(z_dim // 2) * 3, resolution=4 + ) + + if architecture == "skip": + self.torgb = ToRGBLayer( + self.in_channels, + self.img_channels, + kernel_size=1, + w_dim=(z_dim // 2) * 3, + ) + + def forward(self, x, ws, feats, img, force_fp32=False): + _ = force_fp32 # unused + dtype = torch.float32 + memory_format = torch.contiguous_format + + x_global = x.clone() + # ToRGB. + x = self.fc(x) + x = x.view(-1, self.z_dim // 2, 4, 4) + x = x.to(dtype=dtype, memory_format=memory_format) + + # Main layers. + x_skip = feats[4].clone() + x = x + x_skip + + mod_vector = [] + mod_vector.append(ws[:, 0]) + mod_vector.append(x_global.clone()) + mod_vector = torch.cat(mod_vector, dim=1) + + x = self.conv(x, mod_vector) + + mod_vector = [] + mod_vector.append(ws[:, 2 * 2 - 3]) + mod_vector.append(x_global.clone()) + mod_vector = torch.cat(mod_vector, dim=1) + + if self.architecture == "skip": + img = self.torgb(x, mod_vector) + img = img.to(dtype=torch.float32, memory_format=torch.contiguous_format) + + assert x.dtype == dtype + return x, img + + +class SELayer(nn.Module): + def __init__(self, channel, reduction=16): + super(SELayer, self).__init__() + self.avg_pool = nn.AdaptiveAvgPool2d(1) + self.fc = nn.Sequential( + nn.Linear(channel, channel // reduction, bias=False), + nn.ReLU(inplace=False), + nn.Linear(channel // reduction, channel, bias=False), + nn.Sigmoid(), + ) + + def forward(self, x): + b, c, _, _ = x.size() + y = self.avg_pool(x).view(b, c) + y = self.fc(y).view(b, c, 1, 1) + res = x * y.expand_as(x) + return res + + +class FourierUnit(nn.Module): + def __init__( + self, + in_channels, + out_channels, + groups=1, + spatial_scale_factor=None, + spatial_scale_mode="bilinear", + spectral_pos_encoding=False, + use_se=False, + se_kwargs=None, + ffc3d=False, + fft_norm="ortho", + ): + # bn_layer not used + super(FourierUnit, self).__init__() + self.groups = groups + + self.conv_layer = torch.nn.Conv2d( + in_channels=in_channels * 2 + (2 if spectral_pos_encoding else 0), + out_channels=out_channels * 2, + kernel_size=1, + stride=1, + padding=0, + groups=self.groups, + bias=False, + ) + self.relu = torch.nn.ReLU(inplace=False) + + # squeeze and excitation block + self.use_se = use_se + if use_se: + if se_kwargs is None: + se_kwargs = {} + self.se = SELayer(self.conv_layer.in_channels, **se_kwargs) + + self.spatial_scale_factor = spatial_scale_factor + self.spatial_scale_mode = spatial_scale_mode + self.spectral_pos_encoding = spectral_pos_encoding + self.ffc3d = ffc3d + self.fft_norm = fft_norm + + def forward(self, x): + batch = x.shape[0] + + if self.spatial_scale_factor is not None: + orig_size = x.shape[-2:] + x = F.interpolate( + x, + scale_factor=self.spatial_scale_factor, + mode=self.spatial_scale_mode, + align_corners=False, + ) + + r_size = x.size() + # (batch, c, h, w/2+1, 2) + fft_dim = (-3, -2, -1) if self.ffc3d else (-2, -1) + ffted = fft.rfftn(x, dim=fft_dim, norm=self.fft_norm) + ffted = torch.stack((ffted.real, ffted.imag), dim=-1) + ffted = ffted.permute(0, 1, 4, 2, 3).contiguous() # (batch, c, 2, h, w/2+1) + ffted = ffted.view( + ( + batch, + -1, + ) + + ffted.size()[3:] + ) + + if self.spectral_pos_encoding: + height, width = ffted.shape[-2:] + coords_vert = ( + torch.linspace(0, 1, height)[None, None, :, None] + .expand(batch, 1, height, width) + .to(ffted) + ) + coords_hor = ( + torch.linspace(0, 1, width)[None, None, None, :] + .expand(batch, 1, height, width) + .to(ffted) + ) + ffted = torch.cat((coords_vert, coords_hor, ffted), dim=1) + + if self.use_se: + ffted = self.se(ffted) + + ffted = self.conv_layer(ffted) # (batch, c*2, h, w/2+1) + ffted = self.relu(ffted) + + ffted = ( + ffted.view( + ( + batch, + -1, + 2, + ) + + ffted.size()[2:] + ) + .permute(0, 1, 3, 4, 2) + .contiguous() + ) # (batch,c, t, h, w/2+1, 2) + ffted = torch.complex(ffted[..., 0], ffted[..., 1]) + + ifft_shape_slice = x.shape[-3:] if self.ffc3d else x.shape[-2:] + output = torch.fft.irfftn( + ffted, s=ifft_shape_slice, dim=fft_dim, norm=self.fft_norm + ) + + if self.spatial_scale_factor is not None: + output = F.interpolate( + output, + size=orig_size, + mode=self.spatial_scale_mode, + align_corners=False, + ) + + return output + + +class SpectralTransform(nn.Module): + def __init__( + self, + in_channels, + out_channels, + stride=1, + groups=1, + enable_lfu=True, + **fu_kwargs, + ): + # bn_layer not used + super(SpectralTransform, self).__init__() + self.enable_lfu = enable_lfu + if stride == 2: + self.downsample = nn.AvgPool2d(kernel_size=(2, 2), stride=2) + else: + self.downsample = nn.Identity() + + self.stride = stride + self.conv1 = nn.Sequential( + nn.Conv2d( + in_channels, out_channels // 2, kernel_size=1, groups=groups, bias=False + ), + # nn.BatchNorm2d(out_channels // 2), + nn.ReLU(inplace=True), + ) + self.fu = FourierUnit(out_channels // 2, out_channels // 2, groups, **fu_kwargs) + if self.enable_lfu: + self.lfu = FourierUnit(out_channels // 2, out_channels // 2, groups) + self.conv2 = torch.nn.Conv2d( + out_channels // 2, out_channels, kernel_size=1, groups=groups, bias=False + ) + + def forward(self, x): + x = self.downsample(x) + x = self.conv1(x) + output = self.fu(x) + + if self.enable_lfu: + n, c, h, w = x.shape + split_no = 2 + split_s = h // split_no + xs = torch.cat( + torch.split(x[:, : c // 4], split_s, dim=-2), dim=1 + ).contiguous() + xs = torch.cat(torch.split(xs, split_s, dim=-1), dim=1).contiguous() + xs = self.lfu(xs) + xs = xs.repeat(1, 1, split_no, split_no).contiguous() + else: + xs = 0 + + output = self.conv2(x + output + xs) + + return output + + +class FFC(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + ratio_gin, + ratio_gout, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=False, + enable_lfu=True, + padding_type="reflect", + gated=False, + **spectral_kwargs, + ): + super(FFC, self).__init__() + + assert stride == 1 or stride == 2, "Stride should be 1 or 2." + self.stride = stride + + in_cg = int(in_channels * ratio_gin) + in_cl = in_channels - in_cg + out_cg = int(out_channels * ratio_gout) + out_cl = out_channels - out_cg + # groups_g = 1 if groups == 1 else int(groups * ratio_gout) + # groups_l = 1 if groups == 1 else groups - groups_g + + self.ratio_gin = ratio_gin + self.ratio_gout = ratio_gout + self.global_in_num = in_cg + + module = nn.Identity if in_cl == 0 or out_cl == 0 else nn.Conv2d + self.convl2l = module( + in_cl, + out_cl, + kernel_size, + stride, + padding, + dilation, + groups, + bias, + padding_mode=padding_type, + ) + module = nn.Identity if in_cl == 0 or out_cg == 0 else nn.Conv2d + self.convl2g = module( + in_cl, + out_cg, + kernel_size, + stride, + padding, + dilation, + groups, + bias, + padding_mode=padding_type, + ) + module = nn.Identity if in_cg == 0 or out_cl == 0 else nn.Conv2d + self.convg2l = module( + in_cg, + out_cl, + kernel_size, + stride, + padding, + dilation, + groups, + bias, + padding_mode=padding_type, + ) + module = nn.Identity if in_cg == 0 or out_cg == 0 else SpectralTransform + self.convg2g = module( + in_cg, + out_cg, + stride, + 1 if groups == 1 else groups // 2, + enable_lfu, + **spectral_kwargs, + ) + + self.gated = gated + module = ( + nn.Identity if in_cg == 0 or out_cl == 0 or not self.gated else nn.Conv2d + ) + self.gate = module(in_channels, 2, 1) + + def forward(self, x, fname=None): + x_l, x_g = x if type(x) is tuple else (x, 0) + out_xl, out_xg = 0, 0 + + if self.gated: + total_input_parts = [x_l] + if torch.is_tensor(x_g): + total_input_parts.append(x_g) + total_input = torch.cat(total_input_parts, dim=1) + + gates = torch.sigmoid(self.gate(total_input)) + g2l_gate, l2g_gate = gates.chunk(2, dim=1) + else: + g2l_gate, l2g_gate = 1, 1 + + spec_x = self.convg2g(x_g) + + if self.ratio_gout != 1: + out_xl = self.convl2l(x_l) + self.convg2l(x_g) * g2l_gate + if self.ratio_gout != 0: + out_xg = self.convl2g(x_l) * l2g_gate + spec_x + + return out_xl, out_xg + + +class FFC_BN_ACT(nn.Module): + def __init__( + self, + in_channels, + out_channels, + kernel_size, + ratio_gin, + ratio_gout, + stride=1, + padding=0, + dilation=1, + groups=1, + bias=False, + norm_layer=nn.SyncBatchNorm, + activation_layer=nn.Identity, + padding_type="reflect", + enable_lfu=True, + **kwargs, + ): + super(FFC_BN_ACT, self).__init__() + self.ffc = FFC( + in_channels, + out_channels, + kernel_size, + ratio_gin, + ratio_gout, + stride, + padding, + dilation, + groups, + bias, + enable_lfu, + padding_type=padding_type, + **kwargs, + ) + lnorm = nn.Identity if ratio_gout == 1 else norm_layer + gnorm = nn.Identity if ratio_gout == 0 else norm_layer + global_channels = int(out_channels * ratio_gout) + # self.bn_l = lnorm(out_channels - global_channels) + # self.bn_g = gnorm(global_channels) + + lact = nn.Identity if ratio_gout == 1 else activation_layer + gact = nn.Identity if ratio_gout == 0 else activation_layer + self.act_l = lact(inplace=True) + self.act_g = gact(inplace=True) + + def forward(self, x, fname=None): + x_l, x_g = self.ffc( + x, + fname=fname, + ) + x_l = self.act_l(x_l) + x_g = self.act_g(x_g) + return x_l, x_g + + +class FFCResnetBlock(nn.Module): + def __init__( + self, + dim, + padding_type, + norm_layer, + activation_layer=nn.ReLU, + dilation=1, + spatial_transform_kwargs=None, + inline=False, + ratio_gin=0.75, + ratio_gout=0.75, + ): + super().__init__() + self.conv1 = FFC_BN_ACT( + dim, + dim, + kernel_size=3, + padding=dilation, + dilation=dilation, + norm_layer=norm_layer, + activation_layer=activation_layer, + padding_type=padding_type, + ratio_gin=ratio_gin, + ratio_gout=ratio_gout, + ) + self.conv2 = FFC_BN_ACT( + dim, + dim, + kernel_size=3, + padding=dilation, + dilation=dilation, + norm_layer=norm_layer, + activation_layer=activation_layer, + padding_type=padding_type, + ratio_gin=ratio_gin, + ratio_gout=ratio_gout, + ) + self.inline = inline + + def forward(self, x, fname=None): + if self.inline: + x_l, x_g = ( + x[:, : -self.conv1.ffc.global_in_num], + x[:, -self.conv1.ffc.global_in_num :], + ) + else: + x_l, x_g = x if type(x) is tuple else (x, 0) + + id_l, id_g = x_l, x_g + + x_l, x_g = self.conv1((x_l, x_g), fname=fname) + x_l, x_g = self.conv2((x_l, x_g), fname=fname) + + x_l, x_g = id_l + x_l, id_g + x_g + out = x_l, x_g + if self.inline: + out = torch.cat(out, dim=1) + return out + + +class ConcatTupleLayer(nn.Module): + def forward(self, x): + assert isinstance(x, tuple) + x_l, x_g = x + assert torch.is_tensor(x_l) or torch.is_tensor(x_g) + if not torch.is_tensor(x_g): + return x_l + return torch.cat(x, dim=1) + + +class FFCBlock(torch.nn.Module): + def __init__( + self, + dim, # Number of output/input channels. + kernel_size, # Width and height of the convolution kernel. + padding, + ratio_gin=0.75, + ratio_gout=0.75, + activation="linear", # Activation function: 'relu', 'lrelu', etc. + ): + super().__init__() + if activation == "linear": + self.activation = nn.Identity + else: + self.activation = nn.ReLU + self.padding = padding + self.kernel_size = kernel_size + self.ffc_block = FFCResnetBlock( + dim=dim, + padding_type="reflect", + norm_layer=nn.SyncBatchNorm, + activation_layer=self.activation, + dilation=1, + ratio_gin=ratio_gin, + ratio_gout=ratio_gout, + ) + + self.concat_layer = ConcatTupleLayer() + + def forward(self, gen_ft, mask, fname=None): + x = gen_ft.float() + + x_l, x_g = ( + x[:, : -self.ffc_block.conv1.ffc.global_in_num], + x[:, -self.ffc_block.conv1.ffc.global_in_num :], + ) + id_l, id_g = x_l, x_g + + x_l, x_g = self.ffc_block((x_l, x_g), fname=fname) + x_l, x_g = id_l + x_l, id_g + x_g + x = self.concat_layer((x_l, x_g)) + + return x + gen_ft.float() + + +class FFCSkipLayer(torch.nn.Module): + def __init__( + self, + dim, # Number of input/output channels. + kernel_size=3, # Convolution kernel size. + ratio_gin=0.75, + ratio_gout=0.75, + ): + super().__init__() + self.padding = kernel_size // 2 + + self.ffc_act = FFCBlock( + dim=dim, + kernel_size=kernel_size, + activation=nn.ReLU, + padding=self.padding, + ratio_gin=ratio_gin, + ratio_gout=ratio_gout, + ) + + def forward(self, gen_ft, mask, fname=None): + x = self.ffc_act(gen_ft, mask, fname=fname) + return x + + +class SynthesisBlock(torch.nn.Module): + def __init__( + self, + in_channels, # Number of input channels, 0 = first block. + out_channels, # Number of output channels. + w_dim, # Intermediate latent (W) dimensionality. + resolution, # Resolution of this block. + img_channels, # Number of output color channels. + is_last, # Is this the last block? + architecture="skip", # Architecture: 'orig', 'skip', 'resnet'. + resample_filter=[ + 1, + 3, + 3, + 1, + ], # Low-pass filter to apply when resampling activations. + conv_clamp=None, # Clamp the output of convolution layers to +-X, None = disable clamping. + use_fp16=False, # Use FP16 for this block? + fp16_channels_last=False, # Use channels-last memory format with FP16? + **layer_kwargs, # Arguments for SynthesisLayer. + ): + assert architecture in ["orig", "skip", "resnet"] + super().__init__() + self.in_channels = in_channels + self.w_dim = w_dim + self.resolution = resolution + self.img_channels = img_channels + self.is_last = is_last + self.architecture = architecture + self.use_fp16 = use_fp16 + self.channels_last = use_fp16 and fp16_channels_last + self.register_buffer("resample_filter", setup_filter(resample_filter)) + self.num_conv = 0 + self.num_torgb = 0 + self.res_ffc = {4: 0, 8: 0, 16: 0, 32: 1, 64: 1, 128: 1, 256: 1, 512: 1} + + if in_channels != 0 and resolution >= 8: + self.ffc_skip = nn.ModuleList() + for _ in range(self.res_ffc[resolution]): + self.ffc_skip.append(FFCSkipLayer(dim=out_channels)) + + if in_channels == 0: + self.const = torch.nn.Parameter( + torch.randn([out_channels, resolution, resolution]) + ) + + if in_channels != 0: + self.conv0 = SynthesisLayer( + in_channels, + out_channels, + w_dim=w_dim * 3, + resolution=resolution, + up=2, + resample_filter=resample_filter, + conv_clamp=conv_clamp, + channels_last=self.channels_last, + **layer_kwargs, + ) + self.num_conv += 1 + + self.conv1 = SynthesisLayer( + out_channels, + out_channels, + w_dim=w_dim * 3, + resolution=resolution, + conv_clamp=conv_clamp, + channels_last=self.channels_last, + **layer_kwargs, + ) + self.num_conv += 1 + + if is_last or architecture == "skip": + self.torgb = ToRGBLayer( + out_channels, + img_channels, + w_dim=w_dim * 3, + conv_clamp=conv_clamp, + channels_last=self.channels_last, + ) + self.num_torgb += 1 + + if in_channels != 0 and architecture == "resnet": + self.skip = Conv2dLayer( + in_channels, + out_channels, + kernel_size=1, + bias=False, + up=2, + resample_filter=resample_filter, + channels_last=self.channels_last, + ) + + def forward( + self, + x, + mask, + feats, + img, + ws, + fname=None, + force_fp32=False, + fused_modconv=None, + **layer_kwargs, + ): + dtype = torch.float16 if self.use_fp16 and not force_fp32 else torch.float32 + dtype = torch.float32 + memory_format = ( + torch.channels_last + if self.channels_last and not force_fp32 + else torch.contiguous_format + ) + if fused_modconv is None: + fused_modconv = (not self.training) and ( + dtype == torch.float32 or int(x.shape[0]) == 1 + ) + + x = x.to(dtype=dtype, memory_format=memory_format) + x_skip = ( + feats[self.resolution].clone().to(dtype=dtype, memory_format=memory_format) + ) + + # Main layers. + if self.in_channels == 0: + x = self.conv1(x, ws[1], fused_modconv=fused_modconv, **layer_kwargs) + elif self.architecture == "resnet": + y = self.skip(x, gain=np.sqrt(0.5)) + x = self.conv0( + x, ws[0].clone(), fused_modconv=fused_modconv, **layer_kwargs + ) + if len(self.ffc_skip) > 0: + mask = F.interpolate( + mask, + size=x_skip.shape[2:], + ) + z = x + x_skip + for fres in self.ffc_skip: + z = fres(z, mask) + x = x + z + else: + x = x + x_skip + x = self.conv1( + x, + ws[1].clone(), + fused_modconv=fused_modconv, + gain=np.sqrt(0.5), + **layer_kwargs, + ) + x = y.add_(x) + else: + x = self.conv0( + x, ws[0].clone(), fused_modconv=fused_modconv, **layer_kwargs + ) + if len(self.ffc_skip) > 0: + mask = F.interpolate( + mask, + size=x_skip.shape[2:], + ) + z = x + x_skip + for fres in self.ffc_skip: + z = fres(z, mask) + x = x + z + else: + x = x + x_skip + x = self.conv1( + x, ws[1].clone(), fused_modconv=fused_modconv, **layer_kwargs + ) + # ToRGB. + if img is not None: + img = upsample2d(img, self.resample_filter) + if self.is_last or self.architecture == "skip": + y = self.torgb(x, ws[2].clone(), fused_modconv=fused_modconv) + y = y.to(dtype=torch.float32, memory_format=torch.contiguous_format) + img = img.add_(y) if img is not None else y + + x = x.to(dtype=dtype) + assert x.dtype == dtype + assert img is None or img.dtype == torch.float32 + return x, img + + +class SynthesisNetwork(torch.nn.Module): + def __init__( + self, + w_dim, # Intermediate latent (W) dimensionality. + z_dim, # Output Latent (Z) dimensionality. + img_resolution, # Output image resolution. + img_channels, # Number of color channels. + channel_base=16384, # Overall multiplier for the number of channels. + channel_max=512, # Maximum number of channels in any layer. + num_fp16_res=0, # Use FP16 for the N highest resolutions. + **block_kwargs, # Arguments for SynthesisBlock. + ): + assert img_resolution >= 4 and img_resolution & (img_resolution - 1) == 0 + super().__init__() + self.w_dim = w_dim + self.img_resolution = img_resolution + self.img_resolution_log2 = int(np.log2(img_resolution)) + self.img_channels = img_channels + self.block_resolutions = [ + 2**i for i in range(3, self.img_resolution_log2 + 1) + ] + channels_dict = { + res: min(channel_base // res, channel_max) for res in self.block_resolutions + } + fp16_resolution = max(2 ** (self.img_resolution_log2 + 1 - num_fp16_res), 8) + + self.foreword = SynthesisForeword( + img_channels=img_channels, + in_channels=min(channel_base // 4, channel_max), + z_dim=z_dim * 2, + resolution=4, + ) + + self.num_ws = self.img_resolution_log2 * 2 - 2 + for res in self.block_resolutions: + if res // 2 in channels_dict.keys(): + in_channels = channels_dict[res // 2] if res > 4 else 0 + else: + in_channels = min(channel_base // (res // 2), channel_max) + out_channels = channels_dict[res] + use_fp16 = res >= fp16_resolution + use_fp16 = False + is_last = res == self.img_resolution + block = SynthesisBlock( + in_channels, + out_channels, + w_dim=w_dim, + resolution=res, + img_channels=img_channels, + is_last=is_last, + use_fp16=use_fp16, + **block_kwargs, + ) + setattr(self, f"b{res}", block) + + def forward(self, x_global, mask, feats, ws, fname=None, **block_kwargs): + img = None + + x, img = self.foreword(x_global, ws, feats, img) + + for res in self.block_resolutions: + block = getattr(self, f"b{res}") + mod_vector0 = [] + mod_vector0.append(ws[:, int(np.log2(res)) * 2 - 5]) + mod_vector0.append(x_global.clone()) + mod_vector0 = torch.cat(mod_vector0, dim=1) + + mod_vector1 = [] + mod_vector1.append(ws[:, int(np.log2(res)) * 2 - 4]) + mod_vector1.append(x_global.clone()) + mod_vector1 = torch.cat(mod_vector1, dim=1) + + mod_vector_rgb = [] + mod_vector_rgb.append(ws[:, int(np.log2(res)) * 2 - 3]) + mod_vector_rgb.append(x_global.clone()) + mod_vector_rgb = torch.cat(mod_vector_rgb, dim=1) + x, img = block( + x, + mask, + feats, + img, + (mod_vector0, mod_vector1, mod_vector_rgb), + fname=fname, + **block_kwargs, + ) + return img + + +class MappingNetwork(torch.nn.Module): + def __init__( + self, + z_dim, # Input latent (Z) dimensionality, 0 = no latent. + c_dim, # Conditioning label (C) dimensionality, 0 = no label. + w_dim, # Intermediate latent (W) dimensionality. + num_ws, # Number of intermediate latents to output, None = do not broadcast. + num_layers=8, # Number of mapping layers. + embed_features=None, # Label embedding dimensionality, None = same as w_dim. + layer_features=None, # Number of intermediate features in the mapping layers, None = same as w_dim. + activation="lrelu", # Activation function: 'relu', 'lrelu', etc. + lr_multiplier=0.01, # Learning rate multiplier for the mapping layers. + w_avg_beta=0.995, # Decay for tracking the moving average of W during training, None = do not track. + ): + super().__init__() + self.z_dim = z_dim + self.c_dim = c_dim + self.w_dim = w_dim + self.num_ws = num_ws + self.num_layers = num_layers + self.w_avg_beta = w_avg_beta + + if embed_features is None: + embed_features = w_dim + if c_dim == 0: + embed_features = 0 + if layer_features is None: + layer_features = w_dim + features_list = ( + [z_dim + embed_features] + [layer_features] * (num_layers - 1) + [w_dim] + ) + + if c_dim > 0: + self.embed = FullyConnectedLayer(c_dim, embed_features) + for idx in range(num_layers): + in_features = features_list[idx] + out_features = features_list[idx + 1] + layer = FullyConnectedLayer( + in_features, + out_features, + activation=activation, + lr_multiplier=lr_multiplier, + ) + setattr(self, f"fc{idx}", layer) + + if num_ws is not None and w_avg_beta is not None: + self.register_buffer("w_avg", torch.zeros([w_dim])) + + def forward( + self, z, c, truncation_psi=1, truncation_cutoff=None, skip_w_avg_update=False + ): + # Embed, normalize, and concat inputs. + x = None + with torch.autograd.profiler.record_function("input"): + if self.z_dim > 0: + x = normalize_2nd_moment(z.to(torch.float32)) + if self.c_dim > 0: + y = normalize_2nd_moment(self.embed(c.to(torch.float32))) + x = torch.cat([x, y], dim=1) if x is not None else y + + # Main layers. + for idx in range(self.num_layers): + layer = getattr(self, f"fc{idx}") + x = layer(x) + + # Update moving average of W. + if self.w_avg_beta is not None and self.training and not skip_w_avg_update: + with torch.autograd.profiler.record_function("update_w_avg"): + self.w_avg.copy_( + x.detach().mean(dim=0).lerp(self.w_avg, self.w_avg_beta) + ) + + # Broadcast. + if self.num_ws is not None: + with torch.autograd.profiler.record_function("broadcast"): + x = x.unsqueeze(1).repeat([1, self.num_ws, 1]) + + # Apply truncation. + if truncation_psi != 1: + with torch.autograd.profiler.record_function("truncate"): + assert self.w_avg_beta is not None + if self.num_ws is None or truncation_cutoff is None: + x = self.w_avg.lerp(x, truncation_psi) + else: + x[:, :truncation_cutoff] = self.w_avg.lerp( + x[:, :truncation_cutoff], truncation_psi + ) + return x + + +class Generator(torch.nn.Module): + def __init__( + self, + z_dim, # Input latent (Z) dimensionality. + c_dim, # Conditioning label (C) dimensionality. + w_dim, # Intermediate latent (W) dimensionality. + img_resolution, # Output resolution. + img_channels, # Number of output color channels. + encoder_kwargs={}, # Arguments for EncoderNetwork. + mapping_kwargs={}, # Arguments for MappingNetwork. + synthesis_kwargs={}, # Arguments for SynthesisNetwork. + ): + super().__init__() + self.z_dim = z_dim + self.c_dim = c_dim + self.w_dim = w_dim + self.img_resolution = img_resolution + self.img_channels = img_channels + self.encoder = EncoderNetwork( + c_dim=c_dim, + z_dim=z_dim, + img_resolution=img_resolution, + img_channels=img_channels, + **encoder_kwargs, + ) + self.synthesis = SynthesisNetwork( + z_dim=z_dim, + w_dim=w_dim, + img_resolution=img_resolution, + img_channels=img_channels, + **synthesis_kwargs, + ) + self.num_ws = self.synthesis.num_ws + self.mapping = MappingNetwork( + z_dim=z_dim, c_dim=c_dim, w_dim=w_dim, num_ws=self.num_ws, **mapping_kwargs + ) + + def forward( + self, + img, + c, + fname=None, + truncation_psi=1, + truncation_cutoff=None, + **synthesis_kwargs, + ): + mask = img[:, -1].unsqueeze(1) + x_global, z, feats = self.encoder(img, c) + ws = self.mapping( + z, c, truncation_psi=truncation_psi, truncation_cutoff=truncation_cutoff + ) + img = self.synthesis(x_global, mask, feats, ws, fname=fname, **synthesis_kwargs) + return img + + +FCF_MODEL_URL = os.environ.get( + "FCF_MODEL_URL", + "https://github.com/Sanster/models/releases/download/add_fcf/places_512_G.pth", +) +FCF_MODEL_MD5 = os.environ.get("FCF_MODEL_MD5", "3323152bc01bf1c56fd8aba74435a211") + + +class FcF(InpaintModel): + name = "fcf" + min_size = 512 + pad_mod = 512 + pad_to_square = True + is_erase_model = True + + def init_model(self, device, **kwargs): + seed = 0 + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + torch.backends.cudnn.deterministic = True + torch.backends.cudnn.benchmark = False + + kwargs = { + "channel_base": 1 * 32768, + "channel_max": 512, + "num_fp16_res": 4, + "conv_clamp": 256, + } + G = Generator( + z_dim=512, + c_dim=0, + w_dim=512, + img_resolution=512, + img_channels=3, + synthesis_kwargs=kwargs, + encoder_kwargs=kwargs, + mapping_kwargs={"num_layers": 2}, + ) + self.model = load_model(G, FCF_MODEL_URL, device, FCF_MODEL_MD5) + self.label = torch.zeros([1, self.model.c_dim], device=device) + + @staticmethod + def download(): + download_model(FCF_MODEL_URL, FCF_MODEL_MD5) + + @staticmethod + def is_downloaded() -> bool: + return os.path.exists(get_cache_path_by_url(FCF_MODEL_URL)) + + @torch.no_grad() + def __call__(self, image, mask, config: InpaintRequest): + """ + images: [H, W, C] RGB, not normalized + masks: [H, W] + return: BGR IMAGE + """ + if image.shape[0] == 512 and image.shape[1] == 512: + return self._pad_forward(image, mask, config) + + boxes = boxes_from_mask(mask) + crop_result = [] + config.hd_strategy_crop_margin = 128 + for box in boxes: + crop_image, crop_mask, crop_box = self._crop_box(image, mask, box, config) + origin_size = crop_image.shape[:2] + resize_image = resize_max_size(crop_image, size_limit=512) + resize_mask = resize_max_size(crop_mask, size_limit=512) + inpaint_result = self._pad_forward(resize_image, resize_mask, config) + + # only paste masked area result + inpaint_result = cv2.resize( + inpaint_result, + (origin_size[1], origin_size[0]), + interpolation=cv2.INTER_CUBIC, + ) + + original_pixel_indices = crop_mask < 127 + inpaint_result[original_pixel_indices] = crop_image[:, :, ::-1][ + original_pixel_indices + ] + + crop_result.append((inpaint_result, crop_box)) + + inpaint_result = image[:, :, ::-1].copy() + for crop_image, crop_box in crop_result: + x1, y1, x2, y2 = crop_box + inpaint_result[y1:y2, x1:x2, :] = crop_image + + return inpaint_result + + def forward(self, image, mask, config: InpaintRequest): + """Input images and output images have same size + images: [H, W, C] RGB + masks: [H, W] mask area == 255 + return: BGR IMAGE + """ + + image = norm_img(image) # [0, 1] + image = image * 2 - 1 # [0, 1] -> [-1, 1] + mask = (mask > 120) * 255 + mask = norm_img(mask) + + image = torch.from_numpy(image).unsqueeze(0).to(self.device) + mask = torch.from_numpy(mask).unsqueeze(0).to(self.device) + + erased_img = image * (1 - mask) + input_image = torch.cat([0.5 - mask, erased_img], dim=1) + + output = self.model( + input_image, self.label, truncation_psi=0.1, noise_mode="none" + ) + output = ( + (output.permute(0, 2, 3, 1) * 127.5 + 127.5) + .round() + .clamp(0, 255) + .to(torch.uint8) + ) + output = output[0].cpu().numpy() + cur_res = cv2.cvtColor(output, cv2.COLOR_RGB2BGR) + return cur_res diff --git a/inpaint/model/helper/__init__.py b/inpaint/model/helper/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/inpaint/model/helper/controlnet_preprocess.py b/inpaint/model/helper/controlnet_preprocess.py new file mode 100644 index 0000000..75c409f --- /dev/null +++ b/inpaint/model/helper/controlnet_preprocess.py @@ -0,0 +1,68 @@ +import torch +import PIL +import cv2 +from PIL import Image +import numpy as np + +from iopaint.helper import pad_img_to_modulo + + +def make_canny_control_image(image: np.ndarray) -> Image: + canny_image = cv2.Canny(image, 100, 200) + canny_image = canny_image[:, :, None] + canny_image = np.concatenate([canny_image, canny_image, canny_image], axis=2) + canny_image = PIL.Image.fromarray(canny_image) + control_image = canny_image + return control_image + + +def make_openpose_control_image(image: np.ndarray) -> Image: + from controlnet_aux import OpenposeDetector + + processor = OpenposeDetector.from_pretrained("lllyasviel/ControlNet") + control_image = processor(image, hand_and_face=True) + return control_image + + +def resize_image(input_image, resolution): + H, W, C = input_image.shape + H = float(H) + W = float(W) + k = float(resolution) / min(H, W) + H *= k + W *= k + H = int(np.round(H / 64.0)) * 64 + W = int(np.round(W / 64.0)) * 64 + img = cv2.resize( + input_image, + (W, H), + interpolation=cv2.INTER_LANCZOS4 if k > 1 else cv2.INTER_AREA, + ) + return img + + +def make_depth_control_image(image: np.ndarray) -> Image: + from controlnet_aux import MidasDetector + + midas = MidasDetector.from_pretrained("lllyasviel/Annotators") + + origin_height, origin_width = image.shape[:2] + pad_image = pad_img_to_modulo(image, mod=64, square=False, min_size=512) + depth_image = midas(pad_image) + depth_image = depth_image[0:origin_height, 0:origin_width] + depth_image = depth_image[:, :, None] + depth_image = np.concatenate([depth_image, depth_image, depth_image], axis=2) + control_image = PIL.Image.fromarray(depth_image) + return control_image + + +def make_inpaint_control_image(image: np.ndarray, mask: np.ndarray) -> torch.Tensor: + """ + image: [H, W, C] RGB + mask: [H, W, 1] 255 means area to repaint + """ + image = image.astype(np.float32) / 255.0 + image[mask[:, :, -1] > 128] = -1.0 # set as masked pixel + image = np.expand_dims(image, 0).transpose(0, 3, 1, 2) + image = torch.from_numpy(image) + return image diff --git a/inpaint/model/helper/cpu_text_encoder.py b/inpaint/model/helper/cpu_text_encoder.py new file mode 100644 index 0000000..116eb48 --- /dev/null +++ b/inpaint/model/helper/cpu_text_encoder.py @@ -0,0 +1,41 @@ +import torch +from transformers import PreTrainedModel + +from ..utils import torch_gc + + +class CPUTextEncoderWrapper(PreTrainedModel): + def __init__(self, text_encoder, torch_dtype): + super().__init__(text_encoder.config) + self.config = text_encoder.config + self._device = text_encoder.device + # cpu not support float16 + self.text_encoder = text_encoder.to(torch.device("cpu"), non_blocking=True) + self.text_encoder = self.text_encoder.to(torch.float32, non_blocking=True) + self.torch_dtype = torch_dtype + del text_encoder + torch_gc() + + def __call__(self, x, **kwargs): + input_device = x.device + original_output = self.text_encoder(x.to(self.text_encoder.device), **kwargs) + for k, v in original_output.items(): + if isinstance(v, tuple): + original_output[k] = [ + v[i].to(input_device).to(self.torch_dtype) for i in range(len(v)) + ] + else: + original_output[k] = v.to(input_device).to(self.torch_dtype) + return original_output + + @property + def dtype(self): + return self.torch_dtype + + @property + def device(self) -> torch.device: + """ + `torch.device`: The device on which the module is (assuming that all the module parameters are on the same + device). + """ + return self._device \ No newline at end of file diff --git a/inpaint/model/helper/g_diffuser_bot.py b/inpaint/model/helper/g_diffuser_bot.py new file mode 100644 index 0000000..79b19aa --- /dev/null +++ b/inpaint/model/helper/g_diffuser_bot.py @@ -0,0 +1,62 @@ +import cv2 +import numpy as np + + +def expand_image(cv2_img, top: int, right: int, bottom: int, left: int): + assert cv2_img.shape[2] == 3 + origin_h, origin_w = cv2_img.shape[:2] + + # TODO: which is better? + # new_img = np.ones((new_height, new_width, 3), np.uint8) * 255 + new_img = cv2.copyMakeBorder( + cv2_img, top, bottom, left, right, cv2.BORDER_REPLICATE + ) + + inner_padding_left = 0 if left > 0 else 0 + inner_padding_right = 0 if right > 0 else 0 + inner_padding_top = 0 if top > 0 else 0 + inner_padding_bottom = 0 if bottom > 0 else 0 + + mask_image = np.zeros( + ( + origin_h - inner_padding_top - inner_padding_bottom, + origin_w - inner_padding_left - inner_padding_right, + ), + np.uint8, + ) + mask_image = cv2.copyMakeBorder( + mask_image, + top + inner_padding_top, + bottom + inner_padding_bottom, + left + inner_padding_left, + right + inner_padding_right, + cv2.BORDER_CONSTANT, + value=255, + ) + # k = 2*int(min(origin_h, origin_w) // 6)+1 + # k = 7 + # mask_image = cv2.GaussianBlur(mask_image, (k, k), 0) + return new_img, mask_image + + +if __name__ == "__main__": + from pathlib import Path + + current_dir = Path(__file__).parent.absolute().resolve() + image_path = "/Users/cwq/code/github/IOPaint/iopaint/tests/bunny.jpeg" + init_image = cv2.imread(str(image_path)) + init_image, mask_image = expand_image( + init_image, + top=0, + right=0, + bottom=0, + left=100, + softness=20, + space=20, + ) + print(mask_image.dtype, mask_image.min(), mask_image.max()) + print(init_image.dtype, init_image.min(), init_image.max()) + mask_image = mask_image.astype(np.uint8) + init_image = init_image.astype(np.uint8) + cv2.imwrite("expanded_image.png", init_image) + cv2.imwrite("expanded_mask.png", mask_image) diff --git a/inpaint/model/instruct_pix2pix.py b/inpaint/model/instruct_pix2pix.py new file mode 100644 index 0000000..fc8cd26 --- /dev/null +++ b/inpaint/model/instruct_pix2pix.py @@ -0,0 +1,64 @@ +import PIL.Image +import cv2 +import torch +from loguru import logger + +from iopaint.const import INSTRUCT_PIX2PIX_NAME +from .base import DiffusionInpaintModel +from iopaint.schema import InpaintRequest +from .utils import get_torch_dtype, enable_low_mem, is_local_files_only + + +class InstructPix2Pix(DiffusionInpaintModel): + name = INSTRUCT_PIX2PIX_NAME + pad_mod = 8 + min_size = 512 + + def init_model(self, device: torch.device, **kwargs): + from diffusers import StableDiffusionInstructPix2PixPipeline + + use_gpu, torch_dtype = get_torch_dtype(device, kwargs.get("no_half", False)) + + model_kwargs = {"local_files_only": is_local_files_only(**kwargs)} + if kwargs["disable_nsfw"] or kwargs.get("cpu_offload", False): + logger.info("Disable Stable Diffusion Model NSFW checker") + model_kwargs.update( + dict( + safety_checker=None, + feature_extractor=None, + requires_safety_checker=False, + ) + ) + + self.model = StableDiffusionInstructPix2PixPipeline.from_pretrained( + self.name, variant="fp16", torch_dtype=torch_dtype, **model_kwargs + ) + enable_low_mem(self.model, kwargs.get("low_mem", False)) + + if kwargs.get("cpu_offload", False) and use_gpu: + logger.info("Enable sequential cpu offload") + self.model.enable_sequential_cpu_offload(gpu_id=0) + else: + self.model = self.model.to(device) + + def forward(self, image, mask, config: InpaintRequest): + """Input image and output image have same size + image: [H, W, C] RGB + mask: [H, W, 1] 255 means area to repaint + return: BGR IMAGE + edit = pipe(prompt, image=image, num_inference_steps=20, image_guidance_scale=1.5, guidance_scale=7).images[0] + """ + output = self.model( + image=PIL.Image.fromarray(image), + prompt=config.prompt, + negative_prompt=config.negative_prompt, + num_inference_steps=config.sd_steps, + image_guidance_scale=config.p2p_image_guidance_scale, + guidance_scale=config.sd_guidance_scale, + output_type="np", + generator=torch.manual_seed(config.sd_seed), + ).images[0] + + output = (output * 255).round().astype("uint8") + output = cv2.cvtColor(output, cv2.COLOR_RGB2BGR) + return output diff --git a/inpaint/model/kandinsky.py b/inpaint/model/kandinsky.py new file mode 100644 index 0000000..1a0bf1c --- /dev/null +++ b/inpaint/model/kandinsky.py @@ -0,0 +1,65 @@ +import PIL.Image +import cv2 +import numpy as np +import torch + +from iopaint.const import KANDINSKY22_NAME +from .base import DiffusionInpaintModel +from iopaint.schema import InpaintRequest +from .utils import get_torch_dtype, enable_low_mem, is_local_files_only + + +class Kandinsky(DiffusionInpaintModel): + pad_mod = 64 + min_size = 512 + + def init_model(self, device: torch.device, **kwargs): + from diffusers import AutoPipelineForInpainting + + use_gpu, torch_dtype = get_torch_dtype(device, kwargs.get("no_half", False)) + + model_kwargs = { + "torch_dtype": torch_dtype, + "local_files_only": is_local_files_only(**kwargs), + } + self.model = AutoPipelineForInpainting.from_pretrained( + self.name, **model_kwargs + ).to(device) + enable_low_mem(self.model, kwargs.get("low_mem", False)) + + self.callback = kwargs.pop("callback", None) + + def forward(self, image, mask, config: InpaintRequest): + """Input image and output image have same size + image: [H, W, C] RGB + mask: [H, W, 1] 255 means area to repaint + return: BGR IMAGE + """ + self.set_scheduler(config) + + generator = torch.manual_seed(config.sd_seed) + mask = mask.astype(np.float32) / 255 + img_h, img_w = image.shape[:2] + + # kandinsky 没有 strength + output = self.model( + prompt=config.prompt, + negative_prompt=config.negative_prompt, + image=PIL.Image.fromarray(image), + mask_image=mask[:, :, 0], + height=img_h, + width=img_w, + num_inference_steps=config.sd_steps, + guidance_scale=config.sd_guidance_scale, + output_type="np", + callback_on_step_end=self.callback, + generator=generator, + ).images[0] + + output = (output * 255).round().astype("uint8") + output = cv2.cvtColor(output, cv2.COLOR_RGB2BGR) + return output + + +class Kandinsky22(Kandinsky): + name = KANDINSKY22_NAME diff --git a/inpaint/model/lama.py b/inpaint/model/lama.py new file mode 100644 index 0000000..7aba242 --- /dev/null +++ b/inpaint/model/lama.py @@ -0,0 +1,57 @@ +import os + +import cv2 +import numpy as np +import torch + +from iopaint.helper import ( + norm_img, + get_cache_path_by_url, + load_jit_model, + download_model, +) +from iopaint.schema import InpaintRequest +from .base import InpaintModel + +LAMA_MODEL_URL = os.environ.get( + "LAMA_MODEL_URL", + "https://github.com/Sanster/models/releases/download/add_big_lama/big-lama.pt", +) +LAMA_MODEL_MD5 = os.environ.get("LAMA_MODEL_MD5", "e3aa4aaa15225a33ec84f9f4bc47e500") + + +class LaMa(InpaintModel): + name = "lama" + pad_mod = 8 + is_erase_model = True + + @staticmethod + def download(): + download_model(LAMA_MODEL_URL, LAMA_MODEL_MD5) + + def init_model(self, device, **kwargs): + self.model = load_jit_model(LAMA_MODEL_URL, device, LAMA_MODEL_MD5).eval() + + @staticmethod + def is_downloaded() -> bool: + return os.path.exists(get_cache_path_by_url(LAMA_MODEL_URL)) + + def forward(self, image, mask, config: InpaintRequest): + """Input image and output image have same size + image: [H, W, C] RGB + mask: [H, W] + return: BGR IMAGE + """ + image = norm_img(image) + mask = norm_img(mask) + + mask = (mask > 0) * 1 + image = torch.from_numpy(image).unsqueeze(0).to(self.device) + mask = torch.from_numpy(mask).unsqueeze(0).to(self.device) + + inpainted_image = self.model(image, mask) + + cur_res = inpainted_image[0].permute(1, 2, 0).detach().cpu().numpy() + cur_res = np.clip(cur_res * 255, 0, 255).astype("uint8") + cur_res = cv2.cvtColor(cur_res, cv2.COLOR_RGB2BGR) + return cur_res diff --git a/inpaint/model/ldm.py b/inpaint/model/ldm.py new file mode 100644 index 0000000..19e51a3 --- /dev/null +++ b/inpaint/model/ldm.py @@ -0,0 +1,336 @@ +import os + +import numpy as np +import torch +from loguru import logger + +from .base import InpaintModel +from .ddim_sampler import DDIMSampler +from .plms_sampler import PLMSSampler +from iopaint.schema import InpaintRequest, LDMSampler + +torch.manual_seed(42) +import torch.nn as nn +from iopaint.helper import ( + download_model, + norm_img, + get_cache_path_by_url, + load_jit_model, +) +from .utils import ( + make_beta_schedule, + timestep_embedding, +) + +LDM_ENCODE_MODEL_URL = os.environ.get( + "LDM_ENCODE_MODEL_URL", + "https://github.com/Sanster/models/releases/download/add_ldm/cond_stage_model_encode.pt", +) +LDM_ENCODE_MODEL_MD5 = os.environ.get( + "LDM_ENCODE_MODEL_MD5", "23239fc9081956a3e70de56472b3f296" +) + +LDM_DECODE_MODEL_URL = os.environ.get( + "LDM_DECODE_MODEL_URL", + "https://github.com/Sanster/models/releases/download/add_ldm/cond_stage_model_decode.pt", +) +LDM_DECODE_MODEL_MD5 = os.environ.get( + "LDM_DECODE_MODEL_MD5", "fe419cd15a750d37a4733589d0d3585c" +) + +LDM_DIFFUSION_MODEL_URL = os.environ.get( + "LDM_DIFFUSION_MODEL_URL", + "https://github.com/Sanster/models/releases/download/add_ldm/diffusion.pt", +) + +LDM_DIFFUSION_MODEL_MD5 = os.environ.get( + "LDM_DIFFUSION_MODEL_MD5", "b0afda12bf790c03aba2a7431f11d22d" +) + + +class DDPM(nn.Module): + # classic DDPM with Gaussian diffusion, in image space + def __init__( + self, + device, + timesteps=1000, + beta_schedule="linear", + linear_start=0.0015, + linear_end=0.0205, + cosine_s=0.008, + original_elbo_weight=0.0, + v_posterior=0.0, # weight for choosing posterior variance as sigma = (1-v) * beta_tilde + v * beta + l_simple_weight=1.0, + parameterization="eps", # all assuming fixed variance schedules + use_positional_encodings=False, + ): + super().__init__() + self.device = device + self.parameterization = parameterization + self.use_positional_encodings = use_positional_encodings + + self.v_posterior = v_posterior + self.original_elbo_weight = original_elbo_weight + self.l_simple_weight = l_simple_weight + + self.register_schedule( + beta_schedule=beta_schedule, + timesteps=timesteps, + linear_start=linear_start, + linear_end=linear_end, + cosine_s=cosine_s, + ) + + def register_schedule( + self, + given_betas=None, + beta_schedule="linear", + timesteps=1000, + linear_start=1e-4, + linear_end=2e-2, + cosine_s=8e-3, + ): + betas = make_beta_schedule( + self.device, + beta_schedule, + timesteps, + linear_start=linear_start, + linear_end=linear_end, + cosine_s=cosine_s, + ) + alphas = 1.0 - betas + alphas_cumprod = np.cumprod(alphas, axis=0) + alphas_cumprod_prev = np.append(1.0, alphas_cumprod[:-1]) + + (timesteps,) = betas.shape + self.num_timesteps = int(timesteps) + self.linear_start = linear_start + self.linear_end = linear_end + assert ( + alphas_cumprod.shape[0] == self.num_timesteps + ), "alphas have to be defined for each timestep" + + to_torch = lambda x: torch.tensor(x, dtype=torch.float32).to(self.device) + + self.register_buffer("betas", to_torch(betas)) + self.register_buffer("alphas_cumprod", to_torch(alphas_cumprod)) + self.register_buffer("alphas_cumprod_prev", to_torch(alphas_cumprod_prev)) + + # calculations for diffusion q(x_t | x_{t-1}) and others + self.register_buffer("sqrt_alphas_cumprod", to_torch(np.sqrt(alphas_cumprod))) + self.register_buffer( + "sqrt_one_minus_alphas_cumprod", to_torch(np.sqrt(1.0 - alphas_cumprod)) + ) + self.register_buffer( + "log_one_minus_alphas_cumprod", to_torch(np.log(1.0 - alphas_cumprod)) + ) + self.register_buffer( + "sqrt_recip_alphas_cumprod", to_torch(np.sqrt(1.0 / alphas_cumprod)) + ) + self.register_buffer( + "sqrt_recipm1_alphas_cumprod", to_torch(np.sqrt(1.0 / alphas_cumprod - 1)) + ) + + # calculations for posterior q(x_{t-1} | x_t, x_0) + posterior_variance = (1 - self.v_posterior) * betas * ( + 1.0 - alphas_cumprod_prev + ) / (1.0 - alphas_cumprod) + self.v_posterior * betas + # above: equal to 1. / (1. / (1. - alpha_cumprod_tm1) + alpha_t / beta_t) + self.register_buffer("posterior_variance", to_torch(posterior_variance)) + # below: log calculation clipped because the posterior variance is 0 at the beginning of the diffusion chain + self.register_buffer( + "posterior_log_variance_clipped", + to_torch(np.log(np.maximum(posterior_variance, 1e-20))), + ) + self.register_buffer( + "posterior_mean_coef1", + to_torch(betas * np.sqrt(alphas_cumprod_prev) / (1.0 - alphas_cumprod)), + ) + self.register_buffer( + "posterior_mean_coef2", + to_torch( + (1.0 - alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - alphas_cumprod) + ), + ) + + if self.parameterization == "eps": + lvlb_weights = self.betas**2 / ( + 2 + * self.posterior_variance + * to_torch(alphas) + * (1 - self.alphas_cumprod) + ) + elif self.parameterization == "x0": + lvlb_weights = ( + 0.5 + * np.sqrt(torch.Tensor(alphas_cumprod)) + / (2.0 * 1 - torch.Tensor(alphas_cumprod)) + ) + else: + raise NotImplementedError("mu not supported") + # TODO how to choose this term + lvlb_weights[0] = lvlb_weights[1] + self.register_buffer("lvlb_weights", lvlb_weights, persistent=False) + assert not torch.isnan(self.lvlb_weights).all() + + +class LatentDiffusion(DDPM): + def __init__( + self, + diffusion_model, + device, + cond_stage_key="image", + cond_stage_trainable=False, + concat_mode=True, + scale_factor=1.0, + scale_by_std=False, + *args, + **kwargs, + ): + self.num_timesteps_cond = 1 + self.scale_by_std = scale_by_std + super().__init__(device, *args, **kwargs) + self.diffusion_model = diffusion_model + self.concat_mode = concat_mode + self.cond_stage_trainable = cond_stage_trainable + self.cond_stage_key = cond_stage_key + self.num_downs = 2 + self.scale_factor = scale_factor + + def make_cond_schedule( + self, + ): + self.cond_ids = torch.full( + size=(self.num_timesteps,), + fill_value=self.num_timesteps - 1, + dtype=torch.long, + ) + ids = torch.round( + torch.linspace(0, self.num_timesteps - 1, self.num_timesteps_cond) + ).long() + self.cond_ids[: self.num_timesteps_cond] = ids + + def register_schedule( + self, + given_betas=None, + beta_schedule="linear", + timesteps=1000, + linear_start=1e-4, + linear_end=2e-2, + cosine_s=8e-3, + ): + super().register_schedule( + given_betas, beta_schedule, timesteps, linear_start, linear_end, cosine_s + ) + + self.shorten_cond_schedule = self.num_timesteps_cond > 1 + if self.shorten_cond_schedule: + self.make_cond_schedule() + + def apply_model(self, x_noisy, t, cond): + # x_recon = self.model(x_noisy, t, cond['c_concat'][0]) # cond['c_concat'][0].shape 1,4,128,128 + t_emb = timestep_embedding(x_noisy.device, t, 256, repeat_only=False) + x_recon = self.diffusion_model(x_noisy, t_emb, cond) + return x_recon + + +class LDM(InpaintModel): + name = "ldm" + pad_mod = 32 + is_erase_model = True + + def __init__(self, device, fp16: bool = True, **kwargs): + self.fp16 = fp16 + super().__init__(device) + self.device = device + + def init_model(self, device, **kwargs): + self.diffusion_model = load_jit_model( + LDM_DIFFUSION_MODEL_URL, device, LDM_DIFFUSION_MODEL_MD5 + ) + self.cond_stage_model_decode = load_jit_model( + LDM_DECODE_MODEL_URL, device, LDM_DECODE_MODEL_MD5 + ) + self.cond_stage_model_encode = load_jit_model( + LDM_ENCODE_MODEL_URL, device, LDM_ENCODE_MODEL_MD5 + ) + if self.fp16 and "cuda" in str(device): + self.diffusion_model = self.diffusion_model.half() + self.cond_stage_model_decode = self.cond_stage_model_decode.half() + self.cond_stage_model_encode = self.cond_stage_model_encode.half() + + self.model = LatentDiffusion(self.diffusion_model, device) + + @staticmethod + def download(): + download_model(LDM_DIFFUSION_MODEL_URL, LDM_DIFFUSION_MODEL_MD5) + download_model(LDM_DECODE_MODEL_URL, LDM_DECODE_MODEL_MD5) + download_model(LDM_ENCODE_MODEL_URL, LDM_ENCODE_MODEL_MD5) + + @staticmethod + def is_downloaded() -> bool: + model_paths = [ + get_cache_path_by_url(LDM_DIFFUSION_MODEL_URL), + get_cache_path_by_url(LDM_DECODE_MODEL_URL), + get_cache_path_by_url(LDM_ENCODE_MODEL_URL), + ] + return all([os.path.exists(it) for it in model_paths]) + + @torch.cuda.amp.autocast() + def forward(self, image, mask, config: InpaintRequest): + """ + image: [H, W, C] RGB + mask: [H, W, 1] + return: BGR IMAGE + """ + # image [1,3,512,512] float32 + # mask: [1,1,512,512] float32 + # masked_image: [1,3,512,512] float32 + if config.ldm_sampler == LDMSampler.ddim: + sampler = DDIMSampler(self.model) + elif config.ldm_sampler == LDMSampler.plms: + sampler = PLMSSampler(self.model) + else: + raise ValueError() + + steps = config.ldm_steps + image = norm_img(image) + mask = norm_img(mask) + + mask[mask < 0.5] = 0 + mask[mask >= 0.5] = 1 + + image = torch.from_numpy(image).unsqueeze(0).to(self.device) + mask = torch.from_numpy(mask).unsqueeze(0).to(self.device) + masked_image = (1 - mask) * image + + mask = self._norm(mask) + masked_image = self._norm(masked_image) + + c = self.cond_stage_model_encode(masked_image) + torch.cuda.empty_cache() + + cc = torch.nn.functional.interpolate(mask, size=c.shape[-2:]) # 1,1,128,128 + c = torch.cat((c, cc), dim=1) # 1,4,128,128 + + shape = (c.shape[1] - 1,) + c.shape[2:] + samples_ddim = sampler.sample( + steps=steps, conditioning=c, batch_size=c.shape[0], shape=shape + ) + torch.cuda.empty_cache() + x_samples_ddim = self.cond_stage_model_decode( + samples_ddim + ) # samples_ddim: 1, 3, 128, 128 float32 + torch.cuda.empty_cache() + + # image = torch.clamp((image + 1.0) / 2.0, min=0.0, max=1.0) + # mask = torch.clamp((mask + 1.0) / 2.0, min=0.0, max=1.0) + inpainted_image = torch.clamp((x_samples_ddim + 1.0) / 2.0, min=0.0, max=1.0) + + # inpainted = (1 - mask) * image + mask * predicted_image + inpainted_image = inpainted_image.cpu().numpy().transpose(0, 2, 3, 1)[0] * 255 + inpainted_image = inpainted_image.astype(np.uint8)[:, :, ::-1] + return inpainted_image + + def _norm(self, tensor): + return tensor * 2.0 - 1.0 diff --git a/inpaint/model/manga.py b/inpaint/model/manga.py new file mode 100644 index 0000000..1f58251 --- /dev/null +++ b/inpaint/model/manga.py @@ -0,0 +1,97 @@ +import os +import random + +import cv2 +import numpy as np +import torch +import time +from loguru import logger + +from iopaint.helper import get_cache_path_by_url, load_jit_model, download_model +from .base import InpaintModel +from iopaint.schema import InpaintRequest + + +MANGA_INPAINTOR_MODEL_URL = os.environ.get( + "MANGA_INPAINTOR_MODEL_URL", + "https://github.com/Sanster/models/releases/download/manga/manga_inpaintor.jit", +) +MANGA_INPAINTOR_MODEL_MD5 = os.environ.get( + "MANGA_INPAINTOR_MODEL_MD5", "7d8b269c4613b6b3768af714610da86c" +) + +MANGA_LINE_MODEL_URL = os.environ.get( + "MANGA_LINE_MODEL_URL", + "https://github.com/Sanster/models/releases/download/manga/erika.jit", +) +MANGA_LINE_MODEL_MD5 = os.environ.get( + "MANGA_LINE_MODEL_MD5", "0c926d5a4af8450b0d00bc5b9a095644" +) + + +class Manga(InpaintModel): + name = "manga" + pad_mod = 16 + is_erase_model = True + + def init_model(self, device, **kwargs): + self.inpaintor_model = load_jit_model( + MANGA_INPAINTOR_MODEL_URL, device, MANGA_INPAINTOR_MODEL_MD5 + ) + self.line_model = load_jit_model( + MANGA_LINE_MODEL_URL, device, MANGA_LINE_MODEL_MD5 + ) + self.seed = 42 + + @staticmethod + def download(): + download_model(MANGA_INPAINTOR_MODEL_URL, MANGA_INPAINTOR_MODEL_MD5) + download_model(MANGA_LINE_MODEL_URL, MANGA_LINE_MODEL_MD5) + + @staticmethod + def is_downloaded() -> bool: + model_paths = [ + get_cache_path_by_url(MANGA_INPAINTOR_MODEL_URL), + get_cache_path_by_url(MANGA_LINE_MODEL_URL), + ] + return all([os.path.exists(it) for it in model_paths]) + + def forward(self, image, mask, config: InpaintRequest): + """ + image: [H, W, C] RGB + mask: [H, W, 1] + return: BGR IMAGE + """ + seed = self.seed + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + gray_img = cv2.cvtColor(image, cv2.COLOR_RGB2GRAY) + gray_img = torch.from_numpy( + gray_img[np.newaxis, np.newaxis, :, :].astype(np.float32) + ).to(self.device) + start = time.time() + lines = self.line_model(gray_img) + torch.cuda.empty_cache() + lines = torch.clamp(lines, 0, 255) + logger.info(f"erika_model time: {time.time() - start}") + + mask = torch.from_numpy(mask[np.newaxis, :, :, :]).to(self.device) + mask = mask.permute(0, 3, 1, 2) + mask = torch.where(mask > 0.5, 1.0, 0.0) + noise = torch.randn_like(mask) + ones = torch.ones_like(mask) + + gray_img = gray_img / 255 * 2 - 1.0 + lines = lines / 255 * 2 - 1.0 + + start = time.time() + inpainted_image = self.inpaintor_model(gray_img, lines, mask, noise, ones) + logger.info(f"image_inpaintor_model time: {time.time() - start}") + + cur_res = inpainted_image[0].permute(1, 2, 0).detach().cpu().numpy() + cur_res = (cur_res * 127.5 + 127.5).astype(np.uint8) + cur_res = cv2.cvtColor(cur_res, cv2.COLOR_GRAY2BGR) + return cur_res diff --git a/inpaint/model/mat.py b/inpaint/model/mat.py new file mode 100644 index 0000000..0c5360f --- /dev/null +++ b/inpaint/model/mat.py @@ -0,0 +1,1945 @@ +import os +import random + +import cv2 +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as checkpoint + +from iopaint.helper import ( + load_model, + get_cache_path_by_url, + norm_img, + download_model, +) +from iopaint.schema import InpaintRequest +from .base import InpaintModel +from .utils import ( + setup_filter, + Conv2dLayer, + FullyConnectedLayer, + conv2d_resample, + bias_act, + upsample2d, + activation_funcs, + MinibatchStdLayer, + to_2tuple, + normalize_2nd_moment, + set_seed, +) + + +class ModulatedConv2d(nn.Module): + def __init__( + self, + in_channels, # Number of input channels. + out_channels, # Number of output channels. + kernel_size, # Width and height of the convolution kernel. + style_dim, # dimension of the style code + demodulate=True, # perfrom demodulation + up=1, # Integer upsampling factor. + down=1, # Integer downsampling factor. + resample_filter=[ + 1, + 3, + 3, + 1, + ], # Low-pass filter to apply when resampling activations. + conv_clamp=None, # Clamp the output to +-X, None = disable clamping. + ): + super().__init__() + self.demodulate = demodulate + + self.weight = torch.nn.Parameter( + torch.randn([1, out_channels, in_channels, kernel_size, kernel_size]) + ) + self.out_channels = out_channels + self.kernel_size = kernel_size + self.weight_gain = 1 / np.sqrt(in_channels * (kernel_size**2)) + self.padding = self.kernel_size // 2 + self.up = up + self.down = down + self.register_buffer("resample_filter", setup_filter(resample_filter)) + self.conv_clamp = conv_clamp + + self.affine = FullyConnectedLayer(style_dim, in_channels, bias_init=1) + + def forward(self, x, style): + batch, in_channels, height, width = x.shape + style = self.affine(style).view(batch, 1, in_channels, 1, 1) + weight = self.weight * self.weight_gain * style + + if self.demodulate: + decoefs = (weight.pow(2).sum(dim=[2, 3, 4]) + 1e-8).rsqrt() + weight = weight * decoefs.view(batch, self.out_channels, 1, 1, 1) + + weight = weight.view( + batch * self.out_channels, in_channels, self.kernel_size, self.kernel_size + ) + x = x.view(1, batch * in_channels, height, width) + x = conv2d_resample( + x=x, + w=weight, + f=self.resample_filter, + up=self.up, + down=self.down, + padding=self.padding, + groups=batch, + ) + out = x.view(batch, self.out_channels, *x.shape[2:]) + + return out + + +class StyleConv(torch.nn.Module): + def __init__( + self, + in_channels, # Number of input channels. + out_channels, # Number of output channels. + style_dim, # Intermediate latent (W) dimensionality. + resolution, # Resolution of this layer. + kernel_size=3, # Convolution kernel size. + up=1, # Integer upsampling factor. + use_noise=False, # Enable noise input? + activation="lrelu", # Activation function: 'relu', 'lrelu', etc. + resample_filter=[ + 1, + 3, + 3, + 1, + ], # Low-pass filter to apply when resampling activations. + conv_clamp=None, # Clamp the output of convolution layers to +-X, None = disable clamping. + demodulate=True, # perform demodulation + ): + super().__init__() + + self.conv = ModulatedConv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + style_dim=style_dim, + demodulate=demodulate, + up=up, + resample_filter=resample_filter, + conv_clamp=conv_clamp, + ) + + self.use_noise = use_noise + self.resolution = resolution + if use_noise: + self.register_buffer("noise_const", torch.randn([resolution, resolution])) + self.noise_strength = torch.nn.Parameter(torch.zeros([])) + + self.bias = torch.nn.Parameter(torch.zeros([out_channels])) + self.activation = activation + self.act_gain = activation_funcs[activation].def_gain + self.conv_clamp = conv_clamp + + def forward(self, x, style, noise_mode="random", gain=1): + x = self.conv(x, style) + + assert noise_mode in ["random", "const", "none"] + + if self.use_noise: + if noise_mode == "random": + xh, xw = x.size()[-2:] + noise = ( + torch.randn([x.shape[0], 1, xh, xw], device=x.device) + * self.noise_strength + ) + if noise_mode == "const": + noise = self.noise_const * self.noise_strength + x = x + noise + + act_gain = self.act_gain * gain + act_clamp = self.conv_clamp * gain if self.conv_clamp is not None else None + out = bias_act( + x, self.bias, act=self.activation, gain=act_gain, clamp=act_clamp + ) + + return out + + +class ToRGB(torch.nn.Module): + def __init__( + self, + in_channels, + out_channels, + style_dim, + kernel_size=1, + resample_filter=[1, 3, 3, 1], + conv_clamp=None, + demodulate=False, + ): + super().__init__() + + self.conv = ModulatedConv2d( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=kernel_size, + style_dim=style_dim, + demodulate=demodulate, + resample_filter=resample_filter, + conv_clamp=conv_clamp, + ) + self.bias = torch.nn.Parameter(torch.zeros([out_channels])) + self.register_buffer("resample_filter", setup_filter(resample_filter)) + self.conv_clamp = conv_clamp + + def forward(self, x, style, skip=None): + x = self.conv(x, style) + out = bias_act(x, self.bias, clamp=self.conv_clamp) + + if skip is not None: + if skip.shape != out.shape: + skip = upsample2d(skip, self.resample_filter) + out = out + skip + + return out + + +def get_style_code(a, b): + return torch.cat([a, b], dim=1) + + +class DecBlockFirst(nn.Module): + def __init__( + self, + in_channels, + out_channels, + activation, + style_dim, + use_noise, + demodulate, + img_channels, + ): + super().__init__() + self.fc = FullyConnectedLayer( + in_features=in_channels * 2, + out_features=in_channels * 4**2, + activation=activation, + ) + self.conv = StyleConv( + in_channels=in_channels, + out_channels=out_channels, + style_dim=style_dim, + resolution=4, + kernel_size=3, + use_noise=use_noise, + activation=activation, + demodulate=demodulate, + ) + self.toRGB = ToRGB( + in_channels=out_channels, + out_channels=img_channels, + style_dim=style_dim, + kernel_size=1, + demodulate=False, + ) + + def forward(self, x, ws, gs, E_features, noise_mode="random"): + x = self.fc(x).view(x.shape[0], -1, 4, 4) + x = x + E_features[2] + style = get_style_code(ws[:, 0], gs) + x = self.conv(x, style, noise_mode=noise_mode) + style = get_style_code(ws[:, 1], gs) + img = self.toRGB(x, style, skip=None) + + return x, img + + +class DecBlockFirstV2(nn.Module): + def __init__( + self, + in_channels, + out_channels, + activation, + style_dim, + use_noise, + demodulate, + img_channels, + ): + super().__init__() + self.conv0 = Conv2dLayer( + in_channels=in_channels, + out_channels=in_channels, + kernel_size=3, + activation=activation, + ) + self.conv1 = StyleConv( + in_channels=in_channels, + out_channels=out_channels, + style_dim=style_dim, + resolution=4, + kernel_size=3, + use_noise=use_noise, + activation=activation, + demodulate=demodulate, + ) + self.toRGB = ToRGB( + in_channels=out_channels, + out_channels=img_channels, + style_dim=style_dim, + kernel_size=1, + demodulate=False, + ) + + def forward(self, x, ws, gs, E_features, noise_mode="random"): + # x = self.fc(x).view(x.shape[0], -1, 4, 4) + x = self.conv0(x) + x = x + E_features[2] + style = get_style_code(ws[:, 0], gs) + x = self.conv1(x, style, noise_mode=noise_mode) + style = get_style_code(ws[:, 1], gs) + img = self.toRGB(x, style, skip=None) + + return x, img + + +class DecBlock(nn.Module): + def __init__( + self, + res, + in_channels, + out_channels, + activation, + style_dim, + use_noise, + demodulate, + img_channels, + ): # res = 2, ..., resolution_log2 + super().__init__() + self.res = res + + self.conv0 = StyleConv( + in_channels=in_channels, + out_channels=out_channels, + style_dim=style_dim, + resolution=2**res, + kernel_size=3, + up=2, + use_noise=use_noise, + activation=activation, + demodulate=demodulate, + ) + self.conv1 = StyleConv( + in_channels=out_channels, + out_channels=out_channels, + style_dim=style_dim, + resolution=2**res, + kernel_size=3, + use_noise=use_noise, + activation=activation, + demodulate=demodulate, + ) + self.toRGB = ToRGB( + in_channels=out_channels, + out_channels=img_channels, + style_dim=style_dim, + kernel_size=1, + demodulate=False, + ) + + def forward(self, x, img, ws, gs, E_features, noise_mode="random"): + style = get_style_code(ws[:, self.res * 2 - 5], gs) + x = self.conv0(x, style, noise_mode=noise_mode) + x = x + E_features[self.res] + style = get_style_code(ws[:, self.res * 2 - 4], gs) + x = self.conv1(x, style, noise_mode=noise_mode) + style = get_style_code(ws[:, self.res * 2 - 3], gs) + img = self.toRGB(x, style, skip=img) + + return x, img + + +class MappingNet(torch.nn.Module): + def __init__( + self, + z_dim, # Input latent (Z) dimensionality, 0 = no latent. + c_dim, # Conditioning label (C) dimensionality, 0 = no label. + w_dim, # Intermediate latent (W) dimensionality. + num_ws, # Number of intermediate latents to output, None = do not broadcast. + num_layers=8, # Number of mapping layers. + embed_features=None, # Label embedding dimensionality, None = same as w_dim. + layer_features=None, # Number of intermediate features in the mapping layers, None = same as w_dim. + activation="lrelu", # Activation function: 'relu', 'lrelu', etc. + lr_multiplier=0.01, # Learning rate multiplier for the mapping layers. + w_avg_beta=0.995, # Decay for tracking the moving average of W during training, None = do not track. + torch_dtype=torch.float32, + ): + super().__init__() + self.z_dim = z_dim + self.c_dim = c_dim + self.w_dim = w_dim + self.num_ws = num_ws + self.num_layers = num_layers + self.w_avg_beta = w_avg_beta + self.torch_dtype = torch_dtype + + if embed_features is None: + embed_features = w_dim + if c_dim == 0: + embed_features = 0 + if layer_features is None: + layer_features = w_dim + features_list = ( + [z_dim + embed_features] + [layer_features] * (num_layers - 1) + [w_dim] + ) + + if c_dim > 0: + self.embed = FullyConnectedLayer(c_dim, embed_features) + for idx in range(num_layers): + in_features = features_list[idx] + out_features = features_list[idx + 1] + layer = FullyConnectedLayer( + in_features, + out_features, + activation=activation, + lr_multiplier=lr_multiplier, + ) + setattr(self, f"fc{idx}", layer) + + if num_ws is not None and w_avg_beta is not None: + self.register_buffer("w_avg", torch.zeros([w_dim])) + + def forward( + self, z, c, truncation_psi=1, truncation_cutoff=None, skip_w_avg_update=False + ): + # Embed, normalize, and concat inputs. + x = None + if self.z_dim > 0: + x = normalize_2nd_moment(z) + if self.c_dim > 0: + y = normalize_2nd_moment(self.embed(c)) + x = torch.cat([x, y], dim=1) if x is not None else y + + # Main layers. + for idx in range(self.num_layers): + layer = getattr(self, f"fc{idx}") + x = layer(x) + + # Update moving average of W. + if self.w_avg_beta is not None and self.training and not skip_w_avg_update: + self.w_avg.copy_(x.detach().mean(dim=0).lerp(self.w_avg, self.w_avg_beta)) + + # Broadcast. + if self.num_ws is not None: + x = x.unsqueeze(1).repeat([1, self.num_ws, 1]) + + # Apply truncation. + if truncation_psi != 1: + assert self.w_avg_beta is not None + if self.num_ws is None or truncation_cutoff is None: + x = self.w_avg.lerp(x, truncation_psi) + else: + x[:, :truncation_cutoff] = self.w_avg.lerp( + x[:, :truncation_cutoff], truncation_psi + ) + + return x + + +class DisFromRGB(nn.Module): + def __init__( + self, in_channels, out_channels, activation + ): # res = 2, ..., resolution_log2 + super().__init__() + self.conv = Conv2dLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + activation=activation, + ) + + def forward(self, x): + return self.conv(x) + + +class DisBlock(nn.Module): + def __init__( + self, in_channels, out_channels, activation + ): # res = 2, ..., resolution_log2 + super().__init__() + self.conv0 = Conv2dLayer( + in_channels=in_channels, + out_channels=in_channels, + kernel_size=3, + activation=activation, + ) + self.conv1 = Conv2dLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=3, + down=2, + activation=activation, + ) + self.skip = Conv2dLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + down=2, + bias=False, + ) + + def forward(self, x): + skip = self.skip(x, gain=np.sqrt(0.5)) + x = self.conv0(x) + x = self.conv1(x, gain=np.sqrt(0.5)) + out = skip + x + + return out + + +class Discriminator(torch.nn.Module): + def __init__( + self, + c_dim, # Conditioning label (C) dimensionality. + img_resolution, # Input resolution. + img_channels, # Number of input color channels. + channel_base=32768, # Overall multiplier for the number of channels. + channel_max=512, # Maximum number of channels in any layer. + channel_decay=1, + cmap_dim=None, # Dimensionality of mapped conditioning label, None = default. + activation="lrelu", + mbstd_group_size=4, # Group size for the minibatch standard deviation layer, None = entire minibatch. + mbstd_num_channels=1, # Number of features for the minibatch standard deviation layer, 0 = disable. + ): + super().__init__() + self.c_dim = c_dim + self.img_resolution = img_resolution + self.img_channels = img_channels + + resolution_log2 = int(np.log2(img_resolution)) + assert img_resolution == 2**resolution_log2 and img_resolution >= 4 + self.resolution_log2 = resolution_log2 + + def nf(stage): + return np.clip( + int(channel_base / 2 ** (stage * channel_decay)), 1, channel_max + ) + + if cmap_dim == None: + cmap_dim = nf(2) + if c_dim == 0: + cmap_dim = 0 + self.cmap_dim = cmap_dim + + if c_dim > 0: + self.mapping = MappingNet( + z_dim=0, c_dim=c_dim, w_dim=cmap_dim, num_ws=None, w_avg_beta=None + ) + + Dis = [DisFromRGB(img_channels + 1, nf(resolution_log2), activation)] + for res in range(resolution_log2, 2, -1): + Dis.append(DisBlock(nf(res), nf(res - 1), activation)) + + if mbstd_num_channels > 0: + Dis.append( + MinibatchStdLayer( + group_size=mbstd_group_size, num_channels=mbstd_num_channels + ) + ) + Dis.append( + Conv2dLayer( + nf(2) + mbstd_num_channels, nf(2), kernel_size=3, activation=activation + ) + ) + self.Dis = nn.Sequential(*Dis) + + self.fc0 = FullyConnectedLayer(nf(2) * 4**2, nf(2), activation=activation) + self.fc1 = FullyConnectedLayer(nf(2), 1 if cmap_dim == 0 else cmap_dim) + + def forward(self, images_in, masks_in, c): + x = torch.cat([masks_in - 0.5, images_in], dim=1) + x = self.Dis(x) + x = self.fc1(self.fc0(x.flatten(start_dim=1))) + + if self.c_dim > 0: + cmap = self.mapping(None, c) + + if self.cmap_dim > 0: + x = (x * cmap).sum(dim=1, keepdim=True) * (1 / np.sqrt(self.cmap_dim)) + + return x + + +def nf(stage, channel_base=32768, channel_decay=1.0, channel_max=512): + NF = {512: 64, 256: 128, 128: 256, 64: 512, 32: 512, 16: 512, 8: 512, 4: 512} + return NF[2**stage] + + +class Mlp(nn.Module): + def __init__( + self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.0, + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.fc1 = FullyConnectedLayer( + in_features=in_features, out_features=hidden_features, activation="lrelu" + ) + self.fc2 = FullyConnectedLayer( + in_features=hidden_features, out_features=out_features + ) + + def forward(self, x): + x = self.fc1(x) + x = self.fc2(x) + return x + + +def window_partition(x, window_size): + """ + Args: + x: (B, H, W, C) + window_size (int): window size + Returns: + windows: (num_windows*B, window_size, window_size, C) + """ + B, H, W, C = x.shape + x = x.view(B, H // window_size, window_size, W // window_size, window_size, C) + windows = ( + x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) + ) + return windows + + +def window_reverse(windows, window_size: int, H: int, W: int): + """ + Args: + windows: (num_windows*B, window_size, window_size, C) + window_size (int): Window size + H (int): Height of image + W (int): Width of image + Returns: + x: (B, H, W, C) + """ + B = int(windows.shape[0] / (H * W / window_size / window_size)) + # B = windows.shape[0] / (H * W / window_size / window_size) + x = windows.view( + B, H // window_size, W // window_size, window_size, window_size, -1 + ) + x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, H, W, -1) + return x + + +class Conv2dLayerPartial(nn.Module): + def __init__( + self, + in_channels, # Number of input channels. + out_channels, # Number of output channels. + kernel_size, # Width and height of the convolution kernel. + bias=True, # Apply additive bias before the activation function? + activation="linear", # Activation function: 'relu', 'lrelu', etc. + up=1, # Integer upsampling factor. + down=1, # Integer downsampling factor. + resample_filter=[ + 1, + 3, + 3, + 1, + ], # Low-pass filter to apply when resampling activations. + conv_clamp=None, # Clamp the output to +-X, None = disable clamping. + trainable=True, # Update the weights of this layer during training? + ): + super().__init__() + self.conv = Conv2dLayer( + in_channels, + out_channels, + kernel_size, + bias, + activation, + up, + down, + resample_filter, + conv_clamp, + trainable, + ) + + self.weight_maskUpdater = torch.ones(1, 1, kernel_size, kernel_size) + self.slide_winsize = kernel_size**2 + self.stride = down + self.padding = kernel_size // 2 if kernel_size % 2 == 1 else 0 + + def forward(self, x, mask=None): + if mask is not None: + with torch.no_grad(): + if self.weight_maskUpdater.type() != x.type(): + self.weight_maskUpdater = self.weight_maskUpdater.to(x) + update_mask = F.conv2d( + mask, + self.weight_maskUpdater, + bias=None, + stride=self.stride, + padding=self.padding, + ) + mask_ratio = self.slide_winsize / (update_mask.to(torch.float32) + 1e-8) + update_mask = torch.clamp(update_mask, 0, 1) # 0 or 1 + mask_ratio = torch.mul(mask_ratio, update_mask).to(x.dtype) + x = self.conv(x) + x = torch.mul(x, mask_ratio) + return x, update_mask + else: + x = self.conv(x) + return x, None + + +class WindowAttention(nn.Module): + r"""Window based multi-head self attention (W-MSA) module with relative position bias. + It supports both of shifted and non-shifted window. + Args: + dim (int): Number of input channels. + window_size (tuple[int]): The height and width of the window. + num_heads (int): Number of attention heads. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set + attn_drop (float, optional): Dropout ratio of attention weight. Default: 0.0 + proj_drop (float, optional): Dropout ratio of output. Default: 0.0 + """ + + def __init__( + self, + dim, + window_size, + num_heads, + down_ratio=1, + qkv_bias=True, + qk_scale=None, + attn_drop=0.0, + proj_drop=0.0, + ): + super().__init__() + self.dim = dim + self.window_size = window_size # Wh, Ww + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = qk_scale or head_dim**-0.5 + + self.q = FullyConnectedLayer(in_features=dim, out_features=dim) + self.k = FullyConnectedLayer(in_features=dim, out_features=dim) + self.v = FullyConnectedLayer(in_features=dim, out_features=dim) + self.proj = FullyConnectedLayer(in_features=dim, out_features=dim) + + self.softmax = nn.Softmax(dim=-1) + + def forward(self, x, mask_windows=None, mask=None): + """ + Args: + x: input features with shape of (num_windows*B, N, C) + mask: (0/-inf) mask with shape of (num_windows, Wh*Ww, Wh*Ww) or None + """ + B_, N, C = x.shape + norm_x = F.normalize(x, p=2.0, dim=-1, eps=torch.finfo(x.dtype).eps) + q = ( + self.q(norm_x) + .reshape(B_, N, self.num_heads, C // self.num_heads) + .permute(0, 2, 1, 3) + ) + k = ( + self.k(norm_x) + .view(B_, -1, self.num_heads, C // self.num_heads) + .permute(0, 2, 3, 1) + ) + v = ( + self.v(x) + .view(B_, -1, self.num_heads, C // self.num_heads) + .permute(0, 2, 1, 3) + ) + + attn = (q @ k) * self.scale + + if mask is not None: + nW = mask.shape[0] + attn = attn.view(B_ // nW, nW, self.num_heads, N, N) + mask.unsqueeze( + 1 + ).unsqueeze(0) + attn = attn.view(-1, self.num_heads, N, N) + + if mask_windows is not None: + attn_mask_windows = mask_windows.squeeze(-1).unsqueeze(1).unsqueeze(1) + attn = attn + attn_mask_windows.masked_fill( + attn_mask_windows == 0, float(-100.0) + ).masked_fill(attn_mask_windows == 1, float(0.0)) + with torch.no_grad(): + mask_windows = torch.clamp( + torch.sum(mask_windows, dim=1, keepdim=True), 0, 1 + ).repeat(1, N, 1) + + attn = self.softmax(attn) + x = (attn @ v).transpose(1, 2).reshape(B_, N, C) + x = self.proj(x) + return x, mask_windows + + +class SwinTransformerBlock(nn.Module): + r"""Swin Transformer Block. + Args: + dim (int): Number of input channels. + input_resolution (tuple[int]): Input resulotion. + num_heads (int): Number of attention heads. + window_size (int): Window size. + shift_size (int): Shift size for SW-MSA. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float, optional): Stochastic depth rate. Default: 0.0 + act_layer (nn.Module, optional): Activation layer. Default: nn.GELU + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + """ + + def __init__( + self, + dim, + input_resolution, + num_heads, + down_ratio=1, + window_size=7, + shift_size=0, + mlp_ratio=4.0, + qkv_bias=True, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + act_layer=nn.GELU, + norm_layer=nn.LayerNorm, + ): + super().__init__() + self.dim = dim + self.input_resolution = input_resolution + self.num_heads = num_heads + self.window_size = window_size + self.shift_size = shift_size + self.mlp_ratio = mlp_ratio + if min(self.input_resolution) <= self.window_size: + # if window size is larger than input resolution, we don't partition windows + self.shift_size = 0 + self.window_size = min(self.input_resolution) + assert ( + 0 <= self.shift_size < self.window_size + ), "shift_size must in 0-window_size" + + if self.shift_size > 0: + down_ratio = 1 + self.attn = WindowAttention( + dim, + window_size=to_2tuple(self.window_size), + num_heads=num_heads, + down_ratio=down_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + attn_drop=attn_drop, + proj_drop=drop, + ) + + self.fuse = FullyConnectedLayer( + in_features=dim * 2, out_features=dim, activation="lrelu" + ) + + mlp_hidden_dim = int(dim * mlp_ratio) + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=act_layer, + drop=drop, + ) + + if self.shift_size > 0: + attn_mask = self.calculate_mask(self.input_resolution) + else: + attn_mask = None + + self.register_buffer("attn_mask", attn_mask) + + def calculate_mask(self, x_size): + # calculate attention mask for SW-MSA + H, W = x_size + img_mask = torch.zeros((1, H, W, 1)) # 1 H W 1 + h_slices = ( + slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None), + ) + w_slices = ( + slice(0, -self.window_size), + slice(-self.window_size, -self.shift_size), + slice(-self.shift_size, None), + ) + cnt = 0 + for h in h_slices: + for w in w_slices: + img_mask[:, h, w, :] = cnt + cnt += 1 + + mask_windows = window_partition( + img_mask, self.window_size + ) # nW, window_size, window_size, 1 + mask_windows = mask_windows.view(-1, self.window_size * self.window_size) + attn_mask = mask_windows.unsqueeze(1) - mask_windows.unsqueeze(2) + attn_mask = attn_mask.masked_fill(attn_mask != 0, float(-100.0)).masked_fill( + attn_mask == 0, float(0.0) + ) + + return attn_mask + + def forward(self, x, x_size, mask=None): + # H, W = self.input_resolution + H, W = x_size + B, L, C = x.shape + # assert L == H * W, "input feature has wrong size" + + shortcut = x + x = x.view(B, H, W, C) + if mask is not None: + mask = mask.view(B, H, W, 1) + + # cyclic shift + if self.shift_size > 0: + shifted_x = torch.roll( + x, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2) + ) + if mask is not None: + shifted_mask = torch.roll( + mask, shifts=(-self.shift_size, -self.shift_size), dims=(1, 2) + ) + else: + shifted_x = x + if mask is not None: + shifted_mask = mask + + # partition windows + x_windows = window_partition( + shifted_x, self.window_size + ) # nW*B, window_size, window_size, C + x_windows = x_windows.view( + -1, self.window_size * self.window_size, C + ) # nW*B, window_size*window_size, C + if mask is not None: + mask_windows = window_partition(shifted_mask, self.window_size) + mask_windows = mask_windows.view(-1, self.window_size * self.window_size, 1) + else: + mask_windows = None + + # W-MSA/SW-MSA (to be compatible for testing on images whose shapes are the multiple of window size + if self.input_resolution == x_size: + attn_windows, mask_windows = self.attn( + x_windows, mask_windows, mask=self.attn_mask + ) # nW*B, window_size*window_size, C + else: + attn_windows, mask_windows = self.attn( + x_windows, + mask_windows, + mask=self.calculate_mask(x_size).to(x.dtype).to(x.device), + ) # nW*B, window_size*window_size, C + + # merge windows + attn_windows = attn_windows.view(-1, self.window_size, self.window_size, C) + shifted_x = window_reverse(attn_windows, self.window_size, H, W) # B H' W' C + if mask is not None: + mask_windows = mask_windows.view(-1, self.window_size, self.window_size, 1) + shifted_mask = window_reverse(mask_windows, self.window_size, H, W) + + # reverse cyclic shift + if self.shift_size > 0: + x = torch.roll( + shifted_x, shifts=(self.shift_size, self.shift_size), dims=(1, 2) + ) + if mask is not None: + mask = torch.roll( + shifted_mask, shifts=(self.shift_size, self.shift_size), dims=(1, 2) + ) + else: + x = shifted_x + if mask is not None: + mask = shifted_mask + x = x.view(B, H * W, C) + if mask is not None: + mask = mask.view(B, H * W, 1) + + # FFN + x = self.fuse(torch.cat([shortcut, x], dim=-1)) + x = self.mlp(x) + + return x, mask + + +class PatchMerging(nn.Module): + def __init__(self, in_channels, out_channels, down=2): + super().__init__() + self.conv = Conv2dLayerPartial( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=3, + activation="lrelu", + down=down, + ) + self.down = down + + def forward(self, x, x_size, mask=None): + x = token2feature(x, x_size) + if mask is not None: + mask = token2feature(mask, x_size) + x, mask = self.conv(x, mask) + if self.down != 1: + ratio = 1 / self.down + x_size = (int(x_size[0] * ratio), int(x_size[1] * ratio)) + x = feature2token(x) + if mask is not None: + mask = feature2token(mask) + return x, x_size, mask + + +class PatchUpsampling(nn.Module): + def __init__(self, in_channels, out_channels, up=2): + super().__init__() + self.conv = Conv2dLayerPartial( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=3, + activation="lrelu", + up=up, + ) + self.up = up + + def forward(self, x, x_size, mask=None): + x = token2feature(x, x_size) + if mask is not None: + mask = token2feature(mask, x_size) + x, mask = self.conv(x, mask) + if self.up != 1: + x_size = (int(x_size[0] * self.up), int(x_size[1] * self.up)) + x = feature2token(x) + if mask is not None: + mask = feature2token(mask) + return x, x_size, mask + + +class BasicLayer(nn.Module): + """A basic Swin Transformer layer for one stage. + Args: + dim (int): Number of input channels. + input_resolution (tuple[int]): Input resolution. + depth (int): Number of blocks. + num_heads (int): Number of attention heads. + window_size (int): Local window size. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool, optional): If True, add a learnable bias to query, key, value. Default: True + qk_scale (float | None, optional): Override default qk scale of head_dim ** -0.5 if set. + drop (float, optional): Dropout rate. Default: 0.0 + attn_drop (float, optional): Attention dropout rate. Default: 0.0 + drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 + norm_layer (nn.Module, optional): Normalization layer. Default: nn.LayerNorm + downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. + """ + + def __init__( + self, + dim, + input_resolution, + depth, + num_heads, + window_size, + down_ratio=1, + mlp_ratio=2.0, + qkv_bias=True, + qk_scale=None, + drop=0.0, + attn_drop=0.0, + drop_path=0.0, + norm_layer=nn.LayerNorm, + downsample=None, + use_checkpoint=False, + ): + super().__init__() + self.dim = dim + self.input_resolution = input_resolution + self.depth = depth + self.use_checkpoint = use_checkpoint + + # patch merging layer + if downsample is not None: + # self.downsample = downsample(input_resolution, dim=dim, norm_layer=norm_layer) + self.downsample = downsample + else: + self.downsample = None + + # build blocks + self.blocks = nn.ModuleList( + [ + SwinTransformerBlock( + dim=dim, + input_resolution=input_resolution, + num_heads=num_heads, + down_ratio=down_ratio, + window_size=window_size, + shift_size=0 if (i % 2 == 0) else window_size // 2, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + qk_scale=qk_scale, + drop=drop, + attn_drop=attn_drop, + drop_path=drop_path[i] + if isinstance(drop_path, list) + else drop_path, + norm_layer=norm_layer, + ) + for i in range(depth) + ] + ) + + self.conv = Conv2dLayerPartial( + in_channels=dim, out_channels=dim, kernel_size=3, activation="lrelu" + ) + + def forward(self, x, x_size, mask=None): + if self.downsample is not None: + x, x_size, mask = self.downsample(x, x_size, mask) + identity = x + for blk in self.blocks: + if self.use_checkpoint: + x, mask = checkpoint.checkpoint(blk, x, x_size, mask) + else: + x, mask = blk(x, x_size, mask) + if mask is not None: + mask = token2feature(mask, x_size) + x, mask = self.conv(token2feature(x, x_size), mask) + x = feature2token(x) + identity + if mask is not None: + mask = feature2token(mask) + return x, x_size, mask + + +class ToToken(nn.Module): + def __init__(self, in_channels=3, dim=128, kernel_size=5, stride=1): + super().__init__() + + self.proj = Conv2dLayerPartial( + in_channels=in_channels, + out_channels=dim, + kernel_size=kernel_size, + activation="lrelu", + ) + + def forward(self, x, mask): + x, mask = self.proj(x, mask) + + return x, mask + + +class EncFromRGB(nn.Module): + def __init__( + self, in_channels, out_channels, activation + ): # res = 2, ..., resolution_log2 + super().__init__() + self.conv0 = Conv2dLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=1, + activation=activation, + ) + self.conv1 = Conv2dLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + activation=activation, + ) + + def forward(self, x): + x = self.conv0(x) + x = self.conv1(x) + + return x + + +class ConvBlockDown(nn.Module): + def __init__( + self, in_channels, out_channels, activation + ): # res = 2, ..., resolution_log + super().__init__() + + self.conv0 = Conv2dLayer( + in_channels=in_channels, + out_channels=out_channels, + kernel_size=3, + activation=activation, + down=2, + ) + self.conv1 = Conv2dLayer( + in_channels=out_channels, + out_channels=out_channels, + kernel_size=3, + activation=activation, + ) + + def forward(self, x): + x = self.conv0(x) + x = self.conv1(x) + + return x + + +def token2feature(x, x_size): + B, N, C = x.shape + h, w = x_size + x = x.permute(0, 2, 1).reshape(B, C, h, w) + return x + + +def feature2token(x): + B, C, H, W = x.shape + x = x.view(B, C, -1).transpose(1, 2) + return x + + +class Encoder(nn.Module): + def __init__( + self, + res_log2, + img_channels, + activation, + patch_size=5, + channels=16, + drop_path_rate=0.1, + ): + super().__init__() + + self.resolution = [] + + for idx, i in enumerate(range(res_log2, 3, -1)): # from input size to 16x16 + res = 2**i + self.resolution.append(res) + if i == res_log2: + block = EncFromRGB(img_channels * 2 + 1, nf(i), activation) + else: + block = ConvBlockDown(nf(i + 1), nf(i), activation) + setattr(self, "EncConv_Block_%dx%d" % (res, res), block) + + def forward(self, x): + out = {} + for res in self.resolution: + res_log2 = int(np.log2(res)) + x = getattr(self, "EncConv_Block_%dx%d" % (res, res))(x) + out[res_log2] = x + + return out + + +class ToStyle(nn.Module): + def __init__(self, in_channels, out_channels, activation, drop_rate): + super().__init__() + self.conv = nn.Sequential( + Conv2dLayer( + in_channels=in_channels, + out_channels=in_channels, + kernel_size=3, + activation=activation, + down=2, + ), + Conv2dLayer( + in_channels=in_channels, + out_channels=in_channels, + kernel_size=3, + activation=activation, + down=2, + ), + Conv2dLayer( + in_channels=in_channels, + out_channels=in_channels, + kernel_size=3, + activation=activation, + down=2, + ), + ) + + self.pool = nn.AdaptiveAvgPool2d(1) + self.fc = FullyConnectedLayer( + in_features=in_channels, out_features=out_channels, activation=activation + ) + # self.dropout = nn.Dropout(drop_rate) + + def forward(self, x): + x = self.conv(x) + x = self.pool(x) + x = self.fc(x.flatten(start_dim=1)) + # x = self.dropout(x) + + return x + + +class DecBlockFirstV2(nn.Module): + def __init__( + self, + res, + in_channels, + out_channels, + activation, + style_dim, + use_noise, + demodulate, + img_channels, + ): + super().__init__() + self.res = res + + self.conv0 = Conv2dLayer( + in_channels=in_channels, + out_channels=in_channels, + kernel_size=3, + activation=activation, + ) + self.conv1 = StyleConv( + in_channels=in_channels, + out_channels=out_channels, + style_dim=style_dim, + resolution=2**res, + kernel_size=3, + use_noise=use_noise, + activation=activation, + demodulate=demodulate, + ) + self.toRGB = ToRGB( + in_channels=out_channels, + out_channels=img_channels, + style_dim=style_dim, + kernel_size=1, + demodulate=False, + ) + + def forward(self, x, ws, gs, E_features, noise_mode="random"): + # x = self.fc(x).view(x.shape[0], -1, 4, 4) + x = self.conv0(x) + x = x + E_features[self.res] + style = get_style_code(ws[:, 0], gs) + x = self.conv1(x, style, noise_mode=noise_mode) + style = get_style_code(ws[:, 1], gs) + img = self.toRGB(x, style, skip=None) + + return x, img + + +class DecBlock(nn.Module): + def __init__( + self, + res, + in_channels, + out_channels, + activation, + style_dim, + use_noise, + demodulate, + img_channels, + ): # res = 4, ..., resolution_log2 + super().__init__() + self.res = res + + self.conv0 = StyleConv( + in_channels=in_channels, + out_channels=out_channels, + style_dim=style_dim, + resolution=2**res, + kernel_size=3, + up=2, + use_noise=use_noise, + activation=activation, + demodulate=demodulate, + ) + self.conv1 = StyleConv( + in_channels=out_channels, + out_channels=out_channels, + style_dim=style_dim, + resolution=2**res, + kernel_size=3, + use_noise=use_noise, + activation=activation, + demodulate=demodulate, + ) + self.toRGB = ToRGB( + in_channels=out_channels, + out_channels=img_channels, + style_dim=style_dim, + kernel_size=1, + demodulate=False, + ) + + def forward(self, x, img, ws, gs, E_features, noise_mode="random"): + style = get_style_code(ws[:, self.res * 2 - 9], gs) + x = self.conv0(x, style, noise_mode=noise_mode) + x = x + E_features[self.res] + style = get_style_code(ws[:, self.res * 2 - 8], gs) + x = self.conv1(x, style, noise_mode=noise_mode) + style = get_style_code(ws[:, self.res * 2 - 7], gs) + img = self.toRGB(x, style, skip=img) + + return x, img + + +class Decoder(nn.Module): + def __init__( + self, res_log2, activation, style_dim, use_noise, demodulate, img_channels + ): + super().__init__() + self.Dec_16x16 = DecBlockFirstV2( + 4, nf(4), nf(4), activation, style_dim, use_noise, demodulate, img_channels + ) + for res in range(5, res_log2 + 1): + setattr( + self, + "Dec_%dx%d" % (2**res, 2**res), + DecBlock( + res, + nf(res - 1), + nf(res), + activation, + style_dim, + use_noise, + demodulate, + img_channels, + ), + ) + self.res_log2 = res_log2 + + def forward(self, x, ws, gs, E_features, noise_mode="random"): + x, img = self.Dec_16x16(x, ws, gs, E_features, noise_mode=noise_mode) + for res in range(5, self.res_log2 + 1): + block = getattr(self, "Dec_%dx%d" % (2**res, 2**res)) + x, img = block(x, img, ws, gs, E_features, noise_mode=noise_mode) + + return img + + +class DecStyleBlock(nn.Module): + def __init__( + self, + res, + in_channels, + out_channels, + activation, + style_dim, + use_noise, + demodulate, + img_channels, + ): + super().__init__() + self.res = res + + self.conv0 = StyleConv( + in_channels=in_channels, + out_channels=out_channels, + style_dim=style_dim, + resolution=2**res, + kernel_size=3, + up=2, + use_noise=use_noise, + activation=activation, + demodulate=demodulate, + ) + self.conv1 = StyleConv( + in_channels=out_channels, + out_channels=out_channels, + style_dim=style_dim, + resolution=2**res, + kernel_size=3, + use_noise=use_noise, + activation=activation, + demodulate=demodulate, + ) + self.toRGB = ToRGB( + in_channels=out_channels, + out_channels=img_channels, + style_dim=style_dim, + kernel_size=1, + demodulate=False, + ) + + def forward(self, x, img, style, skip, noise_mode="random"): + x = self.conv0(x, style, noise_mode=noise_mode) + x = x + skip + x = self.conv1(x, style, noise_mode=noise_mode) + img = self.toRGB(x, style, skip=img) + + return x, img + + +class FirstStage(nn.Module): + def __init__( + self, + img_channels, + img_resolution=256, + dim=180, + w_dim=512, + use_noise=False, + demodulate=True, + activation="lrelu", + ): + super().__init__() + res = 64 + + self.conv_first = Conv2dLayerPartial( + in_channels=img_channels + 1, + out_channels=dim, + kernel_size=3, + activation=activation, + ) + self.enc_conv = nn.ModuleList() + down_time = int(np.log2(img_resolution // res)) + # 根据图片尺寸构建 swim transformer 的层数 + for i in range(down_time): # from input size to 64 + self.enc_conv.append( + Conv2dLayerPartial( + in_channels=dim, + out_channels=dim, + kernel_size=3, + down=2, + activation=activation, + ) + ) + + # from 64 -> 16 -> 64 + depths = [2, 3, 4, 3, 2] + ratios = [1, 1 / 2, 1 / 2, 2, 2] + num_heads = 6 + window_sizes = [8, 16, 16, 16, 8] + drop_path_rate = 0.1 + dpr = [x.item() for x in torch.linspace(0, drop_path_rate, sum(depths))] + + self.tran = nn.ModuleList() + for i, depth in enumerate(depths): + res = int(res * ratios[i]) + if ratios[i] < 1: + merge = PatchMerging(dim, dim, down=int(1 / ratios[i])) + elif ratios[i] > 1: + merge = PatchUpsampling(dim, dim, up=ratios[i]) + else: + merge = None + self.tran.append( + BasicLayer( + dim=dim, + input_resolution=[res, res], + depth=depth, + num_heads=num_heads, + window_size=window_sizes[i], + drop_path=dpr[sum(depths[:i]) : sum(depths[: i + 1])], + downsample=merge, + ) + ) + + # global style + down_conv = [] + for i in range(int(np.log2(16))): + down_conv.append( + Conv2dLayer( + in_channels=dim, + out_channels=dim, + kernel_size=3, + down=2, + activation=activation, + ) + ) + down_conv.append(nn.AdaptiveAvgPool2d((1, 1))) + self.down_conv = nn.Sequential(*down_conv) + self.to_style = FullyConnectedLayer( + in_features=dim, out_features=dim * 2, activation=activation + ) + self.ws_style = FullyConnectedLayer( + in_features=w_dim, out_features=dim, activation=activation + ) + self.to_square = FullyConnectedLayer( + in_features=dim, out_features=16 * 16, activation=activation + ) + + style_dim = dim * 3 + self.dec_conv = nn.ModuleList() + for i in range(down_time): # from 64 to input size + res = res * 2 + self.dec_conv.append( + DecStyleBlock( + res, + dim, + dim, + activation, + style_dim, + use_noise, + demodulate, + img_channels, + ) + ) + + def forward(self, images_in, masks_in, ws, noise_mode="random"): + x = torch.cat([masks_in - 0.5, images_in * masks_in], dim=1) + + skips = [] + x, mask = self.conv_first(x, masks_in) # input size + skips.append(x) + for i, block in enumerate(self.enc_conv): # input size to 64 + x, mask = block(x, mask) + if i != len(self.enc_conv) - 1: + skips.append(x) + + x_size = x.size()[-2:] + x = feature2token(x) + mask = feature2token(mask) + mid = len(self.tran) // 2 + for i, block in enumerate(self.tran): # 64 to 16 + if i < mid: + x, x_size, mask = block(x, x_size, mask) + skips.append(x) + elif i > mid: + x, x_size, mask = block(x, x_size, None) + x = x + skips[mid - i] + else: + x, x_size, mask = block(x, x_size, None) + + mul_map = torch.ones_like(x) * 0.5 + mul_map = F.dropout(mul_map, training=True) + ws = self.ws_style(ws[:, -1]) + add_n = self.to_square(ws).unsqueeze(1) + add_n = ( + F.interpolate( + add_n, size=x.size(1), mode="linear", align_corners=False + ) + .squeeze(1) + .unsqueeze(-1) + ) + x = x * mul_map + add_n * (1 - mul_map) + gs = self.to_style( + self.down_conv(token2feature(x, x_size)).flatten(start_dim=1) + ) + style = torch.cat([gs, ws], dim=1) + + x = token2feature(x, x_size).contiguous() + img = None + for i, block in enumerate(self.dec_conv): + x, img = block( + x, img, style, skips[len(self.dec_conv) - i - 1], noise_mode=noise_mode + ) + + # ensemble + img = img * (1 - masks_in) + images_in * masks_in + + return img + + +class SynthesisNet(nn.Module): + def __init__( + self, + w_dim, # Intermediate latent (W) dimensionality. + img_resolution, # Output image resolution. + img_channels=3, # Number of color channels. + channel_base=32768, # Overall multiplier for the number of channels. + channel_decay=1.0, + channel_max=512, # Maximum number of channels in any layer. + activation="lrelu", # Activation function: 'relu', 'lrelu', etc. + drop_rate=0.5, + use_noise=False, + demodulate=True, + ): + super().__init__() + resolution_log2 = int(np.log2(img_resolution)) + assert img_resolution == 2**resolution_log2 and img_resolution >= 4 + + self.num_layers = resolution_log2 * 2 - 3 * 2 + self.img_resolution = img_resolution + self.resolution_log2 = resolution_log2 + + # first stage + self.first_stage = FirstStage( + img_channels, + img_resolution=img_resolution, + w_dim=w_dim, + use_noise=False, + demodulate=demodulate, + ) + + # second stage + self.enc = Encoder( + resolution_log2, img_channels, activation, patch_size=5, channels=16 + ) + self.to_square = FullyConnectedLayer( + in_features=w_dim, out_features=16 * 16, activation=activation + ) + self.to_style = ToStyle( + in_channels=nf(4), + out_channels=nf(2) * 2, + activation=activation, + drop_rate=drop_rate, + ) + style_dim = w_dim + nf(2) * 2 + self.dec = Decoder( + resolution_log2, activation, style_dim, use_noise, demodulate, img_channels + ) + + def forward(self, images_in, masks_in, ws, noise_mode="random", return_stg1=False): + out_stg1 = self.first_stage(images_in, masks_in, ws, noise_mode=noise_mode) + + # encoder + x = images_in * masks_in + out_stg1 * (1 - masks_in) + x = torch.cat([masks_in - 0.5, x, images_in * masks_in], dim=1) + E_features = self.enc(x) + + fea_16 = E_features[4] + mul_map = torch.ones_like(fea_16) * 0.5 + mul_map = F.dropout(mul_map, training=True) + add_n = self.to_square(ws[:, 0]).view(-1, 16, 16).unsqueeze(1) + add_n = F.interpolate( + add_n, size=fea_16.size()[-2:], mode="bilinear", align_corners=False + ) + fea_16 = fea_16 * mul_map + add_n * (1 - mul_map) + E_features[4] = fea_16 + + # style + gs = self.to_style(fea_16) + + # decoder + img = self.dec(fea_16, ws, gs, E_features, noise_mode=noise_mode) + + # ensemble + img = img * (1 - masks_in) + images_in * masks_in + + if not return_stg1: + return img + else: + return img, out_stg1 + + +class Generator(nn.Module): + def __init__( + self, + z_dim, # Input latent (Z) dimensionality, 0 = no latent. + c_dim, # Conditioning label (C) dimensionality, 0 = no label. + w_dim, # Intermediate latent (W) dimensionality. + img_resolution, # resolution of generated image + img_channels, # Number of input color channels. + synthesis_kwargs={}, # Arguments for SynthesisNetwork. + mapping_kwargs={}, # Arguments for MappingNetwork. + ): + super().__init__() + self.z_dim = z_dim + self.c_dim = c_dim + self.w_dim = w_dim + self.img_resolution = img_resolution + self.img_channels = img_channels + + self.synthesis = SynthesisNet( + w_dim=w_dim, + img_resolution=img_resolution, + img_channels=img_channels, + **synthesis_kwargs, + ) + self.mapping = MappingNet( + z_dim=z_dim, + c_dim=c_dim, + w_dim=w_dim, + num_ws=self.synthesis.num_layers, + **mapping_kwargs, + ) + + def forward( + self, + images_in, + masks_in, + z, + c, + truncation_psi=1, + truncation_cutoff=None, + skip_w_avg_update=False, + noise_mode="none", + return_stg1=False, + ): + ws = self.mapping( + z, + c, + truncation_psi=truncation_psi, + truncation_cutoff=truncation_cutoff, + skip_w_avg_update=skip_w_avg_update, + ) + img = self.synthesis(images_in, masks_in, ws, noise_mode=noise_mode) + return img + + +class Discriminator(torch.nn.Module): + def __init__( + self, + c_dim, # Conditioning label (C) dimensionality. + img_resolution, # Input resolution. + img_channels, # Number of input color channels. + channel_base=32768, # Overall multiplier for the number of channels. + channel_max=512, # Maximum number of channels in any layer. + channel_decay=1, + cmap_dim=None, # Dimensionality of mapped conditioning label, None = default. + activation="lrelu", + mbstd_group_size=4, # Group size for the minibatch standard deviation layer, None = entire minibatch. + mbstd_num_channels=1, # Number of features for the minibatch standard deviation layer, 0 = disable. + ): + super().__init__() + self.c_dim = c_dim + self.img_resolution = img_resolution + self.img_channels = img_channels + + resolution_log2 = int(np.log2(img_resolution)) + assert img_resolution == 2**resolution_log2 and img_resolution >= 4 + self.resolution_log2 = resolution_log2 + + if cmap_dim == None: + cmap_dim = nf(2) + if c_dim == 0: + cmap_dim = 0 + self.cmap_dim = cmap_dim + + if c_dim > 0: + self.mapping = MappingNet( + z_dim=0, c_dim=c_dim, w_dim=cmap_dim, num_ws=None, w_avg_beta=None + ) + + Dis = [DisFromRGB(img_channels + 1, nf(resolution_log2), activation)] + for res in range(resolution_log2, 2, -1): + Dis.append(DisBlock(nf(res), nf(res - 1), activation)) + + if mbstd_num_channels > 0: + Dis.append( + MinibatchStdLayer( + group_size=mbstd_group_size, num_channels=mbstd_num_channels + ) + ) + Dis.append( + Conv2dLayer( + nf(2) + mbstd_num_channels, nf(2), kernel_size=3, activation=activation + ) + ) + self.Dis = nn.Sequential(*Dis) + + self.fc0 = FullyConnectedLayer(nf(2) * 4**2, nf(2), activation=activation) + self.fc1 = FullyConnectedLayer(nf(2), 1 if cmap_dim == 0 else cmap_dim) + + # for 64x64 + Dis_stg1 = [DisFromRGB(img_channels + 1, nf(resolution_log2) // 2, activation)] + for res in range(resolution_log2, 2, -1): + Dis_stg1.append(DisBlock(nf(res) // 2, nf(res - 1) // 2, activation)) + + if mbstd_num_channels > 0: + Dis_stg1.append( + MinibatchStdLayer( + group_size=mbstd_group_size, num_channels=mbstd_num_channels + ) + ) + Dis_stg1.append( + Conv2dLayer( + nf(2) // 2 + mbstd_num_channels, + nf(2) // 2, + kernel_size=3, + activation=activation, + ) + ) + self.Dis_stg1 = nn.Sequential(*Dis_stg1) + + self.fc0_stg1 = FullyConnectedLayer( + nf(2) // 2 * 4**2, nf(2) // 2, activation=activation + ) + self.fc1_stg1 = FullyConnectedLayer( + nf(2) // 2, 1 if cmap_dim == 0 else cmap_dim + ) + + def forward(self, images_in, masks_in, images_stg1, c): + x = self.Dis(torch.cat([masks_in - 0.5, images_in], dim=1)) + x = self.fc1(self.fc0(x.flatten(start_dim=1))) + + x_stg1 = self.Dis_stg1(torch.cat([masks_in - 0.5, images_stg1], dim=1)) + x_stg1 = self.fc1_stg1(self.fc0_stg1(x_stg1.flatten(start_dim=1))) + + if self.c_dim > 0: + cmap = self.mapping(None, c) + + if self.cmap_dim > 0: + x = (x * cmap).sum(dim=1, keepdim=True) * (1 / np.sqrt(self.cmap_dim)) + x_stg1 = (x_stg1 * cmap).sum(dim=1, keepdim=True) * ( + 1 / np.sqrt(self.cmap_dim) + ) + + return x, x_stg1 + + +MAT_MODEL_URL = os.environ.get( + "MAT_MODEL_URL", + "https://github.com/Sanster/models/releases/download/add_mat/Places_512_FullData_G.pth", +) + +MAT_MODEL_MD5 = os.environ.get("MAT_MODEL_MD5", "8ca927835fa3f5e21d65ffcb165377ed") + + +class MAT(InpaintModel): + name = "mat" + min_size = 512 + pad_mod = 512 + pad_to_square = True + is_erase_model = True + + def init_model(self, device, **kwargs): + seed = 240 # pick up a random number + set_seed(seed) + + fp16 = not kwargs.get("no_half", False) + use_gpu = "cuda" in str(device) and torch.cuda.is_available() + self.torch_dtype = torch.float16 if use_gpu and fp16 else torch.float32 + + G = Generator( + z_dim=512, + c_dim=0, + w_dim=512, + img_resolution=512, + img_channels=3, + mapping_kwargs={"torch_dtype": self.torch_dtype}, + ).to(self.torch_dtype) + # fmt: off + self.model = load_model(G, MAT_MODEL_URL, device, MAT_MODEL_MD5) + self.z = torch.from_numpy(np.random.randn(1, G.z_dim)).to(self.torch_dtype).to(device) + self.label = torch.zeros([1, self.model.c_dim], device=device).to(self.torch_dtype) + # fmt: on + + @staticmethod + def download(): + download_model(MAT_MODEL_URL, MAT_MODEL_MD5) + + @staticmethod + def is_downloaded() -> bool: + return os.path.exists(get_cache_path_by_url(MAT_MODEL_URL)) + + def forward(self, image, mask, config: InpaintRequest): + """Input images and output images have same size + images: [H, W, C] RGB + masks: [H, W] mask area == 255 + return: BGR IMAGE + """ + + image = norm_img(image) # [0, 1] + image = image * 2 - 1 # [0, 1] -> [-1, 1] + + mask = (mask > 127) * 255 + mask = 255 - mask + mask = norm_img(mask) + + image = ( + torch.from_numpy(image).unsqueeze(0).to(self.torch_dtype).to(self.device) + ) + mask = torch.from_numpy(mask).unsqueeze(0).to(self.torch_dtype).to(self.device) + + output = self.model( + image, mask, self.z, self.label, truncation_psi=1, noise_mode="none" + ) + output = ( + (output.permute(0, 2, 3, 1) * 127.5 + 127.5) + .round() + .clamp(0, 255) + .to(torch.uint8) + ) + output = output[0].cpu().numpy() + cur_res = cv2.cvtColor(output, cv2.COLOR_RGB2BGR) + return cur_res diff --git a/inpaint/model/mi_gan.py b/inpaint/model/mi_gan.py new file mode 100644 index 0000000..f1ce25f --- /dev/null +++ b/inpaint/model/mi_gan.py @@ -0,0 +1,110 @@ +import os + +import cv2 +import torch + +from iopaint.helper import ( + load_jit_model, + download_model, + get_cache_path_by_url, + boxes_from_mask, + resize_max_size, + norm_img, +) +from .base import InpaintModel +from iopaint.schema import InpaintRequest + +MIGAN_MODEL_URL = os.environ.get( + "MIGAN_MODEL_URL", + "https://github.com/Sanster/models/releases/download/migan/migan_traced.pt", +) +MIGAN_MODEL_MD5 = os.environ.get("MIGAN_MODEL_MD5", "76eb3b1a71c400ee3290524f7a11b89c") + + +class MIGAN(InpaintModel): + name = "migan" + min_size = 512 + pad_mod = 512 + pad_to_square = True + is_erase_model = True + + def init_model(self, device, **kwargs): + self.model = load_jit_model(MIGAN_MODEL_URL, device, MIGAN_MODEL_MD5).eval() + + @staticmethod + def download(): + download_model(MIGAN_MODEL_URL, MIGAN_MODEL_MD5) + + @staticmethod + def is_downloaded() -> bool: + return os.path.exists(get_cache_path_by_url(MIGAN_MODEL_URL)) + + @torch.no_grad() + def __call__(self, image, mask, config: InpaintRequest): + """ + images: [H, W, C] RGB, not normalized + masks: [H, W] + return: BGR IMAGE + """ + if image.shape[0] == 512 and image.shape[1] == 512: + return self._pad_forward(image, mask, config) + + boxes = boxes_from_mask(mask) + crop_result = [] + config.hd_strategy_crop_margin = 128 + for box in boxes: + crop_image, crop_mask, crop_box = self._crop_box(image, mask, box, config) + origin_size = crop_image.shape[:2] + resize_image = resize_max_size(crop_image, size_limit=512) + resize_mask = resize_max_size(crop_mask, size_limit=512) + inpaint_result = self._pad_forward(resize_image, resize_mask, config) + + # only paste masked area result + inpaint_result = cv2.resize( + inpaint_result, + (origin_size[1], origin_size[0]), + interpolation=cv2.INTER_CUBIC, + ) + + original_pixel_indices = crop_mask < 127 + inpaint_result[original_pixel_indices] = crop_image[:, :, ::-1][ + original_pixel_indices + ] + + crop_result.append((inpaint_result, crop_box)) + + inpaint_result = image[:, :, ::-1].copy() + for crop_image, crop_box in crop_result: + x1, y1, x2, y2 = crop_box + inpaint_result[y1:y2, x1:x2, :] = crop_image + + return inpaint_result + + def forward(self, image, mask, config: InpaintRequest): + """Input images and output images have same size + images: [H, W, C] RGB + masks: [H, W] mask area == 255 + return: BGR IMAGE + """ + + image = norm_img(image) # [0, 1] + image = image * 2 - 1 # [0, 1] -> [-1, 1] + mask = (mask > 120) * 255 + mask = norm_img(mask) + + image = torch.from_numpy(image).unsqueeze(0).to(self.device) + mask = torch.from_numpy(mask).unsqueeze(0).to(self.device) + + erased_img = image * (1 - mask) + input_image = torch.cat([0.5 - mask, erased_img], dim=1) + + output = self.model(input_image) + output = ( + (output.permute(0, 2, 3, 1) * 127.5 + 127.5) + .round() + .clamp(0, 255) + .to(torch.uint8) + ) + output = output[0].cpu().numpy() + cur_res = cv2.cvtColor(output, cv2.COLOR_RGB2BGR) + return cur_res diff --git a/inpaint/model/opencv2.py b/inpaint/model/opencv2.py new file mode 100644 index 0000000..de47209 --- /dev/null +++ b/inpaint/model/opencv2.py @@ -0,0 +1,29 @@ +import cv2 +from .base import InpaintModel +from iopaint.schema import InpaintRequest + +flag_map = {"INPAINT_NS": cv2.INPAINT_NS, "INPAINT_TELEA": cv2.INPAINT_TELEA} + + +class OpenCV2(InpaintModel): + name = "cv2" + pad_mod = 1 + is_erase_model = True + + @staticmethod + def is_downloaded() -> bool: + return True + + def forward(self, image, mask, config: InpaintRequest): + """Input image and output image have same size + image: [H, W, C] RGB + mask: [H, W, 1] + return: BGR IMAGE + """ + cur_res = cv2.inpaint( + image[:, :, ::-1], + mask, + inpaintRadius=config.cv2_radius, + flags=flag_map[config.cv2_flag], + ) + return cur_res diff --git a/inpaint/model/original_sd_configs/__init__.py b/inpaint/model/original_sd_configs/__init__.py new file mode 100644 index 0000000..23896a7 --- /dev/null +++ b/inpaint/model/original_sd_configs/__init__.py @@ -0,0 +1,19 @@ +from pathlib import Path +from typing import Dict + +CURRENT_DIR = Path(__file__).parent.absolute() + + +def get_config_files() -> Dict[str, Path]: + """ + - `v1`: Config file for Stable Diffusion v1 + - `v2`: Config file for Stable Diffusion v2 + - `xl`: Config file for Stable Diffusion XL + - `xl_refiner`: Config file for Stable Diffusion XL Refiner + """ + return { + "v1": CURRENT_DIR / "v1-inference.yaml", + "v2": CURRENT_DIR / "v2-inference-v.yaml", + "xl": CURRENT_DIR / "sd_xl_base.yaml", + "xl_refiner": CURRENT_DIR / "sd_xl_refiner.yaml", + } diff --git a/inpaint/model/original_sd_configs/sd_xl_base.yaml b/inpaint/model/original_sd_configs/sd_xl_base.yaml new file mode 100644 index 0000000..6047379 --- /dev/null +++ b/inpaint/model/original_sd_configs/sd_xl_base.yaml @@ -0,0 +1,93 @@ +model: + target: sgm.models.diffusion.DiffusionEngine + params: + scale_factor: 0.13025 + disable_first_stage_autocast: True + + denoiser_config: + target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser + params: + num_idx: 1000 + + scaling_config: + target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling + discretization_config: + target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization + + network_config: + target: sgm.modules.diffusionmodules.openaimodel.UNetModel + params: + adm_in_channels: 2816 + num_classes: sequential + use_checkpoint: True + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [4, 2] + num_res_blocks: 2 + channel_mult: [1, 2, 4] + num_head_channels: 64 + use_linear_in_transformer: True + transformer_depth: [1, 2, 10] + context_dim: 2048 + spatial_transformer_attn_type: softmax-xformers + + conditioner_config: + target: sgm.modules.GeneralConditioner + params: + emb_models: + - is_trainable: False + input_key: txt + target: sgm.modules.encoders.modules.FrozenCLIPEmbedder + params: + layer: hidden + layer_idx: 11 + + - is_trainable: False + input_key: txt + target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2 + params: + arch: ViT-bigG-14 + version: laion2b_s39b_b160k + freeze: True + layer: penultimate + always_return_pooled: True + legacy: False + + - is_trainable: False + input_key: original_size_as_tuple + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 256 + + - is_trainable: False + input_key: crop_coords_top_left + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 256 + + - is_trainable: False + input_key: target_size_as_tuple + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 256 + + first_stage_config: + target: sgm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + attn_type: vanilla-xformers + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity diff --git a/inpaint/model/original_sd_configs/sd_xl_refiner.yaml b/inpaint/model/original_sd_configs/sd_xl_refiner.yaml new file mode 100644 index 0000000..2d5ab44 --- /dev/null +++ b/inpaint/model/original_sd_configs/sd_xl_refiner.yaml @@ -0,0 +1,86 @@ +model: + target: sgm.models.diffusion.DiffusionEngine + params: + scale_factor: 0.13025 + disable_first_stage_autocast: True + + denoiser_config: + target: sgm.modules.diffusionmodules.denoiser.DiscreteDenoiser + params: + num_idx: 1000 + + scaling_config: + target: sgm.modules.diffusionmodules.denoiser_scaling.EpsScaling + discretization_config: + target: sgm.modules.diffusionmodules.discretizer.LegacyDDPMDiscretization + + network_config: + target: sgm.modules.diffusionmodules.openaimodel.UNetModel + params: + adm_in_channels: 2560 + num_classes: sequential + use_checkpoint: True + in_channels: 4 + out_channels: 4 + model_channels: 384 + attention_resolutions: [4, 2] + num_res_blocks: 2 + channel_mult: [1, 2, 4, 4] + num_head_channels: 64 + use_linear_in_transformer: True + transformer_depth: 4 + context_dim: [1280, 1280, 1280, 1280] + spatial_transformer_attn_type: softmax-xformers + + conditioner_config: + target: sgm.modules.GeneralConditioner + params: + emb_models: + - is_trainable: False + input_key: txt + target: sgm.modules.encoders.modules.FrozenOpenCLIPEmbedder2 + params: + arch: ViT-bigG-14 + version: laion2b_s39b_b160k + legacy: False + freeze: True + layer: penultimate + always_return_pooled: True + + - is_trainable: False + input_key: original_size_as_tuple + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 256 + + - is_trainable: False + input_key: crop_coords_top_left + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 256 + + - is_trainable: False + input_key: aesthetic_score + target: sgm.modules.encoders.modules.ConcatTimestepEmbedderND + params: + outdim: 256 + + first_stage_config: + target: sgm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + attn_type: vanilla-xformers + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: [1, 2, 4, 4] + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity diff --git a/inpaint/model/original_sd_configs/v1-inference.yaml b/inpaint/model/original_sd_configs/v1-inference.yaml new file mode 100644 index 0000000..d4effe5 --- /dev/null +++ b/inpaint/model/original_sd_configs/v1-inference.yaml @@ -0,0 +1,70 @@ +model: + base_learning_rate: 1.0e-04 + target: ldm.models.diffusion.ddpm.LatentDiffusion + params: + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + image_size: 64 + channels: 4 + cond_stage_trainable: false # Note: different from the one we trained before + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False + + scheduler_config: # 10000 warmup steps + target: ldm.lr_scheduler.LambdaLinearScheduler + params: + warm_up_steps: [ 10000 ] + cycle_lengths: [ 10000000000000 ] # incredibly large number to prevent corner cases + f_start: [ 1.e-6 ] + f_max: [ 1. ] + f_min: [ 1. ] + + unet_config: + target: ldm.modules.diffusionmodules.openaimodel.UNetModel + params: + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_heads: 8 + use_spatial_transformer: True + transformer_depth: 1 + context_dim: 768 + use_checkpoint: True + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenCLIPEmbedder diff --git a/inpaint/model/original_sd_configs/v2-inference-v.yaml b/inpaint/model/original_sd_configs/v2-inference-v.yaml new file mode 100644 index 0000000..8ec8dfb --- /dev/null +++ b/inpaint/model/original_sd_configs/v2-inference-v.yaml @@ -0,0 +1,68 @@ +model: + base_learning_rate: 1.0e-4 + target: ldm.models.diffusion.ddpm.LatentDiffusion + params: + parameterization: "v" + linear_start: 0.00085 + linear_end: 0.0120 + num_timesteps_cond: 1 + log_every_t: 200 + timesteps: 1000 + first_stage_key: "jpg" + cond_stage_key: "txt" + image_size: 64 + channels: 4 + cond_stage_trainable: false + conditioning_key: crossattn + monitor: val/loss_simple_ema + scale_factor: 0.18215 + use_ema: False # we set this to false because this is an inference only config + + unet_config: + target: ldm.modules.diffusionmodules.openaimodel.UNetModel + params: + use_checkpoint: True + use_fp16: True + image_size: 32 # unused + in_channels: 4 + out_channels: 4 + model_channels: 320 + attention_resolutions: [ 4, 2, 1 ] + num_res_blocks: 2 + channel_mult: [ 1, 2, 4, 4 ] + num_head_channels: 64 # need to fix for flash-attn + use_spatial_transformer: True + use_linear_in_transformer: True + transformer_depth: 1 + context_dim: 1024 + legacy: False + + first_stage_config: + target: ldm.models.autoencoder.AutoencoderKL + params: + embed_dim: 4 + monitor: val/rec_loss + ddconfig: + #attn_type: "vanilla-xformers" + double_z: true + z_channels: 4 + resolution: 256 + in_channels: 3 + out_ch: 3 + ch: 128 + ch_mult: + - 1 + - 2 + - 4 + - 4 + num_res_blocks: 2 + attn_resolutions: [] + dropout: 0.0 + lossconfig: + target: torch.nn.Identity + + cond_stage_config: + target: ldm.modules.encoders.modules.FrozenOpenCLIPEmbedder + params: + freeze: True + layer: "penultimate" diff --git a/inpaint/model/paint_by_example.py b/inpaint/model/paint_by_example.py new file mode 100644 index 0000000..bf1e5b7 --- /dev/null +++ b/inpaint/model/paint_by_example.py @@ -0,0 +1,68 @@ +import PIL +import PIL.Image +import cv2 +import torch +from loguru import logger + +from iopaint.helper import decode_base64_to_image +from .base import DiffusionInpaintModel +from iopaint.schema import InpaintRequest +from .utils import get_torch_dtype, enable_low_mem, is_local_files_only + + +class PaintByExample(DiffusionInpaintModel): + name = "Fantasy-Studio/Paint-by-Example" + pad_mod = 8 + min_size = 512 + + def init_model(self, device: torch.device, **kwargs): + from diffusers import DiffusionPipeline + + use_gpu, torch_dtype = get_torch_dtype(device, kwargs.get("no_half", False)) + model_kwargs = { + "local_files_only": is_local_files_only(**kwargs), + } + + if kwargs["disable_nsfw"] or kwargs.get("cpu_offload", False): + logger.info("Disable Paint By Example Model NSFW checker") + model_kwargs.update( + dict(safety_checker=None, requires_safety_checker=False) + ) + + self.model = DiffusionPipeline.from_pretrained( + self.name, torch_dtype=torch_dtype, **model_kwargs + ) + enable_low_mem(self.model, kwargs.get("low_mem", False)) + + # TODO: gpu_id + if kwargs.get("cpu_offload", False) and use_gpu: + self.model.image_encoder = self.model.image_encoder.to(device) + self.model.enable_sequential_cpu_offload(gpu_id=0) + else: + self.model = self.model.to(device) + + def forward(self, image, mask, config: InpaintRequest): + """Input image and output image have same size + image: [H, W, C] RGB + mask: [H, W, 1] 255 means area to repaint + return: BGR IMAGE + """ + if config.paint_by_example_example_image is None: + raise ValueError("paint_by_example_example_image is required") + example_image, _, _ = decode_base64_to_image( + config.paint_by_example_example_image + ) + output = self.model( + image=PIL.Image.fromarray(image), + mask_image=PIL.Image.fromarray(mask[:, :, -1], mode="L"), + example_image=PIL.Image.fromarray(example_image), + num_inference_steps=config.sd_steps, + guidance_scale=config.sd_guidance_scale, + negative_prompt="out of frame, lowres, error, cropped, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, out of frame, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, disfigured, gross proportions, malformed limbs, watermark, signature", + output_type="np.array", + generator=torch.manual_seed(config.sd_seed), + ).images[0] + + output = (output * 255).round().astype("uint8") + output = cv2.cvtColor(output, cv2.COLOR_RGB2BGR) + return output diff --git a/inpaint/model/plms_sampler.py b/inpaint/model/plms_sampler.py new file mode 100644 index 0000000..131a8f4 --- /dev/null +++ b/inpaint/model/plms_sampler.py @@ -0,0 +1,225 @@ +# From: https://github.com/CompVis/latent-diffusion/blob/main/ldm/models/diffusion/plms.py +import torch +import numpy as np +from .utils import make_ddim_timesteps, make_ddim_sampling_parameters, noise_like +from tqdm import tqdm + + +class PLMSSampler(object): + def __init__(self, model, schedule="linear", **kwargs): + super().__init__() + self.model = model + self.ddpm_num_timesteps = model.num_timesteps + self.schedule = schedule + + def register_buffer(self, name, attr): + setattr(self, name, attr) + + def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True): + if ddim_eta != 0: + raise ValueError('ddim_eta must be 0 for PLMS') + self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps, + num_ddpm_timesteps=self.ddpm_num_timesteps, verbose=verbose) + alphas_cumprod = self.model.alphas_cumprod + assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep' + to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device) + + self.register_buffer('betas', to_torch(self.model.betas)) + self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod)) + self.register_buffer('alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev)) + + # calculations for diffusion q(x_t | x_{t-1}) and others + self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu()))) + self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu()))) + self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu()))) + self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu()))) + self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1))) + + # ddim sampling parameters + ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(), + ddim_timesteps=self.ddim_timesteps, + eta=ddim_eta, verbose=verbose) + self.register_buffer('ddim_sigmas', ddim_sigmas) + self.register_buffer('ddim_alphas', ddim_alphas) + self.register_buffer('ddim_alphas_prev', ddim_alphas_prev) + self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas)) + sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt( + (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * ( + 1 - self.alphas_cumprod / self.alphas_cumprod_prev)) + self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps) + + @torch.no_grad() + def sample(self, + steps, + batch_size, + shape, + conditioning=None, + callback=None, + normals_sequence=None, + img_callback=None, + quantize_x0=False, + eta=0., + mask=None, + x0=None, + temperature=1., + noise_dropout=0., + score_corrector=None, + corrector_kwargs=None, + verbose=False, + x_T=None, + log_every_t=100, + unconditional_guidance_scale=1., + unconditional_conditioning=None, + # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ... + **kwargs + ): + if conditioning is not None: + if isinstance(conditioning, dict): + cbs = conditioning[list(conditioning.keys())[0]].shape[0] + if cbs != batch_size: + print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}") + else: + if conditioning.shape[0] != batch_size: + print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}") + + self.make_schedule(ddim_num_steps=steps, ddim_eta=eta, verbose=verbose) + # sampling + C, H, W = shape + size = (batch_size, C, H, W) + print(f'Data shape for PLMS sampling is {size}') + + samples = self.plms_sampling(conditioning, size, + callback=callback, + img_callback=img_callback, + quantize_denoised=quantize_x0, + mask=mask, x0=x0, + ddim_use_original_steps=False, + noise_dropout=noise_dropout, + temperature=temperature, + score_corrector=score_corrector, + corrector_kwargs=corrector_kwargs, + x_T=x_T, + log_every_t=log_every_t, + unconditional_guidance_scale=unconditional_guidance_scale, + unconditional_conditioning=unconditional_conditioning, + ) + return samples + + @torch.no_grad() + def plms_sampling(self, cond, shape, + x_T=None, ddim_use_original_steps=False, + callback=None, timesteps=None, quantize_denoised=False, + mask=None, x0=None, img_callback=None, log_every_t=100, + temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None, + unconditional_guidance_scale=1., unconditional_conditioning=None, ): + device = self.model.betas.device + b = shape[0] + if x_T is None: + img = torch.randn(shape, device=device) + else: + img = x_T + + if timesteps is None: + timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps + elif timesteps is not None and not ddim_use_original_steps: + subset_end = int(min(timesteps / self.ddim_timesteps.shape[0], 1) * self.ddim_timesteps.shape[0]) - 1 + timesteps = self.ddim_timesteps[:subset_end] + + time_range = list(reversed(range(0, timesteps))) if ddim_use_original_steps else np.flip(timesteps) + total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0] + print(f"Running PLMS Sampling with {total_steps} timesteps") + + iterator = tqdm(time_range, desc='PLMS Sampler', total=total_steps) + old_eps = [] + + for i, step in enumerate(iterator): + index = total_steps - i - 1 + ts = torch.full((b,), step, device=device, dtype=torch.long) + ts_next = torch.full((b,), time_range[min(i + 1, len(time_range) - 1)], device=device, dtype=torch.long) + + if mask is not None: + assert x0 is not None + img_orig = self.model.q_sample(x0, ts) # TODO: deterministic forward pass? + img = img_orig * mask + (1. - mask) * img + + outs = self.p_sample_plms(img, cond, ts, index=index, use_original_steps=ddim_use_original_steps, + quantize_denoised=quantize_denoised, temperature=temperature, + noise_dropout=noise_dropout, score_corrector=score_corrector, + corrector_kwargs=corrector_kwargs, + unconditional_guidance_scale=unconditional_guidance_scale, + unconditional_conditioning=unconditional_conditioning, + old_eps=old_eps, t_next=ts_next) + img, pred_x0, e_t = outs + old_eps.append(e_t) + if len(old_eps) >= 4: + old_eps.pop(0) + if callback: callback(i) + if img_callback: img_callback(pred_x0, i) + + return img + + @torch.no_grad() + def p_sample_plms(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False, + temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None, + unconditional_guidance_scale=1., unconditional_conditioning=None, old_eps=None, t_next=None): + b, *_, device = *x.shape, x.device + + def get_model_output(x, t): + if unconditional_conditioning is None or unconditional_guidance_scale == 1.: + e_t = self.model.apply_model(x, t, c) + else: + x_in = torch.cat([x] * 2) + t_in = torch.cat([t] * 2) + c_in = torch.cat([unconditional_conditioning, c]) + e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2) + e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond) + + if score_corrector is not None: + assert self.model.parameterization == "eps" + e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs) + + return e_t + + alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas + alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev + sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas + sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas + + def get_x_prev_and_pred_x0(e_t, index): + # select parameters corresponding to the currently considered timestep + a_t = torch.full((b, 1, 1, 1), alphas[index], device=device) + a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device) + sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device) + sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index], device=device) + + # current prediction for x_0 + pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt() + if quantize_denoised: + pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0) + # direction pointing to x_t + dir_xt = (1. - a_prev - sigma_t ** 2).sqrt() * e_t + noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature + if noise_dropout > 0.: + noise = torch.nn.functional.dropout(noise, p=noise_dropout) + x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise + return x_prev, pred_x0 + + e_t = get_model_output(x, t) + if len(old_eps) == 0: + # Pseudo Improved Euler (2nd order) + x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t, index) + e_t_next = get_model_output(x_prev, t_next) + e_t_prime = (e_t + e_t_next) / 2 + elif len(old_eps) == 1: + # 2nd order Pseudo Linear Multistep (Adams-Bashforth) + e_t_prime = (3 * e_t - old_eps[-1]) / 2 + elif len(old_eps) == 2: + # 3nd order Pseudo Linear Multistep (Adams-Bashforth) + e_t_prime = (23 * e_t - 16 * old_eps[-1] + 5 * old_eps[-2]) / 12 + elif len(old_eps) >= 3: + # 4nd order Pseudo Linear Multistep (Adams-Bashforth) + e_t_prime = (55 * e_t - 59 * old_eps[-1] + 37 * old_eps[-2] - 9 * old_eps[-3]) / 24 + + x_prev, pred_x0 = get_x_prev_and_pred_x0(e_t_prime, index) + + return x_prev, pred_x0, e_t diff --git a/inpaint/model/power_paint/__init__.py b/inpaint/model/power_paint/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/inpaint/model/power_paint/pipeline_powerpaint.py b/inpaint/model/power_paint/pipeline_powerpaint.py new file mode 100644 index 0000000..13c1d27 --- /dev/null +++ b/inpaint/model/power_paint/pipeline_powerpaint.py @@ -0,0 +1,1243 @@ +# Copyright 2023 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import inspect +from typing import Any, Callable, Dict, List, Optional, Union + +import numpy as np +import PIL +import torch +from packaging import version +from transformers import CLIPImageProcessor, CLIPTextModel, CLIPTokenizer +from diffusers.configuration_utils import FrozenDict +from diffusers.image_processor import VaeImageProcessor +from diffusers.loaders import ( + FromSingleFileMixin, + LoraLoaderMixin, + TextualInversionLoaderMixin, +) +from diffusers.models import ( + AsymmetricAutoencoderKL, + AutoencoderKL, + UNet2DConditionModel, +) +from diffusers.schedulers import KarrasDiffusionSchedulers +from diffusers.utils import ( + deprecate, + is_accelerate_available, + is_accelerate_version, + logging, +) +from diffusers.utils.torch_utils import randn_tensor +from diffusers.pipelines.pipeline_utils import DiffusionPipeline +from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput +from diffusers.pipelines.stable_diffusion.safety_checker import ( + StableDiffusionSafetyChecker, +) + + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +def prepare_mask_and_masked_image( + image, mask, height, width, return_image: bool = False +): + """ + Prepares a pair (image, mask) to be consumed by the Stable Diffusion pipeline. This means that those inputs will be + converted to ``torch.Tensor`` with shapes ``batch x channels x height x width`` where ``channels`` is ``3`` for the + ``image`` and ``1`` for the ``mask``. + + The ``image`` will be converted to ``torch.float32`` and normalized to be in ``[-1, 1]``. The ``mask`` will be + binarized (``mask > 0.5``) and cast to ``torch.float32`` too. + + Args: + image (Union[np.array, PIL.Image, torch.Tensor]): The image to inpaint. + It can be a ``PIL.Image``, or a ``height x width x 3`` ``np.array`` or a ``channels x height x width`` + ``torch.Tensor`` or a ``batch x channels x height x width`` ``torch.Tensor``. + mask (_type_): The mask to apply to the image, i.e. regions to inpaint. + It can be a ``PIL.Image``, or a ``height x width`` ``np.array`` or a ``1 x height x width`` + ``torch.Tensor`` or a ``batch x 1 x height x width`` ``torch.Tensor``. + + + Raises: + ValueError: ``torch.Tensor`` images should be in the ``[-1, 1]`` range. ValueError: ``torch.Tensor`` mask + should be in the ``[0, 1]`` range. ValueError: ``mask`` and ``image`` should have the same spatial dimensions. + TypeError: ``mask`` is a ``torch.Tensor`` but ``image`` is not + (ot the other way around). + + Returns: + tuple[torch.Tensor]: The pair (mask, masked_image) as ``torch.Tensor`` with 4 + dimensions: ``batch x channels x height x width``. + """ + + if image is None: + raise ValueError("`image` input cannot be undefined.") + + if mask is None: + raise ValueError("`mask_image` input cannot be undefined.") + + if isinstance(image, torch.Tensor): + if not isinstance(mask, torch.Tensor): + raise TypeError( + f"`image` is a torch.Tensor but `mask` (type: {type(mask)} is not" + ) + + # Batch single image + if image.ndim == 3: + assert ( + image.shape[0] == 3 + ), "Image outside a batch should be of shape (3, H, W)" + image = image.unsqueeze(0) + + # Batch and add channel dim for single mask + if mask.ndim == 2: + mask = mask.unsqueeze(0).unsqueeze(0) + + # Batch single mask or add channel dim + if mask.ndim == 3: + # Single batched mask, no channel dim or single mask not batched but channel dim + if mask.shape[0] == 1: + mask = mask.unsqueeze(0) + + # Batched masks no channel dim + else: + mask = mask.unsqueeze(1) + + assert ( + image.ndim == 4 and mask.ndim == 4 + ), "Image and Mask must have 4 dimensions" + assert ( + image.shape[-2:] == mask.shape[-2:] + ), "Image and Mask must have the same spatial dimensions" + assert ( + image.shape[0] == mask.shape[0] + ), "Image and Mask must have the same batch size" + + # Check image is in [-1, 1] + if image.min() < -1 or image.max() > 1: + raise ValueError("Image should be in [-1, 1] range") + + # Check mask is in [0, 1] + if mask.min() < 0 or mask.max() > 1: + raise ValueError("Mask should be in [0, 1] range") + + # Binarize mask + mask[mask < 0.5] = 0 + mask[mask >= 0.5] = 1 + + # Image as float32 + image = image.to(dtype=torch.float32) + elif isinstance(mask, torch.Tensor): + raise TypeError( + f"`mask` is a torch.Tensor but `image` (type: {type(image)} is not" + ) + else: + # preprocess image + if isinstance(image, (PIL.Image.Image, np.ndarray)): + image = [image] + if isinstance(image, list) and isinstance(image[0], PIL.Image.Image): + # resize all images w.r.t passed height an width + image = [ + i.resize((width, height), resample=PIL.Image.LANCZOS) for i in image + ] + image = [np.array(i.convert("RGB"))[None, :] for i in image] + image = np.concatenate(image, axis=0) + elif isinstance(image, list) and isinstance(image[0], np.ndarray): + image = np.concatenate([i[None, :] for i in image], axis=0) + + image = image.transpose(0, 3, 1, 2) + image = torch.from_numpy(image).to(dtype=torch.float32) / 127.5 - 1.0 + + # preprocess mask + if isinstance(mask, (PIL.Image.Image, np.ndarray)): + mask = [mask] + + if isinstance(mask, list) and isinstance(mask[0], PIL.Image.Image): + mask = [i.resize((width, height), resample=PIL.Image.LANCZOS) for i in mask] + mask = np.concatenate( + [np.array(m.convert("L"))[None, None, :] for m in mask], axis=0 + ) + mask = mask.astype(np.float32) / 255.0 + elif isinstance(mask, list) and isinstance(mask[0], np.ndarray): + mask = np.concatenate([m[None, None, :] for m in mask], axis=0) + + mask[mask < 0.5] = 0 + mask[mask >= 0.5] = 1 + mask = torch.from_numpy(mask) + + masked_image = image * (mask < 0.5) + + # n.b. ensure backwards compatibility as old function does not return image + if return_image: + return mask, masked_image, image + + return mask, masked_image + + +class StableDiffusionInpaintPipeline( + DiffusionPipeline, TextualInversionLoaderMixin, LoraLoaderMixin, FromSingleFileMixin +): + r""" + Pipeline for text-guided image inpainting using Stable Diffusion. + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods + implemented for all pipelines (downloading, saving, running on a particular device, etc.). + + The pipeline also inherits the following loading methods: + - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings + - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + + Args: + vae ([`AutoencoderKL`, `AsymmetricAutoencoderKL`]): + Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations. + text_encoder ([`CLIPTextModel`]): + Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)). + tokenizer ([`~transformers.CLIPTokenizer`]): + A `CLIPTokenizer` to tokenize text. + unet ([`UNet2DConditionModel`]): + A `UNet2DConditionModel` to denoise the encoded image latents. + scheduler ([`SchedulerMixin`]): + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. + safety_checker ([`StableDiffusionSafetyChecker`]): + Classification module that estimates whether generated images could be considered offensive or harmful. + Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details + about a model's potential harms. + feature_extractor ([`~transformers.CLIPImageProcessor`]): + A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`. + """ + _optional_components = ["safety_checker", "feature_extractor"] + + def __init__( + self, + vae: Union[AutoencoderKL, AsymmetricAutoencoderKL], + text_encoder: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + scheduler: KarrasDiffusionSchedulers, + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPImageProcessor, + requires_safety_checker: bool = True, + ): + super().__init__() + + if ( + hasattr(scheduler.config, "steps_offset") + and scheduler.config.steps_offset != 1 + ): + deprecation_message = ( + f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`" + f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure " + "to update the config accordingly as leaving `steps_offset` might led to incorrect results" + " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub," + " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`" + " file" + ) + deprecate( + "steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False + ) + new_config = dict(scheduler.config) + new_config["steps_offset"] = 1 + scheduler._internal_dict = FrozenDict(new_config) + + if ( + hasattr(scheduler.config, "skip_prk_steps") + and scheduler.config.skip_prk_steps is False + ): + deprecation_message = ( + f"The configuration file of this scheduler: {scheduler} has not set the configuration" + " `skip_prk_steps`. `skip_prk_steps` should be set to True in the configuration file. Please make" + " sure to update the config accordingly as not setting `skip_prk_steps` in the config might lead to" + " incorrect results in future versions. If you have downloaded this checkpoint from the Hugging Face" + " Hub, it would be very nice if you could open a Pull request for the" + " `scheduler/scheduler_config.json` file" + ) + deprecate( + "skip_prk_steps not set", + "1.0.0", + deprecation_message, + standard_warn=False, + ) + new_config = dict(scheduler.config) + new_config["skip_prk_steps"] = True + scheduler._internal_dict = FrozenDict(new_config) + + if safety_checker is None and requires_safety_checker: + logger.warning( + f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure" + " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered" + " results in services or applications open to the public. Both the diffusers team and Hugging Face" + " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling" + " it only for use-cases that involve analyzing network behavior or auditing its results. For more" + " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ." + ) + + if safety_checker is not None and feature_extractor is None: + raise ValueError( + "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety" + " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead." + ) + + is_unet_version_less_0_9_0 = hasattr( + unet.config, "_diffusers_version" + ) and version.parse( + version.parse(unet.config._diffusers_version).base_version + ) < version.parse( + "0.9.0.dev0" + ) + is_unet_sample_size_less_64 = ( + hasattr(unet.config, "sample_size") and unet.config.sample_size < 64 + ) + if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64: + deprecation_message = ( + "The configuration file of the unet has set the default `sample_size` to smaller than" + " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the" + " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-" + " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5" + " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the" + " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`" + " in the config might lead to incorrect results in future versions. If you have downloaded this" + " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for" + " the `unet/config.json` file" + ) + deprecate( + "sample_size<64", "1.0.0", deprecation_message, standard_warn=False + ) + new_config = dict(unet.config) + new_config["sample_size"] = 64 + unet._internal_dict = FrozenDict(new_config) + + # Check shapes, assume num_channels_latents == 4, num_channels_mask == 1, num_channels_masked == 4 + if unet.config.in_channels != 9: + logger.info( + f"You have loaded a UNet with {unet.config.in_channels} input channels which." + ) + + self.register_modules( + vae=vae, + text_encoder=text_encoder, + tokenizer=tokenizer, + unet=unet, + scheduler=scheduler, + safety_checker=safety_checker, + feature_extractor=feature_extractor, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor) + self.register_to_config(requires_safety_checker=requires_safety_checker) + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.enable_model_cpu_offload + def enable_model_cpu_offload(self, gpu_id=0): + r""" + Offload all models to CPU to reduce memory usage with a low impact on performance. Moves one whole model at a + time to the GPU when its `forward` method is called, and the model remains in GPU until the next model runs. + Memory savings are lower than using `enable_sequential_cpu_offload`, but performance is much better due to the + iterative execution of the `unet`. + """ + if is_accelerate_available() and is_accelerate_version(">=", "0.17.0.dev0"): + from accelerate import cpu_offload_with_hook + else: + raise ImportError( + "`enable_model_cpu_offload` requires `accelerate v0.17.0` or higher." + ) + + device = torch.device(f"cuda:{gpu_id}") + + if self.device.type != "cpu": + self.to("cpu", silence_dtype_warnings=True) + torch.cuda.empty_cache() # otherwise we don't see the memory savings (but they probably exist) + + hook = None + for cpu_offloaded_model in [self.text_encoder, self.unet, self.vae]: + _, hook = cpu_offload_with_hook( + cpu_offloaded_model, device, prev_module_hook=hook + ) + + if self.safety_checker is not None: + _, hook = cpu_offload_with_hook( + self.safety_checker, device, prev_module_hook=hook + ) + + # We'll offload the last model manually. + self.final_offload_hook = hook + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt + def _encode_prompt( + self, + promptA, + promptB, + t, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_promptA=None, + negative_promptB=None, + t_nag=None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, + ): + r""" + Encodes the prompt into text encoder hidden states. + + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + device: (`torch.device`): + torch device + num_images_per_prompt (`int`): + number of images that should be generated per prompt + do_classifier_free_guidance (`bool`): + whether to use classifier free guidance or not + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. + """ + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + + prompt = promptA + negative_prompt = negative_promptA + + if promptA is not None and isinstance(promptA, str): + batch_size = 1 + elif promptA is not None and isinstance(promptA, list): + batch_size = len(promptA) + else: + batch_size = prompt_embeds.shape[0] + + if prompt_embeds is None: + # textual inversion: procecss multi-vector tokens if necessary + if isinstance(self, TextualInversionLoaderMixin): + promptA = self.maybe_convert_prompt(promptA, self.tokenizer) + + text_inputsA = self.tokenizer( + promptA, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + text_inputsB = self.tokenizer( + promptB, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + text_input_idsA = text_inputsA.input_ids + text_input_idsB = text_inputsB.input_ids + untruncated_ids = self.tokenizer( + promptA, padding="longest", return_tensors="pt" + ).input_ids + + if untruncated_ids.shape[-1] >= text_input_idsA.shape[ + -1 + ] and not torch.equal(text_input_idsA, untruncated_ids): + removed_text = self.tokenizer.batch_decode( + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + + if ( + hasattr(self.text_encoder.config, "use_attention_mask") + and self.text_encoder.config.use_attention_mask + ): + attention_mask = text_inputsA.attention_mask.to(device) + else: + attention_mask = None + + # print("text_input_idsA: ",text_input_idsA) + # print("text_input_idsB: ",text_input_idsB) + # print('t: ',t) + + prompt_embedsA = self.text_encoder( + text_input_idsA.to(device), + attention_mask=attention_mask, + ) + prompt_embedsA = prompt_embedsA[0] + + prompt_embedsB = self.text_encoder( + text_input_idsB.to(device), + attention_mask=attention_mask, + ) + prompt_embedsB = prompt_embedsB[0] + prompt_embeds = prompt_embedsA * (t) + (1 - t) * prompt_embedsB + # print("prompt_embeds: ",prompt_embeds) + + if self.text_encoder is not None: + prompt_embeds_dtype = self.text_encoder.dtype + elif self.unet is not None: + prompt_embeds_dtype = self.unet.dtype + else: + prompt_embeds_dtype = prompt_embeds.dtype + + prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device) + + bs_embed, seq_len, _ = prompt_embeds.shape + # duplicate text embeddings for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view( + bs_embed * num_images_per_prompt, seq_len, -1 + ) + + # get unconditional embeddings for classifier free guidance + if do_classifier_free_guidance and negative_prompt_embeds is None: + uncond_tokensA: List[str] + uncond_tokensB: List[str] + if negative_prompt is None: + uncond_tokensA = [""] * batch_size + uncond_tokensB = [""] * batch_size + elif prompt is not None and type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif isinstance(negative_prompt, str): + uncond_tokensA = [negative_promptA] + uncond_tokensB = [negative_promptB] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokensA = negative_promptA + uncond_tokensB = negative_promptB + + # textual inversion: procecss multi-vector tokens if necessary + if isinstance(self, TextualInversionLoaderMixin): + uncond_tokensA = self.maybe_convert_prompt( + uncond_tokensA, self.tokenizer + ) + uncond_tokensB = self.maybe_convert_prompt( + uncond_tokensB, self.tokenizer + ) + + max_length = prompt_embeds.shape[1] + uncond_inputA = self.tokenizer( + uncond_tokensA, + padding="max_length", + max_length=max_length, + truncation=True, + return_tensors="pt", + ) + uncond_inputB = self.tokenizer( + uncond_tokensB, + padding="max_length", + max_length=max_length, + truncation=True, + return_tensors="pt", + ) + + if ( + hasattr(self.text_encoder.config, "use_attention_mask") + and self.text_encoder.config.use_attention_mask + ): + attention_mask = uncond_inputA.attention_mask.to(device) + else: + attention_mask = None + + negative_prompt_embedsA = self.text_encoder( + uncond_inputA.input_ids.to(device), + attention_mask=attention_mask, + ) + negative_prompt_embedsB = self.text_encoder( + uncond_inputB.input_ids.to(device), + attention_mask=attention_mask, + ) + negative_prompt_embeds = ( + negative_prompt_embedsA[0] * (t_nag) + + (1 - t_nag) * negative_prompt_embedsB[0] + ) + + # negative_prompt_embeds = negative_prompt_embeds[0] + + if do_classifier_free_guidance: + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + seq_len = negative_prompt_embeds.shape[1] + + negative_prompt_embeds = negative_prompt_embeds.to( + dtype=prompt_embeds_dtype, device=device + ) + + negative_prompt_embeds = negative_prompt_embeds.repeat( + 1, num_images_per_prompt, 1 + ) + negative_prompt_embeds = negative_prompt_embeds.view( + batch_size * num_images_per_prompt, seq_len, -1 + ) + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + # print("prompt_embeds: ",prompt_embeds) + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) + + return prompt_embeds + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker + def run_safety_checker(self, image, device, dtype): + if self.safety_checker is None: + has_nsfw_concept = None + else: + if torch.is_tensor(image): + feature_extractor_input = self.image_processor.postprocess( + image, output_type="pil" + ) + else: + feature_extractor_input = self.image_processor.numpy_to_pil(image) + safety_checker_input = self.feature_extractor( + feature_extractor_input, return_tensors="pt" + ).to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) + return image, has_nsfw_concept + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs + def prepare_extra_step_kwargs(self, generator, eta): + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + + accepts_eta = "eta" in set( + inspect.signature(self.scheduler.step).parameters.keys() + ) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + # check if the scheduler accepts generator + accepts_generator = "generator" in set( + inspect.signature(self.scheduler.step).parameters.keys() + ) + if accepts_generator: + extra_step_kwargs["generator"] = generator + return extra_step_kwargs + + def check_inputs( + self, + prompt, + height, + width, + strength, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ): + if strength < 0 or strength > 1: + raise ValueError( + f"The value of strength should in [0.0, 1.0] but is {strength}" + ) + + if height % 8 != 0 or width % 8 != 0: + raise ValueError( + f"`height` and `width` have to be divisible by 8 but are {height} and {width}." + ) + + if (callback_steps is None) or ( + callback_steps is not None + and (not isinstance(callback_steps, int) or callback_steps <= 0) + ): + raise ValueError( + f"`callback_steps` has to be a positive integer but is {callback_steps} of type" + f" {type(callback_steps)}." + ) + + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and ( + not isinstance(prompt, str) and not isinstance(prompt, list) + ): + raise ValueError( + f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" + ) + + if negative_prompt is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + + if prompt_embeds is not None and negative_prompt_embeds is not None: + if prompt_embeds.shape != negative_prompt_embeds.shape: + raise ValueError( + "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" + f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" + f" {negative_prompt_embeds.shape}." + ) + + def prepare_latents( + self, + batch_size, + num_channels_latents, + height, + width, + dtype, + device, + generator, + latents=None, + image=None, + timestep=None, + is_strength_max=True, + return_noise=False, + return_image_latents=False, + ): + shape = ( + batch_size, + num_channels_latents, + height // self.vae_scale_factor, + width // self.vae_scale_factor, + ) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + if (image is None or timestep is None) and not is_strength_max: + raise ValueError( + "Since strength < 1. initial latents are to be initialised as a combination of Image + Noise." + "However, either the image or the noise timestep has not been provided." + ) + + if return_image_latents or (latents is None and not is_strength_max): + image = image.to(device=device, dtype=dtype) + image_latents = self._encode_vae_image(image=image, generator=generator) + + if latents is None: + noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + # if strength is 1. then initialise the latents to noise, else initial to image + noise + latents = ( + noise + if is_strength_max + else self.scheduler.add_noise(image_latents, noise, timestep) + ) + # if pure noise then scale the initial latents by the Scheduler's init sigma + latents = ( + latents * self.scheduler.init_noise_sigma + if is_strength_max + else latents + ) + else: + noise = latents.to(device) + latents = noise * self.scheduler.init_noise_sigma + + outputs = (latents,) + + if return_noise: + outputs += (noise,) + + if return_image_latents: + outputs += (image_latents,) + + return outputs + + def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator): + if isinstance(generator, list): + image_latents = [ + self.vae.encode(image[i : i + 1]).latent_dist.sample( + generator=generator[i] + ) + for i in range(image.shape[0]) + ] + image_latents = torch.cat(image_latents, dim=0) + else: + image_latents = self.vae.encode(image).latent_dist.sample( + generator=generator + ) + + image_latents = self.vae.config.scaling_factor * image_latents + + return image_latents + + def prepare_mask_latents( + self, + mask, + masked_image, + batch_size, + height, + width, + dtype, + device, + generator, + do_classifier_free_guidance, + ): + # resize the mask to latents shape as we concatenate the mask to the latents + # we do that before converting to dtype to avoid breaking in case we're using cpu_offload + # and half precision + mask = torch.nn.functional.interpolate( + mask, size=(height // self.vae_scale_factor, width // self.vae_scale_factor) + ) + mask = mask.to(device=device, dtype=dtype) + + masked_image = masked_image.to(device=device, dtype=dtype) + masked_image_latents = self._encode_vae_image(masked_image, generator=generator) + + # duplicate mask and masked_image_latents for each generation per prompt, using mps friendly method + if mask.shape[0] < batch_size: + if not batch_size % mask.shape[0] == 0: + raise ValueError( + "The passed mask and the required batch size don't match. Masks are supposed to be duplicated to" + f" a total batch size of {batch_size}, but {mask.shape[0]} masks were passed. Make sure the number" + " of masks that you pass is divisible by the total requested batch size." + ) + mask = mask.repeat(batch_size // mask.shape[0], 1, 1, 1) + if masked_image_latents.shape[0] < batch_size: + if not batch_size % masked_image_latents.shape[0] == 0: + raise ValueError( + "The passed images and the required batch size don't match. Images are supposed to be duplicated" + f" to a total batch size of {batch_size}, but {masked_image_latents.shape[0]} images were passed." + " Make sure the number of images that you pass is divisible by the total requested batch size." + ) + masked_image_latents = masked_image_latents.repeat( + batch_size // masked_image_latents.shape[0], 1, 1, 1 + ) + + mask = torch.cat([mask] * 2) if do_classifier_free_guidance else mask + masked_image_latents = ( + torch.cat([masked_image_latents] * 2) + if do_classifier_free_guidance + else masked_image_latents + ) + + # aligning device to prevent device errors when concating it with the latent model input + masked_image_latents = masked_image_latents.to(device=device, dtype=dtype) + return mask, masked_image_latents + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.StableDiffusionImg2ImgPipeline.get_timesteps + def get_timesteps(self, num_inference_steps, strength, device): + # get the original timestep using init_timestep + init_timestep = min(int(num_inference_steps * strength), num_inference_steps) + + t_start = max(num_inference_steps - init_timestep, 0) + timesteps = self.scheduler.timesteps[t_start * self.scheduler.order :] + + return timesteps, num_inference_steps - t_start + + @torch.no_grad() + def __call__( + self, + promptA: Union[str, List[str]] = None, + promptB: Union[str, List[str]] = None, + image: Union[torch.FloatTensor, PIL.Image.Image] = None, + mask_image: Union[torch.FloatTensor, PIL.Image.Image] = None, + height: Optional[int] = None, + width: Optional[int] = None, + strength: float = 1.0, + tradoff: float = 1.0, + tradoff_nag: float = 1.0, + num_inference_steps: int = 50, + guidance_scale: float = 7.5, + negative_promptA: Optional[Union[str, List[str]]] = None, + negative_promptB: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None, + callback_steps: int = 1, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + task_class: Union[torch.Tensor, float, int] = None, + ): + r""" + The call function to the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. + image (`PIL.Image.Image`): + `Image` or tensor representing an image batch to be inpainted (which parts of the image to be masked + out with `mask_image` and repainted according to `prompt`). + mask_image (`PIL.Image.Image`): + `Image` or tensor representing an image batch to mask `image`. White pixels in the mask are repainted + while black pixels are preserved. If `mask_image` is a PIL image, it is converted to a single channel + (luminance) before use. If it's a tensor, it should contain one color channel (L) instead of 3, so the + expected shape would be `(B, H, W, 1)`. + height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): + The height in pixels of the generated image. + width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): + The width in pixels of the generated image. + strength (`float`, *optional*, defaults to 1.0): + Indicates extent to transform the reference `image`. Must be between 0 and 1. `image` is used as a + starting point and more noise is added the higher the `strength`. The number of denoising steps depends + on the amount of noise initially added. When `strength` is 1, added noise is maximum and the denoising + process runs for the full number of iterations specified in `num_inference_steps`. A value of 1 + essentially ignores `image`. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. This parameter is modulated by `strength`. + guidance_scale (`float`, *optional*, defaults to 7.5): + A higher guidance scale value encourages the model to generate images closely linked to the text + `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide what to not include in image generation. If not defined, you need to + pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies + to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make + generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor is generated by sampling using the supplied random `generator`. + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not + provided, text embeddings are generated from the `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If + not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generated image. Choose between `PIL.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a + plain tuple. + callback (`Callable`, *optional*): + A function that calls every `callback_steps` steps during inference. The function is called with the + following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + callback_steps (`int`, *optional*, defaults to 1): + The frequency at which the `callback` function is called. If not specified, the callback is called at + every step. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in + [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + + Examples: + + ```py + >>> import PIL + >>> import requests + >>> import torch + >>> from io import BytesIO + + >>> from diffusers import StableDiffusionInpaintPipeline + + + >>> def download_image(url): + ... response = requests.get(url) + ... return PIL.Image.open(BytesIO(response.content)).convert("RGB") + + + >>> img_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo.png" + >>> mask_url = "https://raw.githubusercontent.com/CompVis/latent-diffusion/main/data/inpainting_examples/overture-creations-5sI6fQgYIuo_mask.png" + + >>> init_image = download_image(img_url).resize((512, 512)) + >>> mask_image = download_image(mask_url).resize((512, 512)) + + >>> pipe = StableDiffusionInpaintPipeline.from_pretrained( + ... "runwayml/stable-diffusion-inpainting", torch_dtype=torch.float16 + ... ) + >>> pipe = pipe.to("cuda") + + >>> prompt = "Face of a yellow cat, high resolution, sitting on a park bench" + >>> image = pipe(prompt=prompt, image=init_image, mask_image=mask_image).images[0] + ``` + + Returns: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: + If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned, + otherwise a `tuple` is returned where the first element is a list with the generated images and the + second element is a list of `bool`s indicating whether the corresponding generated image contains + "not-safe-for-work" (nsfw) content. + """ + # 0. Default height and width to unet + height = height or self.unet.config.sample_size * self.vae_scale_factor + width = width or self.unet.config.sample_size * self.vae_scale_factor + prompt = promptA + negative_prompt = negative_promptA + # 1. Check inputs + self.check_inputs( + prompt, + height, + width, + strength, + callback_steps, + negative_prompt, + prompt_embeds, + negative_prompt_embeds, + ) + + # 2. Define call parameters + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + device = self._execution_device + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + do_classifier_free_guidance = guidance_scale > 1.0 + + # 3. Encode input prompt + text_encoder_lora_scale = ( + cross_attention_kwargs.get("scale", None) + if cross_attention_kwargs is not None + else None + ) + prompt_embeds = self._encode_prompt( + promptA, + promptB, + tradoff, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_promptA, + negative_promptB, + tradoff_nag, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, + ) + + # 4. set timesteps + self.scheduler.set_timesteps(num_inference_steps, device=device) + timesteps, num_inference_steps = self.get_timesteps( + num_inference_steps=num_inference_steps, strength=strength, device=device + ) + # check that number of inference steps is not < 1 - as this doesn't make sense + if num_inference_steps < 1: + raise ValueError( + f"After adjusting the num_inference_steps by strength parameter: {strength}, the number of pipeline" + f"steps is {num_inference_steps} which is < 1 and not appropriate for this pipeline." + ) + # at which timestep to set the initial noise (n.b. 50% if strength is 0.5) + latent_timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt) + # create a boolean to check if the strength is set to 1. if so then initialise the latents with pure noise + is_strength_max = strength == 1.0 + + # 5. Preprocess mask and image + mask, masked_image, init_image = prepare_mask_and_masked_image( + image, mask_image, height, width, return_image=True + ) + mask_condition = mask.clone() + + # 6. Prepare latent variables + num_channels_latents = self.vae.config.latent_channels + num_channels_unet = self.unet.config.in_channels + return_image_latents = num_channels_unet == 4 + + latents_outputs = self.prepare_latents( + batch_size * num_images_per_prompt, + num_channels_latents, + height, + width, + prompt_embeds.dtype, + device, + generator, + latents, + image=init_image, + timestep=latent_timestep, + is_strength_max=is_strength_max, + return_noise=True, + return_image_latents=return_image_latents, + ) + + if return_image_latents: + latents, noise, image_latents = latents_outputs + else: + latents, noise = latents_outputs + + # 7. Prepare mask latent variables + mask, masked_image_latents = self.prepare_mask_latents( + mask, + masked_image, + batch_size * num_images_per_prompt, + height, + width, + prompt_embeds.dtype, + device, + generator, + do_classifier_free_guidance, + ) + + # 8. Check that sizes of mask, masked image and latents match + if num_channels_unet == 9: + # default case for runwayml/stable-diffusion-inpainting + num_channels_mask = mask.shape[1] + num_channels_masked_image = masked_image_latents.shape[1] + if ( + num_channels_latents + num_channels_mask + num_channels_masked_image + != self.unet.config.in_channels + ): + raise ValueError( + f"Incorrect configuration settings! The config of `pipeline.unet`: {self.unet.config} expects" + f" {self.unet.config.in_channels} but received `num_channels_latents`: {num_channels_latents} +" + f" `num_channels_mask`: {num_channels_mask} + `num_channels_masked_image`: {num_channels_masked_image}" + f" = {num_channels_latents+num_channels_masked_image+num_channels_mask}. Please verify the config of" + " `pipeline.unet` or your `mask_image` or `image` input." + ) + elif num_channels_unet != 4: + raise ValueError( + f"The unet {self.unet.__class__} should have either 4 or 9 input channels, not {self.unet.config.in_channels}." + ) + + # 9. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + # 10. Denoising loop + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + # expand the latents if we are doing classifier free guidance + latent_model_input = ( + torch.cat([latents] * 2) if do_classifier_free_guidance else latents + ) + + # concat latents, mask, masked_image_latents in the channel dimension + latent_model_input = self.scheduler.scale_model_input( + latent_model_input, t + ) + + if num_channels_unet == 9: + latent_model_input = torch.cat( + [latent_model_input, mask, masked_image_latents], dim=1 + ) + + # predict the noise residual + if task_class is not None: + noise_pred = self.unet( + sample=latent_model_input, + timestep=t, + encoder_hidden_states=prompt_embeds, + cross_attention_kwargs=cross_attention_kwargs, + return_dict=False, + task_class=task_class, + )[0] + else: + noise_pred = self.unet( + latent_model_input, + t, + encoder_hidden_states=prompt_embeds, + cross_attention_kwargs=cross_attention_kwargs, + return_dict=False, + )[0] + + # perform guidance + if do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + guidance_scale * ( + noise_pred_text - noise_pred_uncond + ) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step( + noise_pred, t, latents, **extra_step_kwargs, return_dict=False + )[0] + + if num_channels_unet == 4: + init_latents_proper = image_latents[:1] + init_mask = mask[:1] + + if i < len(timesteps) - 1: + noise_timestep = timesteps[i + 1] + init_latents_proper = self.scheduler.add_noise( + init_latents_proper, noise, torch.tensor([noise_timestep]) + ) + + latents = ( + 1 - init_mask + ) * init_latents_proper + init_mask * latents + + # call the callback, if provided + if i == len(timesteps) - 1 or ( + (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0 + ): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + callback(self, i, t, {}) + + if not output_type == "latent": + condition_kwargs = {} + if isinstance(self.vae, AsymmetricAutoencoderKL): + init_image = init_image.to( + device=device, dtype=masked_image_latents.dtype + ) + init_image_condition = init_image.clone() + init_image = self._encode_vae_image(init_image, generator=generator) + mask_condition = mask_condition.to( + device=device, dtype=masked_image_latents.dtype + ) + condition_kwargs = { + "image": init_image_condition, + "mask": mask_condition, + } + image = self.vae.decode( + latents / self.vae.config.scaling_factor, + return_dict=False, + **condition_kwargs, + )[0] + image, has_nsfw_concept = self.run_safety_checker( + image, device, prompt_embeds.dtype + ) + else: + image = latents + has_nsfw_concept = None + + if has_nsfw_concept is None: + do_denormalize = [True] * image.shape[0] + else: + do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] + + image = self.image_processor.postprocess( + image, output_type=output_type, do_denormalize=do_denormalize + ) + + # Offload last model to CPU + if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: + self.final_offload_hook.offload() + + if not return_dict: + return (image, has_nsfw_concept) + + return StableDiffusionPipelineOutput( + images=image, nsfw_content_detected=has_nsfw_concept + ) diff --git a/inpaint/model/power_paint/power_paint.py b/inpaint/model/power_paint/power_paint.py new file mode 100644 index 0000000..f17a5a3 --- /dev/null +++ b/inpaint/model/power_paint/power_paint.py @@ -0,0 +1,101 @@ +from PIL import Image +import PIL.Image +import cv2 +import torch +from loguru import logger + +from ..base import DiffusionInpaintModel +from ..helper.cpu_text_encoder import CPUTextEncoderWrapper +from ..utils import ( + handle_from_pretrained_exceptions, + get_torch_dtype, + enable_low_mem, + is_local_files_only, +) +from iopaint.schema import InpaintRequest +from .powerpaint_tokenizer import add_task_to_prompt +from ...const import POWERPAINT_NAME + + +class PowerPaint(DiffusionInpaintModel): + name = POWERPAINT_NAME + pad_mod = 8 + min_size = 512 + lcm_lora_id = "latent-consistency/lcm-lora-sdv1-5" + + def init_model(self, device: torch.device, **kwargs): + from .pipeline_powerpaint import StableDiffusionInpaintPipeline + from .powerpaint_tokenizer import PowerPaintTokenizer + + use_gpu, torch_dtype = get_torch_dtype(device, kwargs.get("no_half", False)) + model_kwargs = {"local_files_only": is_local_files_only(**kwargs)} + if kwargs["disable_nsfw"] or kwargs.get("cpu_offload", False): + logger.info("Disable Stable Diffusion Model NSFW checker") + model_kwargs.update( + dict( + safety_checker=None, + feature_extractor=None, + requires_safety_checker=False, + ) + ) + + self.model = handle_from_pretrained_exceptions( + StableDiffusionInpaintPipeline.from_pretrained, + pretrained_model_name_or_path=self.name, + variant="fp16", + torch_dtype=torch_dtype, + **model_kwargs, + ) + self.model.tokenizer = PowerPaintTokenizer(self.model.tokenizer) + + enable_low_mem(self.model, kwargs.get("low_mem", False)) + + if kwargs.get("cpu_offload", False) and use_gpu: + logger.info("Enable sequential cpu offload") + self.model.enable_sequential_cpu_offload(gpu_id=0) + else: + self.model = self.model.to(device) + if kwargs["sd_cpu_textencoder"]: + logger.info("Run Stable Diffusion TextEncoder on CPU") + self.model.text_encoder = CPUTextEncoderWrapper( + self.model.text_encoder, torch_dtype + ) + + self.callback = kwargs.pop("callback", None) + + def forward(self, image, mask, config: InpaintRequest): + """Input image and output image have same size + image: [H, W, C] RGB + mask: [H, W, 1] 255 means area to repaint + return: BGR IMAGE + """ + self.set_scheduler(config) + + img_h, img_w = image.shape[:2] + promptA, promptB, negative_promptA, negative_promptB = add_task_to_prompt( + config.prompt, config.negative_prompt, config.powerpaint_task + ) + + output = self.model( + image=PIL.Image.fromarray(image), + promptA=promptA, + promptB=promptB, + tradoff=config.fitting_degree, + tradoff_nag=config.fitting_degree, + negative_promptA=negative_promptA, + negative_promptB=negative_promptB, + mask_image=PIL.Image.fromarray(mask[:, :, -1], mode="L"), + num_inference_steps=config.sd_steps, + strength=config.sd_strength, + guidance_scale=config.sd_guidance_scale, + output_type="np", + callback=self.callback, + height=img_h, + width=img_w, + generator=torch.manual_seed(config.sd_seed), + callback_steps=1, + ).images[0] + + output = (output * 255).round().astype("uint8") + output = cv2.cvtColor(output, cv2.COLOR_RGB2BGR) + return output diff --git a/inpaint/model/power_paint/power_paint_v2.py b/inpaint/model/power_paint/power_paint_v2.py new file mode 100644 index 0000000..1a27f65 --- /dev/null +++ b/inpaint/model/power_paint/power_paint_v2.py @@ -0,0 +1,186 @@ +from itertools import chain + +import PIL.Image +import cv2 +import torch +from iopaint.model.original_sd_configs import get_config_files +from loguru import logger +from transformers import CLIPTextModel, CLIPTokenizer +import numpy as np + +from ..base import DiffusionInpaintModel +from ..helper.cpu_text_encoder import CPUTextEncoderWrapper +from ..utils import ( + get_torch_dtype, + enable_low_mem, + is_local_files_only, + handle_from_pretrained_exceptions, +) +from .powerpaint_tokenizer import task_to_prompt +from iopaint.schema import InpaintRequest, ModelType +from .v2.BrushNet_CA import BrushNetModel +from .v2.unet_2d_condition import UNet2DConditionModel_forward +from .v2.unet_2d_blocks import ( + CrossAttnDownBlock2D_forward, + DownBlock2D_forward, + CrossAttnUpBlock2D_forward, + UpBlock2D_forward, +) + + +class PowerPaintV2(DiffusionInpaintModel): + pad_mod = 8 + min_size = 512 + lcm_lora_id = "latent-consistency/lcm-lora-sdv1-5" + hf_model_id = "Sanster/PowerPaint_v2" + + def init_model(self, device: torch.device, **kwargs): + from .v2.pipeline_PowerPaint_Brushnet_CA import ( + StableDiffusionPowerPaintBrushNetPipeline, + ) + from .powerpaint_tokenizer import PowerPaintTokenizer + + use_gpu, torch_dtype = get_torch_dtype(device, kwargs.get("no_half", False)) + model_kwargs = {"local_files_only": is_local_files_only(**kwargs)} + if kwargs["disable_nsfw"] or kwargs.get("cpu_offload", False): + logger.info("Disable Stable Diffusion Model NSFW checker") + model_kwargs.update( + dict( + safety_checker=None, + feature_extractor=None, + requires_safety_checker=False, + ) + ) + + text_encoder_brushnet = CLIPTextModel.from_pretrained( + self.hf_model_id, + subfolder="text_encoder_brushnet", + variant="fp16", + torch_dtype=torch_dtype, + local_files_only=model_kwargs["local_files_only"], + ) + + brushnet = BrushNetModel.from_pretrained( + self.hf_model_id, + subfolder="PowerPaint_Brushnet", + variant="fp16", + torch_dtype=torch_dtype, + local_files_only=model_kwargs["local_files_only"], + ) + + if self.model_info.is_single_file_diffusers: + if self.model_info.model_type == ModelType.DIFFUSERS_SD: + model_kwargs["num_in_channels"] = 4 + else: + model_kwargs["num_in_channels"] = 9 + + pipe = StableDiffusionPowerPaintBrushNetPipeline.from_single_file( + self.model_id_or_path, + torch_dtype=torch_dtype, + load_safety_checker=False, + original_config_file=get_config_files()["v1"], + brushnet=brushnet, + text_encoder_brushnet=text_encoder_brushnet, + **model_kwargs, + ) + else: + pipe = handle_from_pretrained_exceptions( + StableDiffusionPowerPaintBrushNetPipeline.from_pretrained, + pretrained_model_name_or_path=self.model_id_or_path, + torch_dtype=torch_dtype, + brushnet=brushnet, + text_encoder_brushnet=text_encoder_brushnet, + variant="fp16", + **model_kwargs, + ) + pipe.tokenizer = PowerPaintTokenizer( + CLIPTokenizer.from_pretrained(self.hf_model_id, subfolder="tokenizer") + ) + self.model = pipe + + enable_low_mem(self.model, kwargs.get("low_mem", False)) + + if kwargs.get("cpu_offload", False) and use_gpu: + logger.info("Enable sequential cpu offload") + self.model.enable_sequential_cpu_offload(gpu_id=0) + else: + self.model = self.model.to(device) + if kwargs["sd_cpu_textencoder"]: + logger.info("Run Stable Diffusion TextEncoder on CPU") + self.model.text_encoder = CPUTextEncoderWrapper( + self.model.text_encoder, torch_dtype + ) + + self.callback = kwargs.pop("callback", None) + + # Monkey patch the forward method of the UNet to use the brushnet_unet_forward method + self.model.unet.forward = UNet2DConditionModel_forward.__get__( + self.model.unet, self.model.unet.__class__ + ) + + # Monkey patch unet down_blocks to use CrossAttnDownBlock2D_forward + for down_block in chain( + self.model.unet.down_blocks, self.model.brushnet.down_blocks + ): + if down_block.__class__.__name__ == "CrossAttnDownBlock2D": + down_block.forward = CrossAttnDownBlock2D_forward.__get__( + down_block, down_block.__class__ + ) + else: + down_block.forward = DownBlock2D_forward.__get__( + down_block, down_block.__class__ + ) + + for up_block in chain(self.model.unet.up_blocks, self.model.brushnet.up_blocks): + if up_block.__class__.__name__ == "CrossAttnUpBlock2D": + up_block.forward = CrossAttnUpBlock2D_forward.__get__( + up_block, up_block.__class__ + ) + else: + up_block.forward = UpBlock2D_forward.__get__( + up_block, up_block.__class__ + ) + + def forward(self, image, mask, config: InpaintRequest): + """Input image and output image have same size + image: [H, W, C] RGB + mask: [H, W, 1] 255 means area to repaint + return: BGR IMAGE + """ + self.set_scheduler(config) + + image = image * (1 - mask / 255.0) + img_h, img_w = image.shape[:2] + + image = PIL.Image.fromarray(image.astype(np.uint8)) + mask = PIL.Image.fromarray(mask[:, :, -1], mode="L").convert("RGB") + + promptA, promptB, negative_promptA, negative_promptB = task_to_prompt( + config.powerpaint_task + ) + + output = self.model( + image=image, + mask=mask, + promptA=promptA, + promptB=promptB, + promptU=config.prompt, + tradoff=config.fitting_degree, + tradoff_nag=config.fitting_degree, + negative_promptA=negative_promptA, + negative_promptB=negative_promptB, + negative_promptU=config.negative_prompt, + num_inference_steps=config.sd_steps, + # strength=config.sd_strength, + brushnet_conditioning_scale=1.0, + guidance_scale=config.sd_guidance_scale, + output_type="np", + callback_on_step_end=self.callback, + height=img_h, + width=img_w, + generator=torch.manual_seed(config.sd_seed), + ).images[0] + + output = (output * 255).round().astype("uint8") + output = cv2.cvtColor(output, cv2.COLOR_RGB2BGR) + return output diff --git a/inpaint/model/power_paint/powerpaint_tokenizer.py b/inpaint/model/power_paint/powerpaint_tokenizer.py new file mode 100644 index 0000000..53a68c9 --- /dev/null +++ b/inpaint/model/power_paint/powerpaint_tokenizer.py @@ -0,0 +1,254 @@ +import copy +import random +from typing import Any, List, Union +from transformers import CLIPTokenizer + +from iopaint.schema import PowerPaintTask + + +def add_task_to_prompt(prompt, negative_prompt, task: PowerPaintTask): + if task == PowerPaintTask.object_remove: + promptA = prompt + " P_ctxt" + promptB = prompt + " P_ctxt" + negative_promptA = negative_prompt + " P_obj" + negative_promptB = negative_prompt + " P_obj" + elif task == PowerPaintTask.context_aware: + promptA = prompt + " P_ctxt" + promptB = prompt + " P_ctxt" + negative_promptA = negative_prompt + negative_promptB = negative_prompt + elif task == PowerPaintTask.shape_guided: + promptA = prompt + " P_shape" + promptB = prompt + " P_ctxt" + negative_promptA = negative_prompt + negative_promptB = negative_prompt + elif task == PowerPaintTask.outpainting: + promptA = prompt + " P_ctxt" + promptB = prompt + " P_ctxt" + negative_promptA = negative_prompt + " P_obj" + negative_promptB = negative_prompt + " P_obj" + else: + promptA = prompt + " P_obj" + promptB = prompt + " P_obj" + negative_promptA = negative_prompt + negative_promptB = negative_prompt + + return promptA, promptB, negative_promptA, negative_promptB + + +def task_to_prompt(task: PowerPaintTask): + promptA, promptB, negative_promptA, negative_promptB = add_task_to_prompt( + "", "", task + ) + return ( + promptA.strip(), + promptB.strip(), + negative_promptA.strip(), + negative_promptB.strip(), + ) + + +class PowerPaintTokenizer: + def __init__(self, tokenizer: CLIPTokenizer): + self.wrapped = tokenizer + self.token_map = {} + placeholder_tokens = ["P_ctxt", "P_shape", "P_obj"] + num_vec_per_token = 10 + for placeholder_token in placeholder_tokens: + output = [] + for i in range(num_vec_per_token): + ith_token = placeholder_token + f"_{i}" + output.append(ith_token) + self.token_map[placeholder_token] = output + + def __getattr__(self, name: str) -> Any: + if name == "wrapped": + return super().__getattr__("wrapped") + + try: + return getattr(self.wrapped, name) + except AttributeError: + try: + return super().__getattr__(name) + except AttributeError: + raise AttributeError( + "'name' cannot be found in both " + f"'{self.__class__.__name__}' and " + f"'{self.__class__.__name__}.tokenizer'." + ) + + def try_adding_tokens(self, tokens: Union[str, List[str]], *args, **kwargs): + """Attempt to add tokens to the tokenizer. + + Args: + tokens (Union[str, List[str]]): The tokens to be added. + """ + num_added_tokens = self.wrapped.add_tokens(tokens, *args, **kwargs) + assert num_added_tokens != 0, ( + f"The tokenizer already contains the token {tokens}. Please pass " + "a different `placeholder_token` that is not already in the " + "tokenizer." + ) + + def get_token_info(self, token: str) -> dict: + """Get the information of a token, including its start and end index in + the current tokenizer. + + Args: + token (str): The token to be queried. + + Returns: + dict: The information of the token, including its start and end + index in current tokenizer. + """ + token_ids = self.__call__(token).input_ids + start, end = token_ids[1], token_ids[-2] + 1 + return {"name": token, "start": start, "end": end} + + def add_placeholder_token( + self, placeholder_token: str, *args, num_vec_per_token: int = 1, **kwargs + ): + """Add placeholder tokens to the tokenizer. + + Args: + placeholder_token (str): The placeholder token to be added. + num_vec_per_token (int, optional): The number of vectors of + the added placeholder token. + *args, **kwargs: The arguments for `self.wrapped.add_tokens`. + """ + output = [] + if num_vec_per_token == 1: + self.try_adding_tokens(placeholder_token, *args, **kwargs) + output.append(placeholder_token) + else: + output = [] + for i in range(num_vec_per_token): + ith_token = placeholder_token + f"_{i}" + self.try_adding_tokens(ith_token, *args, **kwargs) + output.append(ith_token) + + for token in self.token_map: + if token in placeholder_token: + raise ValueError( + f"The tokenizer already has placeholder token {token} " + f"that can get confused with {placeholder_token} " + "keep placeholder tokens independent" + ) + self.token_map[placeholder_token] = output + + def replace_placeholder_tokens_in_text( + self, + text: Union[str, List[str]], + vector_shuffle: bool = False, + prop_tokens_to_load: float = 1.0, + ) -> Union[str, List[str]]: + """Replace the keywords in text with placeholder tokens. This function + will be called in `self.__call__` and `self.encode`. + + Args: + text (Union[str, List[str]]): The text to be processed. + vector_shuffle (bool, optional): Whether to shuffle the vectors. + Defaults to False. + prop_tokens_to_load (float, optional): The proportion of tokens to + be loaded. If 1.0, all tokens will be loaded. Defaults to 1.0. + + Returns: + Union[str, List[str]]: The processed text. + """ + if isinstance(text, list): + output = [] + for i in range(len(text)): + output.append( + self.replace_placeholder_tokens_in_text( + text[i], vector_shuffle=vector_shuffle + ) + ) + return output + + for placeholder_token in self.token_map: + if placeholder_token in text: + tokens = self.token_map[placeholder_token] + tokens = tokens[: 1 + int(len(tokens) * prop_tokens_to_load)] + if vector_shuffle: + tokens = copy.copy(tokens) + random.shuffle(tokens) + text = text.replace(placeholder_token, " ".join(tokens)) + return text + + def replace_text_with_placeholder_tokens( + self, text: Union[str, List[str]] + ) -> Union[str, List[str]]: + """Replace the placeholder tokens in text with the original keywords. + This function will be called in `self.decode`. + + Args: + text (Union[str, List[str]]): The text to be processed. + + Returns: + Union[str, List[str]]: The processed text. + """ + if isinstance(text, list): + output = [] + for i in range(len(text)): + output.append(self.replace_text_with_placeholder_tokens(text[i])) + return output + + for placeholder_token, tokens in self.token_map.items(): + merged_tokens = " ".join(tokens) + if merged_tokens in text: + text = text.replace(merged_tokens, placeholder_token) + return text + + def __call__( + self, + text: Union[str, List[str]], + *args, + vector_shuffle: bool = False, + prop_tokens_to_load: float = 1.0, + **kwargs, + ): + """The call function of the wrapper. + + Args: + text (Union[str, List[str]]): The text to be tokenized. + vector_shuffle (bool, optional): Whether to shuffle the vectors. + Defaults to False. + prop_tokens_to_load (float, optional): The proportion of tokens to + be loaded. If 1.0, all tokens will be loaded. Defaults to 1.0 + *args, **kwargs: The arguments for `self.wrapped.__call__`. + """ + replaced_text = self.replace_placeholder_tokens_in_text( + text, vector_shuffle=vector_shuffle, prop_tokens_to_load=prop_tokens_to_load + ) + + return self.wrapped.__call__(replaced_text, *args, **kwargs) + + def encode(self, text: Union[str, List[str]], *args, **kwargs): + """Encode the passed text to token index. + + Args: + text (Union[str, List[str]]): The text to be encode. + *args, **kwargs: The arguments for `self.wrapped.__call__`. + """ + replaced_text = self.replace_placeholder_tokens_in_text(text) + return self.wrapped(replaced_text, *args, **kwargs) + + def decode( + self, token_ids, return_raw: bool = False, *args, **kwargs + ) -> Union[str, List[str]]: + """Decode the token index to text. + + Args: + token_ids: The token index to be decoded. + return_raw: Whether keep the placeholder token in the text. + Defaults to False. + *args, **kwargs: The arguments for `self.wrapped.decode`. + + Returns: + Union[str, List[str]]: The decoded text. + """ + text = self.wrapped.decode(token_ids, *args, **kwargs) + if return_raw: + return text + replaced_text = self.replace_text_with_placeholder_tokens(text) + return replaced_text diff --git a/inpaint/model/power_paint/v2/BrushNet_CA.py b/inpaint/model/power_paint/v2/BrushNet_CA.py new file mode 100644 index 0000000..b892c84 --- /dev/null +++ b/inpaint/model/power_paint/v2/BrushNet_CA.py @@ -0,0 +1,1094 @@ +from dataclasses import dataclass +from typing import Any, Dict, List, Optional, Tuple, Union + +import torch +from diffusers import UNet2DConditionModel +from diffusers.models.unet_2d_blocks import ( + get_down_block, + get_mid_block, + get_up_block, + CrossAttnDownBlock2D, + DownBlock2D, +) +from torch import nn + +from diffusers.configuration_utils import ConfigMixin, register_to_config +from diffusers.utils import BaseOutput, logging +from diffusers.models.attention_processor import ( + ADDED_KV_ATTENTION_PROCESSORS, + CROSS_ATTENTION_PROCESSORS, + AttentionProcessor, + AttnAddedKVProcessor, + AttnProcessor, +) +from diffusers.models.embeddings import ( + TextImageProjection, + TextImageTimeEmbedding, + TextTimeEmbedding, + TimestepEmbedding, + Timesteps, +) +from diffusers.models.modeling_utils import ModelMixin + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +@dataclass +class BrushNetOutput(BaseOutput): + """ + The output of [`BrushNetModel`]. + + Args: + up_block_res_samples (`tuple[torch.Tensor]`): + A tuple of upsample activations at different resolutions for each upsampling block. Each tensor should + be of shape `(batch_size, channel * resolution, height //resolution, width // resolution)`. Output can be + used to condition the original UNet's upsampling activations. + down_block_res_samples (`tuple[torch.Tensor]`): + A tuple of downsample activations at different resolutions for each downsampling block. Each tensor should + be of shape `(batch_size, channel * resolution, height //resolution, width // resolution)`. Output can be + used to condition the original UNet's downsampling activations. + mid_down_block_re_sample (`torch.Tensor`): + The activation of the midde block (the lowest sample resolution). Each tensor should be of shape + `(batch_size, channel * lowest_resolution, height // lowest_resolution, width // lowest_resolution)`. + Output can be used to condition the original UNet's middle block activation. + """ + + up_block_res_samples: Tuple[torch.Tensor] + down_block_res_samples: Tuple[torch.Tensor] + mid_block_res_sample: torch.Tensor + + +class BrushNetModel(ModelMixin, ConfigMixin): + """ + A BrushNet model. + + Args: + in_channels (`int`, defaults to 4): + The number of channels in the input sample. + flip_sin_to_cos (`bool`, defaults to `True`): + Whether to flip the sin to cos in the time embedding. + freq_shift (`int`, defaults to 0): + The frequency shift to apply to the time embedding. + down_block_types (`tuple[str]`, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`): + The tuple of downsample blocks to use. + mid_block_type (`str`, *optional*, defaults to `"UNetMidBlock2DCrossAttn"`): + Block type for middle of UNet, it can be one of `UNetMidBlock2DCrossAttn`, `UNetMidBlock2D`, or + `UNetMidBlock2DSimpleCrossAttn`. If `None`, the mid block layer is skipped. + up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")`): + The tuple of upsample blocks to use. + only_cross_attention (`Union[bool, Tuple[bool]]`, defaults to `False`): + block_out_channels (`tuple[int]`, defaults to `(320, 640, 1280, 1280)`): + The tuple of output channels for each block. + layers_per_block (`int`, defaults to 2): + The number of layers per block. + downsample_padding (`int`, defaults to 1): + The padding to use for the downsampling convolution. + mid_block_scale_factor (`float`, defaults to 1): + The scale factor to use for the mid block. + act_fn (`str`, defaults to "silu"): + The activation function to use. + norm_num_groups (`int`, *optional*, defaults to 32): + The number of groups to use for the normalization. If None, normalization and activation layers is skipped + in post-processing. + norm_eps (`float`, defaults to 1e-5): + The epsilon to use for the normalization. + cross_attention_dim (`int`, defaults to 1280): + The dimension of the cross attention features. + transformer_layers_per_block (`int` or `Tuple[int]`, *optional*, defaults to 1): + The number of transformer blocks of type [`~models.attention.BasicTransformerBlock`]. Only relevant for + [`~models.unet_2d_blocks.CrossAttnDownBlock2D`], [`~models.unet_2d_blocks.CrossAttnUpBlock2D`], + [`~models.unet_2d_blocks.UNetMidBlock2DCrossAttn`]. + encoder_hid_dim (`int`, *optional*, defaults to None): + If `encoder_hid_dim_type` is defined, `encoder_hidden_states` will be projected from `encoder_hid_dim` + dimension to `cross_attention_dim`. + encoder_hid_dim_type (`str`, *optional*, defaults to `None`): + If given, the `encoder_hidden_states` and potentially other embeddings are down-projected to text + embeddings of dimension `cross_attention` according to `encoder_hid_dim_type`. + attention_head_dim (`Union[int, Tuple[int]]`, defaults to 8): + The dimension of the attention heads. + use_linear_projection (`bool`, defaults to `False`): + class_embed_type (`str`, *optional*, defaults to `None`): + The type of class embedding to use which is ultimately summed with the time embeddings. Choose from None, + `"timestep"`, `"identity"`, `"projection"`, or `"simple_projection"`. + addition_embed_type (`str`, *optional*, defaults to `None`): + Configures an optional embedding which will be summed with the time embeddings. Choose from `None` or + "text". "text" will use the `TextTimeEmbedding` layer. + num_class_embeds (`int`, *optional*, defaults to 0): + Input dimension of the learnable embedding matrix to be projected to `time_embed_dim`, when performing + class conditioning with `class_embed_type` equal to `None`. + upcast_attention (`bool`, defaults to `False`): + resnet_time_scale_shift (`str`, defaults to `"default"`): + Time scale shift config for ResNet blocks (see `ResnetBlock2D`). Choose from `default` or `scale_shift`. + projection_class_embeddings_input_dim (`int`, *optional*, defaults to `None`): + The dimension of the `class_labels` input when `class_embed_type="projection"`. Required when + `class_embed_type="projection"`. + brushnet_conditioning_channel_order (`str`, defaults to `"rgb"`): + The channel order of conditional image. Will convert to `rgb` if it's `bgr`. + conditioning_embedding_out_channels (`tuple[int]`, *optional*, defaults to `(16, 32, 96, 256)`): + The tuple of output channel for each block in the `conditioning_embedding` layer. + global_pool_conditions (`bool`, defaults to `False`): + TODO(Patrick) - unused parameter. + addition_embed_type_num_heads (`int`, defaults to 64): + The number of heads to use for the `TextTimeEmbedding` layer. + """ + + _supports_gradient_checkpointing = True + + @register_to_config + def __init__( + self, + in_channels: int = 4, + conditioning_channels: int = 5, + flip_sin_to_cos: bool = True, + freq_shift: int = 0, + down_block_types: Tuple[str, ...] = ( + "CrossAttnDownBlock2D", + "CrossAttnDownBlock2D", + "CrossAttnDownBlock2D", + "DownBlock2D", + ), + mid_block_type: Optional[str] = "UNetMidBlock2DCrossAttn", + up_block_types: Tuple[str, ...] = ( + "UpBlock2D", + "CrossAttnUpBlock2D", + "CrossAttnUpBlock2D", + "CrossAttnUpBlock2D", + ), + only_cross_attention: Union[bool, Tuple[bool]] = False, + block_out_channels: Tuple[int, ...] = (320, 640, 1280, 1280), + layers_per_block: int = 2, + downsample_padding: int = 1, + mid_block_scale_factor: float = 1, + act_fn: str = "silu", + norm_num_groups: Optional[int] = 32, + norm_eps: float = 1e-5, + cross_attention_dim: int = 1280, + transformer_layers_per_block: Union[int, Tuple[int, ...]] = 1, + encoder_hid_dim: Optional[int] = None, + encoder_hid_dim_type: Optional[str] = None, + attention_head_dim: Union[int, Tuple[int, ...]] = 8, + num_attention_heads: Optional[Union[int, Tuple[int, ...]]] = None, + use_linear_projection: bool = False, + class_embed_type: Optional[str] = None, + addition_embed_type: Optional[str] = None, + addition_time_embed_dim: Optional[int] = None, + num_class_embeds: Optional[int] = None, + upcast_attention: bool = False, + resnet_time_scale_shift: str = "default", + projection_class_embeddings_input_dim: Optional[int] = None, + brushnet_conditioning_channel_order: str = "rgb", + conditioning_embedding_out_channels: Optional[Tuple[int, ...]] = ( + 16, + 32, + 96, + 256, + ), + global_pool_conditions: bool = False, + addition_embed_type_num_heads: int = 64, + ): + super().__init__() + + # If `num_attention_heads` is not defined (which is the case for most models) + # it will default to `attention_head_dim`. This looks weird upon first reading it and it is. + # The reason for this behavior is to correct for incorrectly named variables that were introduced + # when this library was created. The incorrect naming was only discovered much later in https://github.com/huggingface/diffusers/issues/2011#issuecomment-1547958131 + # Changing `attention_head_dim` to `num_attention_heads` for 40,000+ configurations is too backwards breaking + # which is why we correct for the naming here. + num_attention_heads = num_attention_heads or attention_head_dim + + # Check inputs + if len(down_block_types) != len(up_block_types): + raise ValueError( + f"Must provide the same number of `down_block_types` as `up_block_types`. `down_block_types`: {down_block_types}. `up_block_types`: {up_block_types}." + ) + + if len(block_out_channels) != len(down_block_types): + raise ValueError( + f"Must provide the same number of `block_out_channels` as `down_block_types`. `block_out_channels`: {block_out_channels}. `down_block_types`: {down_block_types}." + ) + + if not isinstance(only_cross_attention, bool) and len( + only_cross_attention + ) != len(down_block_types): + raise ValueError( + f"Must provide the same number of `only_cross_attention` as `down_block_types`. `only_cross_attention`: {only_cross_attention}. `down_block_types`: {down_block_types}." + ) + + if not isinstance(num_attention_heads, int) and len(num_attention_heads) != len( + down_block_types + ): + raise ValueError( + f"Must provide the same number of `num_attention_heads` as `down_block_types`. `num_attention_heads`: {num_attention_heads}. `down_block_types`: {down_block_types}." + ) + + if isinstance(transformer_layers_per_block, int): + transformer_layers_per_block = [transformer_layers_per_block] * len( + down_block_types + ) + + # input + conv_in_kernel = 3 + conv_in_padding = (conv_in_kernel - 1) // 2 + self.conv_in_condition = nn.Conv2d( + in_channels + conditioning_channels, + block_out_channels[0], + kernel_size=conv_in_kernel, + padding=conv_in_padding, + ) + + # time + time_embed_dim = block_out_channels[0] * 4 + self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift) + timestep_input_dim = block_out_channels[0] + self.time_embedding = TimestepEmbedding( + timestep_input_dim, + time_embed_dim, + act_fn=act_fn, + ) + + if encoder_hid_dim_type is None and encoder_hid_dim is not None: + encoder_hid_dim_type = "text_proj" + self.register_to_config(encoder_hid_dim_type=encoder_hid_dim_type) + logger.info( + "encoder_hid_dim_type defaults to 'text_proj' as `encoder_hid_dim` is defined." + ) + + if encoder_hid_dim is None and encoder_hid_dim_type is not None: + raise ValueError( + f"`encoder_hid_dim` has to be defined when `encoder_hid_dim_type` is set to {encoder_hid_dim_type}." + ) + + if encoder_hid_dim_type == "text_proj": + self.encoder_hid_proj = nn.Linear(encoder_hid_dim, cross_attention_dim) + elif encoder_hid_dim_type == "text_image_proj": + # image_embed_dim DOESN'T have to be `cross_attention_dim`. To not clutter the __init__ too much + # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use + # case when `addition_embed_type == "text_image_proj"` (Kadinsky 2.1)` + self.encoder_hid_proj = TextImageProjection( + text_embed_dim=encoder_hid_dim, + image_embed_dim=cross_attention_dim, + cross_attention_dim=cross_attention_dim, + ) + + elif encoder_hid_dim_type is not None: + raise ValueError( + f"encoder_hid_dim_type: {encoder_hid_dim_type} must be None, 'text_proj' or 'text_image_proj'." + ) + else: + self.encoder_hid_proj = None + + # class embedding + if class_embed_type is None and num_class_embeds is not None: + self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim) + elif class_embed_type == "timestep": + self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim) + elif class_embed_type == "identity": + self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim) + elif class_embed_type == "projection": + if projection_class_embeddings_input_dim is None: + raise ValueError( + "`class_embed_type`: 'projection' requires `projection_class_embeddings_input_dim` be set" + ) + # The projection `class_embed_type` is the same as the timestep `class_embed_type` except + # 1. the `class_labels` inputs are not first converted to sinusoidal embeddings + # 2. it projects from an arbitrary input dimension. + # + # Note that `TimestepEmbedding` is quite general, being mainly linear layers and activations. + # When used for embedding actual timesteps, the timesteps are first converted to sinusoidal embeddings. + # As a result, `TimestepEmbedding` can be passed arbitrary vectors. + self.class_embedding = TimestepEmbedding( + projection_class_embeddings_input_dim, time_embed_dim + ) + else: + self.class_embedding = None + + if addition_embed_type == "text": + if encoder_hid_dim is not None: + text_time_embedding_from_dim = encoder_hid_dim + else: + text_time_embedding_from_dim = cross_attention_dim + + self.add_embedding = TextTimeEmbedding( + text_time_embedding_from_dim, + time_embed_dim, + num_heads=addition_embed_type_num_heads, + ) + elif addition_embed_type == "text_image": + # text_embed_dim and image_embed_dim DON'T have to be `cross_attention_dim`. To not clutter the __init__ too much + # they are set to `cross_attention_dim` here as this is exactly the required dimension for the currently only use + # case when `addition_embed_type == "text_image"` (Kadinsky 2.1)` + self.add_embedding = TextImageTimeEmbedding( + text_embed_dim=cross_attention_dim, + image_embed_dim=cross_attention_dim, + time_embed_dim=time_embed_dim, + ) + elif addition_embed_type == "text_time": + self.add_time_proj = Timesteps( + addition_time_embed_dim, flip_sin_to_cos, freq_shift + ) + self.add_embedding = TimestepEmbedding( + projection_class_embeddings_input_dim, time_embed_dim + ) + + elif addition_embed_type is not None: + raise ValueError( + f"addition_embed_type: {addition_embed_type} must be None, 'text' or 'text_image'." + ) + + self.down_blocks = nn.ModuleList([]) + self.brushnet_down_blocks = nn.ModuleList([]) + + if isinstance(only_cross_attention, bool): + only_cross_attention = [only_cross_attention] * len(down_block_types) + + if isinstance(attention_head_dim, int): + attention_head_dim = (attention_head_dim,) * len(down_block_types) + + if isinstance(num_attention_heads, int): + num_attention_heads = (num_attention_heads,) * len(down_block_types) + + # down + output_channel = block_out_channels[0] + + brushnet_block = nn.Conv2d(output_channel, output_channel, kernel_size=1) + brushnet_block = zero_module(brushnet_block) + self.brushnet_down_blocks.append(brushnet_block) + + for i, down_block_type in enumerate(down_block_types): + input_channel = output_channel + output_channel = block_out_channels[i] + is_final_block = i == len(block_out_channels) - 1 + + down_block = get_down_block( + down_block_type, + num_layers=layers_per_block, + transformer_layers_per_block=transformer_layers_per_block[i], + in_channels=input_channel, + out_channels=output_channel, + temb_channels=time_embed_dim, + add_downsample=not is_final_block, + resnet_eps=norm_eps, + resnet_act_fn=act_fn, + resnet_groups=norm_num_groups, + cross_attention_dim=cross_attention_dim, + num_attention_heads=num_attention_heads[i], + attention_head_dim=attention_head_dim[i] + if attention_head_dim[i] is not None + else output_channel, + downsample_padding=downsample_padding, + use_linear_projection=use_linear_projection, + only_cross_attention=only_cross_attention[i], + upcast_attention=upcast_attention, + resnet_time_scale_shift=resnet_time_scale_shift, + ) + self.down_blocks.append(down_block) + + for _ in range(layers_per_block): + brushnet_block = nn.Conv2d( + output_channel, output_channel, kernel_size=1 + ) + brushnet_block = zero_module(brushnet_block) + self.brushnet_down_blocks.append(brushnet_block) + + if not is_final_block: + brushnet_block = nn.Conv2d( + output_channel, output_channel, kernel_size=1 + ) + brushnet_block = zero_module(brushnet_block) + self.brushnet_down_blocks.append(brushnet_block) + + # mid + mid_block_channel = block_out_channels[-1] + + brushnet_block = nn.Conv2d(mid_block_channel, mid_block_channel, kernel_size=1) + brushnet_block = zero_module(brushnet_block) + self.brushnet_mid_block = brushnet_block + + self.mid_block = get_mid_block( + mid_block_type, + transformer_layers_per_block=transformer_layers_per_block[-1], + in_channels=mid_block_channel, + temb_channels=time_embed_dim, + resnet_eps=norm_eps, + resnet_act_fn=act_fn, + output_scale_factor=mid_block_scale_factor, + resnet_time_scale_shift=resnet_time_scale_shift, + cross_attention_dim=cross_attention_dim, + num_attention_heads=num_attention_heads[-1], + resnet_groups=norm_num_groups, + use_linear_projection=use_linear_projection, + upcast_attention=upcast_attention, + ) + + # count how many layers upsample the images + self.num_upsamplers = 0 + + # up + reversed_block_out_channels = list(reversed(block_out_channels)) + reversed_num_attention_heads = list(reversed(num_attention_heads)) + reversed_transformer_layers_per_block = list( + reversed(transformer_layers_per_block) + ) + only_cross_attention = list(reversed(only_cross_attention)) + + output_channel = reversed_block_out_channels[0] + + self.up_blocks = nn.ModuleList([]) + self.brushnet_up_blocks = nn.ModuleList([]) + + for i, up_block_type in enumerate(up_block_types): + is_final_block = i == len(block_out_channels) - 1 + + prev_output_channel = output_channel + output_channel = reversed_block_out_channels[i] + input_channel = reversed_block_out_channels[ + min(i + 1, len(block_out_channels) - 1) + ] + + # add upsample block for all BUT final layer + if not is_final_block: + add_upsample = True + self.num_upsamplers += 1 + else: + add_upsample = False + + up_block = get_up_block( + up_block_type, + num_layers=layers_per_block + 1, + transformer_layers_per_block=reversed_transformer_layers_per_block[i], + in_channels=input_channel, + out_channels=output_channel, + prev_output_channel=prev_output_channel, + temb_channels=time_embed_dim, + add_upsample=add_upsample, + resnet_eps=norm_eps, + resnet_act_fn=act_fn, + resolution_idx=i, + resnet_groups=norm_num_groups, + cross_attention_dim=cross_attention_dim, + num_attention_heads=reversed_num_attention_heads[i], + use_linear_projection=use_linear_projection, + only_cross_attention=only_cross_attention[i], + upcast_attention=upcast_attention, + resnet_time_scale_shift=resnet_time_scale_shift, + attention_head_dim=attention_head_dim[i] + if attention_head_dim[i] is not None + else output_channel, + ) + self.up_blocks.append(up_block) + prev_output_channel = output_channel + + for _ in range(layers_per_block + 1): + brushnet_block = nn.Conv2d( + output_channel, output_channel, kernel_size=1 + ) + brushnet_block = zero_module(brushnet_block) + self.brushnet_up_blocks.append(brushnet_block) + + if not is_final_block: + brushnet_block = nn.Conv2d( + output_channel, output_channel, kernel_size=1 + ) + brushnet_block = zero_module(brushnet_block) + self.brushnet_up_blocks.append(brushnet_block) + + @classmethod + def from_unet( + cls, + unet: UNet2DConditionModel, + brushnet_conditioning_channel_order: str = "rgb", + conditioning_embedding_out_channels: Optional[Tuple[int, ...]] = ( + 16, + 32, + 96, + 256, + ), + load_weights_from_unet: bool = True, + conditioning_channels: int = 5, + ): + r""" + Instantiate a [`BrushNetModel`] from [`UNet2DConditionModel`]. + + Parameters: + unet (`UNet2DConditionModel`): + The UNet model weights to copy to the [`BrushNetModel`]. All configuration options are also copied + where applicable. + """ + transformer_layers_per_block = ( + unet.config.transformer_layers_per_block + if "transformer_layers_per_block" in unet.config + else 1 + ) + encoder_hid_dim = ( + unet.config.encoder_hid_dim if "encoder_hid_dim" in unet.config else None + ) + encoder_hid_dim_type = ( + unet.config.encoder_hid_dim_type + if "encoder_hid_dim_type" in unet.config + else None + ) + addition_embed_type = ( + unet.config.addition_embed_type + if "addition_embed_type" in unet.config + else None + ) + addition_time_embed_dim = ( + unet.config.addition_time_embed_dim + if "addition_time_embed_dim" in unet.config + else None + ) + + brushnet = cls( + in_channels=unet.config.in_channels, + conditioning_channels=conditioning_channels, + flip_sin_to_cos=unet.config.flip_sin_to_cos, + freq_shift=unet.config.freq_shift, + # down_block_types=['DownBlock2D','DownBlock2D','DownBlock2D','DownBlock2D'], + down_block_types=[ + "CrossAttnDownBlock2D", + "CrossAttnDownBlock2D", + "CrossAttnDownBlock2D", + "DownBlock2D", + ], + # mid_block_type='MidBlock2D', + mid_block_type="UNetMidBlock2DCrossAttn", + # up_block_types=['UpBlock2D','UpBlock2D','UpBlock2D','UpBlock2D'], + up_block_types=[ + "UpBlock2D", + "CrossAttnUpBlock2D", + "CrossAttnUpBlock2D", + "CrossAttnUpBlock2D", + ], + only_cross_attention=unet.config.only_cross_attention, + block_out_channels=unet.config.block_out_channels, + layers_per_block=unet.config.layers_per_block, + downsample_padding=unet.config.downsample_padding, + mid_block_scale_factor=unet.config.mid_block_scale_factor, + act_fn=unet.config.act_fn, + norm_num_groups=unet.config.norm_num_groups, + norm_eps=unet.config.norm_eps, + cross_attention_dim=unet.config.cross_attention_dim, + transformer_layers_per_block=transformer_layers_per_block, + encoder_hid_dim=encoder_hid_dim, + encoder_hid_dim_type=encoder_hid_dim_type, + attention_head_dim=unet.config.attention_head_dim, + num_attention_heads=unet.config.num_attention_heads, + use_linear_projection=unet.config.use_linear_projection, + class_embed_type=unet.config.class_embed_type, + addition_embed_type=addition_embed_type, + addition_time_embed_dim=addition_time_embed_dim, + num_class_embeds=unet.config.num_class_embeds, + upcast_attention=unet.config.upcast_attention, + resnet_time_scale_shift=unet.config.resnet_time_scale_shift, + projection_class_embeddings_input_dim=unet.config.projection_class_embeddings_input_dim, + brushnet_conditioning_channel_order=brushnet_conditioning_channel_order, + conditioning_embedding_out_channels=conditioning_embedding_out_channels, + ) + + if load_weights_from_unet: + conv_in_condition_weight = torch.zeros_like( + brushnet.conv_in_condition.weight + ) + conv_in_condition_weight[:, :4, ...] = unet.conv_in.weight + conv_in_condition_weight[:, 4:8, ...] = unet.conv_in.weight + brushnet.conv_in_condition.weight = torch.nn.Parameter( + conv_in_condition_weight + ) + brushnet.conv_in_condition.bias = unet.conv_in.bias + + brushnet.time_proj.load_state_dict(unet.time_proj.state_dict()) + brushnet.time_embedding.load_state_dict(unet.time_embedding.state_dict()) + + if brushnet.class_embedding: + brushnet.class_embedding.load_state_dict( + unet.class_embedding.state_dict() + ) + + brushnet.down_blocks.load_state_dict( + unet.down_blocks.state_dict(), strict=False + ) + brushnet.mid_block.load_state_dict( + unet.mid_block.state_dict(), strict=False + ) + brushnet.up_blocks.load_state_dict( + unet.up_blocks.state_dict(), strict=False + ) + + return brushnet.to(unet.dtype) + + @property + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors + def attn_processors(self) -> Dict[str, AttentionProcessor]: + r""" + Returns: + `dict` of attention processors: A dictionary containing all attention processors used in the model with + indexed by its weight name. + """ + # set recursively + processors = {} + + def fn_recursive_add_processors( + name: str, + module: torch.nn.Module, + processors: Dict[str, AttentionProcessor], + ): + if hasattr(module, "get_processor"): + processors[f"{name}.processor"] = module.get_processor( + return_deprecated_lora=True + ) + + for sub_name, child in module.named_children(): + fn_recursive_add_processors(f"{name}.{sub_name}", child, processors) + + return processors + + for name, module in self.named_children(): + fn_recursive_add_processors(name, module, processors) + + return processors + + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor + def set_attn_processor( + self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]] + ): + r""" + Sets the attention processor to use to compute attention. + + Parameters: + processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`): + The instantiated processor class or a dictionary of processor classes that will be set as the processor + for **all** `Attention` layers. + + If `processor` is a dict, the key needs to define the path to the corresponding cross attention + processor. This is strongly recommended when setting trainable attention processors. + + """ + count = len(self.attn_processors.keys()) + + if isinstance(processor, dict) and len(processor) != count: + raise ValueError( + f"A dict of processors was passed, but the number of processors {len(processor)} does not match the" + f" number of attention layers: {count}. Please make sure to pass {count} processor classes." + ) + + def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor): + if hasattr(module, "set_processor"): + if not isinstance(processor, dict): + module.set_processor(processor) + else: + module.set_processor(processor.pop(f"{name}.processor")) + + for sub_name, child in module.named_children(): + fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor) + + for name, module in self.named_children(): + fn_recursive_attn_processor(name, module, processor) + + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_default_attn_processor + def set_default_attn_processor(self): + """ + Disables custom attention processors and sets the default attention implementation. + """ + if all( + proc.__class__ in ADDED_KV_ATTENTION_PROCESSORS + for proc in self.attn_processors.values() + ): + processor = AttnAddedKVProcessor() + elif all( + proc.__class__ in CROSS_ATTENTION_PROCESSORS + for proc in self.attn_processors.values() + ): + processor = AttnProcessor() + else: + raise ValueError( + f"Cannot call `set_default_attn_processor` when attention processors are of type {next(iter(self.attn_processors.values()))}" + ) + + self.set_attn_processor(processor) + + # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attention_slice + def set_attention_slice(self, slice_size: Union[str, int, List[int]]) -> None: + r""" + Enable sliced attention computation. + + When this option is enabled, the attention module splits the input tensor in slices to compute attention in + several steps. This is useful for saving some memory in exchange for a small decrease in speed. + + Args: + slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`): + When `"auto"`, input to the attention heads is halved, so attention is computed in two steps. If + `"max"`, maximum amount of memory is saved by running only one slice at a time. If a number is + provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim` + must be a multiple of `slice_size`. + """ + sliceable_head_dims = [] + + def fn_recursive_retrieve_sliceable_dims(module: torch.nn.Module): + if hasattr(module, "set_attention_slice"): + sliceable_head_dims.append(module.sliceable_head_dim) + + for child in module.children(): + fn_recursive_retrieve_sliceable_dims(child) + + # retrieve number of attention layers + for module in self.children(): + fn_recursive_retrieve_sliceable_dims(module) + + num_sliceable_layers = len(sliceable_head_dims) + + if slice_size == "auto": + # half the attention head size is usually a good trade-off between + # speed and memory + slice_size = [dim // 2 for dim in sliceable_head_dims] + elif slice_size == "max": + # make smallest slice possible + slice_size = num_sliceable_layers * [1] + + slice_size = ( + num_sliceable_layers * [slice_size] + if not isinstance(slice_size, list) + else slice_size + ) + + if len(slice_size) != len(sliceable_head_dims): + raise ValueError( + f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different" + f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}." + ) + + for i in range(len(slice_size)): + size = slice_size[i] + dim = sliceable_head_dims[i] + if size is not None and size > dim: + raise ValueError(f"size {size} has to be smaller or equal to {dim}.") + + # Recursively walk through all the children. + # Any children which exposes the set_attention_slice method + # gets the message + def fn_recursive_set_attention_slice( + module: torch.nn.Module, slice_size: List[int] + ): + if hasattr(module, "set_attention_slice"): + module.set_attention_slice(slice_size.pop()) + + for child in module.children(): + fn_recursive_set_attention_slice(child, slice_size) + + reversed_slice_size = list(reversed(slice_size)) + for module in self.children(): + fn_recursive_set_attention_slice(module, reversed_slice_size) + + def _set_gradient_checkpointing(self, module, value: bool = False) -> None: + if isinstance(module, (CrossAttnDownBlock2D, DownBlock2D)): + module.gradient_checkpointing = value + + def forward( + self, + sample: torch.FloatTensor, + timestep: Union[torch.Tensor, float, int], + encoder_hidden_states: torch.Tensor, + brushnet_cond: torch.FloatTensor, + conditioning_scale: float = 1.0, + class_labels: Optional[torch.Tensor] = None, + timestep_cond: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + guess_mode: bool = False, + return_dict: bool = True, + ) -> Union[BrushNetOutput, Tuple[Tuple[torch.FloatTensor, ...], torch.FloatTensor]]: + """ + The [`BrushNetModel`] forward method. + + Args: + sample (`torch.FloatTensor`): + The noisy input tensor. + timestep (`Union[torch.Tensor, float, int]`): + The number of timesteps to denoise an input. + encoder_hidden_states (`torch.Tensor`): + The encoder hidden states. + brushnet_cond (`torch.FloatTensor`): + The conditional input tensor of shape `(batch_size, sequence_length, hidden_size)`. + conditioning_scale (`float`, defaults to `1.0`): + The scale factor for BrushNet outputs. + class_labels (`torch.Tensor`, *optional*, defaults to `None`): + Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings. + timestep_cond (`torch.Tensor`, *optional*, defaults to `None`): + Additional conditional embeddings for timestep. If provided, the embeddings will be summed with the + timestep_embedding passed through the `self.time_embedding` layer to obtain the final timestep + embeddings. + attention_mask (`torch.Tensor`, *optional*, defaults to `None`): + An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask + is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large + negative values to the attention scores corresponding to "discard" tokens. + added_cond_kwargs (`dict`): + Additional conditions for the Stable Diffusion XL UNet. + cross_attention_kwargs (`dict[str]`, *optional*, defaults to `None`): + A kwargs dictionary that if specified is passed along to the `AttnProcessor`. + guess_mode (`bool`, defaults to `False`): + In this mode, the BrushNet encoder tries its best to recognize the input content of the input even if + you remove all prompts. A `guidance_scale` between 3.0 and 5.0 is recommended. + return_dict (`bool`, defaults to `True`): + Whether or not to return a [`~models.brushnet.BrushNetOutput`] instead of a plain tuple. + + Returns: + [`~models.brushnet.BrushNetOutput`] **or** `tuple`: + If `return_dict` is `True`, a [`~models.brushnet.BrushNetOutput`] is returned, otherwise a tuple is + returned where the first element is the sample tensor. + """ + # check channel order + channel_order = self.config.brushnet_conditioning_channel_order + + if channel_order == "rgb": + # in rgb order by default + ... + elif channel_order == "bgr": + brushnet_cond = torch.flip(brushnet_cond, dims=[1]) + else: + raise ValueError( + f"unknown `brushnet_conditioning_channel_order`: {channel_order}" + ) + + # prepare attention_mask + if attention_mask is not None: + attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0 + attention_mask = attention_mask.unsqueeze(1) + + # 1. time + timesteps = timestep + if not torch.is_tensor(timesteps): + # TODO: this requires sync between CPU and GPU. So try to pass timesteps as tensors if you can + # This would be a good case for the `match` statement (Python 3.10+) + is_mps = sample.device.type == "mps" + if isinstance(timestep, float): + dtype = torch.float32 if is_mps else torch.float64 + else: + dtype = torch.int32 if is_mps else torch.int64 + timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device) + elif len(timesteps.shape) == 0: + timesteps = timesteps[None].to(sample.device) + + # broadcast to batch dimension in a way that's compatible with ONNX/Core ML + timesteps = timesteps.expand(sample.shape[0]) + + t_emb = self.time_proj(timesteps) + + # timesteps does not contain any weights and will always return f32 tensors + # but time_embedding might actually be running in fp16. so we need to cast here. + # there might be better ways to encapsulate this. + t_emb = t_emb.to(dtype=sample.dtype) + + emb = self.time_embedding(t_emb, timestep_cond) + aug_emb = None + + if self.class_embedding is not None: + if class_labels is None: + raise ValueError( + "class_labels should be provided when num_class_embeds > 0" + ) + + if self.config.class_embed_type == "timestep": + class_labels = self.time_proj(class_labels) + + class_emb = self.class_embedding(class_labels).to(dtype=self.dtype) + emb = emb + class_emb + + if self.config.addition_embed_type is not None: + if self.config.addition_embed_type == "text": + aug_emb = self.add_embedding(encoder_hidden_states) + + elif self.config.addition_embed_type == "text_time": + if "text_embeds" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `text_embeds` to be passed in `added_cond_kwargs`" + ) + text_embeds = added_cond_kwargs.get("text_embeds") + if "time_ids" not in added_cond_kwargs: + raise ValueError( + f"{self.__class__} has the config param `addition_embed_type` set to 'text_time' which requires the keyword argument `time_ids` to be passed in `added_cond_kwargs`" + ) + time_ids = added_cond_kwargs.get("time_ids") + time_embeds = self.add_time_proj(time_ids.flatten()) + time_embeds = time_embeds.reshape((text_embeds.shape[0], -1)) + + add_embeds = torch.concat([text_embeds, time_embeds], dim=-1) + add_embeds = add_embeds.to(emb.dtype) + aug_emb = self.add_embedding(add_embeds) + + emb = emb + aug_emb if aug_emb is not None else emb + + # 2. pre-process + brushnet_cond = torch.concat([sample, brushnet_cond], 1) + sample = self.conv_in_condition(brushnet_cond) + + # 3. down + down_block_res_samples = (sample,) + for downsample_block in self.down_blocks: + if ( + hasattr(downsample_block, "has_cross_attention") + and downsample_block.has_cross_attention + ): + sample, res_samples = downsample_block( + hidden_states=sample, + temb=emb, + encoder_hidden_states=encoder_hidden_states, + attention_mask=attention_mask, + cross_attention_kwargs=cross_attention_kwargs, + ) + else: + sample, res_samples = downsample_block(hidden_states=sample, temb=emb) + + down_block_res_samples += res_samples + + # 4. PaintingNet down blocks + brushnet_down_block_res_samples = () + for down_block_res_sample, brushnet_down_block in zip( + down_block_res_samples, self.brushnet_down_blocks + ): + down_block_res_sample = brushnet_down_block(down_block_res_sample) + brushnet_down_block_res_samples = brushnet_down_block_res_samples + ( + down_block_res_sample, + ) + + # 5. mid + if self.mid_block is not None: + if ( + hasattr(self.mid_block, "has_cross_attention") + and self.mid_block.has_cross_attention + ): + sample = self.mid_block( + sample, + emb, + encoder_hidden_states=encoder_hidden_states, + attention_mask=attention_mask, + cross_attention_kwargs=cross_attention_kwargs, + ) + else: + sample = self.mid_block(sample, emb) + + # 6. BrushNet mid blocks + brushnet_mid_block_res_sample = self.brushnet_mid_block(sample) + + # 7. up + up_block_res_samples = () + for i, upsample_block in enumerate(self.up_blocks): + is_final_block = i == len(self.up_blocks) - 1 + + res_samples = down_block_res_samples[-len(upsample_block.resnets) :] + down_block_res_samples = down_block_res_samples[ + : -len(upsample_block.resnets) + ] + + # if we have not reached the final block and need to forward the + # upsample size, we do it here + if not is_final_block: + upsample_size = down_block_res_samples[-1].shape[2:] + + if ( + hasattr(upsample_block, "has_cross_attention") + and upsample_block.has_cross_attention + ): + sample, up_res_samples = upsample_block( + hidden_states=sample, + temb=emb, + res_hidden_states_tuple=res_samples, + encoder_hidden_states=encoder_hidden_states, + cross_attention_kwargs=cross_attention_kwargs, + upsample_size=upsample_size, + attention_mask=attention_mask, + return_res_samples=True, + ) + else: + sample, up_res_samples = upsample_block( + hidden_states=sample, + temb=emb, + res_hidden_states_tuple=res_samples, + upsample_size=upsample_size, + return_res_samples=True, + ) + + up_block_res_samples += up_res_samples + + # 8. BrushNet up blocks + brushnet_up_block_res_samples = () + for up_block_res_sample, brushnet_up_block in zip( + up_block_res_samples, self.brushnet_up_blocks + ): + up_block_res_sample = brushnet_up_block(up_block_res_sample) + brushnet_up_block_res_samples = brushnet_up_block_res_samples + ( + up_block_res_sample, + ) + + # 6. scaling + if guess_mode and not self.config.global_pool_conditions: + scales = torch.logspace( + -1, + 0, + len(brushnet_down_block_res_samples) + + 1 + + len(brushnet_up_block_res_samples), + device=sample.device, + ) # 0.1 to 1.0 + scales = scales * conditioning_scale + + brushnet_down_block_res_samples = [ + sample * scale + for sample, scale in zip( + brushnet_down_block_res_samples, + scales[: len(brushnet_down_block_res_samples)], + ) + ] + brushnet_mid_block_res_sample = ( + brushnet_mid_block_res_sample + * scales[len(brushnet_down_block_res_samples)] + ) + brushnet_up_block_res_samples = [ + sample * scale + for sample, scale in zip( + brushnet_up_block_res_samples, + scales[len(brushnet_down_block_res_samples) + 1 :], + ) + ] + else: + brushnet_down_block_res_samples = [ + sample * conditioning_scale + for sample in brushnet_down_block_res_samples + ] + brushnet_mid_block_res_sample = ( + brushnet_mid_block_res_sample * conditioning_scale + ) + brushnet_up_block_res_samples = [ + sample * conditioning_scale for sample in brushnet_up_block_res_samples + ] + + if self.config.global_pool_conditions: + brushnet_down_block_res_samples = [ + torch.mean(sample, dim=(2, 3), keepdim=True) + for sample in brushnet_down_block_res_samples + ] + brushnet_mid_block_res_sample = torch.mean( + brushnet_mid_block_res_sample, dim=(2, 3), keepdim=True + ) + brushnet_up_block_res_samples = [ + torch.mean(sample, dim=(2, 3), keepdim=True) + for sample in brushnet_up_block_res_samples + ] + + if not return_dict: + return ( + brushnet_down_block_res_samples, + brushnet_mid_block_res_sample, + brushnet_up_block_res_samples, + ) + + return BrushNetOutput( + down_block_res_samples=brushnet_down_block_res_samples, + mid_block_res_sample=brushnet_mid_block_res_sample, + up_block_res_samples=brushnet_up_block_res_samples, + ) + + +def zero_module(module): + for p in module.parameters(): + nn.init.zeros_(p) + return module diff --git a/inpaint/model/power_paint/v2/__init__.py b/inpaint/model/power_paint/v2/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/inpaint/model/power_paint/v2/pipeline_PowerPaint_Brushnet_CA.py b/inpaint/model/power_paint/v2/pipeline_PowerPaint_Brushnet_CA.py new file mode 100644 index 0000000..c1892e6 --- /dev/null +++ b/inpaint/model/power_paint/v2/pipeline_PowerPaint_Brushnet_CA.py @@ -0,0 +1,1690 @@ +import inspect +from typing import Any, Callable, Dict, List, Optional, Union + +import numpy as np +import PIL.Image +import torch +import torch.nn.functional as F +from diffusers import StableDiffusionMixin, UNet2DConditionModel +from transformers import ( + CLIPImageProcessor, + CLIPTextModel, + CLIPTokenizer, + CLIPVisionModelWithProjection, +) + +from diffusers.image_processor import PipelineImageInput, VaeImageProcessor +from diffusers.loaders import ( + FromSingleFileMixin, + IPAdapterMixin, + LoraLoaderMixin, + TextualInversionLoaderMixin, +) +from diffusers.models import AutoencoderKL, ImageProjection +from diffusers.models.lora import adjust_lora_scale_text_encoder +from diffusers.schedulers import KarrasDiffusionSchedulers +from diffusers.utils import ( + USE_PEFT_BACKEND, + deprecate, + logging, + replace_example_docstring, + scale_lora_layers, + unscale_lora_layers, +) +from diffusers.utils.torch_utils import ( + is_compiled_module, + is_torch_version, + randn_tensor, +) +from diffusers.pipelines.pipeline_utils import DiffusionPipeline +from diffusers.pipelines.stable_diffusion.pipeline_output import ( + StableDiffusionPipelineOutput, +) +from diffusers.pipelines.stable_diffusion.safety_checker import ( + StableDiffusionSafetyChecker, +) + +from .BrushNet_CA import BrushNetModel + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + +EXAMPLE_DOC_STRING = """ + Examples: + ```py + from diffusers import StableDiffusionBrushNetPipeline, BrushNetModel, UniPCMultistepScheduler + from diffusers.utils import load_image + import torch + import cv2 + import numpy as np + from PIL import Image + + base_model_path = "runwayml/stable-diffusion-v1-5" + brushnet_path = "ckpt_path" + + brushnet = BrushNetModel.from_pretrained(brushnet_path, torch_dtype=torch.float16) + pipe = StableDiffusionBrushNetPipeline.from_pretrained( + base_model_path, brushnet=brushnet, torch_dtype=torch.float16, low_cpu_mem_usage=False + ) + + # speed up diffusion process with faster scheduler and memory optimization + pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config) + # remove following line if xformers is not installed or when using Torch 2.0. + # pipe.enable_xformers_memory_efficient_attention() + # memory optimization. + pipe.enable_model_cpu_offload() + + image_path="examples/brushnet/src/test_image.jpg" + mask_path="examples/brushnet/src/test_mask.jpg" + caption="A cake on the table." + + init_image = cv2.imread(image_path) + mask_image = 1.*(cv2.imread(mask_path).sum(-1)>255)[:,:,np.newaxis] + init_image = init_image * (1-mask_image) + + init_image = Image.fromarray(init_image.astype(np.uint8)).convert("RGB") + mask_image = Image.fromarray(mask_image.astype(np.uint8).repeat(3,-1)*255).convert("RGB") + + generator = torch.Generator("cuda").manual_seed(1234) + + image = pipe( + caption, + init_image, + mask_image, + num_inference_steps=50, + generator=generator, + paintingnet_conditioning_scale=1.0 + ).images[0] + image.save("output.png") + ``` +""" + + +# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps +def retrieve_timesteps( + scheduler, + num_inference_steps: Optional[int] = None, + device: Optional[Union[str, torch.device]] = None, + timesteps: Optional[List[int]] = None, + **kwargs, +): + """ + Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles + custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`. + + Args: + scheduler (`SchedulerMixin`): + The scheduler to get timesteps from. + num_inference_steps (`int`): + The number of diffusion steps used when generating samples with a pre-trained model. If used, + `timesteps` must be `None`. + device (`str` or `torch.device`, *optional*): + The device to which the timesteps should be moved to. If `None`, the timesteps are not moved. + timesteps (`List[int]`, *optional*): + Custom timesteps used to support arbitrary spacing between timesteps. If `None`, then the default + timestep spacing strategy of the scheduler is used. If `timesteps` is passed, `num_inference_steps` + must be `None`. + + Returns: + `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the + second element is the number of inference steps. + """ + if timesteps is not None: + accepts_timesteps = "timesteps" in set( + inspect.signature(scheduler.set_timesteps).parameters.keys() + ) + if not accepts_timesteps: + raise ValueError( + f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom" + f" timestep schedules. Please check whether you are using the correct scheduler." + ) + scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs) + timesteps = scheduler.timesteps + num_inference_steps = len(timesteps) + else: + scheduler.set_timesteps(num_inference_steps, device=device, **kwargs) + timesteps = scheduler.timesteps + return timesteps, num_inference_steps + + +class StableDiffusionPowerPaintBrushNetPipeline( + DiffusionPipeline, + StableDiffusionMixin, + TextualInversionLoaderMixin, + LoraLoaderMixin, + IPAdapterMixin, + FromSingleFileMixin, +): + r""" + Pipeline for text-to-image generation using Stable Diffusion with BrushNet guidance. + + This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods + implemented for all pipelines (downloading, saving, running on a particular device, etc.). + + The pipeline also inherits the following loading methods: + - [`~loaders.TextualInversionLoaderMixin.load_textual_inversion`] for loading textual inversion embeddings + - [`~loaders.LoraLoaderMixin.load_lora_weights`] for loading LoRA weights + - [`~loaders.LoraLoaderMixin.save_lora_weights`] for saving LoRA weights + - [`~loaders.FromSingleFileMixin.from_single_file`] for loading `.ckpt` files + - [`~loaders.IPAdapterMixin.load_ip_adapter`] for loading IP Adapters + + Args: + vae ([`AutoencoderKL`]): + Variational Auto-Encoder (VAE) model to encode and decode images to and from latent representations. + text_encoder ([`~transformers.CLIPTextModel`]): + Frozen text-encoder ([clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14)). + tokenizer ([`~transformers.CLIPTokenizer`]): + A `CLIPTokenizer` to tokenize text. + unet ([`UNet2DConditionModel`]): + A `UNet2DConditionModel` to denoise the encoded image latents. + brushnet ([`BrushNetModel`]`): + Provides additional conditioning to the `unet` during the denoising process. + scheduler ([`SchedulerMixin`]): + A scheduler to be used in combination with `unet` to denoise the encoded image latents. Can be one of + [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`]. + safety_checker ([`StableDiffusionSafetyChecker`]): + Classification module that estimates whether generated images could be considered offensive or harmful. + Please refer to the [model card](https://huggingface.co/runwayml/stable-diffusion-v1-5) for more details + about a model's potential harms. + feature_extractor ([`~transformers.CLIPImageProcessor`]): + A `CLIPImageProcessor` to extract features from generated images; used as inputs to the `safety_checker`. + """ + + model_cpu_offload_seq = "text_encoder->image_encoder->unet->vae" + _optional_components = ["safety_checker", "feature_extractor", "image_encoder"] + _exclude_from_cpu_offload = ["safety_checker"] + _callback_tensor_inputs = ["latents", "prompt_embeds", "negative_prompt_embeds"] + + def __init__( + self, + vae: AutoencoderKL, + text_encoder: CLIPTextModel, + text_encoder_brushnet: CLIPTextModel, + tokenizer: CLIPTokenizer, + unet: UNet2DConditionModel, + brushnet: BrushNetModel, + scheduler: KarrasDiffusionSchedulers, + safety_checker: StableDiffusionSafetyChecker, + feature_extractor: CLIPImageProcessor, + image_encoder: CLIPVisionModelWithProjection = None, + requires_safety_checker: bool = True, + ): + super().__init__() + + if safety_checker is None and requires_safety_checker: + logger.warning( + f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure" + " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered" + " results in services or applications open to the public. Both the diffusers team and Hugging Face" + " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling" + " it only for use-cases that involve analyzing network behavior or auditing its results. For more" + " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ." + ) + + if safety_checker is not None and feature_extractor is None: + raise ValueError( + "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety" + " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead." + ) + + self.register_modules( + vae=vae, + text_encoder=text_encoder, + text_encoder_brushnet=text_encoder_brushnet, + tokenizer=tokenizer, + unet=unet, + brushnet=brushnet, + scheduler=scheduler, + safety_checker=safety_checker, + feature_extractor=feature_extractor, + image_encoder=image_encoder, + ) + self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1) + self.image_processor = VaeImageProcessor( + vae_scale_factor=self.vae_scale_factor, do_convert_rgb=True + ) + self.register_to_config(requires_safety_checker=requires_safety_checker) + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline._encode_prompt + def _encode_prompt( + self, + promptA, + promptB, + t, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_promptA=None, + negative_promptB=None, + t_nag=None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, + ): + r""" + Encodes the prompt into text encoder hidden states. + + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + device: (`torch.device`): + torch device + num_images_per_prompt (`int`): + number of images that should be generated per prompt + do_classifier_free_guidance (`bool`): + whether to use classifier free guidance or not + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + lora_scale (`float`, *optional*): + A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. + """ + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + + prompt = promptA + negative_prompt = negative_promptA + + if promptA is not None and isinstance(promptA, str): + batch_size = 1 + elif promptA is not None and isinstance(promptA, list): + batch_size = len(promptA) + else: + batch_size = prompt_embeds.shape[0] + + if prompt_embeds is None: + # textual inversion: procecss multi-vector tokens if necessary + if isinstance(self, TextualInversionLoaderMixin): + promptA = self.maybe_convert_prompt(promptA, self.tokenizer) + + text_inputsA = self.tokenizer( + promptA, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + text_inputsB = self.tokenizer( + promptB, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + text_input_idsA = text_inputsA.input_ids + text_input_idsB = text_inputsB.input_ids + untruncated_ids = self.tokenizer( + promptA, padding="longest", return_tensors="pt" + ).input_ids + + if untruncated_ids.shape[-1] >= text_input_idsA.shape[ + -1 + ] and not torch.equal(text_input_idsA, untruncated_ids): + removed_text = self.tokenizer.batch_decode( + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + + if ( + hasattr(self.text_encoder_brushnet.config, "use_attention_mask") + and self.text_encoder_brushnet.config.use_attention_mask + ): + attention_mask = text_inputsA.attention_mask.to(device) + else: + attention_mask = None + + # print("text_input_idsA: ",text_input_idsA) + # print("text_input_idsB: ",text_input_idsB) + # print('t: ',t) + + prompt_embedsA = self.text_encoder_brushnet( + text_input_idsA.to(device), + attention_mask=attention_mask, + ) + prompt_embedsA = prompt_embedsA[0] + + prompt_embedsB = self.text_encoder_brushnet( + text_input_idsB.to(device), + attention_mask=attention_mask, + ) + prompt_embedsB = prompt_embedsB[0] + prompt_embeds = prompt_embedsA * (t) + (1 - t) * prompt_embedsB + # print("prompt_embeds: ",prompt_embeds) + + if self.text_encoder_brushnet is not None: + prompt_embeds_dtype = self.text_encoder_brushnet.dtype + elif self.unet is not None: + prompt_embeds_dtype = self.unet.dtype + else: + prompt_embeds_dtype = prompt_embeds.dtype + + prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device) + + bs_embed, seq_len, _ = prompt_embeds.shape + # duplicate text embeddings for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view( + bs_embed * num_images_per_prompt, seq_len, -1 + ) + + # get unconditional embeddings for classifier free guidance + if do_classifier_free_guidance and negative_prompt_embeds is None: + uncond_tokensA: List[str] + uncond_tokensB: List[str] + if negative_prompt is None: + uncond_tokensA = [""] * batch_size + uncond_tokensB = [""] * batch_size + elif prompt is not None and type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif isinstance(negative_prompt, str): + uncond_tokensA = [negative_promptA] + uncond_tokensB = [negative_promptB] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokensA = negative_promptA + uncond_tokensB = negative_promptB + + # textual inversion: procecss multi-vector tokens if necessary + if isinstance(self, TextualInversionLoaderMixin): + uncond_tokensA = self.maybe_convert_prompt( + uncond_tokensA, self.tokenizer + ) + uncond_tokensB = self.maybe_convert_prompt( + uncond_tokensB, self.tokenizer + ) + + max_length = prompt_embeds.shape[1] + uncond_inputA = self.tokenizer( + uncond_tokensA, + padding="max_length", + max_length=max_length, + truncation=True, + return_tensors="pt", + ) + uncond_inputB = self.tokenizer( + uncond_tokensB, + padding="max_length", + max_length=max_length, + truncation=True, + return_tensors="pt", + ) + + if ( + hasattr(self.text_encoder_brushnet.config, "use_attention_mask") + and self.text_encoder_brushnet.config.use_attention_mask + ): + attention_mask = uncond_inputA.attention_mask.to(device) + else: + attention_mask = None + + negative_prompt_embedsA = self.text_encoder_brushnet( + uncond_inputA.input_ids.to(device), + attention_mask=attention_mask, + ) + negative_prompt_embedsB = self.text_encoder_brushnet( + uncond_inputB.input_ids.to(device), + attention_mask=attention_mask, + ) + negative_prompt_embeds = ( + negative_prompt_embedsA[0] * (t_nag) + + (1 - t_nag) * negative_prompt_embedsB[0] + ) + + # negative_prompt_embeds = negative_prompt_embeds[0] + + if do_classifier_free_guidance: + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + seq_len = negative_prompt_embeds.shape[1] + + negative_prompt_embeds = negative_prompt_embeds.to( + dtype=prompt_embeds_dtype, device=device + ) + + negative_prompt_embeds = negative_prompt_embeds.repeat( + 1, num_images_per_prompt, 1 + ) + negative_prompt_embeds = negative_prompt_embeds.view( + batch_size * num_images_per_prompt, seq_len, -1 + ) + + # For classifier free guidance, we need to do two forward passes. + # Here we concatenate the unconditional and text embeddings into a single batch + # to avoid doing two forward passes + # print("prompt_embeds: ",prompt_embeds) + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) + + return prompt_embeds + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_prompt + def encode_prompt( + self, + prompt, + device, + num_images_per_prompt, + do_classifier_free_guidance, + negative_prompt=None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + lora_scale: Optional[float] = None, + clip_skip: Optional[int] = None, + ): + r""" + Encodes the prompt into text encoder hidden states. + + Args: + prompt (`str` or `List[str]`, *optional*): + prompt to be encoded + device: (`torch.device`): + torch device + num_images_per_prompt (`int`): + number of images that should be generated per prompt + do_classifier_free_guidance (`bool`): + whether to use classifier free guidance or not + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts not to guide the image generation. If not defined, one has to pass + `negative_prompt_embeds` instead. Ignored when not using guidance (i.e., ignored if `guidance_scale` is + less than `1`). + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not + provided, text embeddings will be generated from `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt + weighting. If not provided, negative_prompt_embeds will be generated from `negative_prompt` input + argument. + lora_scale (`float`, *optional*): + A LoRA scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded. + clip_skip (`int`, *optional*): + Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that + the output of the pre-final layer will be used for computing the prompt embeddings. + """ + # set lora scale so that monkey patched LoRA + # function of text encoder can correctly access it + # print('1 ',prompt,negative_prompt) + if lora_scale is not None and isinstance(self, LoraLoaderMixin): + self._lora_scale = lora_scale + + # dynamically adjust the LoRA scale + if not USE_PEFT_BACKEND: + adjust_lora_scale_text_encoder(self.text_encoder, lora_scale) + else: + scale_lora_layers(self.text_encoder, lora_scale) + # print('2 ',prompt,negative_prompt) + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + # print('3 ',prompt,negative_prompt) + if prompt_embeds is None: + # textual inversion: process multi-vector tokens if necessary + # print('4 ',prompt,negative_prompt) + if isinstance(self, TextualInversionLoaderMixin): + prompt = self.maybe_convert_prompt(prompt, self.tokenizer) + + # print('5 ',prompt,negative_prompt) + + text_inputs = self.tokenizer( + prompt, + padding="max_length", + max_length=self.tokenizer.model_max_length, + truncation=True, + return_tensors="pt", + ) + text_input_ids = text_inputs.input_ids + # print(prompt, text_input_ids) + untruncated_ids = self.tokenizer( + prompt, padding="longest", return_tensors="pt" + ).input_ids + + if untruncated_ids.shape[-1] >= text_input_ids.shape[ + -1 + ] and not torch.equal(text_input_ids, untruncated_ids): + removed_text = self.tokenizer.batch_decode( + untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1] + ) + logger.warning( + "The following part of your input was truncated because CLIP can only handle sequences up to" + f" {self.tokenizer.model_max_length} tokens: {removed_text}" + ) + + if ( + hasattr(self.text_encoder.config, "use_attention_mask") + and self.text_encoder.config.use_attention_mask + ): + attention_mask = text_inputs.attention_mask.to(device) + else: + attention_mask = None + + if clip_skip is None: + prompt_embeds = self.text_encoder( + text_input_ids.to(device), attention_mask=attention_mask + ) + prompt_embeds = prompt_embeds[0] + else: + prompt_embeds = self.text_encoder( + text_input_ids.to(device), + attention_mask=attention_mask, + output_hidden_states=True, + ) + # Access the `hidden_states` first, that contains a tuple of + # all the hidden states from the encoder layers. Then index into + # the tuple to access the hidden states from the desired layer. + prompt_embeds = prompt_embeds[-1][-(clip_skip + 1)] + # We also need to apply the final LayerNorm here to not mess with the + # representations. The `last_hidden_states` that we typically use for + # obtaining the final prompt representations passes through the LayerNorm + # layer. + prompt_embeds = self.text_encoder.text_model.final_layer_norm( + prompt_embeds + ) + + if self.text_encoder is not None: + prompt_embeds_dtype = self.text_encoder.dtype + elif self.unet is not None: + prompt_embeds_dtype = self.unet.dtype + else: + prompt_embeds_dtype = prompt_embeds.dtype + + prompt_embeds = prompt_embeds.to(dtype=prompt_embeds_dtype, device=device) + + bs_embed, seq_len, _ = prompt_embeds.shape + # duplicate text embeddings for each generation per prompt, using mps friendly method + prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1) + prompt_embeds = prompt_embeds.view( + bs_embed * num_images_per_prompt, seq_len, -1 + ) + + # get unconditional embeddings for classifier free guidance + if do_classifier_free_guidance and negative_prompt_embeds is None: + uncond_tokens: List[str] + if negative_prompt is None: + uncond_tokens = [""] * batch_size + elif prompt is not None and type(prompt) is not type(negative_prompt): + raise TypeError( + f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !=" + f" {type(prompt)}." + ) + elif isinstance(negative_prompt, str): + uncond_tokens = [negative_prompt] + elif batch_size != len(negative_prompt): + raise ValueError( + f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:" + f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches" + " the batch size of `prompt`." + ) + else: + uncond_tokens = negative_prompt + + # textual inversion: process multi-vector tokens if necessary + if isinstance(self, TextualInversionLoaderMixin): + uncond_tokens = self.maybe_convert_prompt(uncond_tokens, self.tokenizer) + + max_length = prompt_embeds.shape[1] + uncond_input = self.tokenizer( + uncond_tokens, + padding="max_length", + max_length=max_length, + truncation=True, + return_tensors="pt", + ) + # print("neg: ", uncond_input.input_ids) + + if ( + hasattr(self.text_encoder.config, "use_attention_mask") + and self.text_encoder.config.use_attention_mask + ): + attention_mask = uncond_input.attention_mask.to(device) + else: + attention_mask = None + + negative_prompt_embeds = self.text_encoder( + uncond_input.input_ids.to(device), + attention_mask=attention_mask, + ) + negative_prompt_embeds = negative_prompt_embeds[0] + + if do_classifier_free_guidance: + # duplicate unconditional embeddings for each generation per prompt, using mps friendly method + seq_len = negative_prompt_embeds.shape[1] + + negative_prompt_embeds = negative_prompt_embeds.to( + dtype=prompt_embeds_dtype, device=device + ) + + negative_prompt_embeds = negative_prompt_embeds.repeat( + 1, num_images_per_prompt, 1 + ) + negative_prompt_embeds = negative_prompt_embeds.view( + batch_size * num_images_per_prompt, seq_len, -1 + ) + + if isinstance(self, LoraLoaderMixin) and USE_PEFT_BACKEND: + # Retrieve the original scale by scaling back the LoRA layers + unscale_lora_layers(self.text_encoder, lora_scale) + + prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds]) + + return prompt_embeds + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.encode_image + def encode_image( + self, image, device, num_images_per_prompt, output_hidden_states=None + ): + dtype = next(self.image_encoder.parameters()).dtype + + if not isinstance(image, torch.Tensor): + image = self.feature_extractor(image, return_tensors="pt").pixel_values + + image = image.to(device=device, dtype=dtype) + if output_hidden_states: + image_enc_hidden_states = self.image_encoder( + image, output_hidden_states=True + ).hidden_states[-2] + image_enc_hidden_states = image_enc_hidden_states.repeat_interleave( + num_images_per_prompt, dim=0 + ) + uncond_image_enc_hidden_states = self.image_encoder( + torch.zeros_like(image), output_hidden_states=True + ).hidden_states[-2] + uncond_image_enc_hidden_states = ( + uncond_image_enc_hidden_states.repeat_interleave( + num_images_per_prompt, dim=0 + ) + ) + return image_enc_hidden_states, uncond_image_enc_hidden_states + else: + image_embeds = self.image_encoder(image).image_embeds + image_embeds = image_embeds.repeat_interleave(num_images_per_prompt, dim=0) + uncond_image_embeds = torch.zeros_like(image_embeds) + + return image_embeds, uncond_image_embeds + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_ip_adapter_image_embeds + def prepare_ip_adapter_image_embeds( + self, + ip_adapter_image, + ip_adapter_image_embeds, + device, + num_images_per_prompt, + do_classifier_free_guidance, + ): + if ip_adapter_image_embeds is None: + if not isinstance(ip_adapter_image, list): + ip_adapter_image = [ip_adapter_image] + + if len(ip_adapter_image) != len( + self.unet.encoder_hid_proj.image_projection_layers + ): + raise ValueError( + f"`ip_adapter_image` must have same length as the number of IP Adapters. Got {len(ip_adapter_image)} images and {len(self.unet.encoder_hid_proj.image_projection_layers)} IP Adapters." + ) + + image_embeds = [] + for single_ip_adapter_image, image_proj_layer in zip( + ip_adapter_image, self.unet.encoder_hid_proj.image_projection_layers + ): + output_hidden_state = not isinstance(image_proj_layer, ImageProjection) + single_image_embeds, single_negative_image_embeds = self.encode_image( + single_ip_adapter_image, device, 1, output_hidden_state + ) + single_image_embeds = torch.stack( + [single_image_embeds] * num_images_per_prompt, dim=0 + ) + single_negative_image_embeds = torch.stack( + [single_negative_image_embeds] * num_images_per_prompt, dim=0 + ) + + if do_classifier_free_guidance: + single_image_embeds = torch.cat( + [single_negative_image_embeds, single_image_embeds] + ) + single_image_embeds = single_image_embeds.to(device) + + image_embeds.append(single_image_embeds) + else: + repeat_dims = [1] + image_embeds = [] + for single_image_embeds in ip_adapter_image_embeds: + if do_classifier_free_guidance: + single_negative_image_embeds, single_image_embeds = ( + single_image_embeds.chunk(2) + ) + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, + *(repeat_dims * len(single_image_embeds.shape[1:])), + ) + single_negative_image_embeds = single_negative_image_embeds.repeat( + num_images_per_prompt, + *(repeat_dims * len(single_negative_image_embeds.shape[1:])), + ) + single_image_embeds = torch.cat( + [single_negative_image_embeds, single_image_embeds] + ) + else: + single_image_embeds = single_image_embeds.repeat( + num_images_per_prompt, + *(repeat_dims * len(single_image_embeds.shape[1:])), + ) + image_embeds.append(single_image_embeds) + + return image_embeds + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.run_safety_checker + def run_safety_checker(self, image, device, dtype): + if self.safety_checker is None: + has_nsfw_concept = None + else: + if torch.is_tensor(image): + feature_extractor_input = self.image_processor.postprocess( + image, output_type="pil" + ) + else: + feature_extractor_input = self.image_processor.numpy_to_pil(image) + safety_checker_input = self.feature_extractor( + feature_extractor_input, return_tensors="pt" + ).to(device) + image, has_nsfw_concept = self.safety_checker( + images=image, clip_input=safety_checker_input.pixel_values.to(dtype) + ) + return image, has_nsfw_concept + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.decode_latents + def decode_latents(self, latents): + deprecation_message = "The decode_latents method is deprecated and will be removed in 1.0.0. Please use VaeImageProcessor.postprocess(...) instead" + deprecate("decode_latents", "1.0.0", deprecation_message, standard_warn=False) + + latents = 1 / self.vae.config.scaling_factor * latents + image = self.vae.decode(latents, return_dict=False)[0] + image = (image / 2 + 0.5).clamp(0, 1) + # we always cast to float32 as this does not cause significant overhead and is compatible with bfloat16 + image = image.cpu().permute(0, 2, 3, 1).float().numpy() + return image + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_extra_step_kwargs + def prepare_extra_step_kwargs(self, generator, eta): + # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature + # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers. + # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502 + # and should be between [0, 1] + + accepts_eta = "eta" in set( + inspect.signature(self.scheduler.step).parameters.keys() + ) + extra_step_kwargs = {} + if accepts_eta: + extra_step_kwargs["eta"] = eta + + # check if the scheduler accepts generator + accepts_generator = "generator" in set( + inspect.signature(self.scheduler.step).parameters.keys() + ) + if accepts_generator: + extra_step_kwargs["generator"] = generator + return extra_step_kwargs + + def check_inputs( + self, + prompt, + image, + mask, + callback_steps, + negative_prompt=None, + prompt_embeds=None, + negative_prompt_embeds=None, + ip_adapter_image=None, + ip_adapter_image_embeds=None, + brushnet_conditioning_scale=1.0, + control_guidance_start=0.0, + control_guidance_end=1.0, + callback_on_step_end_tensor_inputs=None, + ): + if callback_steps is not None and ( + not isinstance(callback_steps, int) or callback_steps <= 0 + ): + raise ValueError( + f"`callback_steps` has to be a positive integer but is {callback_steps} of type" + f" {type(callback_steps)}." + ) + + if callback_on_step_end_tensor_inputs is not None and not all( + k in self._callback_tensor_inputs + for k in callback_on_step_end_tensor_inputs + ): + raise ValueError( + f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}" + ) + + if prompt is not None and prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to" + " only forward one of the two." + ) + elif prompt is None and prompt_embeds is None: + raise ValueError( + "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined." + ) + elif prompt is not None and ( + not isinstance(prompt, str) and not isinstance(prompt, list) + ): + raise ValueError( + f"`prompt` has to be of type `str` or `list` but is {type(prompt)}" + ) + + if negative_prompt is not None and negative_prompt_embeds is not None: + raise ValueError( + f"Cannot forward both `negative_prompt`: {negative_prompt} and `negative_prompt_embeds`:" + f" {negative_prompt_embeds}. Please make sure to only forward one of the two." + ) + + if prompt_embeds is not None and negative_prompt_embeds is not None: + if prompt_embeds.shape != negative_prompt_embeds.shape: + raise ValueError( + "`prompt_embeds` and `negative_prompt_embeds` must have the same shape when passed directly, but" + f" got: `prompt_embeds` {prompt_embeds.shape} != `negative_prompt_embeds`" + f" {negative_prompt_embeds.shape}." + ) + + # Check `image` + is_compiled = hasattr(F, "scaled_dot_product_attention") and isinstance( + self.brushnet, torch._dynamo.eval_frame.OptimizedModule + ) + if ( + isinstance(self.brushnet, BrushNetModel) + or is_compiled + and isinstance(self.brushnet._orig_mod, BrushNetModel) + ): + self.check_image(image, mask, prompt, prompt_embeds) + else: + assert False + + # Check `brushnet_conditioning_scale` + if ( + isinstance(self.brushnet, BrushNetModel) + or is_compiled + and isinstance(self.brushnet._orig_mod, BrushNetModel) + ): + if not isinstance(brushnet_conditioning_scale, float): + raise TypeError( + "For single brushnet: `brushnet_conditioning_scale` must be type `float`." + ) + else: + assert False + + if not isinstance(control_guidance_start, (tuple, list)): + control_guidance_start = [control_guidance_start] + + if not isinstance(control_guidance_end, (tuple, list)): + control_guidance_end = [control_guidance_end] + + if len(control_guidance_start) != len(control_guidance_end): + raise ValueError( + f"`control_guidance_start` has {len(control_guidance_start)} elements, but `control_guidance_end` has {len(control_guidance_end)} elements. Make sure to provide the same number of elements to each list." + ) + + for start, end in zip(control_guidance_start, control_guidance_end): + if start >= end: + raise ValueError( + f"control guidance start: {start} cannot be larger or equal to control guidance end: {end}." + ) + if start < 0.0: + raise ValueError( + f"control guidance start: {start} can't be smaller than 0." + ) + if end > 1.0: + raise ValueError( + f"control guidance end: {end} can't be larger than 1.0." + ) + + if ip_adapter_image is not None and ip_adapter_image_embeds is not None: + raise ValueError( + "Provide either `ip_adapter_image` or `ip_adapter_image_embeds`. Cannot leave both `ip_adapter_image` and `ip_adapter_image_embeds` defined." + ) + + if ip_adapter_image_embeds is not None: + if not isinstance(ip_adapter_image_embeds, list): + raise ValueError( + f"`ip_adapter_image_embeds` has to be of type `list` but is {type(ip_adapter_image_embeds)}" + ) + elif ip_adapter_image_embeds[0].ndim not in [3, 4]: + raise ValueError( + f"`ip_adapter_image_embeds` has to be a list of 3D or 4D tensors but is {ip_adapter_image_embeds[0].ndim}D" + ) + + def check_image(self, image, mask, prompt, prompt_embeds): + image_is_pil = isinstance(image, PIL.Image.Image) + image_is_tensor = isinstance(image, torch.Tensor) + image_is_np = isinstance(image, np.ndarray) + image_is_pil_list = isinstance(image, list) and isinstance( + image[0], PIL.Image.Image + ) + image_is_tensor_list = isinstance(image, list) and isinstance( + image[0], torch.Tensor + ) + image_is_np_list = isinstance(image, list) and isinstance(image[0], np.ndarray) + + if ( + not image_is_pil + and not image_is_tensor + and not image_is_np + and not image_is_pil_list + and not image_is_tensor_list + and not image_is_np_list + ): + raise TypeError( + f"image must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(image)}" + ) + + mask_is_pil = isinstance(mask, PIL.Image.Image) + mask_is_tensor = isinstance(mask, torch.Tensor) + mask_is_np = isinstance(mask, np.ndarray) + mask_is_pil_list = isinstance(mask, list) and isinstance( + mask[0], PIL.Image.Image + ) + mask_is_tensor_list = isinstance(mask, list) and isinstance( + mask[0], torch.Tensor + ) + mask_is_np_list = isinstance(mask, list) and isinstance(mask[0], np.ndarray) + + if ( + not mask_is_pil + and not mask_is_tensor + and not mask_is_np + and not mask_is_pil_list + and not mask_is_tensor_list + and not mask_is_np_list + ): + raise TypeError( + f"mask must be passed and be one of PIL image, numpy array, torch tensor, list of PIL images, list of numpy arrays or list of torch tensors, but is {type(mask)}" + ) + + if image_is_pil: + image_batch_size = 1 + else: + image_batch_size = len(image) + + if prompt is not None and isinstance(prompt, str): + prompt_batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + prompt_batch_size = len(prompt) + elif prompt_embeds is not None: + prompt_batch_size = prompt_embeds.shape[0] + + if image_batch_size != 1 and image_batch_size != prompt_batch_size: + raise ValueError( + f"If image batch size is not 1, image batch size must be same as prompt batch size. image batch size: {image_batch_size}, prompt batch size: {prompt_batch_size}" + ) + + def prepare_image( + self, + image, + width, + height, + batch_size, + num_images_per_prompt, + device, + dtype, + do_classifier_free_guidance=False, + guess_mode=False, + ): + image = self.image_processor.preprocess(image, height=height, width=width).to( + dtype=torch.float32 + ) + image_batch_size = image.shape[0] + + if image_batch_size == 1: + repeat_by = batch_size + else: + # image batch size is the same as prompt batch size + repeat_by = num_images_per_prompt + + image = image.repeat_interleave(repeat_by, dim=0) + + image = image.to(device=device, dtype=dtype) + + if do_classifier_free_guidance and not guess_mode: + image = torch.cat([image] * 2) + + return image.to(device=device, dtype=dtype) + + # Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.StableDiffusionPipeline.prepare_latents + def prepare_latents( + self, + batch_size, + num_channels_latents, + height, + width, + dtype, + device, + generator, + latents=None, + ): + shape = ( + batch_size, + num_channels_latents, + height // self.vae_scale_factor, + width // self.vae_scale_factor, + ) + if isinstance(generator, list) and len(generator) != batch_size: + raise ValueError( + f"You have passed a list of generators of length {len(generator)}, but requested an effective batch" + f" size of {batch_size}. Make sure the batch size matches the length of the generators." + ) + + if latents is None: + noise = randn_tensor(shape, generator=generator, device=device, dtype=dtype) + else: + noise = latents.to(device) + + # scale the initial noise by the standard deviation required by the scheduler + latents = noise * self.scheduler.init_noise_sigma + return latents, noise + + # Copied from diffusers.pipelines.latent_consistency_models.pipeline_latent_consistency_text2img.LatentConsistencyModelPipeline.get_guidance_scale_embedding + def get_guidance_scale_embedding(self, w, embedding_dim=512, dtype=torch.float32): + """ + See https://github.com/google-research/vdm/blob/dc27b98a554f65cdc654b800da5aa1846545d41b/model_vdm.py#L298 + + Args: + timesteps (`torch.Tensor`): + generate embedding vectors at these timesteps + embedding_dim (`int`, *optional*, defaults to 512): + dimension of the embeddings to generate + dtype: + data type of the generated embeddings + + Returns: + `torch.FloatTensor`: Embedding vectors with shape `(len(timesteps), embedding_dim)` + """ + assert len(w.shape) == 1 + w = w * 1000.0 + + half_dim = embedding_dim // 2 + emb = torch.log(torch.tensor(10000.0)) / (half_dim - 1) + emb = torch.exp(torch.arange(half_dim, dtype=dtype) * -emb) + emb = w.to(dtype)[:, None] * emb[None, :] + emb = torch.cat([torch.sin(emb), torch.cos(emb)], dim=1) + if embedding_dim % 2 == 1: # zero pad + emb = torch.nn.functional.pad(emb, (0, 1)) + assert emb.shape == (w.shape[0], embedding_dim) + return emb + + @property + def guidance_scale(self): + return self._guidance_scale + + @property + def clip_skip(self): + return self._clip_skip + + # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2) + # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1` + # corresponds to doing no classifier free guidance. + @property + def do_classifier_free_guidance(self): + return self._guidance_scale > 1 and self.unet.config.time_cond_proj_dim is None + + @property + def cross_attention_kwargs(self): + return self._cross_attention_kwargs + + @property + def num_timesteps(self): + return self._num_timesteps + + @torch.no_grad() + @replace_example_docstring(EXAMPLE_DOC_STRING) + def __call__( + self, + promptA: Union[str, List[str]] = None, + promptB: Union[str, List[str]] = None, + promptU: Union[str, List[str]] = None, + tradoff: float = 1.0, + tradoff_nag: float = 1.0, + image: PipelineImageInput = None, + mask: PipelineImageInput = None, + height: Optional[int] = None, + width: Optional[int] = None, + num_inference_steps: int = 50, + timesteps: List[int] = None, + guidance_scale: float = 7.5, + negative_promptA: Optional[Union[str, List[str]]] = None, + negative_promptB: Optional[Union[str, List[str]]] = None, + negative_promptU: Optional[Union[str, List[str]]] = None, + num_images_per_prompt: Optional[int] = 1, + eta: float = 0.0, + generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None, + latents: Optional[torch.FloatTensor] = None, + prompt_embeds: Optional[torch.FloatTensor] = None, + negative_prompt_embeds: Optional[torch.FloatTensor] = None, + ip_adapter_image: Optional[PipelineImageInput] = None, + ip_adapter_image_embeds: Optional[List[torch.FloatTensor]] = None, + output_type: Optional[str] = "pil", + return_dict: bool = True, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + brushnet_conditioning_scale: Union[float, List[float]] = 1.0, + guess_mode: bool = False, + control_guidance_start: Union[float, List[float]] = 0.0, + control_guidance_end: Union[float, List[float]] = 1.0, + clip_skip: Optional[int] = None, + callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None, + callback_on_step_end_tensor_inputs: List[str] = ["latents"], + **kwargs, + ): + r""" + The call function to the pipeline for generation. + + Args: + prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide image generation. If not defined, you need to pass `prompt_embeds`. + image (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + The BrushNet input condition to provide guidance to the `unet` for generation. If the type is + specified as `torch.FloatTensor`, it is passed to BrushNet as is. `PIL.Image.Image` can also be + accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height + and/or width are passed, `image` is resized accordingly. If multiple BrushNets are specified in + `init`, images must be passed as a list such that each element of the list can be correctly batched for + input to a single BrushNet. When `prompt` is a list, and if a list of images is passed for a single BrushNet, + each will be paired with each prompt in the `prompt` list. This also applies to multiple BrushNets, + where a list of image lists can be passed to batch for each prompt and each BrushNet. + mask (`torch.FloatTensor`, `PIL.Image.Image`, `np.ndarray`, `List[torch.FloatTensor]`, `List[PIL.Image.Image]`, `List[np.ndarray]`,: + `List[List[torch.FloatTensor]]`, `List[List[np.ndarray]]` or `List[List[PIL.Image.Image]]`): + The BrushNet input condition to provide guidance to the `unet` for generation. If the type is + specified as `torch.FloatTensor`, it is passed to BrushNet as is. `PIL.Image.Image` can also be + accepted as an image. The dimensions of the output image defaults to `image`'s dimensions. If height + and/or width are passed, `image` is resized accordingly. If multiple BrushNets are specified in + `init`, images must be passed as a list such that each element of the list can be correctly batched for + input to a single BrushNet. When `prompt` is a list, and if a list of images is passed for a single BrushNet, + each will be paired with each prompt in the `prompt` list. This also applies to multiple BrushNets, + where a list of image lists can be passed to batch for each prompt and each BrushNet. + height (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): + The height in pixels of the generated image. + width (`int`, *optional*, defaults to `self.unet.config.sample_size * self.vae_scale_factor`): + The width in pixels of the generated image. + num_inference_steps (`int`, *optional*, defaults to 50): + The number of denoising steps. More denoising steps usually lead to a higher quality image at the + expense of slower inference. + timesteps (`List[int]`, *optional*): + Custom timesteps to use for the denoising process with schedulers which support a `timesteps` argument + in their `set_timesteps` method. If not defined, the default behavior when `num_inference_steps` is + passed will be used. Must be in descending order. + guidance_scale (`float`, *optional*, defaults to 7.5): + A higher guidance scale value encourages the model to generate images closely linked to the text + `prompt` at the expense of lower image quality. Guidance scale is enabled when `guidance_scale > 1`. + negative_prompt (`str` or `List[str]`, *optional*): + The prompt or prompts to guide what to not include in image generation. If not defined, you need to + pass `negative_prompt_embeds` instead. Ignored when not using guidance (`guidance_scale < 1`). + num_images_per_prompt (`int`, *optional*, defaults to 1): + The number of images to generate per prompt. + eta (`float`, *optional*, defaults to 0.0): + Corresponds to parameter eta (η) from the [DDIM](https://arxiv.org/abs/2010.02502) paper. Only applies + to the [`~schedulers.DDIMScheduler`], and is ignored in other schedulers. + generator (`torch.Generator` or `List[torch.Generator]`, *optional*): + A [`torch.Generator`](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make + generation deterministic. + latents (`torch.FloatTensor`, *optional*): + Pre-generated noisy latents sampled from a Gaussian distribution, to be used as inputs for image + generation. Can be used to tweak the same generation with different prompts. If not provided, a latents + tensor is generated by sampling using the supplied random `generator`. + prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated text embeddings. Can be used to easily tweak text inputs (prompt weighting). If not + provided, text embeddings are generated from the `prompt` input argument. + negative_prompt_embeds (`torch.FloatTensor`, *optional*): + Pre-generated negative text embeddings. Can be used to easily tweak text inputs (prompt weighting). If + not provided, `negative_prompt_embeds` are generated from the `negative_prompt` input argument. + ip_adapter_image: (`PipelineImageInput`, *optional*): Optional image input to work with IP Adapters. + ip_adapter_image_embeds (`List[torch.FloatTensor]`, *optional*): + Pre-generated image embeddings for IP-Adapter. It should be a list of length same as number of IP-adapters. + Each element should be a tensor of shape `(batch_size, num_images, emb_dim)`. It should contain the negative image embedding + if `do_classifier_free_guidance` is set to `True`. + If not provided, embeddings are computed from the `ip_adapter_image` input argument. + output_type (`str`, *optional*, defaults to `"pil"`): + The output format of the generated image. Choose between `PIL.Image` or `np.array`. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a + plain tuple. + callback (`Callable`, *optional*): + A function that calls every `callback_steps` steps during inference. The function is called with the + following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`. + callback_steps (`int`, *optional*, defaults to 1): + The frequency at which the `callback` function is called. If not specified, the callback is called at + every step. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the [`AttentionProcessor`] as defined in + [`self.processor`](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + brushnet_conditioning_scale (`float` or `List[float]`, *optional*, defaults to 1.0): + The outputs of the BrushNet are multiplied by `brushnet_conditioning_scale` before they are added + to the residual in the original `unet`. If multiple BrushNets are specified in `init`, you can set + the corresponding scale as a list. + guess_mode (`bool`, *optional*, defaults to `False`): + The BrushNet encoder tries to recognize the content of the input image even if you remove all + prompts. A `guidance_scale` value between 3.0 and 5.0 is recommended. + control_guidance_start (`float` or `List[float]`, *optional*, defaults to 0.0): + The percentage of total steps at which the BrushNet starts applying. + control_guidance_end (`float` or `List[float]`, *optional*, defaults to 1.0): + The percentage of total steps at which the BrushNet stops applying. + clip_skip (`int`, *optional*): + Number of layers to be skipped from CLIP while computing the prompt embeddings. A value of 1 means that + the output of the pre-final layer will be used for computing the prompt embeddings. + callback_on_step_end (`Callable`, *optional*): + A function that calls at the end of each denoising steps during the inference. The function is called + with the following arguments: `callback_on_step_end(self: DiffusionPipeline, step: int, timestep: int, + callback_kwargs: Dict)`. `callback_kwargs` will include a list of all tensors as specified by + `callback_on_step_end_tensor_inputs`. + callback_on_step_end_tensor_inputs (`List`, *optional*): + The list of tensor inputs for the `callback_on_step_end` function. The tensors specified in the list + will be passed as `callback_kwargs` argument. You will only be able to include variables listed in the + `._callback_tensor_inputs` attribute of your pipeine class. + + Examples: + + Returns: + [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`: + If `return_dict` is `True`, [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] is returned, + otherwise a `tuple` is returned where the first element is a list with the generated images and the + second element is a list of `bool`s indicating whether the corresponding generated image contains + "not-safe-for-work" (nsfw) content. + """ + + callback = kwargs.pop("callback", None) + callback_steps = kwargs.pop("callback_steps", None) + + if callback is not None: + deprecate( + "callback", + "1.0.0", + "Passing `callback` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`", + ) + if callback_steps is not None: + deprecate( + "callback_steps", + "1.0.0", + "Passing `callback_steps` as an input argument to `__call__` is deprecated, consider using `callback_on_step_end`", + ) + + brushnet = ( + self.brushnet._orig_mod + if is_compiled_module(self.brushnet) + else self.brushnet + ) + + # align format for control guidance + if not isinstance(control_guidance_start, list) and isinstance( + control_guidance_end, list + ): + control_guidance_start = len(control_guidance_end) * [ + control_guidance_start + ] + elif not isinstance(control_guidance_end, list) and isinstance( + control_guidance_start, list + ): + control_guidance_end = len(control_guidance_start) * [control_guidance_end] + elif not isinstance(control_guidance_start, list) and not isinstance( + control_guidance_end, list + ): + control_guidance_start, control_guidance_end = ( + [control_guidance_start], + [control_guidance_end], + ) + + # 1. Check inputs. Raise error if not correct + prompt = promptA + negative_prompt = negative_promptA + self.check_inputs( + prompt, + image, + mask, + callback_steps, + negative_prompt, + prompt_embeds, + negative_prompt_embeds, + ip_adapter_image, + ip_adapter_image_embeds, + brushnet_conditioning_scale, + control_guidance_start, + control_guidance_end, + callback_on_step_end_tensor_inputs, + ) + + self._guidance_scale = guidance_scale + self._clip_skip = clip_skip + self._cross_attention_kwargs = cross_attention_kwargs + + # 2. Define call parameters + if prompt is not None and isinstance(prompt, str): + batch_size = 1 + elif prompt is not None and isinstance(prompt, list): + batch_size = len(prompt) + else: + batch_size = prompt_embeds.shape[0] + + device = self._execution_device + + global_pool_conditions = ( + brushnet.config.global_pool_conditions + if isinstance(brushnet, BrushNetModel) + else brushnet.nets[0].config.global_pool_conditions + ) + guess_mode = guess_mode or global_pool_conditions + + # 3. Encode input prompt + text_encoder_lora_scale = ( + self.cross_attention_kwargs.get("scale", None) + if self.cross_attention_kwargs is not None + else None + ) + + prompt_embeds = self._encode_prompt( + promptA, + promptB, + tradoff, + device, + num_images_per_prompt, + self.do_classifier_free_guidance, + negative_promptA, + negative_promptB, + tradoff_nag, + prompt_embeds=prompt_embeds, + negative_prompt_embeds=negative_prompt_embeds, + lora_scale=text_encoder_lora_scale, + ) + prompt_embedsU = None + negative_prompt_embedsU = None + prompt_embedsU = self.encode_prompt( + promptU, + device, + num_images_per_prompt, + self.do_classifier_free_guidance, + negative_promptU, + prompt_embeds=prompt_embedsU, + negative_prompt_embeds=negative_prompt_embedsU, + lora_scale=text_encoder_lora_scale, + ) + + if ip_adapter_image is not None or ip_adapter_image_embeds is not None: + image_embeds = self.prepare_ip_adapter_image_embeds( + ip_adapter_image, + ip_adapter_image_embeds, + device, + batch_size * num_images_per_prompt, + self.do_classifier_free_guidance, + ) + + # 4. Prepare image + if isinstance(brushnet, BrushNetModel): + image = self.prepare_image( + image=image, + width=width, + height=height, + batch_size=batch_size * num_images_per_prompt, + num_images_per_prompt=num_images_per_prompt, + device=device, + dtype=brushnet.dtype, + do_classifier_free_guidance=self.do_classifier_free_guidance, + guess_mode=guess_mode, + ) + original_mask = self.prepare_image( + image=mask, + width=width, + height=height, + batch_size=batch_size * num_images_per_prompt, + num_images_per_prompt=num_images_per_prompt, + device=device, + dtype=brushnet.dtype, + do_classifier_free_guidance=self.do_classifier_free_guidance, + guess_mode=guess_mode, + ) + original_mask = (original_mask.sum(1)[:, None, :, :] < 0).to(image.dtype) + height, width = image.shape[-2:] + else: + assert False + + # 5. Prepare timesteps + timesteps, num_inference_steps = retrieve_timesteps( + self.scheduler, num_inference_steps, device, timesteps + ) + self._num_timesteps = len(timesteps) + + # 6. Prepare latent variables + num_channels_latents = self.unet.config.in_channels + latents, noise = self.prepare_latents( + batch_size * num_images_per_prompt, + num_channels_latents, + height, + width, + prompt_embeds.dtype, + device, + generator, + latents, + ) + + # 6.1 prepare condition latents + # mask_i = transforms.ToPILImage()(image[0:1,:,:,:].squeeze(0)) + # mask_i.save('_mask.png') + # print(brushnet.dtype) + conditioning_latents = ( + self.vae.encode( + image.to(device=device, dtype=brushnet.dtype) + ).latent_dist.sample() + * self.vae.config.scaling_factor + ) + mask = torch.nn.functional.interpolate( + original_mask, + size=(conditioning_latents.shape[-2], conditioning_latents.shape[-1]), + ) + conditioning_latents = torch.concat([conditioning_latents, mask], 1) + # image = self.vae.decode(conditioning_latents[:1,:4,:,:] / self.vae.config.scaling_factor, return_dict=False, generator=generator)[0] + # from torchvision import transforms + # mask_i = transforms.ToPILImage()(image[0:1,:,:,:].squeeze(0)/2+0.5) + # mask_i.save(str(timesteps[0]) +'_C.png') + + # 6.5 Optionally get Guidance Scale Embedding + timestep_cond = None + if self.unet.config.time_cond_proj_dim is not None: + guidance_scale_tensor = torch.tensor(self.guidance_scale - 1).repeat( + batch_size * num_images_per_prompt + ) + timestep_cond = self.get_guidance_scale_embedding( + guidance_scale_tensor, embedding_dim=self.unet.config.time_cond_proj_dim + ).to(device=device, dtype=latents.dtype) + + # 7. Prepare extra step kwargs. TODO: Logic should ideally just be moved out of the pipeline + extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta) + + # 7.1 Add image embeds for IP-Adapter + added_cond_kwargs = ( + {"image_embeds": image_embeds} + if ip_adapter_image is not None or ip_adapter_image_embeds is not None + else None + ) + + # 7.2 Create tensor stating which brushnets to keep + brushnet_keep = [] + for i in range(len(timesteps)): + keeps = [ + 1.0 - float(i / len(timesteps) < s or (i + 1) / len(timesteps) > e) + for s, e in zip(control_guidance_start, control_guidance_end) + ] + brushnet_keep.append( + keeps[0] if isinstance(brushnet, BrushNetModel) else keeps + ) + + # 8. Denoising loop + num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order + is_unet_compiled = is_compiled_module(self.unet) + is_brushnet_compiled = is_compiled_module(self.brushnet) + is_torch_higher_equal_2_1 = is_torch_version(">=", "2.1") + with self.progress_bar(total=num_inference_steps) as progress_bar: + for i, t in enumerate(timesteps): + # Relevant thread: + # https://dev-discuss.pytorch.org/t/cudagraphs-in-pytorch-2-0/1428 + if ( + is_unet_compiled and is_brushnet_compiled + ) and is_torch_higher_equal_2_1: + torch._inductor.cudagraph_mark_step_begin() + # expand the latents if we are doing classifier free guidance + latent_model_input = ( + torch.cat([latents] * 2) + if self.do_classifier_free_guidance + else latents + ) + latent_model_input = self.scheduler.scale_model_input( + latent_model_input, t + ) + + # brushnet(s) inference + if guess_mode and self.do_classifier_free_guidance: + # Infer BrushNet only for the conditional batch. + control_model_input = latents + control_model_input = self.scheduler.scale_model_input( + control_model_input, t + ) + brushnet_prompt_embeds = prompt_embeds.chunk(2)[1] + else: + control_model_input = latent_model_input + brushnet_prompt_embeds = prompt_embeds + + if isinstance(brushnet_keep[i], list): + cond_scale = [ + c * s + for c, s in zip(brushnet_conditioning_scale, brushnet_keep[i]) + ] + else: + brushnet_cond_scale = brushnet_conditioning_scale + if isinstance(brushnet_cond_scale, list): + brushnet_cond_scale = brushnet_cond_scale[0] + cond_scale = brushnet_cond_scale * brushnet_keep[i] + + down_block_res_samples, mid_block_res_sample, up_block_res_samples = ( + self.brushnet( + control_model_input, + t, + encoder_hidden_states=brushnet_prompt_embeds, + brushnet_cond=conditioning_latents, + conditioning_scale=cond_scale, + guess_mode=guess_mode, + return_dict=False, + ) + ) + + if guess_mode and self.do_classifier_free_guidance: + # Infered BrushNet only for the conditional batch. + # To apply the output of BrushNet to both the unconditional and conditional batches, + # add 0 to the unconditional batch to keep it unchanged. + down_block_res_samples = [ + torch.cat([torch.zeros_like(d), d]) + for d in down_block_res_samples + ] + mid_block_res_sample = torch.cat( + [torch.zeros_like(mid_block_res_sample), mid_block_res_sample] + ) + up_block_res_samples = [ + torch.cat([torch.zeros_like(d), d]) + for d in up_block_res_samples + ] + + # predict the noise residual + noise_pred = self.unet( + latent_model_input, + t, + encoder_hidden_states=prompt_embedsU, + timestep_cond=timestep_cond, + cross_attention_kwargs=self.cross_attention_kwargs, + down_block_add_samples=down_block_res_samples, + mid_block_add_sample=mid_block_res_sample, + up_block_add_samples=up_block_res_samples, + added_cond_kwargs=added_cond_kwargs, + return_dict=False, + )[0] + + # perform guidance + if self.do_classifier_free_guidance: + noise_pred_uncond, noise_pred_text = noise_pred.chunk(2) + noise_pred = noise_pred_uncond + self.guidance_scale * ( + noise_pred_text - noise_pred_uncond + ) + + # compute the previous noisy sample x_t -> x_t-1 + latents = self.scheduler.step( + noise_pred, t, latents, **extra_step_kwargs, return_dict=False + )[0] + + if callback_on_step_end is not None: + callback_kwargs = {} + for k in callback_on_step_end_tensor_inputs: + callback_kwargs[k] = locals()[k] + callback_outputs = callback_on_step_end(self, i, t, callback_kwargs) + + latents = callback_outputs.pop("latents", latents) + prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds) + negative_prompt_embeds = callback_outputs.pop( + "negative_prompt_embeds", negative_prompt_embeds + ) + + # call the callback, if provided + if i == len(timesteps) - 1 or ( + (i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0 + ): + progress_bar.update() + if callback is not None and i % callback_steps == 0: + step_idx = i // getattr(self.scheduler, "order", 1) + callback(step_idx, t, latents) + + # If we do sequential model offloading, let's offload unet and brushnet + # manually for max memory savings + if hasattr(self, "final_offload_hook") and self.final_offload_hook is not None: + self.unet.to("cpu") + self.brushnet.to("cpu") + torch.cuda.empty_cache() + + if not output_type == "latent": + image = self.vae.decode( + latents / self.vae.config.scaling_factor, + return_dict=False, + generator=generator, + )[0] + image, has_nsfw_concept = self.run_safety_checker( + image, device, prompt_embeds.dtype + ) + else: + image = latents + has_nsfw_concept = None + + if has_nsfw_concept is None: + do_denormalize = [True] * image.shape[0] + else: + do_denormalize = [not has_nsfw for has_nsfw in has_nsfw_concept] + + image = self.image_processor.postprocess( + image, output_type=output_type, do_denormalize=do_denormalize + ) + + # Offload all models + self.maybe_free_model_hooks() + + if not return_dict: + return (image, has_nsfw_concept) + + return StableDiffusionPipelineOutput( + images=image, nsfw_content_detected=has_nsfw_concept + ) diff --git a/inpaint/model/power_paint/v2/unet_2d_blocks.py b/inpaint/model/power_paint/v2/unet_2d_blocks.py new file mode 100644 index 0000000..000d24f --- /dev/null +++ b/inpaint/model/power_paint/v2/unet_2d_blocks.py @@ -0,0 +1,342 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any, Dict, Optional, Tuple + +import torch +from diffusers.utils import is_torch_version, logging +from diffusers.utils.torch_utils import apply_freeu + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +def CrossAttnDownBlock2D_forward( + self, + hidden_states: torch.FloatTensor, + temb: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + attention_mask: Optional[torch.FloatTensor] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + additional_residuals: Optional[torch.FloatTensor] = None, + down_block_add_samples: Optional[torch.FloatTensor] = None, +) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + output_states = () + + lora_scale = ( + cross_attention_kwargs.get("scale", 1.0) + if cross_attention_kwargs is not None + else 1.0 + ) + + blocks = list(zip(self.resnets, self.attentions)) + + for i, (resnet, attn) in enumerate(blocks): + if self.training and self.gradient_checkpointing: + + def create_custom_forward(module, return_dict=None): + def custom_forward(*inputs): + if return_dict is not None: + return module(*inputs, return_dict=return_dict) + else: + return module(*inputs) + + return custom_forward + + ckpt_kwargs: Dict[str, Any] = ( + {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {} + ) + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(resnet), + hidden_states, + temb, + **ckpt_kwargs, + ) + hidden_states = attn( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + cross_attention_kwargs=cross_attention_kwargs, + attention_mask=attention_mask, + encoder_attention_mask=encoder_attention_mask, + return_dict=False, + )[0] + else: + hidden_states = resnet(hidden_states, temb, scale=lora_scale) + hidden_states = attn( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + cross_attention_kwargs=cross_attention_kwargs, + attention_mask=attention_mask, + encoder_attention_mask=encoder_attention_mask, + return_dict=False, + )[0] + + # apply additional residuals to the output of the last pair of resnet and attention blocks + if i == len(blocks) - 1 and additional_residuals is not None: + hidden_states = hidden_states + additional_residuals + + if down_block_add_samples is not None: + hidden_states = hidden_states + down_block_add_samples.pop(0) + + output_states = output_states + (hidden_states,) + + if self.downsamplers is not None: + for downsampler in self.downsamplers: + hidden_states = downsampler(hidden_states, scale=lora_scale) + + if down_block_add_samples is not None: + hidden_states = hidden_states + down_block_add_samples.pop( + 0 + ) # todo: add before or after + + output_states = output_states + (hidden_states,) + + return hidden_states, output_states + + +def DownBlock2D_forward( + self, + hidden_states: torch.FloatTensor, + temb: Optional[torch.FloatTensor] = None, + scale: float = 1.0, + down_block_add_samples: Optional[torch.FloatTensor] = None, +) -> Tuple[torch.FloatTensor, Tuple[torch.FloatTensor, ...]]: + output_states = () + + for resnet in self.resnets: + if self.training and self.gradient_checkpointing: + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs) + + return custom_forward + + if is_torch_version(">=", "1.11.0"): + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(resnet), + hidden_states, + temb, + use_reentrant=False, + ) + else: + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(resnet), hidden_states, temb + ) + else: + hidden_states = resnet(hidden_states, temb, scale=scale) + + if down_block_add_samples is not None: + hidden_states = hidden_states + down_block_add_samples.pop(0) + + output_states = output_states + (hidden_states,) + + if self.downsamplers is not None: + for downsampler in self.downsamplers: + hidden_states = downsampler(hidden_states, scale=scale) + + if down_block_add_samples is not None: + hidden_states = hidden_states + down_block_add_samples.pop( + 0 + ) # todo: add before or after + + output_states = output_states + (hidden_states,) + + return hidden_states, output_states + + +def CrossAttnUpBlock2D_forward( + self, + hidden_states: torch.FloatTensor, + res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], + temb: Optional[torch.FloatTensor] = None, + encoder_hidden_states: Optional[torch.FloatTensor] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + upsample_size: Optional[int] = None, + attention_mask: Optional[torch.FloatTensor] = None, + encoder_attention_mask: Optional[torch.FloatTensor] = None, + return_res_samples: Optional[bool] = False, + up_block_add_samples: Optional[torch.FloatTensor] = None, +) -> torch.FloatTensor: + lora_scale = ( + cross_attention_kwargs.get("scale", 1.0) + if cross_attention_kwargs is not None + else 1.0 + ) + is_freeu_enabled = ( + getattr(self, "s1", None) + and getattr(self, "s2", None) + and getattr(self, "b1", None) + and getattr(self, "b2", None) + ) + if return_res_samples: + output_states = () + + for resnet, attn in zip(self.resnets, self.attentions): + # pop res hidden states + res_hidden_states = res_hidden_states_tuple[-1] + res_hidden_states_tuple = res_hidden_states_tuple[:-1] + + # FreeU: Only operate on the first two stages + if is_freeu_enabled: + hidden_states, res_hidden_states = apply_freeu( + self.resolution_idx, + hidden_states, + res_hidden_states, + s1=self.s1, + s2=self.s2, + b1=self.b1, + b2=self.b2, + ) + + hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1) + + if self.training and self.gradient_checkpointing: + + def create_custom_forward(module, return_dict=None): + def custom_forward(*inputs): + if return_dict is not None: + return module(*inputs, return_dict=return_dict) + else: + return module(*inputs) + + return custom_forward + + ckpt_kwargs: Dict[str, Any] = ( + {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {} + ) + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(resnet), + hidden_states, + temb, + **ckpt_kwargs, + ) + hidden_states = attn( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + cross_attention_kwargs=cross_attention_kwargs, + attention_mask=attention_mask, + encoder_attention_mask=encoder_attention_mask, + return_dict=False, + )[0] + else: + hidden_states = resnet(hidden_states, temb, scale=lora_scale) + hidden_states = attn( + hidden_states, + encoder_hidden_states=encoder_hidden_states, + cross_attention_kwargs=cross_attention_kwargs, + attention_mask=attention_mask, + encoder_attention_mask=encoder_attention_mask, + return_dict=False, + )[0] + if return_res_samples: + output_states = output_states + (hidden_states,) + if up_block_add_samples is not None: + hidden_states = hidden_states + up_block_add_samples.pop(0) + + if self.upsamplers is not None: + for upsampler in self.upsamplers: + hidden_states = upsampler(hidden_states, upsample_size, scale=lora_scale) + if return_res_samples: + output_states = output_states + (hidden_states,) + if up_block_add_samples is not None: + hidden_states = hidden_states + up_block_add_samples.pop(0) + + if return_res_samples: + return hidden_states, output_states + else: + return hidden_states + + +def UpBlock2D_forward( + self, + hidden_states: torch.FloatTensor, + res_hidden_states_tuple: Tuple[torch.FloatTensor, ...], + temb: Optional[torch.FloatTensor] = None, + upsample_size: Optional[int] = None, + scale: float = 1.0, + return_res_samples: Optional[bool] = False, + up_block_add_samples: Optional[torch.FloatTensor] = None, +) -> torch.FloatTensor: + is_freeu_enabled = ( + getattr(self, "s1", None) + and getattr(self, "s2", None) + and getattr(self, "b1", None) + and getattr(self, "b2", None) + ) + if return_res_samples: + output_states = () + + for resnet in self.resnets: + # pop res hidden states + res_hidden_states = res_hidden_states_tuple[-1] + res_hidden_states_tuple = res_hidden_states_tuple[:-1] + + # FreeU: Only operate on the first two stages + if is_freeu_enabled: + hidden_states, res_hidden_states = apply_freeu( + self.resolution_idx, + hidden_states, + res_hidden_states, + s1=self.s1, + s2=self.s2, + b1=self.b1, + b2=self.b2, + ) + + hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1) + + if self.training and self.gradient_checkpointing: + + def create_custom_forward(module): + def custom_forward(*inputs): + return module(*inputs) + + return custom_forward + + if is_torch_version(">=", "1.11.0"): + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(resnet), + hidden_states, + temb, + use_reentrant=False, + ) + else: + hidden_states = torch.utils.checkpoint.checkpoint( + create_custom_forward(resnet), hidden_states, temb + ) + else: + hidden_states = resnet(hidden_states, temb, scale=scale) + + if return_res_samples: + output_states = output_states + (hidden_states,) + if up_block_add_samples is not None: + hidden_states = hidden_states + up_block_add_samples.pop( + 0 + ) # todo: add before or after + + if self.upsamplers is not None: + for upsampler in self.upsamplers: + hidden_states = upsampler(hidden_states, upsample_size, scale=scale) + + if return_res_samples: + output_states = output_states + (hidden_states,) + if up_block_add_samples is not None: + hidden_states = hidden_states + up_block_add_samples.pop( + 0 + ) # todo: add before or after + + if return_res_samples: + return hidden_states, output_states + else: + return hidden_states diff --git a/inpaint/model/power_paint/v2/unet_2d_condition.py b/inpaint/model/power_paint/v2/unet_2d_condition.py new file mode 100644 index 0000000..80741de --- /dev/null +++ b/inpaint/model/power_paint/v2/unet_2d_condition.py @@ -0,0 +1,402 @@ +# Copyright 2024 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +from typing import Any, Dict, Optional, Tuple, Union + +import torch +import torch.utils.checkpoint +from diffusers.models.unet_2d_condition import UNet2DConditionOutput +from diffusers.utils import ( + USE_PEFT_BACKEND, + deprecate, + logging, + scale_lora_layers, + unscale_lora_layers, +) + +logger = logging.get_logger(__name__) # pylint: disable=invalid-name + + +def UNet2DConditionModel_forward( + self, + sample: torch.FloatTensor, + timestep: Union[torch.Tensor, float, int], + encoder_hidden_states: torch.Tensor, + class_labels: Optional[torch.Tensor] = None, + timestep_cond: Optional[torch.Tensor] = None, + attention_mask: Optional[torch.Tensor] = None, + cross_attention_kwargs: Optional[Dict[str, Any]] = None, + added_cond_kwargs: Optional[Dict[str, torch.Tensor]] = None, + down_block_additional_residuals: Optional[Tuple[torch.Tensor]] = None, + mid_block_additional_residual: Optional[torch.Tensor] = None, + down_intrablock_additional_residuals: Optional[Tuple[torch.Tensor]] = None, + encoder_attention_mask: Optional[torch.Tensor] = None, + return_dict: bool = True, + down_block_add_samples: Optional[Tuple[torch.Tensor]] = None, + mid_block_add_sample: Optional[Tuple[torch.Tensor]] = None, + up_block_add_samples: Optional[Tuple[torch.Tensor]] = None, +) -> Union[UNet2DConditionOutput, Tuple]: + r""" + The [`UNet2DConditionModel`] forward method. + + Args: + sample (`torch.FloatTensor`): + The noisy input tensor with the following shape `(batch, channel, height, width)`. + timestep (`torch.FloatTensor` or `float` or `int`): The number of timesteps to denoise an input. + encoder_hidden_states (`torch.FloatTensor`): + The encoder hidden states with shape `(batch, sequence_length, feature_dim)`. + class_labels (`torch.Tensor`, *optional*, defaults to `None`): + Optional class labels for conditioning. Their embeddings will be summed with the timestep embeddings. + timestep_cond: (`torch.Tensor`, *optional*, defaults to `None`): + Conditional embeddings for timestep. If provided, the embeddings will be summed with the samples passed + through the `self.time_embedding` layer to obtain the timestep embeddings. + attention_mask (`torch.Tensor`, *optional*, defaults to `None`): + An attention mask of shape `(batch, key_tokens)` is applied to `encoder_hidden_states`. If `1` the mask + is kept, otherwise if `0` it is discarded. Mask will be converted into a bias, which adds large + negative values to the attention scores corresponding to "discard" tokens. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under + `self.processor` in + [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py). + added_cond_kwargs: (`dict`, *optional*): + A kwargs dictionary containing additional embeddings that if specified are added to the embeddings that + are passed along to the UNet blocks. + down_block_additional_residuals: (`tuple` of `torch.Tensor`, *optional*): + A tuple of tensors that if specified are added to the residuals of down unet blocks. + mid_block_additional_residual: (`torch.Tensor`, *optional*): + A tensor that if specified is added to the residual of the middle unet block. + encoder_attention_mask (`torch.Tensor`): + A cross-attention mask of shape `(batch, sequence_length)` is applied to `encoder_hidden_states`. If + `True` the mask is kept, otherwise if `False` it is discarded. Mask will be converted into a bias, + which adds large negative values to the attention scores corresponding to "discard" tokens. + return_dict (`bool`, *optional*, defaults to `True`): + Whether or not to return a [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] instead of a plain + tuple. + cross_attention_kwargs (`dict`, *optional*): + A kwargs dictionary that if specified is passed along to the [`AttnProcessor`]. + added_cond_kwargs: (`dict`, *optional*): + A kwargs dictionary containin additional embeddings that if specified are added to the embeddings that + are passed along to the UNet blocks. + down_block_additional_residuals (`tuple` of `torch.Tensor`, *optional*): + additional residuals to be added to UNet long skip connections from down blocks to up blocks for + example from ControlNet side model(s) + mid_block_additional_residual (`torch.Tensor`, *optional*): + additional residual to be added to UNet mid block output, for example from ControlNet side model + down_intrablock_additional_residuals (`tuple` of `torch.Tensor`, *optional*): + additional residuals to be added within UNet down blocks, for example from T2I-Adapter side model(s) + + Returns: + [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] or `tuple`: + If `return_dict` is True, an [`~models.unets.unet_2d_condition.UNet2DConditionOutput`] is returned, otherwise + a `tuple` is returned where the first element is the sample tensor. + """ + # By default samples have to be AT least a multiple of the overall upsampling factor. + # The overall upsampling factor is equal to 2 ** (# num of upsampling layers). + # However, the upsampling interpolation output size can be forced to fit any upsampling size + # on the fly if necessary. + default_overall_up_factor = 2**self.num_upsamplers + + # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor` + forward_upsample_size = False + upsample_size = None + + for dim in sample.shape[-2:]: + if dim % default_overall_up_factor != 0: + # Forward upsample size to force interpolation output size. + forward_upsample_size = True + break + + # ensure attention_mask is a bias, and give it a singleton query_tokens dimension + # expects mask of shape: + # [batch, key_tokens] + # adds singleton query_tokens dimension: + # [batch, 1, key_tokens] + # this helps to broadcast it as a bias over attention scores, which will be in one of the following shapes: + # [batch, heads, query_tokens, key_tokens] (e.g. torch sdp attn) + # [batch * heads, query_tokens, key_tokens] (e.g. xformers or classic attn) + if attention_mask is not None: + # assume that mask is expressed as: + # (1 = keep, 0 = discard) + # convert mask into a bias that can be added to attention scores: + # (keep = +0, discard = -10000.0) + attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0 + attention_mask = attention_mask.unsqueeze(1) + + # convert encoder_attention_mask to a bias the same way we do for attention_mask + if encoder_attention_mask is not None: + encoder_attention_mask = ( + 1 - encoder_attention_mask.to(sample.dtype) + ) * -10000.0 + encoder_attention_mask = encoder_attention_mask.unsqueeze(1) + + # 0. center input if necessary + if self.config.center_input_sample: + sample = 2 * sample - 1.0 + + # 1. time + t_emb = self.get_time_embed(sample=sample, timestep=timestep) + emb = self.time_embedding(t_emb, timestep_cond) + aug_emb = None + + class_emb = self.get_class_embed(sample=sample, class_labels=class_labels) + if class_emb is not None: + if self.config.class_embeddings_concat: + emb = torch.cat([emb, class_emb], dim=-1) + else: + emb = emb + class_emb + + aug_emb = self.get_aug_embed( + emb=emb, + encoder_hidden_states=encoder_hidden_states, + added_cond_kwargs=added_cond_kwargs, + ) + if self.config.addition_embed_type == "image_hint": + aug_emb, hint = aug_emb + sample = torch.cat([sample, hint], dim=1) + + emb = emb + aug_emb if aug_emb is not None else emb + + if self.time_embed_act is not None: + emb = self.time_embed_act(emb) + + encoder_hidden_states = self.process_encoder_hidden_states( + encoder_hidden_states=encoder_hidden_states, + added_cond_kwargs=added_cond_kwargs, + ) + + # 2. pre-process + sample = self.conv_in(sample) + + # 2.5 GLIGEN position net + if ( + cross_attention_kwargs is not None + and cross_attention_kwargs.get("gligen", None) is not None + ): + cross_attention_kwargs = cross_attention_kwargs.copy() + gligen_args = cross_attention_kwargs.pop("gligen") + cross_attention_kwargs["gligen"] = {"objs": self.position_net(**gligen_args)} + + # 3. down + lora_scale = ( + cross_attention_kwargs.get("scale", 1.0) + if cross_attention_kwargs is not None + else 1.0 + ) + if USE_PEFT_BACKEND: + # weight the lora layers by setting `lora_scale` for each PEFT layer + scale_lora_layers(self, lora_scale) + + is_controlnet = ( + mid_block_additional_residual is not None + and down_block_additional_residuals is not None + ) + # using new arg down_intrablock_additional_residuals for T2I-Adapters, to distinguish from controlnets + is_adapter = down_intrablock_additional_residuals is not None + # maintain backward compatibility for legacy usage, where + # T2I-Adapter and ControlNet both use down_block_additional_residuals arg + # but can only use one or the other + is_brushnet = ( + down_block_add_samples is not None + and mid_block_add_sample is not None + and up_block_add_samples is not None + ) + if ( + not is_adapter + and mid_block_additional_residual is None + and down_block_additional_residuals is not None + ): + deprecate( + "T2I should not use down_block_additional_residuals", + "1.3.0", + "Passing intrablock residual connections with `down_block_additional_residuals` is deprecated \ + and will be removed in diffusers 1.3.0. `down_block_additional_residuals` should only be used \ + for ControlNet. Please make sure use `down_intrablock_additional_residuals` instead. ", + standard_warn=False, + ) + down_intrablock_additional_residuals = down_block_additional_residuals + is_adapter = True + + down_block_res_samples = (sample,) + + if is_brushnet: + sample = sample + down_block_add_samples.pop(0) + + for downsample_block in self.down_blocks: + if ( + hasattr(downsample_block, "has_cross_attention") + and downsample_block.has_cross_attention + ): + # For t2i-adapter CrossAttnDownBlock2D + additional_residuals = {} + if is_adapter and len(down_intrablock_additional_residuals) > 0: + additional_residuals["additional_residuals"] = ( + down_intrablock_additional_residuals.pop(0) + ) + + if is_brushnet and len(down_block_add_samples) > 0: + additional_residuals["down_block_add_samples"] = [ + down_block_add_samples.pop(0) + for _ in range( + len(downsample_block.resnets) + + (downsample_block.downsamplers != None) + ) + ] + + sample, res_samples = downsample_block( + hidden_states=sample, + temb=emb, + encoder_hidden_states=encoder_hidden_states, + attention_mask=attention_mask, + cross_attention_kwargs=cross_attention_kwargs, + encoder_attention_mask=encoder_attention_mask, + **additional_residuals, + ) + else: + additional_residuals = {} + if is_brushnet and len(down_block_add_samples) > 0: + additional_residuals["down_block_add_samples"] = [ + down_block_add_samples.pop(0) + for _ in range( + len(downsample_block.resnets) + + (downsample_block.downsamplers != None) + ) + ] + + sample, res_samples = downsample_block( + hidden_states=sample, + temb=emb, + scale=lora_scale, + **additional_residuals, + ) + if is_adapter and len(down_intrablock_additional_residuals) > 0: + sample += down_intrablock_additional_residuals.pop(0) + + down_block_res_samples += res_samples + + if is_controlnet: + new_down_block_res_samples = () + + for down_block_res_sample, down_block_additional_residual in zip( + down_block_res_samples, down_block_additional_residuals + ): + down_block_res_sample = ( + down_block_res_sample + down_block_additional_residual + ) + new_down_block_res_samples = new_down_block_res_samples + ( + down_block_res_sample, + ) + + down_block_res_samples = new_down_block_res_samples + + # 4. mid + if self.mid_block is not None: + if ( + hasattr(self.mid_block, "has_cross_attention") + and self.mid_block.has_cross_attention + ): + sample = self.mid_block( + sample, + emb, + encoder_hidden_states=encoder_hidden_states, + attention_mask=attention_mask, + cross_attention_kwargs=cross_attention_kwargs, + encoder_attention_mask=encoder_attention_mask, + ) + else: + sample = self.mid_block(sample, emb) + + # To support T2I-Adapter-XL + if ( + is_adapter + and len(down_intrablock_additional_residuals) > 0 + and sample.shape == down_intrablock_additional_residuals[0].shape + ): + sample += down_intrablock_additional_residuals.pop(0) + + if is_controlnet: + sample = sample + mid_block_additional_residual + + if is_brushnet: + sample = sample + mid_block_add_sample + + # 5. up + for i, upsample_block in enumerate(self.up_blocks): + is_final_block = i == len(self.up_blocks) - 1 + + res_samples = down_block_res_samples[-len(upsample_block.resnets) :] + down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)] + + # if we have not reached the final block and need to forward the + # upsample size, we do it here + if not is_final_block and forward_upsample_size: + upsample_size = down_block_res_samples[-1].shape[2:] + + if ( + hasattr(upsample_block, "has_cross_attention") + and upsample_block.has_cross_attention + ): + additional_residuals = {} + if is_brushnet and len(up_block_add_samples) > 0: + additional_residuals["up_block_add_samples"] = [ + up_block_add_samples.pop(0) + for _ in range( + len(upsample_block.resnets) + + (upsample_block.upsamplers != None) + ) + ] + + sample = upsample_block( + hidden_states=sample, + temb=emb, + res_hidden_states_tuple=res_samples, + encoder_hidden_states=encoder_hidden_states, + cross_attention_kwargs=cross_attention_kwargs, + upsample_size=upsample_size, + attention_mask=attention_mask, + encoder_attention_mask=encoder_attention_mask, + **additional_residuals, + ) + else: + additional_residuals = {} + if is_brushnet and len(up_block_add_samples) > 0: + additional_residuals["up_block_add_samples"] = [ + up_block_add_samples.pop(0) + for _ in range( + len(upsample_block.resnets) + + (upsample_block.upsamplers != None) + ) + ] + + sample = upsample_block( + hidden_states=sample, + temb=emb, + res_hidden_states_tuple=res_samples, + upsample_size=upsample_size, + scale=lora_scale, + **additional_residuals, + ) + + # 6. post-process + if self.conv_norm_out: + sample = self.conv_norm_out(sample) + sample = self.conv_act(sample) + sample = self.conv_out(sample) + + if USE_PEFT_BACKEND: + # remove `lora_scale` from each PEFT layer + unscale_lora_layers(self, lora_scale) + + if not return_dict: + return (sample,) + + return UNet2DConditionOutput(sample=sample) diff --git a/inpaint/model/sd.py b/inpaint/model/sd.py new file mode 100644 index 0000000..2f6698c --- /dev/null +++ b/inpaint/model/sd.py @@ -0,0 +1,129 @@ +import PIL.Image +import cv2 +import torch +from loguru import logger + +from .base import DiffusionInpaintModel +from .helper.cpu_text_encoder import CPUTextEncoderWrapper +from .original_sd_configs import get_config_files +from .utils import ( + handle_from_pretrained_exceptions, + get_torch_dtype, + enable_low_mem, + is_local_files_only, +) +from iopaint.schema import InpaintRequest, ModelType + + +class SD(DiffusionInpaintModel): + pad_mod = 8 + min_size = 512 + lcm_lora_id = "latent-consistency/lcm-lora-sdv1-5" + + def init_model(self, device: torch.device, **kwargs): + from diffusers.pipelines.stable_diffusion import StableDiffusionInpaintPipeline + + use_gpu, torch_dtype = get_torch_dtype(device, kwargs.get("no_half", False)) + + model_kwargs = { + **kwargs.get("pipe_components", {}), + "local_files_only": is_local_files_only(**kwargs), + } + disable_nsfw_checker = kwargs["disable_nsfw"] or kwargs.get( + "cpu_offload", False + ) + if disable_nsfw_checker: + logger.info("Disable Stable Diffusion Model NSFW checker") + model_kwargs.update( + dict( + safety_checker=None, + feature_extractor=None, + requires_safety_checker=False, + ) + ) + + if self.model_info.is_single_file_diffusers: + if self.model_info.model_type == ModelType.DIFFUSERS_SD: + model_kwargs["num_in_channels"] = 4 + else: + model_kwargs["num_in_channels"] = 9 + + self.model = StableDiffusionInpaintPipeline.from_single_file( + self.model_id_or_path, + torch_dtype=torch_dtype, + load_safety_checker=not disable_nsfw_checker, + original_config_file=get_config_files()['v1'], + **model_kwargs, + ) + else: + self.model = handle_from_pretrained_exceptions( + StableDiffusionInpaintPipeline.from_pretrained, + pretrained_model_name_or_path=self.model_id_or_path, + variant="fp16", + torch_dtype=torch_dtype, + **model_kwargs, + ) + + enable_low_mem(self.model, kwargs.get("low_mem", False)) + + if kwargs.get("cpu_offload", False) and use_gpu: + logger.info("Enable sequential cpu offload") + self.model.enable_sequential_cpu_offload(gpu_id=0) + else: + self.model = self.model.to(device) + if kwargs["sd_cpu_textencoder"]: + logger.info("Run Stable Diffusion TextEncoder on CPU") + self.model.text_encoder = CPUTextEncoderWrapper( + self.model.text_encoder, torch_dtype + ) + + self.callback = kwargs.pop("callback", None) + + def forward(self, image, mask, config: InpaintRequest): + """Input image and output image have same size + image: [H, W, C] RGB + mask: [H, W, 1] 255 means area to repaint + return: BGR IMAGE + """ + self.set_scheduler(config) + + img_h, img_w = image.shape[:2] + + output = self.model( + image=PIL.Image.fromarray(image), + prompt=config.prompt, + negative_prompt=config.negative_prompt, + mask_image=PIL.Image.fromarray(mask[:, :, -1], mode="L"), + num_inference_steps=config.sd_steps, + strength=config.sd_strength, + guidance_scale=config.sd_guidance_scale, + output_type="np", + callback_on_step_end=self.callback, + height=img_h, + width=img_w, + generator=torch.manual_seed(config.sd_seed), + ).images[0] + + output = (output * 255).round().astype("uint8") + output = cv2.cvtColor(output, cv2.COLOR_RGB2BGR) + return output + + +class SD15(SD): + name = "runwayml/stable-diffusion-inpainting" + model_id_or_path = "runwayml/stable-diffusion-inpainting" + + +class Anything4(SD): + name = "Sanster/anything-4.0-inpainting" + model_id_or_path = "Sanster/anything-4.0-inpainting" + + +class RealisticVision14(SD): + name = "Sanster/Realistic_Vision_V1.4-inpainting" + model_id_or_path = "Sanster/Realistic_Vision_V1.4-inpainting" + + +class SD2(SD): + name = "stabilityai/stable-diffusion-2-inpainting" + model_id_or_path = "stabilityai/stable-diffusion-2-inpainting" diff --git a/inpaint/model/sdxl.py b/inpaint/model/sdxl.py new file mode 100644 index 0000000..b7099e8 --- /dev/null +++ b/inpaint/model/sdxl.py @@ -0,0 +1,110 @@ +import os + +import PIL.Image +import cv2 +import torch +from diffusers import AutoencoderKL +from loguru import logger + +from inpaint.schema import InpaintRequest, ModelType + +from .base import DiffusionInpaintModel +from .helper.cpu_text_encoder import CPUTextEncoderWrapper +from .original_sd_configs import get_config_files +from .utils import ( + handle_from_pretrained_exceptions, + get_torch_dtype, + enable_low_mem, + is_local_files_only, +) + + +class SDXL(DiffusionInpaintModel): + name = "diffusers/stable-diffusion-xl-1.0-inpainting-0.1" + pad_mod = 8 + min_size = 512 + lcm_lora_id = "latent-consistency/lcm-lora-sdxl" + model_id_or_path = "diffusers/stable-diffusion-xl-1.0-inpainting-0.1" + + def init_model(self, device: torch.device, **kwargs): + from diffusers.pipelines import StableDiffusionXLInpaintPipeline + + use_gpu, torch_dtype = get_torch_dtype(device, kwargs.get("no_half", False)) + + if self.model_info.model_type == ModelType.DIFFUSERS_SDXL: + num_in_channels = 4 + else: + num_in_channels = 9 + + if os.path.isfile(self.model_id_or_path): + self.model = StableDiffusionXLInpaintPipeline.from_single_file( + self.model_id_or_path, + torch_dtype=torch_dtype, + num_in_channels=num_in_channels, + load_safety_checker=False, + original_config_file=get_config_files()['xl'], + ) + else: + model_kwargs = { + **kwargs.get("pipe_components", {}), + "local_files_only": is_local_files_only(**kwargs), + } + if "vae" not in model_kwargs: + vae = AutoencoderKL.from_pretrained( + "madebyollin/sdxl-vae-fp16-fix", torch_dtype=torch_dtype + ) + model_kwargs["vae"] = vae + self.model = handle_from_pretrained_exceptions( + StableDiffusionXLInpaintPipeline.from_pretrained, + pretrained_model_name_or_path=self.model_id_or_path, + torch_dtype=torch_dtype, + variant="fp16", + **model_kwargs + ) + + enable_low_mem(self.model, kwargs.get("low_mem", False)) + + if kwargs.get("cpu_offload", False) and use_gpu: + logger.info("Enable sequential cpu offload") + self.model.enable_sequential_cpu_offload(gpu_id=0) + else: + self.model = self.model.to(device) + if kwargs["sd_cpu_textencoder"]: + logger.info("Run Stable Diffusion TextEncoder on CPU") + self.model.text_encoder = CPUTextEncoderWrapper( + self.model.text_encoder, torch_dtype + ) + self.model.text_encoder_2 = CPUTextEncoderWrapper( + self.model.text_encoder_2, torch_dtype + ) + + self.callback = kwargs.pop("callback", None) + + def forward(self, image, mask, config: InpaintRequest): + """Input image and output image have same size + image: [H, W, C] RGB + mask: [H, W, 1] 255 means area to repaint + return: BGR IMAGE + """ + self.set_scheduler(config) + + img_h, img_w = image.shape[:2] + + output = self.model( + image=PIL.Image.fromarray(image), + prompt=config.prompt, + negative_prompt=config.negative_prompt, + mask_image=PIL.Image.fromarray(mask[:, :, -1], mode="L"), + num_inference_steps=config.sd_steps, + strength=0.999 if config.sd_strength == 1.0 else config.sd_strength, + guidance_scale=config.sd_guidance_scale, + output_type="np", + callback_on_step_end=self.callback, + height=img_h, + width=img_w, + generator=torch.manual_seed(config.sd_seed), + ).images[0] + + output = (output * 255).round().astype("uint8") + output = cv2.cvtColor(output, cv2.COLOR_RGB2BGR) + return output diff --git a/inpaint/model/utils.py b/inpaint/model/utils.py new file mode 100644 index 0000000..2278817 --- /dev/null +++ b/inpaint/model/utils.py @@ -0,0 +1,1033 @@ +import gc +import math +import random +import traceback +from typing import Any + +import torch +import numpy as np +import collections +from itertools import repeat + +from diffusers import ( + DDIMScheduler, + PNDMScheduler, + LMSDiscreteScheduler, + EulerDiscreteScheduler, + EulerAncestralDiscreteScheduler, + DPMSolverMultistepScheduler, + UniPCMultistepScheduler, + LCMScheduler, + DPMSolverSinglestepScheduler, + KDPM2DiscreteScheduler, + KDPM2AncestralDiscreteScheduler, + HeunDiscreteScheduler, +) +from loguru import logger + +from inpaint.schema import SDSampler +from torch import conv2d, conv_transpose2d + + +def make_beta_schedule( + device, schedule, n_timestep, linear_start=1e-4, linear_end=2e-2, cosine_s=8e-3 +): + if schedule == "linear": + betas = ( + torch.linspace( + linear_start**0.5, linear_end**0.5, n_timestep, dtype=torch.float64 + ) + ** 2 + ) + + elif schedule == "cosine": + timesteps = ( + torch.arange(n_timestep + 1, dtype=torch.float64) / n_timestep + cosine_s + ).to(device) + alphas = timesteps / (1 + cosine_s) * np.pi / 2 + alphas = torch.cos(alphas).pow(2).to(device) + alphas = alphas / alphas[0] + betas = 1 - alphas[1:] / alphas[:-1] + betas = np.clip(betas, a_min=0, a_max=0.999) + + elif schedule == "sqrt_linear": + betas = torch.linspace( + linear_start, linear_end, n_timestep, dtype=torch.float64 + ) + elif schedule == "sqrt": + betas = ( + torch.linspace(linear_start, linear_end, n_timestep, dtype=torch.float64) + ** 0.5 + ) + else: + raise ValueError(f"schedule '{schedule}' unknown.") + return betas.numpy() + + +def make_ddim_sampling_parameters(alphacums, ddim_timesteps, eta, verbose=True): + # select alphas for computing the variance schedule + alphas = alphacums[ddim_timesteps] + alphas_prev = np.asarray([alphacums[0]] + alphacums[ddim_timesteps[:-1]].tolist()) + + # according the the formula provided in https://arxiv.org/abs/2010.02502 + sigmas = eta * np.sqrt( + (1 - alphas_prev) / (1 - alphas) * (1 - alphas / alphas_prev) + ) + if verbose: + print( + f"Selected alphas for ddim sampler: a_t: {alphas}; a_(t-1): {alphas_prev}" + ) + print( + f"For the chosen value of eta, which is {eta}, " + f"this results in the following sigma_t schedule for ddim sampler {sigmas}" + ) + return sigmas, alphas, alphas_prev + + +def make_ddim_timesteps( + ddim_discr_method, num_ddim_timesteps, num_ddpm_timesteps, verbose=True +): + if ddim_discr_method == "uniform": + c = num_ddpm_timesteps // num_ddim_timesteps + ddim_timesteps = np.asarray(list(range(0, num_ddpm_timesteps, c))) + elif ddim_discr_method == "quad": + ddim_timesteps = ( + (np.linspace(0, np.sqrt(num_ddpm_timesteps * 0.8), num_ddim_timesteps)) ** 2 + ).astype(int) + else: + raise NotImplementedError( + f'There is no ddim discretization method called "{ddim_discr_method}"' + ) + + # assert ddim_timesteps.shape[0] == num_ddim_timesteps + # add one to get the final alpha values right (the ones from first scale to data during sampling) + steps_out = ddim_timesteps + 1 + if verbose: + print(f"Selected timesteps for ddim sampler: {steps_out}") + return steps_out + + +def noise_like(shape, device, repeat=False): + repeat_noise = lambda: torch.randn((1, *shape[1:]), device=device).repeat( + shape[0], *((1,) * (len(shape) - 1)) + ) + noise = lambda: torch.randn(shape, device=device) + return repeat_noise() if repeat else noise() + + +def timestep_embedding(device, timesteps, dim, max_period=10000, repeat_only=False): + """ + Create sinusoidal timestep embeddings. + :param timesteps: a 1-D Tensor of N indices, one per batch element. + These may be fractional. + :param dim: the dimension of the output. + :param max_period: controls the minimum frequency of the embeddings. + :return: an [N x dim] Tensor of positional embeddings. + """ + half = dim // 2 + freqs = torch.exp( + -math.log(max_period) + * torch.arange(start=0, end=half, dtype=torch.float32) + / half + ).to(device=device) + + args = timesteps[:, None].float() * freqs[None] + + embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1) + if dim % 2: + embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1) + return embedding + + +###### MAT and FcF ####### + + +def normalize_2nd_moment(x, dim=1): + return ( + x * (x.square().mean(dim=dim, keepdim=True) + torch.finfo(x.dtype).eps).rsqrt() + ) + + +class EasyDict(dict): + """Convenience class that behaves like a dict but allows access with the attribute syntax.""" + + def __getattr__(self, name: str) -> Any: + try: + return self[name] + except KeyError: + raise AttributeError(name) + + def __setattr__(self, name: str, value: Any) -> None: + self[name] = value + + def __delattr__(self, name: str) -> None: + del self[name] + + +def _bias_act_ref(x, b=None, dim=1, act="linear", alpha=None, gain=None, clamp=None): + """Slow reference implementation of `bias_act()` using standard TensorFlow ops.""" + assert isinstance(x, torch.Tensor) + assert clamp is None or clamp >= 0 + spec = activation_funcs[act] + alpha = float(alpha if alpha is not None else spec.def_alpha) + gain = float(gain if gain is not None else spec.def_gain) + clamp = float(clamp if clamp is not None else -1) + + # Add bias. + if b is not None: + assert isinstance(b, torch.Tensor) and b.ndim == 1 + assert 0 <= dim < x.ndim + assert b.shape[0] == x.shape[dim] + x = x + b.reshape([-1 if i == dim else 1 for i in range(x.ndim)]) + + # Evaluate activation function. + alpha = float(alpha) + x = spec.func(x, alpha=alpha) + + # Scale by gain. + gain = float(gain) + if gain != 1: + x = x * gain + + # Clamp. + if clamp >= 0: + x = x.clamp(-clamp, clamp) # pylint: disable=invalid-unary-operand-type + return x + + +def bias_act( + x, b=None, dim=1, act="linear", alpha=None, gain=None, clamp=None, impl="ref" +): + r"""Fused bias and activation function. + + Adds bias `b` to activation tensor `x`, evaluates activation function `act`, + and scales the result by `gain`. Each of the steps is optional. In most cases, + the fused op is considerably more efficient than performing the same calculation + using standard PyTorch ops. It supports first and second order gradients, + but not third order gradients. + + Args: + x: Input activation tensor. Can be of any shape. + b: Bias vector, or `None` to disable. Must be a 1D tensor of the same type + as `x`. The shape must be known, and it must match the dimension of `x` + corresponding to `dim`. + dim: The dimension in `x` corresponding to the elements of `b`. + The value of `dim` is ignored if `b` is not specified. + act: Name of the activation function to evaluate, or `"linear"` to disable. + Can be e.g. `"relu"`, `"lrelu"`, `"tanh"`, `"sigmoid"`, `"swish"`, etc. + See `activation_funcs` for a full list. `None` is not allowed. + alpha: Shape parameter for the activation function, or `None` to use the default. + gain: Scaling factor for the output tensor, or `None` to use default. + See `activation_funcs` for the default scaling of each activation function. + If unsure, consider specifying 1. + clamp: Clamp the output values to `[-clamp, +clamp]`, or `None` to disable + the clamping (default). + impl: Name of the implementation to use. Can be `"ref"` or `"cuda"` (default). + + Returns: + Tensor of the same shape and datatype as `x`. + """ + assert isinstance(x, torch.Tensor) + assert impl in ["ref", "cuda"] + return _bias_act_ref( + x=x, b=b, dim=dim, act=act, alpha=alpha, gain=gain, clamp=clamp + ) + + +def _get_filter_size(f): + if f is None: + return 1, 1 + + assert isinstance(f, torch.Tensor) and f.ndim in [1, 2] + fw = f.shape[-1] + fh = f.shape[0] + + fw = int(fw) + fh = int(fh) + assert fw >= 1 and fh >= 1 + return fw, fh + + +def _get_weight_shape(w): + shape = [int(sz) for sz in w.shape] + return shape + + +def _parse_scaling(scaling): + if isinstance(scaling, int): + scaling = [scaling, scaling] + assert isinstance(scaling, (list, tuple)) + assert all(isinstance(x, int) for x in scaling) + sx, sy = scaling + assert sx >= 1 and sy >= 1 + return sx, sy + + +def _parse_padding(padding): + if isinstance(padding, int): + padding = [padding, padding] + assert isinstance(padding, (list, tuple)) + assert all(isinstance(x, int) for x in padding) + if len(padding) == 2: + padx, pady = padding + padding = [padx, padx, pady, pady] + padx0, padx1, pady0, pady1 = padding + return padx0, padx1, pady0, pady1 + + +def setup_filter( + f, + device=torch.device("cpu"), + normalize=True, + flip_filter=False, + gain=1, + separable=None, +): + r"""Convenience function to setup 2D FIR filter for `upfirdn2d()`. + + Args: + f: Torch tensor, numpy array, or python list of the shape + `[filter_height, filter_width]` (non-separable), + `[filter_taps]` (separable), + `[]` (impulse), or + `None` (identity). + device: Result device (default: cpu). + normalize: Normalize the filter so that it retains the magnitude + for constant input signal (DC)? (default: True). + flip_filter: Flip the filter? (default: False). + gain: Overall scaling factor for signal magnitude (default: 1). + separable: Return a separable filter? (default: select automatically). + + Returns: + Float32 tensor of the shape + `[filter_height, filter_width]` (non-separable) or + `[filter_taps]` (separable). + """ + # Validate. + if f is None: + f = 1 + f = torch.as_tensor(f, dtype=torch.float32) + assert f.ndim in [0, 1, 2] + assert f.numel() > 0 + if f.ndim == 0: + f = f[np.newaxis] + + # Separable? + if separable is None: + separable = f.ndim == 1 and f.numel() >= 8 + if f.ndim == 1 and not separable: + f = f.ger(f) + assert f.ndim == (1 if separable else 2) + + # Apply normalize, flip, gain, and device. + if normalize: + f /= f.sum() + if flip_filter: + f = f.flip(list(range(f.ndim))) + f = f * (gain ** (f.ndim / 2)) + f = f.to(device=device) + return f + + +def _ntuple(n): + def parse(x): + if isinstance(x, collections.abc.Iterable): + return x + return tuple(repeat(x, n)) + + return parse + + +to_2tuple = _ntuple(2) + +activation_funcs = { + "linear": EasyDict( + func=lambda x, **_: x, + def_alpha=0, + def_gain=1, + cuda_idx=1, + ref="", + has_2nd_grad=False, + ), + "relu": EasyDict( + func=lambda x, **_: torch.nn.functional.relu(x), + def_alpha=0, + def_gain=np.sqrt(2), + cuda_idx=2, + ref="y", + has_2nd_grad=False, + ), + "lrelu": EasyDict( + func=lambda x, alpha, **_: torch.nn.functional.leaky_relu(x, alpha), + def_alpha=0.2, + def_gain=np.sqrt(2), + cuda_idx=3, + ref="y", + has_2nd_grad=False, + ), + "tanh": EasyDict( + func=lambda x, **_: torch.tanh(x), + def_alpha=0, + def_gain=1, + cuda_idx=4, + ref="y", + has_2nd_grad=True, + ), + "sigmoid": EasyDict( + func=lambda x, **_: torch.sigmoid(x), + def_alpha=0, + def_gain=1, + cuda_idx=5, + ref="y", + has_2nd_grad=True, + ), + "elu": EasyDict( + func=lambda x, **_: torch.nn.functional.elu(x), + def_alpha=0, + def_gain=1, + cuda_idx=6, + ref="y", + has_2nd_grad=True, + ), + "selu": EasyDict( + func=lambda x, **_: torch.nn.functional.selu(x), + def_alpha=0, + def_gain=1, + cuda_idx=7, + ref="y", + has_2nd_grad=True, + ), + "softplus": EasyDict( + func=lambda x, **_: torch.nn.functional.softplus(x), + def_alpha=0, + def_gain=1, + cuda_idx=8, + ref="y", + has_2nd_grad=True, + ), + "swish": EasyDict( + func=lambda x, **_: torch.sigmoid(x) * x, + def_alpha=0, + def_gain=np.sqrt(2), + cuda_idx=9, + ref="x", + has_2nd_grad=True, + ), +} + + +def upfirdn2d(x, f, up=1, down=1, padding=0, flip_filter=False, gain=1, impl="cuda"): + r"""Pad, upsample, filter, and downsample a batch of 2D images. + + Performs the following sequence of operations for each channel: + + 1. Upsample the image by inserting N-1 zeros after each pixel (`up`). + + 2. Pad the image with the specified number of zeros on each side (`padding`). + Negative padding corresponds to cropping the image. + + 3. Convolve the image with the specified 2D FIR filter (`f`), shrinking it + so that the footprint of all output pixels lies within the input image. + + 4. Downsample the image by keeping every Nth pixel (`down`). + + This sequence of operations bears close resemblance to scipy.signal.upfirdn(). + The fused op is considerably more efficient than performing the same calculation + using standard PyTorch ops. It supports gradients of arbitrary order. + + Args: + x: Float32/float64/float16 input tensor of the shape + `[batch_size, num_channels, in_height, in_width]`. + f: Float32 FIR filter of the shape + `[filter_height, filter_width]` (non-separable), + `[filter_taps]` (separable), or + `None` (identity). + up: Integer upsampling factor. Can be a single int or a list/tuple + `[x, y]` (default: 1). + down: Integer downsampling factor. Can be a single int or a list/tuple + `[x, y]` (default: 1). + padding: Padding with respect to the upsampled image. Can be a single number + or a list/tuple `[x, y]` or `[x_before, x_after, y_before, y_after]` + (default: 0). + flip_filter: False = convolution, True = correlation (default: False). + gain: Overall scaling factor for signal magnitude (default: 1). + impl: Implementation to use. Can be `'ref'` or `'cuda'` (default: `'cuda'`). + + Returns: + Tensor of the shape `[batch_size, num_channels, out_height, out_width]`. + """ + # assert isinstance(x, torch.Tensor) + # assert impl in ['ref', 'cuda'] + return _upfirdn2d_ref( + x, f, up=up, down=down, padding=padding, flip_filter=flip_filter, gain=gain + ) + + +def _upfirdn2d_ref(x, f, up=1, down=1, padding=0, flip_filter=False, gain=1): + """Slow reference implementation of `upfirdn2d()` using standard PyTorch ops.""" + # Validate arguments. + assert isinstance(x, torch.Tensor) and x.ndim == 4 + if f is None: + f = torch.ones([1, 1], dtype=torch.float32, device=x.device) + assert isinstance(f, torch.Tensor) and f.ndim in [1, 2] + assert not f.requires_grad + batch_size, num_channels, in_height, in_width = x.shape + # upx, upy = _parse_scaling(up) + # downx, downy = _parse_scaling(down) + + upx, upy = up, up + downx, downy = down, down + + # padx0, padx1, pady0, pady1 = _parse_padding(padding) + padx0, padx1, pady0, pady1 = padding[0], padding[1], padding[2], padding[3] + + # Upsample by inserting zeros. + x = x.reshape([batch_size, num_channels, in_height, 1, in_width, 1]) + x = torch.nn.functional.pad(x, [0, upx - 1, 0, 0, 0, upy - 1]) + x = x.reshape([batch_size, num_channels, in_height * upy, in_width * upx]) + + # Pad or crop. + x = torch.nn.functional.pad( + x, [max(padx0, 0), max(padx1, 0), max(pady0, 0), max(pady1, 0)] + ) + x = x[ + :, + :, + max(-pady0, 0) : x.shape[2] - max(-pady1, 0), + max(-padx0, 0) : x.shape[3] - max(-padx1, 0), + ] + + # Setup filter. + f = f * (gain ** (f.ndim / 2)) + f = f.to(x.dtype) + if not flip_filter: + f = f.flip(list(range(f.ndim))) + + # Convolve with the filter. + f = f[np.newaxis, np.newaxis].repeat([num_channels, 1] + [1] * f.ndim) + if f.ndim == 4: + x = conv2d(input=x, weight=f, groups=num_channels) + else: + x = conv2d(input=x, weight=f.unsqueeze(2), groups=num_channels) + x = conv2d(input=x, weight=f.unsqueeze(3), groups=num_channels) + + # Downsample by throwing away pixels. + x = x[:, :, ::downy, ::downx] + return x + + +def downsample2d(x, f, down=2, padding=0, flip_filter=False, gain=1, impl="cuda"): + r"""Downsample a batch of 2D images using the given 2D FIR filter. + + By default, the result is padded so that its shape is a fraction of the input. + User-specified padding is applied on top of that, with negative values + indicating cropping. Pixels outside the image are assumed to be zero. + + Args: + x: Float32/float64/float16 input tensor of the shape + `[batch_size, num_channels, in_height, in_width]`. + f: Float32 FIR filter of the shape + `[filter_height, filter_width]` (non-separable), + `[filter_taps]` (separable), or + `None` (identity). + down: Integer downsampling factor. Can be a single int or a list/tuple + `[x, y]` (default: 1). + padding: Padding with respect to the input. Can be a single number or a + list/tuple `[x, y]` or `[x_before, x_after, y_before, y_after]` + (default: 0). + flip_filter: False = convolution, True = correlation (default: False). + gain: Overall scaling factor for signal magnitude (default: 1). + impl: Implementation to use. Can be `'ref'` or `'cuda'` (default: `'cuda'`). + + Returns: + Tensor of the shape `[batch_size, num_channels, out_height, out_width]`. + """ + downx, downy = _parse_scaling(down) + # padx0, padx1, pady0, pady1 = _parse_padding(padding) + padx0, padx1, pady0, pady1 = padding, padding, padding, padding + + fw, fh = _get_filter_size(f) + p = [ + padx0 + (fw - downx + 1) // 2, + padx1 + (fw - downx) // 2, + pady0 + (fh - downy + 1) // 2, + pady1 + (fh - downy) // 2, + ] + return upfirdn2d( + x, f, down=down, padding=p, flip_filter=flip_filter, gain=gain, impl=impl + ) + + +def upsample2d(x, f, up=2, padding=0, flip_filter=False, gain=1, impl="cuda"): + r"""Upsample a batch of 2D images using the given 2D FIR filter. + + By default, the result is padded so that its shape is a multiple of the input. + User-specified padding is applied on top of that, with negative values + indicating cropping. Pixels outside the image are assumed to be zero. + + Args: + x: Float32/float64/float16 input tensor of the shape + `[batch_size, num_channels, in_height, in_width]`. + f: Float32 FIR filter of the shape + `[filter_height, filter_width]` (non-separable), + `[filter_taps]` (separable), or + `None` (identity). + up: Integer upsampling factor. Can be a single int or a list/tuple + `[x, y]` (default: 1). + padding: Padding with respect to the output. Can be a single number or a + list/tuple `[x, y]` or `[x_before, x_after, y_before, y_after]` + (default: 0). + flip_filter: False = convolution, True = correlation (default: False). + gain: Overall scaling factor for signal magnitude (default: 1). + impl: Implementation to use. Can be `'ref'` or `'cuda'` (default: `'cuda'`). + + Returns: + Tensor of the shape `[batch_size, num_channels, out_height, out_width]`. + """ + upx, upy = _parse_scaling(up) + # upx, upy = up, up + padx0, padx1, pady0, pady1 = _parse_padding(padding) + # padx0, padx1, pady0, pady1 = padding, padding, padding, padding + fw, fh = _get_filter_size(f) + p = [ + padx0 + (fw + upx - 1) // 2, + padx1 + (fw - upx) // 2, + pady0 + (fh + upy - 1) // 2, + pady1 + (fh - upy) // 2, + ] + return upfirdn2d( + x, + f, + up=up, + padding=p, + flip_filter=flip_filter, + gain=gain * upx * upy, + impl=impl, + ) + + +class MinibatchStdLayer(torch.nn.Module): + def __init__(self, group_size, num_channels=1): + super().__init__() + self.group_size = group_size + self.num_channels = num_channels + + def forward(self, x): + N, C, H, W = x.shape + G = ( + torch.min(torch.as_tensor(self.group_size), torch.as_tensor(N)) + if self.group_size is not None + else N + ) + F = self.num_channels + c = C // F + + y = x.reshape( + G, -1, F, c, H, W + ) # [GnFcHW] Split minibatch N into n groups of size G, and channels C into F groups of size c. + y = y - y.mean(dim=0) # [GnFcHW] Subtract mean over group. + y = y.square().mean(dim=0) # [nFcHW] Calc variance over group. + y = (y + 1e-8).sqrt() # [nFcHW] Calc stddev over group. + y = y.mean(dim=[2, 3, 4]) # [nF] Take average over channels and pixels. + y = y.reshape(-1, F, 1, 1) # [nF11] Add missing dimensions. + y = y.repeat(G, 1, H, W) # [NFHW] Replicate over group and pixels. + x = torch.cat([x, y], dim=1) # [NCHW] Append to input as new channels. + return x + + +class FullyConnectedLayer(torch.nn.Module): + def __init__( + self, + in_features, # Number of input features. + out_features, # Number of output features. + bias=True, # Apply additive bias before the activation function? + activation="linear", # Activation function: 'relu', 'lrelu', etc. + lr_multiplier=1, # Learning rate multiplier. + bias_init=0, # Initial value for the additive bias. + ): + super().__init__() + self.weight = torch.nn.Parameter( + torch.randn([out_features, in_features]) / lr_multiplier + ) + self.bias = ( + torch.nn.Parameter(torch.full([out_features], np.float32(bias_init))) + if bias + else None + ) + self.activation = activation + + self.weight_gain = lr_multiplier / np.sqrt(in_features) + self.bias_gain = lr_multiplier + + def forward(self, x): + w = self.weight * self.weight_gain + b = self.bias + if b is not None and self.bias_gain != 1: + b = b * self.bias_gain + + if self.activation == "linear" and b is not None: + # out = torch.addmm(b.unsqueeze(0), x, w.t()) + x = x.matmul(w.t()) + out = x + b.reshape([-1 if i == x.ndim - 1 else 1 for i in range(x.ndim)]) + else: + x = x.matmul(w.t()) + out = bias_act(x, b, act=self.activation, dim=x.ndim - 1) + return out + + +def _conv2d_wrapper( + x, w, stride=1, padding=0, groups=1, transpose=False, flip_weight=True +): + """Wrapper for the underlying `conv2d()` and `conv_transpose2d()` implementations.""" + out_channels, in_channels_per_group, kh, kw = _get_weight_shape(w) + + # Flip weight if requested. + if ( + not flip_weight + ): # conv2d() actually performs correlation (flip_weight=True) not convolution (flip_weight=False). + w = w.flip([2, 3]) + + # Workaround performance pitfall in cuDNN 8.0.5, triggered when using + # 1x1 kernel + memory_format=channels_last + less than 64 channels. + if ( + kw == 1 + and kh == 1 + and stride == 1 + and padding in [0, [0, 0], (0, 0)] + and not transpose + ): + if x.stride()[1] == 1 and min(out_channels, in_channels_per_group) < 64: + if out_channels <= 4 and groups == 1: + in_shape = x.shape + x = w.squeeze(3).squeeze(2) @ x.reshape( + [in_shape[0], in_channels_per_group, -1] + ) + x = x.reshape([in_shape[0], out_channels, in_shape[2], in_shape[3]]) + else: + x = x.to(memory_format=torch.contiguous_format) + w = w.to(memory_format=torch.contiguous_format) + x = conv2d(x, w, groups=groups) + return x.to(memory_format=torch.channels_last) + + # Otherwise => execute using conv2d_gradfix. + op = conv_transpose2d if transpose else conv2d + return op(x, w, stride=stride, padding=padding, groups=groups) + + +def conv2d_resample( + x, w, f=None, up=1, down=1, padding=0, groups=1, flip_weight=True, flip_filter=False +): + r"""2D convolution with optional up/downsampling. + + Padding is performed only once at the beginning, not between the operations. + + Args: + x: Input tensor of shape + `[batch_size, in_channels, in_height, in_width]`. + w: Weight tensor of shape + `[out_channels, in_channels//groups, kernel_height, kernel_width]`. + f: Low-pass filter for up/downsampling. Must be prepared beforehand by + calling setup_filter(). None = identity (default). + up: Integer upsampling factor (default: 1). + down: Integer downsampling factor (default: 1). + padding: Padding with respect to the upsampled image. Can be a single number + or a list/tuple `[x, y]` or `[x_before, x_after, y_before, y_after]` + (default: 0). + groups: Split input channels into N groups (default: 1). + flip_weight: False = convolution, True = correlation (default: True). + flip_filter: False = convolution, True = correlation (default: False). + + Returns: + Tensor of the shape `[batch_size, num_channels, out_height, out_width]`. + """ + # Validate arguments. + assert isinstance(x, torch.Tensor) and (x.ndim == 4) + assert isinstance(w, torch.Tensor) and (w.ndim == 4) and (w.dtype == x.dtype) + assert f is None or (isinstance(f, torch.Tensor) and f.ndim in [1, 2]) + assert isinstance(up, int) and (up >= 1) + assert isinstance(down, int) and (down >= 1) + # assert isinstance(groups, int) and (groups >= 1), f"!!!!!! groups: {groups} isinstance(groups, int) {isinstance(groups, int)} {type(groups)}" + out_channels, in_channels_per_group, kh, kw = _get_weight_shape(w) + fw, fh = _get_filter_size(f) + # px0, px1, py0, py1 = _parse_padding(padding) + px0, px1, py0, py1 = padding, padding, padding, padding + + # Adjust padding to account for up/downsampling. + if up > 1: + px0 += (fw + up - 1) // 2 + px1 += (fw - up) // 2 + py0 += (fh + up - 1) // 2 + py1 += (fh - up) // 2 + if down > 1: + px0 += (fw - down + 1) // 2 + px1 += (fw - down) // 2 + py0 += (fh - down + 1) // 2 + py1 += (fh - down) // 2 + + # Fast path: 1x1 convolution with downsampling only => downsample first, then convolve. + if kw == 1 and kh == 1 and (down > 1 and up == 1): + x = upfirdn2d( + x=x, f=f, down=down, padding=[px0, px1, py0, py1], flip_filter=flip_filter + ) + x = _conv2d_wrapper(x=x, w=w, groups=groups, flip_weight=flip_weight) + return x + + # Fast path: 1x1 convolution with upsampling only => convolve first, then upsample. + if kw == 1 and kh == 1 and (up > 1 and down == 1): + x = _conv2d_wrapper(x=x, w=w, groups=groups, flip_weight=flip_weight) + x = upfirdn2d( + x=x, + f=f, + up=up, + padding=[px0, px1, py0, py1], + gain=up**2, + flip_filter=flip_filter, + ) + return x + + # Fast path: downsampling only => use strided convolution. + if down > 1 and up == 1: + x = upfirdn2d(x=x, f=f, padding=[px0, px1, py0, py1], flip_filter=flip_filter) + x = _conv2d_wrapper( + x=x, w=w, stride=down, groups=groups, flip_weight=flip_weight + ) + return x + + # Fast path: upsampling with optional downsampling => use transpose strided convolution. + if up > 1: + if groups == 1: + w = w.transpose(0, 1) + else: + w = w.reshape(groups, out_channels // groups, in_channels_per_group, kh, kw) + w = w.transpose(1, 2) + w = w.reshape( + groups * in_channels_per_group, out_channels // groups, kh, kw + ) + px0 -= kw - 1 + px1 -= kw - up + py0 -= kh - 1 + py1 -= kh - up + pxt = max(min(-px0, -px1), 0) + pyt = max(min(-py0, -py1), 0) + x = _conv2d_wrapper( + x=x, + w=w, + stride=up, + padding=[pyt, pxt], + groups=groups, + transpose=True, + flip_weight=(not flip_weight), + ) + x = upfirdn2d( + x=x, + f=f, + padding=[px0 + pxt, px1 + pxt, py0 + pyt, py1 + pyt], + gain=up**2, + flip_filter=flip_filter, + ) + if down > 1: + x = upfirdn2d(x=x, f=f, down=down, flip_filter=flip_filter) + return x + + # Fast path: no up/downsampling, padding supported by the underlying implementation => use plain conv2d. + if up == 1 and down == 1: + if px0 == px1 and py0 == py1 and px0 >= 0 and py0 >= 0: + return _conv2d_wrapper( + x=x, w=w, padding=[py0, px0], groups=groups, flip_weight=flip_weight + ) + + # Fallback: Generic reference implementation. + x = upfirdn2d( + x=x, + f=(f if up > 1 else None), + up=up, + padding=[px0, px1, py0, py1], + gain=up**2, + flip_filter=flip_filter, + ) + x = _conv2d_wrapper(x=x, w=w, groups=groups, flip_weight=flip_weight) + if down > 1: + x = upfirdn2d(x=x, f=f, down=down, flip_filter=flip_filter) + return x + + +class Conv2dLayer(torch.nn.Module): + def __init__( + self, + in_channels, # Number of input channels. + out_channels, # Number of output channels. + kernel_size, # Width and height of the convolution kernel. + bias=True, # Apply additive bias before the activation function? + activation="linear", # Activation function: 'relu', 'lrelu', etc. + up=1, # Integer upsampling factor. + down=1, # Integer downsampling factor. + resample_filter=[ + 1, + 3, + 3, + 1, + ], # Low-pass filter to apply when resampling activations. + conv_clamp=None, # Clamp the output to +-X, None = disable clamping. + channels_last=False, # Expect the input to have memory_format=channels_last? + trainable=True, # Update the weights of this layer during training? + ): + super().__init__() + self.activation = activation + self.up = up + self.down = down + self.register_buffer("resample_filter", setup_filter(resample_filter)) + self.conv_clamp = conv_clamp + self.padding = kernel_size // 2 + self.weight_gain = 1 / np.sqrt(in_channels * (kernel_size**2)) + self.act_gain = activation_funcs[activation].def_gain + + memory_format = ( + torch.channels_last if channels_last else torch.contiguous_format + ) + weight = torch.randn([out_channels, in_channels, kernel_size, kernel_size]).to( + memory_format=memory_format + ) + bias = torch.zeros([out_channels]) if bias else None + if trainable: + self.weight = torch.nn.Parameter(weight) + self.bias = torch.nn.Parameter(bias) if bias is not None else None + else: + self.register_buffer("weight", weight) + if bias is not None: + self.register_buffer("bias", bias) + else: + self.bias = None + + def forward(self, x, gain=1): + w = self.weight * self.weight_gain + x = conv2d_resample( + x=x, + w=w, + f=self.resample_filter, + up=self.up, + down=self.down, + padding=self.padding, + ) + + act_gain = self.act_gain * gain + act_clamp = self.conv_clamp * gain if self.conv_clamp is not None else None + out = bias_act( + x, self.bias, act=self.activation, gain=act_gain, clamp=act_clamp + ) + return out + + +def torch_gc(): + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.ipc_collect() + gc.collect() + + +def set_seed(seed: int): + random.seed(seed) + np.random.seed(seed) + torch.manual_seed(seed) + torch.cuda.manual_seed_all(seed) + + +def get_scheduler(sd_sampler, scheduler_config): + # https://github.com/huggingface/diffusers/issues/4167 + keys_to_pop = ["use_karras_sigmas", "algorithm_type"] + scheduler_config = dict(scheduler_config) + for it in keys_to_pop: + scheduler_config.pop(it, None) + + # fmt: off + samplers = { + SDSampler.dpm_plus_plus_2m: [DPMSolverMultistepScheduler], + SDSampler.dpm_plus_plus_2m_karras: [DPMSolverMultistepScheduler, dict(use_karras_sigmas=True)], + SDSampler.dpm_plus_plus_2m_sde: [DPMSolverMultistepScheduler, dict(algorithm_type="sde-dpmsolver++")], + SDSampler.dpm_plus_plus_2m_sde_karras: [DPMSolverMultistepScheduler, dict(algorithm_type="sde-dpmsolver++", use_karras_sigmas=True)], + SDSampler.dpm_plus_plus_sde: [DPMSolverSinglestepScheduler], + SDSampler.dpm_plus_plus_sde_karras: [DPMSolverSinglestepScheduler, dict(use_karras_sigmas=True)], + SDSampler.dpm2: [KDPM2DiscreteScheduler], + SDSampler.dpm2_karras: [KDPM2DiscreteScheduler, dict(use_karras_sigmas=True)], + SDSampler.dpm2_a: [KDPM2AncestralDiscreteScheduler], + SDSampler.dpm2_a_karras: [KDPM2AncestralDiscreteScheduler, dict(use_karras_sigmas=True)], + SDSampler.euler: [EulerDiscreteScheduler], + SDSampler.euler_a: [EulerAncestralDiscreteScheduler], + SDSampler.heun: [HeunDiscreteScheduler], + SDSampler.lms: [LMSDiscreteScheduler], + SDSampler.lms_karras: [LMSDiscreteScheduler, dict(use_karras_sigmas=True)], + SDSampler.ddim: [DDIMScheduler], + SDSampler.pndm: [PNDMScheduler], + SDSampler.uni_pc: [UniPCMultistepScheduler], + SDSampler.lcm: [LCMScheduler], + } + # fmt: on + if sd_sampler in samplers: + if len(samplers[sd_sampler]) == 2: + scheduler_cls, kwargs = samplers[sd_sampler] + else: + scheduler_cls, kwargs = samplers[sd_sampler][0], {} + return scheduler_cls.from_config(scheduler_config, **kwargs) + else: + raise ValueError(sd_sampler) + + +def is_local_files_only(**kwargs) -> bool: + from huggingface_hub.constants import HF_HUB_OFFLINE + + return HF_HUB_OFFLINE or kwargs.get("local_files_only", False) + + +def handle_from_pretrained_exceptions(func, **kwargs): + try: + return func(**kwargs) + except ValueError as e: + if "You are trying to load the model files of the `variant=fp16`" in str(e): + logger.info("variant=fp16 not found, try revision=fp16") + try: + return func(**{**kwargs, "variant": None, "revision": "fp16"}) + except Exception as e: + logger.info("revision=fp16 not found, try revision=main") + return func(**{**kwargs, "variant": None, "revision": "main"}) + raise e + except OSError as e: + previous_traceback = traceback.format_exc() + if "RevisionNotFoundError: 404 Client Error." in previous_traceback: + logger.info("revision=fp16 not found, try revision=main") + return func(**{**kwargs, "variant": None, "revision": "main"}) + elif "Max retries exceeded" in previous_traceback: + logger.exception( + "Fetching model from HuggingFace failed. " + "If this is your first time downloading the model, you may need to set up proxy in terminal." + "If the model has already been downloaded, you can add --local-files-only when starting." + ) + exit(-1) + raise e + except Exception as e: + raise e + + +def get_torch_dtype(device, no_half: bool): + device = str(device) + use_fp16 = not no_half + use_gpu = device == "cuda" + # https://github.com/huggingface/diffusers/issues/4480 + # pipe.enable_attention_slicing and float16 will cause black output on mps + # if device in ["cuda", "mps"] and use_fp16: + if device in ["cuda"] and use_fp16: + return use_gpu, torch.float16 + return use_gpu, torch.float32 + + +def enable_low_mem(pipe, enable: bool): + if torch.backends.mps.is_available(): + # https://huggingface.co/docs/diffusers/v0.25.0/en/api/pipelines/stable_diffusion/image_variation#diffusers.StableDiffusionImageVariationPipeline.enable_attention_slicing + # CUDA: Don't enable attention slicing if you're already using `scaled_dot_product_attention` (SDPA) from PyTorch 2.0 or xFormers. + if enable: + pipe.enable_attention_slicing("max") + else: + # https://huggingface.co/docs/diffusers/optimization/mps + # Devices with less than 64GB of memory are recommended to use enable_attention_slicing + pipe.enable_attention_slicing() + + if enable: + pipe.vae.enable_tiling() diff --git a/inpaint/model/zits.py b/inpaint/model/zits.py new file mode 100644 index 0000000..d58ac01 --- /dev/null +++ b/inpaint/model/zits.py @@ -0,0 +1,476 @@ +import os +import time + +import cv2 +import torch +import torch.nn.functional as F + +from iopaint.helper import get_cache_path_by_url, load_jit_model, download_model +from iopaint.schema import InpaintRequest +import numpy as np + +from .base import InpaintModel + +ZITS_INPAINT_MODEL_URL = os.environ.get( + "ZITS_INPAINT_MODEL_URL", + "https://github.com/Sanster/models/releases/download/add_zits/zits-inpaint-0717.pt", +) +ZITS_INPAINT_MODEL_MD5 = os.environ.get( + "ZITS_INPAINT_MODEL_MD5", "9978cc7157dc29699e42308d675b2154" +) + +ZITS_EDGE_LINE_MODEL_URL = os.environ.get( + "ZITS_EDGE_LINE_MODEL_URL", + "https://github.com/Sanster/models/releases/download/add_zits/zits-edge-line-0717.pt", +) +ZITS_EDGE_LINE_MODEL_MD5 = os.environ.get( + "ZITS_EDGE_LINE_MODEL_MD5", "55e31af21ba96bbf0c80603c76ea8c5f" +) + +ZITS_STRUCTURE_UPSAMPLE_MODEL_URL = os.environ.get( + "ZITS_STRUCTURE_UPSAMPLE_MODEL_URL", + "https://github.com/Sanster/models/releases/download/add_zits/zits-structure-upsample-0717.pt", +) +ZITS_STRUCTURE_UPSAMPLE_MODEL_MD5 = os.environ.get( + "ZITS_STRUCTURE_UPSAMPLE_MODEL_MD5", "3d88a07211bd41b2ec8cc0d999f29927" +) + +ZITS_WIRE_FRAME_MODEL_URL = os.environ.get( + "ZITS_WIRE_FRAME_MODEL_URL", + "https://github.com/Sanster/models/releases/download/add_zits/zits-wireframe-0717.pt", +) +ZITS_WIRE_FRAME_MODEL_MD5 = os.environ.get( + "ZITS_WIRE_FRAME_MODEL_MD5", "a9727c63a8b48b65c905d351b21ce46b" +) + + +def resize(img, height, width, center_crop=False): + imgh, imgw = img.shape[0:2] + + if center_crop and imgh != imgw: + # center crop + side = np.minimum(imgh, imgw) + j = (imgh - side) // 2 + i = (imgw - side) // 2 + img = img[j : j + side, i : i + side, ...] + + if imgh > height and imgw > width: + inter = cv2.INTER_AREA + else: + inter = cv2.INTER_LINEAR + img = cv2.resize(img, (height, width), interpolation=inter) + + return img + + +def to_tensor(img, scale=True, norm=False): + if img.ndim == 2: + img = img[:, :, np.newaxis] + c = img.shape[-1] + + if scale: + img_t = torch.from_numpy(img).permute(2, 0, 1).float().div(255) + else: + img_t = torch.from_numpy(img).permute(2, 0, 1).float() + + if norm: + mean = torch.tensor([0.5, 0.5, 0.5]).reshape(c, 1, 1) + std = torch.tensor([0.5, 0.5, 0.5]).reshape(c, 1, 1) + img_t = (img_t - mean) / std + return img_t + + +def load_masked_position_encoding(mask): + ones_filter = np.ones((3, 3), dtype=np.float32) + d_filter1 = np.array([[1, 1, 0], [1, 1, 0], [0, 0, 0]], dtype=np.float32) + d_filter2 = np.array([[0, 0, 0], [1, 1, 0], [1, 1, 0]], dtype=np.float32) + d_filter3 = np.array([[0, 1, 1], [0, 1, 1], [0, 0, 0]], dtype=np.float32) + d_filter4 = np.array([[0, 0, 0], [0, 1, 1], [0, 1, 1]], dtype=np.float32) + str_size = 256 + pos_num = 128 + + ori_mask = mask.copy() + ori_h, ori_w = ori_mask.shape[0:2] + ori_mask = ori_mask / 255 + mask = cv2.resize(mask, (str_size, str_size), interpolation=cv2.INTER_AREA) + mask[mask > 0] = 255 + h, w = mask.shape[0:2] + mask3 = mask.copy() + mask3 = 1.0 - (mask3 / 255.0) + pos = np.zeros((h, w), dtype=np.int32) + direct = np.zeros((h, w, 4), dtype=np.int32) + i = 0 + while np.sum(1 - mask3) > 0: + i += 1 + mask3_ = cv2.filter2D(mask3, -1, ones_filter) + mask3_[mask3_ > 0] = 1 + sub_mask = mask3_ - mask3 + pos[sub_mask == 1] = i + + m = cv2.filter2D(mask3, -1, d_filter1) + m[m > 0] = 1 + m = m - mask3 + direct[m == 1, 0] = 1 + + m = cv2.filter2D(mask3, -1, d_filter2) + m[m > 0] = 1 + m = m - mask3 + direct[m == 1, 1] = 1 + + m = cv2.filter2D(mask3, -1, d_filter3) + m[m > 0] = 1 + m = m - mask3 + direct[m == 1, 2] = 1 + + m = cv2.filter2D(mask3, -1, d_filter4) + m[m > 0] = 1 + m = m - mask3 + direct[m == 1, 3] = 1 + + mask3 = mask3_ + + abs_pos = pos.copy() + rel_pos = pos / (str_size / 2) # to 0~1 maybe larger than 1 + rel_pos = (rel_pos * pos_num).astype(np.int32) + rel_pos = np.clip(rel_pos, 0, pos_num - 1) + + if ori_w != w or ori_h != h: + rel_pos = cv2.resize(rel_pos, (ori_w, ori_h), interpolation=cv2.INTER_NEAREST) + rel_pos[ori_mask == 0] = 0 + direct = cv2.resize(direct, (ori_w, ori_h), interpolation=cv2.INTER_NEAREST) + direct[ori_mask == 0, :] = 0 + + return rel_pos, abs_pos, direct + + +def load_image(img, mask, device, sigma256=3.0): + """ + Args: + img: [H, W, C] RGB + mask: [H, W] 255 为 masks 区域 + sigma256: + + Returns: + + """ + h, w, _ = img.shape + imgh, imgw = img.shape[0:2] + img_256 = resize(img, 256, 256) + + mask = (mask > 127).astype(np.uint8) * 255 + mask_256 = cv2.resize(mask, (256, 256), interpolation=cv2.INTER_AREA) + mask_256[mask_256 > 0] = 255 + + mask_512 = cv2.resize(mask, (512, 512), interpolation=cv2.INTER_AREA) + mask_512[mask_512 > 0] = 255 + + # original skimage implemention + # https://scikit-image.org/docs/stable/api/skimage.feature.html#skimage.feature.canny + # low_threshold: Lower bound for hysteresis thresholding (linking edges). If None, low_threshold is set to 10% of dtype’s max. + # high_threshold: Upper bound for hysteresis thresholding (linking edges). If None, high_threshold is set to 20% of dtype’s max. + + try: + import skimage + + gray_256 = skimage.color.rgb2gray(img_256) + edge_256 = skimage.feature.canny(gray_256, sigma=3.0, mask=None).astype(float) + # cv2.imwrite("skimage_gray.jpg", (gray_256*255).astype(np.uint8)) + # cv2.imwrite("skimage_edge.jpg", (edge_256*255).astype(np.uint8)) + except: + gray_256 = cv2.cvtColor(img_256, cv2.COLOR_RGB2GRAY) + gray_256_blured = cv2.GaussianBlur( + gray_256, ksize=(7, 7), sigmaX=sigma256, sigmaY=sigma256 + ) + edge_256 = cv2.Canny( + gray_256_blured, threshold1=int(255 * 0.1), threshold2=int(255 * 0.2) + ) + + # cv2.imwrite("opencv_edge.jpg", edge_256) + + # line + img_512 = resize(img, 512, 512) + + rel_pos, abs_pos, direct = load_masked_position_encoding(mask) + + batch = dict() + batch["images"] = to_tensor(img.copy()).unsqueeze(0).to(device) + batch["img_256"] = to_tensor(img_256, norm=True).unsqueeze(0).to(device) + batch["masks"] = to_tensor(mask).unsqueeze(0).to(device) + batch["mask_256"] = to_tensor(mask_256).unsqueeze(0).to(device) + batch["mask_512"] = to_tensor(mask_512).unsqueeze(0).to(device) + batch["edge_256"] = to_tensor(edge_256, scale=False).unsqueeze(0).to(device) + batch["img_512"] = to_tensor(img_512).unsqueeze(0).to(device) + batch["rel_pos"] = torch.LongTensor(rel_pos).unsqueeze(0).to(device) + batch["abs_pos"] = torch.LongTensor(abs_pos).unsqueeze(0).to(device) + batch["direct"] = torch.LongTensor(direct).unsqueeze(0).to(device) + batch["h"] = imgh + batch["w"] = imgw + + return batch + + +def to_device(data, device): + if isinstance(data, torch.Tensor): + return data.to(device) + if isinstance(data, dict): + for key in data: + if isinstance(data[key], torch.Tensor): + data[key] = data[key].to(device) + return data + if isinstance(data, list): + return [to_device(d, device) for d in data] + + +class ZITS(InpaintModel): + name = "zits" + min_size = 256 + pad_mod = 32 + pad_to_square = True + is_erase_model = True + + def __init__(self, device, **kwargs): + """ + + Args: + device: + """ + super().__init__(device) + self.device = device + self.sample_edge_line_iterations = 1 + + def init_model(self, device, **kwargs): + self.wireframe = load_jit_model( + ZITS_WIRE_FRAME_MODEL_URL, device, ZITS_WIRE_FRAME_MODEL_MD5 + ) + self.edge_line = load_jit_model( + ZITS_EDGE_LINE_MODEL_URL, device, ZITS_EDGE_LINE_MODEL_MD5 + ) + self.structure_upsample = load_jit_model( + ZITS_STRUCTURE_UPSAMPLE_MODEL_URL, device, ZITS_STRUCTURE_UPSAMPLE_MODEL_MD5 + ) + self.inpaint = load_jit_model( + ZITS_INPAINT_MODEL_URL, device, ZITS_INPAINT_MODEL_MD5 + ) + + @staticmethod + def download(): + download_model(ZITS_WIRE_FRAME_MODEL_URL, ZITS_WIRE_FRAME_MODEL_MD5) + download_model(ZITS_EDGE_LINE_MODEL_URL, ZITS_EDGE_LINE_MODEL_MD5) + download_model( + ZITS_STRUCTURE_UPSAMPLE_MODEL_URL, ZITS_STRUCTURE_UPSAMPLE_MODEL_MD5 + ) + download_model(ZITS_INPAINT_MODEL_URL, ZITS_INPAINT_MODEL_MD5) + + @staticmethod + def is_downloaded() -> bool: + model_paths = [ + get_cache_path_by_url(ZITS_WIRE_FRAME_MODEL_URL), + get_cache_path_by_url(ZITS_EDGE_LINE_MODEL_URL), + get_cache_path_by_url(ZITS_STRUCTURE_UPSAMPLE_MODEL_URL), + get_cache_path_by_url(ZITS_INPAINT_MODEL_URL), + ] + return all([os.path.exists(it) for it in model_paths]) + + def wireframe_edge_and_line(self, items, enable: bool): + # 最终向 items 中添加 edge 和 line key + if not enable: + items["edge"] = torch.zeros_like(items["masks"]) + items["line"] = torch.zeros_like(items["masks"]) + return + + start = time.time() + try: + line_256 = self.wireframe_forward( + items["img_512"], + h=256, + w=256, + masks=items["mask_512"], + mask_th=0.85, + ) + except: + line_256 = torch.zeros_like(items["mask_256"]) + + print(f"wireframe_forward time: {(time.time() - start) * 1000:.2f}ms") + + # np_line = (line[0][0].numpy() * 255).astype(np.uint8) + # cv2.imwrite("line.jpg", np_line) + + start = time.time() + edge_pred, line_pred = self.sample_edge_line_logits( + context=[items["img_256"], items["edge_256"], line_256], + mask=items["mask_256"].clone(), + iterations=self.sample_edge_line_iterations, + add_v=0.05, + mul_v=4, + ) + print(f"sample_edge_line_logits time: {(time.time() - start) * 1000:.2f}ms") + + # np_edge_pred = (edge_pred[0][0].numpy() * 255).astype(np.uint8) + # cv2.imwrite("edge_pred.jpg", np_edge_pred) + # np_line_pred = (line_pred[0][0].numpy() * 255).astype(np.uint8) + # cv2.imwrite("line_pred.jpg", np_line_pred) + # exit() + + input_size = min(items["h"], items["w"]) + if input_size != 256 and input_size > 256: + while edge_pred.shape[2] < input_size: + edge_pred = self.structure_upsample(edge_pred) + edge_pred = torch.sigmoid((edge_pred + 2) * 2) + + line_pred = self.structure_upsample(line_pred) + line_pred = torch.sigmoid((line_pred + 2) * 2) + + edge_pred = F.interpolate( + edge_pred, + size=(input_size, input_size), + mode="bilinear", + align_corners=False, + ) + line_pred = F.interpolate( + line_pred, + size=(input_size, input_size), + mode="bilinear", + align_corners=False, + ) + + # np_edge_pred = (edge_pred[0][0].numpy() * 255).astype(np.uint8) + # cv2.imwrite("edge_pred_upsample.jpg", np_edge_pred) + # np_line_pred = (line_pred[0][0].numpy() * 255).astype(np.uint8) + # cv2.imwrite("line_pred_upsample.jpg", np_line_pred) + # exit() + + items["edge"] = edge_pred.detach() + items["line"] = line_pred.detach() + + @torch.no_grad() + def forward(self, image, mask, config: InpaintRequest): + """Input images and output images have same size + images: [H, W, C] RGB + masks: [H, W] + return: BGR IMAGE + """ + mask = mask[:, :, 0] + items = load_image(image, mask, device=self.device) + + self.wireframe_edge_and_line(items, config.zits_wireframe) + + inpainted_image = self.inpaint( + items["images"], + items["masks"], + items["edge"], + items["line"], + items["rel_pos"], + items["direct"], + ) + + inpainted_image = inpainted_image * 255.0 + inpainted_image = ( + inpainted_image.cpu().permute(0, 2, 3, 1)[0].numpy().astype(np.uint8) + ) + inpainted_image = inpainted_image[:, :, ::-1] + + # cv2.imwrite("inpainted.jpg", inpainted_image) + # exit() + + return inpainted_image + + def wireframe_forward(self, images, h, w, masks, mask_th=0.925): + lcnn_mean = torch.tensor([109.730, 103.832, 98.681]).reshape(1, 3, 1, 1) + lcnn_std = torch.tensor([22.275, 22.124, 23.229]).reshape(1, 3, 1, 1) + images = images * 255.0 + # the masks value of lcnn is 127.5 + masked_images = images * (1 - masks) + torch.ones_like(images) * masks * 127.5 + masked_images = (masked_images - lcnn_mean) / lcnn_std + + def to_int(x): + return tuple(map(int, x)) + + lines_tensor = [] + lmap = np.zeros((h, w)) + + output_masked = self.wireframe(masked_images) + + output_masked = to_device(output_masked, "cpu") + if output_masked["num_proposals"] == 0: + lines_masked = [] + scores_masked = [] + else: + lines_masked = output_masked["lines_pred"].numpy() + lines_masked = [ + [line[1] * h, line[0] * w, line[3] * h, line[2] * w] + for line in lines_masked + ] + scores_masked = output_masked["lines_score"].numpy() + + for line, score in zip(lines_masked, scores_masked): + if score > mask_th: + try: + import skimage + + rr, cc, value = skimage.draw.line_aa( + *to_int(line[0:2]), *to_int(line[2:4]) + ) + lmap[rr, cc] = np.maximum(lmap[rr, cc], value) + except: + cv2.line( + lmap, + to_int(line[0:2][::-1]), + to_int(line[2:4][::-1]), + (1, 1, 1), + 1, + cv2.LINE_AA, + ) + + lmap = np.clip(lmap * 255, 0, 255).astype(np.uint8) + lines_tensor.append(to_tensor(lmap).unsqueeze(0)) + + lines_tensor = torch.cat(lines_tensor, dim=0) + return lines_tensor.detach().to(self.device) + + def sample_edge_line_logits( + self, context, mask=None, iterations=1, add_v=0, mul_v=4 + ): + [img, edge, line] = context + + img = img * (1 - mask) + edge = edge * (1 - mask) + line = line * (1 - mask) + + for i in range(iterations): + edge_logits, line_logits = self.edge_line(img, edge, line, masks=mask) + + edge_pred = torch.sigmoid(edge_logits) + line_pred = torch.sigmoid((line_logits + add_v) * mul_v) + edge = edge + edge_pred * mask + edge[edge >= 0.25] = 1 + edge[edge < 0.25] = 0 + line = line + line_pred * mask + + b, _, h, w = edge_pred.shape + edge_pred = edge_pred.reshape(b, -1, 1) + line_pred = line_pred.reshape(b, -1, 1) + mask = mask.reshape(b, -1) + + edge_probs = torch.cat([1 - edge_pred, edge_pred], dim=-1) + line_probs = torch.cat([1 - line_pred, line_pred], dim=-1) + edge_probs[:, :, 1] += 0.5 + line_probs[:, :, 1] += 0.5 + edge_max_probs = edge_probs.max(dim=-1)[0] + (1 - mask) * (-100) + line_max_probs = line_probs.max(dim=-1)[0] + (1 - mask) * (-100) + + indices = torch.sort( + edge_max_probs + line_max_probs, dim=-1, descending=True + )[1] + + for ii in range(b): + keep = int((i + 1) / iterations * torch.sum(mask[ii, ...])) + + assert torch.sum(mask[ii][indices[ii, :keep]]) == keep, "Error!!!" + mask[ii][indices[ii, :keep]] = 0 + + mask = mask.reshape(b, 1, h, w) + edge = edge * (1 - mask) + line = line * (1 - mask) + + edge, line = edge.to(torch.float32), line.to(torch.float32) + return edge, line diff --git a/inpaint/model_manager.py b/inpaint/model_manager.py new file mode 100644 index 0000000..dae37d3 --- /dev/null +++ b/inpaint/model_manager.py @@ -0,0 +1,260 @@ +from typing import List, Dict + +import torch +from loguru import logger +import numpy as np + +from inpaint.download import scan_models +from inpaint.helper import switch_mps_device +from inpaint.model import models, ControlNet, SD, SDXL +from inpaint.model.brushnet.brushnet_wrapper import BrushNetWrapper +from inpaint.model.power_paint.power_paint_v2 import PowerPaintV2 +from inpaint.model.utils import torch_gc, is_local_files_only +from inpaint.schema import InpaintRequest, ModelInfo, ModelType + + +class ModelManager: + def __init__(self, name: str, device: torch.device, **kwargs): + self.name = name + self.device = device + self.kwargs = kwargs + self.available_models: Dict[str, ModelInfo] = {} + self.scan_models() + + self.enable_controlnet = kwargs.get("enable_controlnet", False) + controlnet_method = kwargs.get("controlnet_method", None) + if ( + controlnet_method is None + and name in self.available_models + and self.available_models[name].support_controlnet + ): + controlnet_method = self.available_models[name].controlnets[0] + self.controlnet_method = controlnet_method + + self.enable_brushnet = kwargs.get("enable_brushnet", False) + self.brushnet_method = kwargs.get("brushnet_method", None) + + self.enable_powerpaint_v2 = kwargs.get("enable_powerpaint_v2", False) + + self.model = self.init_model(name, device, **kwargs) + + @property + def current_model(self) -> ModelInfo: + return self.available_models[self.name] + + def init_model(self, name: str, device, **kwargs): + logger.info(f"Loading model: {name}") + if name not in self.available_models: + raise NotImplementedError( + f"Unsupported model: {name}. Available models: {list(self.available_models.keys())}" + ) + + model_info = self.available_models[name] + kwargs = { + **kwargs, + "model_info": model_info, + "enable_controlnet": self.enable_controlnet, + "controlnet_method": self.controlnet_method, + "enable_brushnet": self.enable_brushnet, + "brushnet_method": self.brushnet_method, + } + + if model_info.support_controlnet and self.enable_controlnet: + return ControlNet(device, **kwargs) + + if model_info.support_brushnet and self.enable_brushnet: + return BrushNetWrapper(device, **kwargs) + + if model_info.support_powerpaint_v2 and self.enable_powerpaint_v2: + return PowerPaintV2(device, **kwargs) + + if model_info.name in models: + return models[name](device, **kwargs) + + if model_info.model_type in [ + ModelType.DIFFUSERS_SD_INPAINT, + ModelType.DIFFUSERS_SD, + ]: + return SD(device, **kwargs) + + if model_info.model_type in [ + ModelType.DIFFUSERS_SDXL_INPAINT, + ModelType.DIFFUSERS_SDXL, + ]: + return SDXL(device, **kwargs) + + raise NotImplementedError(f"Unsupported model: {name}") + + @torch.inference_mode() + def __call__(self, image, mask, config: InpaintRequest): + """ + + Args: + image: [H, W, C] RGB + mask: [H, W, 1] 255 means area to repaint + config: + + Returns: + BGR image + """ + if config.enable_controlnet: + self.switch_controlnet_method(config) + if config.enable_brushnet: + self.switch_brushnet_method(config) + + self.enable_disable_powerpaint_v2(config) + self.enable_disable_lcm_lora(config) + return self.model(image, mask, config).astype(np.uint8) + + def scan_models(self) -> List[ModelInfo]: + available_models = scan_models() + self.available_models = {it.name: it for it in available_models} + return available_models + + def switch(self, new_name: str): + if new_name == self.name: + return + + old_name = self.name + old_controlnet_method = self.controlnet_method + self.name = new_name + + if ( + self.available_models[new_name].support_controlnet + and self.controlnet_method + not in self.available_models[new_name].controlnets + ): + self.controlnet_method = self.available_models[new_name].controlnets[0] + try: + # TODO: enable/disable controlnet without reload model + del self.model + torch_gc() + + self.model = self.init_model( + new_name, switch_mps_device(new_name, self.device), **self.kwargs + ) + except Exception as e: + self.name = old_name + self.controlnet_method = old_controlnet_method + logger.info(f"Switch model from {old_name} to {new_name} failed, rollback") + self.model = self.init_model( + old_name, switch_mps_device(old_name, self.device), **self.kwargs + ) + raise e + + def switch_brushnet_method(self, config): + if not self.available_models[self.name].support_brushnet: + return + + if ( + self.enable_brushnet + and config.brushnet_method + and self.brushnet_method != config.brushnet_method + ): + old_brushnet_method = self.brushnet_method + self.brushnet_method = config.brushnet_method + self.model.switch_brushnet_method(config.brushnet_method) + logger.info( + f"Switch Brushnet method from {old_brushnet_method} to {config.brushnet_method}" + ) + + elif self.enable_brushnet != config.enable_brushnet: + self.enable_brushnet = config.enable_brushnet + self.brushnet_method = config.brushnet_method + + pipe_components = { + "vae": self.model.model.vae, + "text_encoder": self.model.model.text_encoder, + "unet": self.model.model.unet, + } + if hasattr(self.model.model, "text_encoder_2"): + pipe_components["text_encoder_2"] = self.model.model.text_encoder_2 + + self.model = self.init_model( + self.name, + switch_mps_device(self.name, self.device), + pipe_components=pipe_components, + **self.kwargs, + ) + + if not config.enable_brushnet: + logger.info("BrushNet Disabled") + else: + logger.info("BrushNet Enabled") + + def switch_controlnet_method(self, config): + if not self.available_models[self.name].support_controlnet: + return + + if ( + self.enable_controlnet + and config.controlnet_method + and self.controlnet_method != config.controlnet_method + ): + old_controlnet_method = self.controlnet_method + self.controlnet_method = config.controlnet_method + self.model.switch_controlnet_method(config.controlnet_method) + logger.info( + f"Switch Controlnet method from {old_controlnet_method} to {config.controlnet_method}" + ) + elif self.enable_controlnet != config.enable_controlnet: + self.enable_controlnet = config.enable_controlnet + self.controlnet_method = config.controlnet_method + + pipe_components = { + "vae": self.model.model.vae, + "text_encoder": self.model.model.text_encoder, + "unet": self.model.model.unet, + } + if hasattr(self.model.model, "text_encoder_2"): + pipe_components["text_encoder_2"] = self.model.model.text_encoder_2 + + self.model = self.init_model( + self.name, + switch_mps_device(self.name, self.device), + pipe_components=pipe_components, + **self.kwargs, + ) + if not config.enable_controlnet: + logger.info("Disable controlnet") + else: + logger.info(f"Enable controlnet: {config.controlnet_method}") + + def enable_disable_powerpaint_v2(self, config: InpaintRequest): + if not self.available_models[self.name].support_powerpaint_v2: + return + + if self.enable_powerpaint_v2 != config.enable_powerpaint_v2: + self.enable_powerpaint_v2 = config.enable_powerpaint_v2 + pipe_components = {"vae": self.model.model.vae} + + self.model = self.init_model( + self.name, + switch_mps_device(self.name, self.device), + pipe_components=pipe_components, + **self.kwargs, + ) + if config.enable_powerpaint_v2: + logger.info("Enable PowerPaintV2") + else: + logger.info("Disable PowerPaintV2") + + def enable_disable_lcm_lora(self, config: InpaintRequest): + if self.available_models[self.name].support_lcm_lora: + # TODO: change this if load other lora is supported + lcm_lora_loaded = bool(self.model.model.get_list_adapters()) + if config.sd_lcm_lora: + if not lcm_lora_loaded: + logger.info("Load LCM LORA") + self.model.model.load_lora_weights( + self.model.lcm_lora_id, + weight_name="pytorch_lora_weights.safetensors", + local_files_only=is_local_files_only(), + ) + else: + logger.info("Enable LCM LORA") + self.model.model.enable_lora() + else: + if lcm_lora_loaded: + logger.info("Disable LCM LORA") + self.model.model.disable_lora() diff --git a/inpaint/plugins/__init__.py b/inpaint/plugins/__init__.py new file mode 100644 index 0000000..8128025 --- /dev/null +++ b/inpaint/plugins/__init__.py @@ -0,0 +1,74 @@ +from typing import Dict + +from loguru import logger + +from .anime_seg import AnimeSeg +from .gfpgan_plugin import GFPGANPlugin +from .interactive_seg import InteractiveSeg +from .realesrgan import RealESRGANUpscaler +from .remove_bg import RemoveBG +from .restoreformer import RestoreFormerPlugin +from ..schema import InteractiveSegModel, Device, RealESRGANModel + + +def build_plugins( + enable_interactive_seg: bool, + interactive_seg_model: InteractiveSegModel, + interactive_seg_device: Device, + enable_remove_bg: bool, + remove_bg_model: str, + enable_anime_seg: bool, + enable_realesrgan: bool, + realesrgan_device: Device, + realesrgan_model: RealESRGANModel, + enable_gfpgan: bool, + gfpgan_device: Device, + enable_restoreformer: bool, + restoreformer_device: Device, + no_half: bool, +) -> Dict: + plugins = {} + if enable_interactive_seg: + logger.info(f"Initialize {InteractiveSeg.name} plugin") + plugins[InteractiveSeg.name] = InteractiveSeg( + interactive_seg_model, interactive_seg_device + ) + + if enable_remove_bg: + logger.info(f"Initialize {RemoveBG.name} plugin") + plugins[RemoveBG.name] = RemoveBG(remove_bg_model) + + if enable_anime_seg: + logger.info(f"Initialize {AnimeSeg.name} plugin") + plugins[AnimeSeg.name] = AnimeSeg() + + if enable_realesrgan: + logger.info( + f"Initialize {RealESRGANUpscaler.name} plugin: {realesrgan_model}, {realesrgan_device}" + ) + plugins[RealESRGANUpscaler.name] = RealESRGANUpscaler( + realesrgan_model, + realesrgan_device, + no_half=no_half, + ) + + if enable_gfpgan: + logger.info(f"Initialize {GFPGANPlugin.name} plugin") + if enable_realesrgan: + logger.info("Use realesrgan as GFPGAN background upscaler") + else: + logger.info( + f"GFPGAN no background upscaler, use --enable-realesrgan to enable it" + ) + plugins[GFPGANPlugin.name] = GFPGANPlugin( + gfpgan_device, + upscaler=plugins.get(RealESRGANUpscaler.name, None), + ) + + if enable_restoreformer: + logger.info(f"Initialize {RestoreFormerPlugin.name} plugin") + plugins[RestoreFormerPlugin.name] = RestoreFormerPlugin( + restoreformer_device, + upscaler=plugins.get(RealESRGANUpscaler.name, None), + ) + return plugins diff --git a/inpaint/plugins/anime_seg.py b/inpaint/plugins/anime_seg.py new file mode 100644 index 0000000..286564b --- /dev/null +++ b/inpaint/plugins/anime_seg.py @@ -0,0 +1,462 @@ +import cv2 +import torch +import torch.nn as nn +import torch.nn.functional as F +import numpy as np +from PIL import Image + +from iopaint.helper import load_model +from iopaint.plugins.base_plugin import BasePlugin +from iopaint.schema import RunPluginRequest + + +class REBNCONV(nn.Module): + def __init__(self, in_ch=3, out_ch=3, dirate=1, stride=1): + super(REBNCONV, self).__init__() + + self.conv_s1 = nn.Conv2d( + in_ch, out_ch, 3, padding=1 * dirate, dilation=1 * dirate, stride=stride + ) + self.bn_s1 = nn.BatchNorm2d(out_ch) + self.relu_s1 = nn.ReLU(inplace=True) + + def forward(self, x): + hx = x + xout = self.relu_s1(self.bn_s1(self.conv_s1(hx))) + + return xout + + +## upsample tensor 'src' to have the same spatial size with tensor 'tar' +def _upsample_like(src, tar): + src = F.interpolate(src, size=tar.shape[2:], mode="bilinear", align_corners=False) + + return src + + +### RSU-7 ### +class RSU7(nn.Module): + def __init__(self, in_ch=3, mid_ch=12, out_ch=3, img_size=512): + super(RSU7, self).__init__() + + self.in_ch = in_ch + self.mid_ch = mid_ch + self.out_ch = out_ch + + self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1) ## 1 -> 1/2 + + self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1) + self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + + self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1) + self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + + self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1) + self.pool3 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + + self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=1) + self.pool4 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + + self.rebnconv5 = REBNCONV(mid_ch, mid_ch, dirate=1) + self.pool5 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + + self.rebnconv6 = REBNCONV(mid_ch, mid_ch, dirate=1) + + self.rebnconv7 = REBNCONV(mid_ch, mid_ch, dirate=2) + + self.rebnconv6d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv5d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv4d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1) + + def forward(self, x): + b, c, h, w = x.shape + + hx = x + hxin = self.rebnconvin(hx) + + hx1 = self.rebnconv1(hxin) + hx = self.pool1(hx1) + + hx2 = self.rebnconv2(hx) + hx = self.pool2(hx2) + + hx3 = self.rebnconv3(hx) + hx = self.pool3(hx3) + + hx4 = self.rebnconv4(hx) + hx = self.pool4(hx4) + + hx5 = self.rebnconv5(hx) + hx = self.pool5(hx5) + + hx6 = self.rebnconv6(hx) + + hx7 = self.rebnconv7(hx6) + + hx6d = self.rebnconv6d(torch.cat((hx7, hx6), 1)) + hx6dup = _upsample_like(hx6d, hx5) + + hx5d = self.rebnconv5d(torch.cat((hx6dup, hx5), 1)) + hx5dup = _upsample_like(hx5d, hx4) + + hx4d = self.rebnconv4d(torch.cat((hx5dup, hx4), 1)) + hx4dup = _upsample_like(hx4d, hx3) + + hx3d = self.rebnconv3d(torch.cat((hx4dup, hx3), 1)) + hx3dup = _upsample_like(hx3d, hx2) + + hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1)) + hx2dup = _upsample_like(hx2d, hx1) + + hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1)) + + return hx1d + hxin + + +### RSU-6 ### +class RSU6(nn.Module): + def __init__(self, in_ch=3, mid_ch=12, out_ch=3): + super(RSU6, self).__init__() + + self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1) + + self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1) + self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + + self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1) + self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + + self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1) + self.pool3 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + + self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=1) + self.pool4 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + + self.rebnconv5 = REBNCONV(mid_ch, mid_ch, dirate=1) + + self.rebnconv6 = REBNCONV(mid_ch, mid_ch, dirate=2) + + self.rebnconv5d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv4d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1) + + def forward(self, x): + hx = x + + hxin = self.rebnconvin(hx) + + hx1 = self.rebnconv1(hxin) + hx = self.pool1(hx1) + + hx2 = self.rebnconv2(hx) + hx = self.pool2(hx2) + + hx3 = self.rebnconv3(hx) + hx = self.pool3(hx3) + + hx4 = self.rebnconv4(hx) + hx = self.pool4(hx4) + + hx5 = self.rebnconv5(hx) + + hx6 = self.rebnconv6(hx5) + + hx5d = self.rebnconv5d(torch.cat((hx6, hx5), 1)) + hx5dup = _upsample_like(hx5d, hx4) + + hx4d = self.rebnconv4d(torch.cat((hx5dup, hx4), 1)) + hx4dup = _upsample_like(hx4d, hx3) + + hx3d = self.rebnconv3d(torch.cat((hx4dup, hx3), 1)) + hx3dup = _upsample_like(hx3d, hx2) + + hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1)) + hx2dup = _upsample_like(hx2d, hx1) + + hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1)) + + return hx1d + hxin + + +### RSU-5 ### +class RSU5(nn.Module): + def __init__(self, in_ch=3, mid_ch=12, out_ch=3): + super(RSU5, self).__init__() + + self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1) + + self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1) + self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + + self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1) + self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + + self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1) + self.pool3 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + + self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=1) + + self.rebnconv5 = REBNCONV(mid_ch, mid_ch, dirate=2) + + self.rebnconv4d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1) + + def forward(self, x): + hx = x + + hxin = self.rebnconvin(hx) + + hx1 = self.rebnconv1(hxin) + hx = self.pool1(hx1) + + hx2 = self.rebnconv2(hx) + hx = self.pool2(hx2) + + hx3 = self.rebnconv3(hx) + hx = self.pool3(hx3) + + hx4 = self.rebnconv4(hx) + + hx5 = self.rebnconv5(hx4) + + hx4d = self.rebnconv4d(torch.cat((hx5, hx4), 1)) + hx4dup = _upsample_like(hx4d, hx3) + + hx3d = self.rebnconv3d(torch.cat((hx4dup, hx3), 1)) + hx3dup = _upsample_like(hx3d, hx2) + + hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1)) + hx2dup = _upsample_like(hx2d, hx1) + + hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1)) + + return hx1d + hxin + + +### RSU-4 ### +class RSU4(nn.Module): + def __init__(self, in_ch=3, mid_ch=12, out_ch=3): + super(RSU4, self).__init__() + + self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1) + + self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1) + self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + + self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1) + self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + + self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1) + + self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=2) + + self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1) + + def forward(self, x): + hx = x + + hxin = self.rebnconvin(hx) + + hx1 = self.rebnconv1(hxin) + hx = self.pool1(hx1) + + hx2 = self.rebnconv2(hx) + hx = self.pool2(hx2) + + hx3 = self.rebnconv3(hx) + + hx4 = self.rebnconv4(hx3) + + hx3d = self.rebnconv3d(torch.cat((hx4, hx3), 1)) + hx3dup = _upsample_like(hx3d, hx2) + + hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1)) + hx2dup = _upsample_like(hx2d, hx1) + + hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1)) + + return hx1d + hxin + + +### RSU-4F ### +class RSU4F(nn.Module): + def __init__(self, in_ch=3, mid_ch=12, out_ch=3): + super(RSU4F, self).__init__() + + self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1) + + self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1) + self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=2) + self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=4) + + self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=8) + + self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=4) + self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=2) + self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1) + + def forward(self, x): + hx = x + + hxin = self.rebnconvin(hx) + + hx1 = self.rebnconv1(hxin) + hx2 = self.rebnconv2(hx1) + hx3 = self.rebnconv3(hx2) + + hx4 = self.rebnconv4(hx3) + + hx3d = self.rebnconv3d(torch.cat((hx4, hx3), 1)) + hx2d = self.rebnconv2d(torch.cat((hx3d, hx2), 1)) + hx1d = self.rebnconv1d(torch.cat((hx2d, hx1), 1)) + + return hx1d + hxin + + +class ISNetDIS(nn.Module): + def __init__(self, in_ch=3, out_ch=1): + super(ISNetDIS, self).__init__() + + self.conv_in = nn.Conv2d(in_ch, 64, 3, stride=2, padding=1) + self.pool_in = nn.MaxPool2d(2, stride=2, ceil_mode=True) + + self.stage1 = RSU7(64, 32, 64) + self.pool12 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + + self.stage2 = RSU6(64, 32, 128) + self.pool23 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + + self.stage3 = RSU5(128, 64, 256) + self.pool34 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + + self.stage4 = RSU4(256, 128, 512) + self.pool45 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + + self.stage5 = RSU4F(512, 256, 512) + self.pool56 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + + self.stage6 = RSU4F(512, 256, 512) + + # decoder + self.stage5d = RSU4F(1024, 256, 512) + self.stage4d = RSU4(1024, 128, 256) + self.stage3d = RSU5(512, 64, 128) + self.stage2d = RSU6(256, 32, 64) + self.stage1d = RSU7(128, 16, 64) + + self.side1 = nn.Conv2d(64, out_ch, 3, padding=1) + + def forward(self, x): + hx = x + + hxin = self.conv_in(hx) + hx = self.pool_in(hxin) + + # stage 1 + hx1 = self.stage1(hxin) + hx = self.pool12(hx1) + + # stage 2 + hx2 = self.stage2(hx) + hx = self.pool23(hx2) + + # stage 3 + hx3 = self.stage3(hx) + hx = self.pool34(hx3) + + # stage 4 + hx4 = self.stage4(hx) + hx = self.pool45(hx4) + + # stage 5 + hx5 = self.stage5(hx) + hx = self.pool56(hx5) + + # stage 6 + hx6 = self.stage6(hx) + hx6up = _upsample_like(hx6, hx5) + + # -------------------- decoder -------------------- + hx5d = self.stage5d(torch.cat((hx6up, hx5), 1)) + hx5dup = _upsample_like(hx5d, hx4) + + hx4d = self.stage4d(torch.cat((hx5dup, hx4), 1)) + hx4dup = _upsample_like(hx4d, hx3) + + hx3d = self.stage3d(torch.cat((hx4dup, hx3), 1)) + hx3dup = _upsample_like(hx3d, hx2) + + hx2d = self.stage2d(torch.cat((hx3dup, hx2), 1)) + hx2dup = _upsample_like(hx2d, hx1) + + hx1d = self.stage1d(torch.cat((hx2dup, hx1), 1)) + + # side output + d1 = self.side1(hx1d) + d1 = _upsample_like(d1, x) + return d1.sigmoid() + + +# 从小到大 +ANIME_SEG_MODELS = { + "url": "https://github.com/Sanster/models/releases/download/isnetis/isnetis.pth", + "md5": "5f25479076b73074730ab8de9e8f2051", +} + + +class AnimeSeg(BasePlugin): + # Model from: https://github.com/SkyTNT/anime-segmentation + name = "AnimeSeg" + support_gen_image = True + support_gen_mask = True + + def __init__(self): + super().__init__() + self.model = load_model( + ISNetDIS(), + ANIME_SEG_MODELS["url"], + "cpu", + ANIME_SEG_MODELS["md5"], + ) + + def gen_image(self, rgb_np_img, req: RunPluginRequest) -> np.ndarray: + mask = self.forward(rgb_np_img) + mask = Image.fromarray(mask, mode="L") + h0, w0 = rgb_np_img.shape[0], rgb_np_img.shape[1] + empty = Image.new("RGBA", (w0, h0), 0) + img = Image.fromarray(rgb_np_img) + cutout = Image.composite(img, empty, mask) + return np.asarray(cutout) + + def gen_mask(self, rgb_np_img, req: RunPluginRequest) -> np.ndarray: + return self.forward(rgb_np_img) + + @torch.inference_mode() + def forward(self, rgb_np_img): + s = 1024 + + h0, w0 = h, w = rgb_np_img.shape[0], rgb_np_img.shape[1] + if h > w: + h, w = s, int(s * w / h) + else: + h, w = int(s * h / w), s + ph, pw = s - h, s - w + tmpImg = np.zeros([s, s, 3], dtype=np.float32) + tmpImg[ph // 2 : ph // 2 + h, pw // 2 : pw // 2 + w] = ( + cv2.resize(rgb_np_img, (w, h)) / 255 + ) + tmpImg = tmpImg.transpose((2, 0, 1)) + tmpImg = torch.from_numpy(tmpImg).unsqueeze(0).type(torch.FloatTensor) + mask = self.model(tmpImg) + mask = mask[0, :, ph // 2 : ph // 2 + h, pw // 2 : pw // 2 + w] + mask = cv2.resize(mask.cpu().numpy().transpose((1, 2, 0)), (w0, h0)) + return (mask * 255).astype("uint8") diff --git a/inpaint/plugins/base_plugin.py b/inpaint/plugins/base_plugin.py new file mode 100644 index 0000000..1f8bddc --- /dev/null +++ b/inpaint/plugins/base_plugin.py @@ -0,0 +1,30 @@ +from loguru import logger +import numpy as np + +from iopaint.schema import RunPluginRequest + + +class BasePlugin: + name: str + support_gen_image: bool = False + support_gen_mask: bool = False + + def __init__(self): + err_msg = self.check_dep() + if err_msg: + logger.error(err_msg) + exit(-1) + + def gen_image(self, rgb_np_img, req: RunPluginRequest) -> np.ndarray: + # return RGBA np image or BGR np image + ... + + def gen_mask(self, rgb_np_img, req: RunPluginRequest) -> np.ndarray: + # return GRAY or BGR np image, 255 means foreground, 0 means background + ... + + def check_dep(self): + ... + + def switch_model(self, new_model_name: str): + ... diff --git a/inpaint/plugins/basicsr/LICENSE b/inpaint/plugins/basicsr/LICENSE new file mode 100644 index 0000000..1c9b5b8 --- /dev/null +++ b/inpaint/plugins/basicsr/LICENSE @@ -0,0 +1,201 @@ + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright 2018-2022 BasicSR Authors + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. \ No newline at end of file diff --git a/inpaint/plugins/basicsr/__init__.py b/inpaint/plugins/basicsr/__init__.py new file mode 100644 index 0000000..6bd8efd --- /dev/null +++ b/inpaint/plugins/basicsr/__init__.py @@ -0,0 +1,22 @@ +""" +Adapted from https://github.com/XPixelGroup/BasicSR +License: Apache-2.0 + +As of Feb 2024, `basicsr` appears to be unmaintained. It imports a function from `torchvision` that is removed in +`torchvision` 0.17. Here is the deprecation warning: + + UserWarning: The torchvision.transforms.functional_tensor module is deprecated in 0.15 and will be **removed in + 0.17**. Please don't rely on it. You probably just need to use APIs in torchvision.transforms.functional or in + torchvision.transforms.v2.functional. + +As a result, a dependency on `basicsr` means we cannot keep our `torchvision` dependency up to date. + +Because we only rely on a single class `RRDBNet` from `basicsr`, we've copied the relevant code here and removed the +dependency on `basicsr`. + +The code is almost unchanged, only a few type annotations have been added. The license is also copied. + +Copy From InvokeAI +""" + +from .rrdbnet_arch import RRDBNet diff --git a/inpaint/plugins/basicsr/arch_util.py b/inpaint/plugins/basicsr/arch_util.py new file mode 100644 index 0000000..befe76a --- /dev/null +++ b/inpaint/plugins/basicsr/arch_util.py @@ -0,0 +1,80 @@ +from typing import Type, List, Union + +import torch +from torch import nn as nn +from torch.nn import init as init +from torch.nn.modules.batchnorm import _BatchNorm + + +@torch.no_grad() +def default_init_weights( + module_list: Union[List[nn.Module], nn.Module], + scale: float = 1, + bias_fill: float = 0, + **kwargs, +) -> None: + """Initialize network weights. + + Args: + module_list (list[nn.Module] | nn.Module): Modules to be initialized. + scale (float): Scale initialized weights, especially for residual + blocks. Default: 1. + bias_fill (float): The value to fill bias. Default: 0 + kwargs (dict): Other arguments for initialization function. + """ + if not isinstance(module_list, list): + module_list = [module_list] + for module in module_list: + for m in module.modules(): + if isinstance(m, nn.Conv2d): + init.kaiming_normal_(m.weight, **kwargs) + m.weight.data *= scale + if m.bias is not None: + m.bias.data.fill_(bias_fill) + elif isinstance(m, nn.Linear): + init.kaiming_normal_(m.weight, **kwargs) + m.weight.data *= scale + if m.bias is not None: + m.bias.data.fill_(bias_fill) + elif isinstance(m, _BatchNorm): + init.constant_(m.weight, 1) + if m.bias is not None: + m.bias.data.fill_(bias_fill) + + +def make_layer( + basic_block: Type[nn.Module], num_basic_block: int, **kwarg +) -> nn.Sequential: + """Make layers by stacking the same blocks. + + Args: + basic_block (Type[nn.Module]): nn.Module class for basic block. + num_basic_block (int): number of blocks. + + Returns: + nn.Sequential: Stacked blocks in nn.Sequential. + """ + layers = [] + for _ in range(num_basic_block): + layers.append(basic_block(**kwarg)) + return nn.Sequential(*layers) + + +# TODO: may write a cpp file +def pixel_unshuffle(x: torch.Tensor, scale: int) -> torch.Tensor: + """Pixel unshuffle. + + Args: + x (Tensor): Input feature with shape (b, c, hh, hw). + scale (int): Downsample ratio. + + Returns: + Tensor: the pixel unshuffled feature. + """ + b, c, hh, hw = x.size() + out_channel = c * (scale**2) + assert hh % scale == 0 and hw % scale == 0 + h = hh // scale + w = hw // scale + x_view = x.view(b, c, h, scale, w, scale) + return x_view.permute(0, 1, 3, 5, 2, 4).reshape(b, out_channel, h, w) diff --git a/inpaint/plugins/basicsr/img_util.py b/inpaint/plugins/basicsr/img_util.py new file mode 100644 index 0000000..3a5f1da --- /dev/null +++ b/inpaint/plugins/basicsr/img_util.py @@ -0,0 +1,172 @@ +import cv2 +import math +import numpy as np +import os +import torch +from torchvision.utils import make_grid + + +def img2tensor(imgs, bgr2rgb=True, float32=True): + """Numpy array to tensor. + + Args: + imgs (list[ndarray] | ndarray): Input images. + bgr2rgb (bool): Whether to change bgr to rgb. + float32 (bool): Whether to change to float32. + + Returns: + list[tensor] | tensor: Tensor images. If returned results only have + one element, just return tensor. + """ + + def _totensor(img, bgr2rgb, float32): + if img.shape[2] == 3 and bgr2rgb: + if img.dtype == 'float64': + img = img.astype('float32') + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + img = torch.from_numpy(img.transpose(2, 0, 1)) + if float32: + img = img.float() + return img + + if isinstance(imgs, list): + return [_totensor(img, bgr2rgb, float32) for img in imgs] + else: + return _totensor(imgs, bgr2rgb, float32) + + +def tensor2img(tensor, rgb2bgr=True, out_type=np.uint8, min_max=(0, 1)): + """Convert torch Tensors into image numpy arrays. + + After clamping to [min, max], values will be normalized to [0, 1]. + + Args: + tensor (Tensor or list[Tensor]): Accept shapes: + 1) 4D mini-batch Tensor of shape (B x 3/1 x H x W); + 2) 3D Tensor of shape (3/1 x H x W); + 3) 2D Tensor of shape (H x W). + Tensor channel should be in RGB order. + rgb2bgr (bool): Whether to change rgb to bgr. + out_type (numpy type): output types. If ``np.uint8``, transform outputs + to uint8 type with range [0, 255]; otherwise, float type with + range [0, 1]. Default: ``np.uint8``. + min_max (tuple[int]): min and max values for clamp. + + Returns: + (Tensor or list): 3D ndarray of shape (H x W x C) OR 2D ndarray of + shape (H x W). The channel order is BGR. + """ + if not (torch.is_tensor(tensor) or (isinstance(tensor, list) and all(torch.is_tensor(t) for t in tensor))): + raise TypeError(f'tensor or list of tensors expected, got {type(tensor)}') + + if torch.is_tensor(tensor): + tensor = [tensor] + result = [] + for _tensor in tensor: + _tensor = _tensor.squeeze(0).float().detach().cpu().clamp_(*min_max) + _tensor = (_tensor - min_max[0]) / (min_max[1] - min_max[0]) + + n_dim = _tensor.dim() + if n_dim == 4: + img_np = make_grid(_tensor, nrow=int(math.sqrt(_tensor.size(0))), normalize=False).numpy() + img_np = img_np.transpose(1, 2, 0) + if rgb2bgr: + img_np = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR) + elif n_dim == 3: + img_np = _tensor.numpy() + img_np = img_np.transpose(1, 2, 0) + if img_np.shape[2] == 1: # gray image + img_np = np.squeeze(img_np, axis=2) + else: + if rgb2bgr: + img_np = cv2.cvtColor(img_np, cv2.COLOR_RGB2BGR) + elif n_dim == 2: + img_np = _tensor.numpy() + else: + raise TypeError(f'Only support 4D, 3D or 2D tensor. But received with dimension: {n_dim}') + if out_type == np.uint8: + # Unlike MATLAB, numpy.unit8() WILL NOT round by default. + img_np = (img_np * 255.0).round() + img_np = img_np.astype(out_type) + result.append(img_np) + if len(result) == 1: + result = result[0] + return result + + +def tensor2img_fast(tensor, rgb2bgr=True, min_max=(0, 1)): + """This implementation is slightly faster than tensor2img. + It now only supports torch tensor with shape (1, c, h, w). + + Args: + tensor (Tensor): Now only support torch tensor with (1, c, h, w). + rgb2bgr (bool): Whether to change rgb to bgr. Default: True. + min_max (tuple[int]): min and max values for clamp. + """ + output = tensor.squeeze(0).detach().clamp_(*min_max).permute(1, 2, 0) + output = (output - min_max[0]) / (min_max[1] - min_max[0]) * 255 + output = output.type(torch.uint8).cpu().numpy() + if rgb2bgr: + output = cv2.cvtColor(output, cv2.COLOR_RGB2BGR) + return output + + +def imfrombytes(content, flag='color', float32=False): + """Read an image from bytes. + + Args: + content (bytes): Image bytes got from files or other streams. + flag (str): Flags specifying the color type of a loaded image, + candidates are `color`, `grayscale` and `unchanged`. + float32 (bool): Whether to change to float32., If True, will also norm + to [0, 1]. Default: False. + + Returns: + ndarray: Loaded image array. + """ + img_np = np.frombuffer(content, np.uint8) + imread_flags = {'color': cv2.IMREAD_COLOR, 'grayscale': cv2.IMREAD_GRAYSCALE, 'unchanged': cv2.IMREAD_UNCHANGED} + img = cv2.imdecode(img_np, imread_flags[flag]) + if float32: + img = img.astype(np.float32) / 255. + return img + + +def imwrite(img, file_path, params=None, auto_mkdir=True): + """Write image to file. + + Args: + img (ndarray): Image array to be written. + file_path (str): Image file path. + params (None or list): Same as opencv's :func:`imwrite` interface. + auto_mkdir (bool): If the parent folder of `file_path` does not exist, + whether to create it automatically. + + Returns: + bool: Successful or not. + """ + if auto_mkdir: + dir_name = os.path.abspath(os.path.dirname(file_path)) + os.makedirs(dir_name, exist_ok=True) + ok = cv2.imwrite(file_path, img, params) + if not ok: + raise IOError('Failed in writing images.') + + +def crop_border(imgs, crop_border): + """Crop borders of images. + + Args: + imgs (list[ndarray] | ndarray): Images with shape (h, w, c). + crop_border (int): Crop border for each end of height and weight. + + Returns: + list[ndarray]: Cropped images. + """ + if crop_border == 0: + return imgs + else: + if isinstance(imgs, list): + return [v[crop_border:-crop_border, crop_border:-crop_border, ...] for v in imgs] + else: + return imgs[crop_border:-crop_border, crop_border:-crop_border, ...] diff --git a/inpaint/plugins/basicsr/rrdbnet_arch.py b/inpaint/plugins/basicsr/rrdbnet_arch.py new file mode 100644 index 0000000..31c08eb --- /dev/null +++ b/inpaint/plugins/basicsr/rrdbnet_arch.py @@ -0,0 +1,133 @@ +import torch +from torch import nn as nn +from torch.nn import functional as F + +from .arch_util import default_init_weights, make_layer, pixel_unshuffle + + +class ResidualDenseBlock(nn.Module): + """Residual Dense Block. + + Used in RRDB block in ESRGAN. + + Args: + num_feat (int): Channel number of intermediate features. + num_grow_ch (int): Channels for each growth. + """ + + def __init__(self, num_feat: int = 64, num_grow_ch: int = 32) -> None: + super(ResidualDenseBlock, self).__init__() + self.conv1 = nn.Conv2d(num_feat, num_grow_ch, 3, 1, 1) + self.conv2 = nn.Conv2d(num_feat + num_grow_ch, num_grow_ch, 3, 1, 1) + self.conv3 = nn.Conv2d(num_feat + 2 * num_grow_ch, num_grow_ch, 3, 1, 1) + self.conv4 = nn.Conv2d(num_feat + 3 * num_grow_ch, num_grow_ch, 3, 1, 1) + self.conv5 = nn.Conv2d(num_feat + 4 * num_grow_ch, num_feat, 3, 1, 1) + + self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True) + + # initialization + default_init_weights( + [self.conv1, self.conv2, self.conv3, self.conv4, self.conv5], 0.1 + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x1 = self.lrelu(self.conv1(x)) + x2 = self.lrelu(self.conv2(torch.cat((x, x1), 1))) + x3 = self.lrelu(self.conv3(torch.cat((x, x1, x2), 1))) + x4 = self.lrelu(self.conv4(torch.cat((x, x1, x2, x3), 1))) + x5 = self.conv5(torch.cat((x, x1, x2, x3, x4), 1)) + # Empirically, we use 0.2 to scale the residual for better performance + return x5 * 0.2 + x + + +class RRDB(nn.Module): + """Residual in Residual Dense Block. + + Used in RRDB-Net in ESRGAN. + + Args: + num_feat (int): Channel number of intermediate features. + num_grow_ch (int): Channels for each growth. + """ + + def __init__(self, num_feat: int, num_grow_ch: int = 32) -> None: + super(RRDB, self).__init__() + self.rdb1 = ResidualDenseBlock(num_feat, num_grow_ch) + self.rdb2 = ResidualDenseBlock(num_feat, num_grow_ch) + self.rdb3 = ResidualDenseBlock(num_feat, num_grow_ch) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + out = self.rdb1(x) + out = self.rdb2(out) + out = self.rdb3(out) + # Empirically, we use 0.2 to scale the residual for better performance + return out * 0.2 + x + + +class RRDBNet(nn.Module): + """Networks consisting of Residual in Residual Dense Block, which is used + in ESRGAN. + + ESRGAN: Enhanced Super-Resolution Generative Adversarial Networks. + + We extend ESRGAN for scale x2 and scale x1. + Note: This is one option for scale 1, scale 2 in RRDBNet. + We first employ the pixel-unshuffle (an inverse operation of pixelshuffle to reduce the spatial size + and enlarge the channel size before feeding inputs into the main ESRGAN architecture. + + Args: + num_in_ch (int): Channel number of inputs. + num_out_ch (int): Channel number of outputs. + num_feat (int): Channel number of intermediate features. + Default: 64 + num_block (int): Block number in the trunk network. Defaults: 23 + num_grow_ch (int): Channels for each growth. Default: 32. + """ + + def __init__( + self, + num_in_ch: int, + num_out_ch: int, + scale: int = 4, + num_feat: int = 64, + num_block: int = 23, + num_grow_ch: int = 32, + ) -> None: + super(RRDBNet, self).__init__() + self.scale = scale + if scale == 2: + num_in_ch = num_in_ch * 4 + elif scale == 1: + num_in_ch = num_in_ch * 16 + self.conv_first = nn.Conv2d(num_in_ch, num_feat, 3, 1, 1) + self.body = make_layer( + RRDB, num_block, num_feat=num_feat, num_grow_ch=num_grow_ch + ) + self.conv_body = nn.Conv2d(num_feat, num_feat, 3, 1, 1) + # upsample + self.conv_up1 = nn.Conv2d(num_feat, num_feat, 3, 1, 1) + self.conv_up2 = nn.Conv2d(num_feat, num_feat, 3, 1, 1) + self.conv_hr = nn.Conv2d(num_feat, num_feat, 3, 1, 1) + self.conv_last = nn.Conv2d(num_feat, num_out_ch, 3, 1, 1) + + self.lrelu = nn.LeakyReLU(negative_slope=0.2, inplace=True) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + if self.scale == 2: + feat = pixel_unshuffle(x, scale=2) + elif self.scale == 1: + feat = pixel_unshuffle(x, scale=4) + else: + feat = x + feat = self.conv_first(feat) + body_feat = self.conv_body(self.body(feat)) + feat = feat + body_feat + # upsample + feat = self.lrelu( + self.conv_up1(F.interpolate(feat, scale_factor=2, mode="nearest")) + ) + feat = self.lrelu( + self.conv_up2(F.interpolate(feat, scale_factor=2, mode="nearest")) + ) + out = self.conv_last(self.lrelu(self.conv_hr(feat))) + return out diff --git a/inpaint/plugins/briarmbg.py b/inpaint/plugins/briarmbg.py new file mode 100644 index 0000000..880f530 --- /dev/null +++ b/inpaint/plugins/briarmbg.py @@ -0,0 +1,512 @@ +# copy from: https://huggingface.co/spaces/briaai/BRIA-RMBG-1.4/blob/main/briarmbg.py +import cv2 +import torch +import torch.nn as nn +import torch.nn.functional as F +from PIL import Image +import numpy as np +from torchvision.transforms.functional import normalize + + +class REBNCONV(nn.Module): + def __init__(self, in_ch=3, out_ch=3, dirate=1, stride=1): + super(REBNCONV, self).__init__() + + self.conv_s1 = nn.Conv2d( + in_ch, out_ch, 3, padding=1 * dirate, dilation=1 * dirate, stride=stride + ) + self.bn_s1 = nn.BatchNorm2d(out_ch) + self.relu_s1 = nn.ReLU(inplace=True) + + def forward(self, x): + hx = x + xout = self.relu_s1(self.bn_s1(self.conv_s1(hx))) + + return xout + + +## upsample tensor 'src' to have the same spatial size with tensor 'tar' +def _upsample_like(src, tar): + src = F.interpolate(src, size=tar.shape[2:], mode="bilinear") + + return src + + +### RSU-7 ### +class RSU7(nn.Module): + def __init__(self, in_ch=3, mid_ch=12, out_ch=3, img_size=512): + super(RSU7, self).__init__() + + self.in_ch = in_ch + self.mid_ch = mid_ch + self.out_ch = out_ch + + self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1) ## 1 -> 1/2 + + self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1) + self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + + self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1) + self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + + self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1) + self.pool3 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + + self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=1) + self.pool4 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + + self.rebnconv5 = REBNCONV(mid_ch, mid_ch, dirate=1) + self.pool5 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + + self.rebnconv6 = REBNCONV(mid_ch, mid_ch, dirate=1) + + self.rebnconv7 = REBNCONV(mid_ch, mid_ch, dirate=2) + + self.rebnconv6d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv5d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv4d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1) + + def forward(self, x): + b, c, h, w = x.shape + + hx = x + hxin = self.rebnconvin(hx) + + hx1 = self.rebnconv1(hxin) + hx = self.pool1(hx1) + + hx2 = self.rebnconv2(hx) + hx = self.pool2(hx2) + + hx3 = self.rebnconv3(hx) + hx = self.pool3(hx3) + + hx4 = self.rebnconv4(hx) + hx = self.pool4(hx4) + + hx5 = self.rebnconv5(hx) + hx = self.pool5(hx5) + + hx6 = self.rebnconv6(hx) + + hx7 = self.rebnconv7(hx6) + + hx6d = self.rebnconv6d(torch.cat((hx7, hx6), 1)) + hx6dup = _upsample_like(hx6d, hx5) + + hx5d = self.rebnconv5d(torch.cat((hx6dup, hx5), 1)) + hx5dup = _upsample_like(hx5d, hx4) + + hx4d = self.rebnconv4d(torch.cat((hx5dup, hx4), 1)) + hx4dup = _upsample_like(hx4d, hx3) + + hx3d = self.rebnconv3d(torch.cat((hx4dup, hx3), 1)) + hx3dup = _upsample_like(hx3d, hx2) + + hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1)) + hx2dup = _upsample_like(hx2d, hx1) + + hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1)) + + return hx1d + hxin + + +### RSU-6 ### +class RSU6(nn.Module): + def __init__(self, in_ch=3, mid_ch=12, out_ch=3): + super(RSU6, self).__init__() + + self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1) + + self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1) + self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + + self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1) + self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + + self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1) + self.pool3 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + + self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=1) + self.pool4 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + + self.rebnconv5 = REBNCONV(mid_ch, mid_ch, dirate=1) + + self.rebnconv6 = REBNCONV(mid_ch, mid_ch, dirate=2) + + self.rebnconv5d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv4d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1) + + def forward(self, x): + hx = x + + hxin = self.rebnconvin(hx) + + hx1 = self.rebnconv1(hxin) + hx = self.pool1(hx1) + + hx2 = self.rebnconv2(hx) + hx = self.pool2(hx2) + + hx3 = self.rebnconv3(hx) + hx = self.pool3(hx3) + + hx4 = self.rebnconv4(hx) + hx = self.pool4(hx4) + + hx5 = self.rebnconv5(hx) + + hx6 = self.rebnconv6(hx5) + + hx5d = self.rebnconv5d(torch.cat((hx6, hx5), 1)) + hx5dup = _upsample_like(hx5d, hx4) + + hx4d = self.rebnconv4d(torch.cat((hx5dup, hx4), 1)) + hx4dup = _upsample_like(hx4d, hx3) + + hx3d = self.rebnconv3d(torch.cat((hx4dup, hx3), 1)) + hx3dup = _upsample_like(hx3d, hx2) + + hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1)) + hx2dup = _upsample_like(hx2d, hx1) + + hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1)) + + return hx1d + hxin + + +### RSU-5 ### +class RSU5(nn.Module): + def __init__(self, in_ch=3, mid_ch=12, out_ch=3): + super(RSU5, self).__init__() + + self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1) + + self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1) + self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + + self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1) + self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + + self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1) + self.pool3 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + + self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=1) + + self.rebnconv5 = REBNCONV(mid_ch, mid_ch, dirate=2) + + self.rebnconv4d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1) + + def forward(self, x): + hx = x + + hxin = self.rebnconvin(hx) + + hx1 = self.rebnconv1(hxin) + hx = self.pool1(hx1) + + hx2 = self.rebnconv2(hx) + hx = self.pool2(hx2) + + hx3 = self.rebnconv3(hx) + hx = self.pool3(hx3) + + hx4 = self.rebnconv4(hx) + + hx5 = self.rebnconv5(hx4) + + hx4d = self.rebnconv4d(torch.cat((hx5, hx4), 1)) + hx4dup = _upsample_like(hx4d, hx3) + + hx3d = self.rebnconv3d(torch.cat((hx4dup, hx3), 1)) + hx3dup = _upsample_like(hx3d, hx2) + + hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1)) + hx2dup = _upsample_like(hx2d, hx1) + + hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1)) + + return hx1d + hxin + + +### RSU-4 ### +class RSU4(nn.Module): + def __init__(self, in_ch=3, mid_ch=12, out_ch=3): + super(RSU4, self).__init__() + + self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1) + + self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1) + self.pool1 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + + self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=1) + self.pool2 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + + self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=1) + + self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=2) + + self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=1) + self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1) + + def forward(self, x): + hx = x + + hxin = self.rebnconvin(hx) + + hx1 = self.rebnconv1(hxin) + hx = self.pool1(hx1) + + hx2 = self.rebnconv2(hx) + hx = self.pool2(hx2) + + hx3 = self.rebnconv3(hx) + + hx4 = self.rebnconv4(hx3) + + hx3d = self.rebnconv3d(torch.cat((hx4, hx3), 1)) + hx3dup = _upsample_like(hx3d, hx2) + + hx2d = self.rebnconv2d(torch.cat((hx3dup, hx2), 1)) + hx2dup = _upsample_like(hx2d, hx1) + + hx1d = self.rebnconv1d(torch.cat((hx2dup, hx1), 1)) + + return hx1d + hxin + + +### RSU-4F ### +class RSU4F(nn.Module): + def __init__(self, in_ch=3, mid_ch=12, out_ch=3): + super(RSU4F, self).__init__() + + self.rebnconvin = REBNCONV(in_ch, out_ch, dirate=1) + + self.rebnconv1 = REBNCONV(out_ch, mid_ch, dirate=1) + self.rebnconv2 = REBNCONV(mid_ch, mid_ch, dirate=2) + self.rebnconv3 = REBNCONV(mid_ch, mid_ch, dirate=4) + + self.rebnconv4 = REBNCONV(mid_ch, mid_ch, dirate=8) + + self.rebnconv3d = REBNCONV(mid_ch * 2, mid_ch, dirate=4) + self.rebnconv2d = REBNCONV(mid_ch * 2, mid_ch, dirate=2) + self.rebnconv1d = REBNCONV(mid_ch * 2, out_ch, dirate=1) + + def forward(self, x): + hx = x + + hxin = self.rebnconvin(hx) + + hx1 = self.rebnconv1(hxin) + hx2 = self.rebnconv2(hx1) + hx3 = self.rebnconv3(hx2) + + hx4 = self.rebnconv4(hx3) + + hx3d = self.rebnconv3d(torch.cat((hx4, hx3), 1)) + hx2d = self.rebnconv2d(torch.cat((hx3d, hx2), 1)) + hx1d = self.rebnconv1d(torch.cat((hx2d, hx1), 1)) + + return hx1d + hxin + + +class myrebnconv(nn.Module): + def __init__( + self, + in_ch=3, + out_ch=1, + kernel_size=3, + stride=1, + padding=1, + dilation=1, + groups=1, + ): + super(myrebnconv, self).__init__() + + self.conv = nn.Conv2d( + in_ch, + out_ch, + kernel_size=kernel_size, + stride=stride, + padding=padding, + dilation=dilation, + groups=groups, + ) + self.bn = nn.BatchNorm2d(out_ch) + self.rl = nn.ReLU(inplace=True) + + def forward(self, x): + return self.rl(self.bn(self.conv(x))) + + +class BriaRMBG(nn.Module): + def __init__(self, in_ch=3, out_ch=1): + super(BriaRMBG, self).__init__() + + self.conv_in = nn.Conv2d(in_ch, 64, 3, stride=2, padding=1) + self.pool_in = nn.MaxPool2d(2, stride=2, ceil_mode=True) + + self.stage1 = RSU7(64, 32, 64) + self.pool12 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + + self.stage2 = RSU6(64, 32, 128) + self.pool23 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + + self.stage3 = RSU5(128, 64, 256) + self.pool34 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + + self.stage4 = RSU4(256, 128, 512) + self.pool45 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + + self.stage5 = RSU4F(512, 256, 512) + self.pool56 = nn.MaxPool2d(2, stride=2, ceil_mode=True) + + self.stage6 = RSU4F(512, 256, 512) + + # decoder + self.stage5d = RSU4F(1024, 256, 512) + self.stage4d = RSU4(1024, 128, 256) + self.stage3d = RSU5(512, 64, 128) + self.stage2d = RSU6(256, 32, 64) + self.stage1d = RSU7(128, 16, 64) + + self.side1 = nn.Conv2d(64, out_ch, 3, padding=1) + self.side2 = nn.Conv2d(64, out_ch, 3, padding=1) + self.side3 = nn.Conv2d(128, out_ch, 3, padding=1) + self.side4 = nn.Conv2d(256, out_ch, 3, padding=1) + self.side5 = nn.Conv2d(512, out_ch, 3, padding=1) + self.side6 = nn.Conv2d(512, out_ch, 3, padding=1) + + # self.outconv = nn.Conv2d(6*out_ch,out_ch,1) + + def forward(self, x): + hx = x + + hxin = self.conv_in(hx) + # hx = self.pool_in(hxin) + + # stage 1 + hx1 = self.stage1(hxin) + hx = self.pool12(hx1) + + # stage 2 + hx2 = self.stage2(hx) + hx = self.pool23(hx2) + + # stage 3 + hx3 = self.stage3(hx) + hx = self.pool34(hx3) + + # stage 4 + hx4 = self.stage4(hx) + hx = self.pool45(hx4) + + # stage 5 + hx5 = self.stage5(hx) + hx = self.pool56(hx5) + + # stage 6 + hx6 = self.stage6(hx) + hx6up = _upsample_like(hx6, hx5) + + # -------------------- decoder -------------------- + hx5d = self.stage5d(torch.cat((hx6up, hx5), 1)) + hx5dup = _upsample_like(hx5d, hx4) + + hx4d = self.stage4d(torch.cat((hx5dup, hx4), 1)) + hx4dup = _upsample_like(hx4d, hx3) + + hx3d = self.stage3d(torch.cat((hx4dup, hx3), 1)) + hx3dup = _upsample_like(hx3d, hx2) + + hx2d = self.stage2d(torch.cat((hx3dup, hx2), 1)) + hx2dup = _upsample_like(hx2d, hx1) + + hx1d = self.stage1d(torch.cat((hx2dup, hx1), 1)) + + # side output + d1 = self.side1(hx1d) + d1 = _upsample_like(d1, x) + + d2 = self.side2(hx2d) + d2 = _upsample_like(d2, x) + + d3 = self.side3(hx3d) + d3 = _upsample_like(d3, x) + + d4 = self.side4(hx4d) + d4 = _upsample_like(d4, x) + + d5 = self.side5(hx5d) + d5 = _upsample_like(d5, x) + + d6 = self.side6(hx6) + d6 = _upsample_like(d6, x) + + return [ + F.sigmoid(d1), + F.sigmoid(d2), + F.sigmoid(d3), + F.sigmoid(d4), + F.sigmoid(d5), + F.sigmoid(d6), + ], [hx1d, hx2d, hx3d, hx4d, hx5d, hx6] + + +def resize_image(image): + image = image.convert("RGB") + model_input_size = (1024, 1024) + image = image.resize(model_input_size, Image.BILINEAR) + return image + + +def create_briarmbg_session(): + from huggingface_hub import hf_hub_download + + net = BriaRMBG() + model_path = hf_hub_download("briaai/RMBG-1.4", "model.pth") + net.load_state_dict(torch.load(model_path, map_location="cpu")) + net.eval() + return net + + +def briarmbg_process(bgr_np_image, session, only_mask=False): + # prepare input + orig_bgr_image = Image.fromarray(bgr_np_image) + w, h = orig_im_size = orig_bgr_image.size + image = resize_image(orig_bgr_image) + im_np = np.array(image) + im_tensor = torch.tensor(im_np, dtype=torch.float32).permute(2, 0, 1) + im_tensor = torch.unsqueeze(im_tensor, 0) + im_tensor = torch.divide(im_tensor, 255.0) + im_tensor = normalize(im_tensor, [0.5, 0.5, 0.5], [1.0, 1.0, 1.0]) + # inference + result = session(im_tensor) + # post process + result = torch.squeeze(F.interpolate(result[0][0], size=(h, w), mode="bilinear"), 0) + ma = torch.max(result) + mi = torch.min(result) + result = (result - mi) / (ma - mi) + # image to pil + im_array = (result * 255).cpu().data.numpy().astype(np.uint8) + + mask = np.squeeze(im_array) + if only_mask: + return mask + + pil_im = Image.fromarray(mask) + # paste the mask on the original image + new_im = Image.new("RGBA", pil_im.size, (0, 0, 0, 0)) + new_im.paste(orig_bgr_image, mask=pil_im) + rgba_np_img = np.asarray(new_im) + return rgba_np_img diff --git a/inpaint/plugins/facexlib/.gitignore b/inpaint/plugins/facexlib/.gitignore new file mode 100644 index 0000000..9f69454 --- /dev/null +++ b/inpaint/plugins/facexlib/.gitignore @@ -0,0 +1,135 @@ +.vscode +*.pth +*.png +*.jpg +version.py + +# Byte-compiled / optimized / DLL files +__pycache__/ +*.py[cod] +*$py.class + +# C extensions +*.so + +# Distribution / packaging +.Python +build/ +develop-eggs/ +dist/ +downloads/ +eggs/ +.eggs/ +lib/ +lib64/ +parts/ +sdist/ +var/ +wheels/ +pip-wheel-metadata/ +share/python-wheels/ +*.egg-info/ +.installed.cfg +*.egg +MANIFEST + +# PyInstaller +# Usually these files are written by a python script from a template +# before PyInstaller builds the exe, so as to inject date/other infos into it. +*.manifest +*.spec + +# Installer logs +pip-log.txt +pip-delete-this-directory.txt + +# Unit test / coverage reports +htmlcov/ +.tox/ +.nox/ +.coverage +.coverage.* +.cache +nosetests.xml +coverage.xml +*.cover +*.py,cover +.hypothesis/ +.pytest_cache/ + +# Translations +*.mo +*.pot + +# Django stuff: +*.log +local_settings.py +db.sqlite3 +db.sqlite3-journal + +# Flask stuff: +instance/ +.webassets-cache + +# Scrapy stuff: +.scrapy + +# Sphinx documentation +docs/_build/ + +# PyBuilder +target/ + +# Jupyter Notebook +.ipynb_checkpoints + +# IPython +profile_default/ +ipython_config.py + +# pyenv +.python-version + +# pipenv +# According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control. +# However, in case of collaboration, if having platform-specific dependencies or dependencies +# having no cross-platform support, pipenv may install dependencies that don't work, or not +# install all needed dependencies. +#Pipfile.lock + +# PEP 582; used by e.g. github.com/David-OConnor/pyflow +__pypackages__/ + +# Celery stuff +celerybeat-schedule +celerybeat.pid + +# SageMath parsed files +*.sage.py + +# Environments +.env +.venv +env/ +venv/ +ENV/ +env.bak/ +venv.bak/ + +# Spyder project settings +.spyderproject +.spyproject + +# Rope project settings +.ropeproject + +# mkdocs documentation +/site + +# mypy +.mypy_cache/ +.dmypy.json +dmypy.json + +# Pyre type checker +.pyre/ diff --git a/inpaint/plugins/facexlib/__init__.py b/inpaint/plugins/facexlib/__init__.py new file mode 100644 index 0000000..6494685 --- /dev/null +++ b/inpaint/plugins/facexlib/__init__.py @@ -0,0 +1,3 @@ +# flake8: noqa +from .detection import * +from .utils import * diff --git a/inpaint/plugins/facexlib/detection/__init__.py b/inpaint/plugins/facexlib/detection/__init__.py new file mode 100644 index 0000000..eb3c79c --- /dev/null +++ b/inpaint/plugins/facexlib/detection/__init__.py @@ -0,0 +1,31 @@ +import torch +from copy import deepcopy + +from ..utils import load_file_from_url +from .retinaface import RetinaFace + + +def init_detection_model(model_name, half=False, device='cuda', model_rootpath=None): + if model_name == 'retinaface_resnet50': + model = RetinaFace(network_name='resnet50', half=half, device=device) + model_url = 'https://github.com/xinntao/facexlib/releases/download/v0.1.0/detection_Resnet50_Final.pth' + elif model_name == 'retinaface_mobile0.25': + model = RetinaFace(network_name='mobile0.25', half=half, device=device) + model_url = 'https://github.com/xinntao/facexlib/releases/download/v0.1.0/detection_mobilenet0.25_Final.pth' + else: + raise NotImplementedError(f'{model_name} is not implemented.') + + model_path = load_file_from_url( + url=model_url, model_dir='facexlib/weights', progress=True, file_name=None, save_dir=model_rootpath) + + # TODO: clean pretrained model + load_net = torch.load(model_path, map_location=lambda storage, loc: storage) + # remove unnecessary 'module.' + for k, v in deepcopy(load_net).items(): + if k.startswith('module.'): + load_net[k[7:]] = v + load_net.pop(k) + model.load_state_dict(load_net, strict=True) + model.eval() + model = model.to(device) + return model diff --git a/inpaint/plugins/facexlib/detection/align_trans.py b/inpaint/plugins/facexlib/detection/align_trans.py new file mode 100644 index 0000000..07f1eb3 --- /dev/null +++ b/inpaint/plugins/facexlib/detection/align_trans.py @@ -0,0 +1,219 @@ +import cv2 +import numpy as np + +from .matlab_cp2tform import get_similarity_transform_for_cv2 + +# reference facial points, a list of coordinates (x,y) +REFERENCE_FACIAL_POINTS = [[30.29459953, 51.69630051], [65.53179932, 51.50139999], [48.02519989, 71.73660278], + [33.54930115, 92.3655014], [62.72990036, 92.20410156]] + +DEFAULT_CROP_SIZE = (96, 112) + + +class FaceWarpException(Exception): + + def __str__(self): + return 'In File {}:{}'.format(__file__, super.__str__(self)) + + +def get_reference_facial_points(output_size=None, inner_padding_factor=0.0, outer_padding=(0, 0), default_square=False): + """ + Function: + ---------- + get reference 5 key points according to crop settings: + 0. Set default crop_size: + if default_square: + crop_size = (112, 112) + else: + crop_size = (96, 112) + 1. Pad the crop_size by inner_padding_factor in each side; + 2. Resize crop_size into (output_size - outer_padding*2), + pad into output_size with outer_padding; + 3. Output reference_5point; + Parameters: + ---------- + @output_size: (w, h) or None + size of aligned face image + @inner_padding_factor: (w_factor, h_factor) + padding factor for inner (w, h) + @outer_padding: (w_pad, h_pad) + each row is a pair of coordinates (x, y) + @default_square: True or False + if True: + default crop_size = (112, 112) + else: + default crop_size = (96, 112); + !!! make sure, if output_size is not None: + (output_size - outer_padding) + = some_scale * (default crop_size * (1.0 + + inner_padding_factor)) + Returns: + ---------- + @reference_5point: 5x2 np.array + each row is a pair of transformed coordinates (x, y) + """ + + tmp_5pts = np.array(REFERENCE_FACIAL_POINTS) + tmp_crop_size = np.array(DEFAULT_CROP_SIZE) + + # 0) make the inner region a square + if default_square: + size_diff = max(tmp_crop_size) - tmp_crop_size + tmp_5pts += size_diff / 2 + tmp_crop_size += size_diff + + if (output_size and output_size[0] == tmp_crop_size[0] and output_size[1] == tmp_crop_size[1]): + + return tmp_5pts + + if (inner_padding_factor == 0 and outer_padding == (0, 0)): + if output_size is None: + return tmp_5pts + else: + raise FaceWarpException('No paddings to do, output_size must be None or {}'.format(tmp_crop_size)) + + # check output size + if not (0 <= inner_padding_factor <= 1.0): + raise FaceWarpException('Not (0 <= inner_padding_factor <= 1.0)') + + if ((inner_padding_factor > 0 or outer_padding[0] > 0 or outer_padding[1] > 0) and output_size is None): + output_size = tmp_crop_size * \ + (1 + inner_padding_factor * 2).astype(np.int32) + output_size += np.array(outer_padding) + if not (outer_padding[0] < output_size[0] and outer_padding[1] < output_size[1]): + raise FaceWarpException('Not (outer_padding[0] < output_size[0] and outer_padding[1] < output_size[1])') + + # 1) pad the inner region according inner_padding_factor + if inner_padding_factor > 0: + size_diff = tmp_crop_size * inner_padding_factor * 2 + tmp_5pts += size_diff / 2 + tmp_crop_size += np.round(size_diff).astype(np.int32) + + # 2) resize the padded inner region + size_bf_outer_pad = np.array(output_size) - np.array(outer_padding) * 2 + + if size_bf_outer_pad[0] * tmp_crop_size[1] != size_bf_outer_pad[1] * tmp_crop_size[0]: + raise FaceWarpException('Must have (output_size - outer_padding)' + '= some_scale * (crop_size * (1.0 + inner_padding_factor)') + + scale_factor = size_bf_outer_pad[0].astype(np.float32) / tmp_crop_size[0] + tmp_5pts = tmp_5pts * scale_factor + # size_diff = tmp_crop_size * (scale_factor - min(scale_factor)) + # tmp_5pts = tmp_5pts + size_diff / 2 + tmp_crop_size = size_bf_outer_pad + + # 3) add outer_padding to make output_size + reference_5point = tmp_5pts + np.array(outer_padding) + tmp_crop_size = output_size + + return reference_5point + + +def get_affine_transform_matrix(src_pts, dst_pts): + """ + Function: + ---------- + get affine transform matrix 'tfm' from src_pts to dst_pts + Parameters: + ---------- + @src_pts: Kx2 np.array + source points matrix, each row is a pair of coordinates (x, y) + @dst_pts: Kx2 np.array + destination points matrix, each row is a pair of coordinates (x, y) + Returns: + ---------- + @tfm: 2x3 np.array + transform matrix from src_pts to dst_pts + """ + + tfm = np.float32([[1, 0, 0], [0, 1, 0]]) + n_pts = src_pts.shape[0] + ones = np.ones((n_pts, 1), src_pts.dtype) + src_pts_ = np.hstack([src_pts, ones]) + dst_pts_ = np.hstack([dst_pts, ones]) + + A, res, rank, s = np.linalg.lstsq(src_pts_, dst_pts_) + + if rank == 3: + tfm = np.float32([[A[0, 0], A[1, 0], A[2, 0]], [A[0, 1], A[1, 1], A[2, 1]]]) + elif rank == 2: + tfm = np.float32([[A[0, 0], A[1, 0], 0], [A[0, 1], A[1, 1], 0]]) + + return tfm + + +def warp_and_crop_face(src_img, facial_pts, reference_pts=None, crop_size=(96, 112), align_type='smilarity'): + """ + Function: + ---------- + apply affine transform 'trans' to uv + Parameters: + ---------- + @src_img: 3x3 np.array + input image + @facial_pts: could be + 1)a list of K coordinates (x,y) + or + 2) Kx2 or 2xK np.array + each row or col is a pair of coordinates (x, y) + @reference_pts: could be + 1) a list of K coordinates (x,y) + or + 2) Kx2 or 2xK np.array + each row or col is a pair of coordinates (x, y) + or + 3) None + if None, use default reference facial points + @crop_size: (w, h) + output face image size + @align_type: transform type, could be one of + 1) 'similarity': use similarity transform + 2) 'cv2_affine': use the first 3 points to do affine transform, + by calling cv2.getAffineTransform() + 3) 'affine': use all points to do affine transform + Returns: + ---------- + @face_img: output face image with size (w, h) = @crop_size + """ + + if reference_pts is None: + if crop_size[0] == 96 and crop_size[1] == 112: + reference_pts = REFERENCE_FACIAL_POINTS + else: + default_square = False + inner_padding_factor = 0 + outer_padding = (0, 0) + output_size = crop_size + + reference_pts = get_reference_facial_points(output_size, inner_padding_factor, outer_padding, + default_square) + + ref_pts = np.float32(reference_pts) + ref_pts_shp = ref_pts.shape + if max(ref_pts_shp) < 3 or min(ref_pts_shp) != 2: + raise FaceWarpException('reference_pts.shape must be (K,2) or (2,K) and K>2') + + if ref_pts_shp[0] == 2: + ref_pts = ref_pts.T + + src_pts = np.float32(facial_pts) + src_pts_shp = src_pts.shape + if max(src_pts_shp) < 3 or min(src_pts_shp) != 2: + raise FaceWarpException('facial_pts.shape must be (K,2) or (2,K) and K>2') + + if src_pts_shp[0] == 2: + src_pts = src_pts.T + + if src_pts.shape != ref_pts.shape: + raise FaceWarpException('facial_pts and reference_pts must have the same shape') + + if align_type == 'cv2_affine': + tfm = cv2.getAffineTransform(src_pts[0:3], ref_pts[0:3]) + elif align_type == 'affine': + tfm = get_affine_transform_matrix(src_pts, ref_pts) + else: + tfm = get_similarity_transform_for_cv2(src_pts, ref_pts) + + face_img = cv2.warpAffine(src_img, tfm, (crop_size[0], crop_size[1])) + + return face_img diff --git a/inpaint/plugins/facexlib/detection/matlab_cp2tform.py b/inpaint/plugins/facexlib/detection/matlab_cp2tform.py new file mode 100644 index 0000000..b2a8b54 --- /dev/null +++ b/inpaint/plugins/facexlib/detection/matlab_cp2tform.py @@ -0,0 +1,317 @@ +import numpy as np +from numpy.linalg import inv, lstsq +from numpy.linalg import matrix_rank as rank +from numpy.linalg import norm + + +class MatlabCp2tormException(Exception): + + def __str__(self): + return 'In File {}:{}'.format(__file__, super.__str__(self)) + + +def tformfwd(trans, uv): + """ + Function: + ---------- + apply affine transform 'trans' to uv + + Parameters: + ---------- + @trans: 3x3 np.array + transform matrix + @uv: Kx2 np.array + each row is a pair of coordinates (x, y) + + Returns: + ---------- + @xy: Kx2 np.array + each row is a pair of transformed coordinates (x, y) + """ + uv = np.hstack((uv, np.ones((uv.shape[0], 1)))) + xy = np.dot(uv, trans) + xy = xy[:, 0:-1] + return xy + + +def tforminv(trans, uv): + """ + Function: + ---------- + apply the inverse of affine transform 'trans' to uv + + Parameters: + ---------- + @trans: 3x3 np.array + transform matrix + @uv: Kx2 np.array + each row is a pair of coordinates (x, y) + + Returns: + ---------- + @xy: Kx2 np.array + each row is a pair of inverse-transformed coordinates (x, y) + """ + Tinv = inv(trans) + xy = tformfwd(Tinv, uv) + return xy + + +def findNonreflectiveSimilarity(uv, xy, options=None): + options = {'K': 2} + + K = options['K'] + M = xy.shape[0] + x = xy[:, 0].reshape((-1, 1)) # use reshape to keep a column vector + y = xy[:, 1].reshape((-1, 1)) # use reshape to keep a column vector + + tmp1 = np.hstack((x, y, np.ones((M, 1)), np.zeros((M, 1)))) + tmp2 = np.hstack((y, -x, np.zeros((M, 1)), np.ones((M, 1)))) + X = np.vstack((tmp1, tmp2)) + + u = uv[:, 0].reshape((-1, 1)) # use reshape to keep a column vector + v = uv[:, 1].reshape((-1, 1)) # use reshape to keep a column vector + U = np.vstack((u, v)) + + # We know that X * r = U + if rank(X) >= 2 * K: + r, _, _, _ = lstsq(X, U, rcond=-1) + r = np.squeeze(r) + else: + raise Exception('cp2tform:twoUniquePointsReq') + sc = r[0] + ss = r[1] + tx = r[2] + ty = r[3] + + Tinv = np.array([[sc, -ss, 0], [ss, sc, 0], [tx, ty, 1]]) + T = inv(Tinv) + T[:, 2] = np.array([0, 0, 1]) + + return T, Tinv + + +def findSimilarity(uv, xy, options=None): + options = {'K': 2} + + # uv = np.array(uv) + # xy = np.array(xy) + + # Solve for trans1 + trans1, trans1_inv = findNonreflectiveSimilarity(uv, xy, options) + + # Solve for trans2 + + # manually reflect the xy data across the Y-axis + xyR = xy + xyR[:, 0] = -1 * xyR[:, 0] + + trans2r, trans2r_inv = findNonreflectiveSimilarity(uv, xyR, options) + + # manually reflect the tform to undo the reflection done on xyR + TreflectY = np.array([[-1, 0, 0], [0, 1, 0], [0, 0, 1]]) + + trans2 = np.dot(trans2r, TreflectY) + + # Figure out if trans1 or trans2 is better + xy1 = tformfwd(trans1, uv) + norm1 = norm(xy1 - xy) + + xy2 = tformfwd(trans2, uv) + norm2 = norm(xy2 - xy) + + if norm1 <= norm2: + return trans1, trans1_inv + else: + trans2_inv = inv(trans2) + return trans2, trans2_inv + + +def get_similarity_transform(src_pts, dst_pts, reflective=True): + """ + Function: + ---------- + Find Similarity Transform Matrix 'trans': + u = src_pts[:, 0] + v = src_pts[:, 1] + x = dst_pts[:, 0] + y = dst_pts[:, 1] + [x, y, 1] = [u, v, 1] * trans + + Parameters: + ---------- + @src_pts: Kx2 np.array + source points, each row is a pair of coordinates (x, y) + @dst_pts: Kx2 np.array + destination points, each row is a pair of transformed + coordinates (x, y) + @reflective: True or False + if True: + use reflective similarity transform + else: + use non-reflective similarity transform + + Returns: + ---------- + @trans: 3x3 np.array + transform matrix from uv to xy + trans_inv: 3x3 np.array + inverse of trans, transform matrix from xy to uv + """ + + if reflective: + trans, trans_inv = findSimilarity(src_pts, dst_pts) + else: + trans, trans_inv = findNonreflectiveSimilarity(src_pts, dst_pts) + + return trans, trans_inv + + +def cvt_tform_mat_for_cv2(trans): + """ + Function: + ---------- + Convert Transform Matrix 'trans' into 'cv2_trans' which could be + directly used by cv2.warpAffine(): + u = src_pts[:, 0] + v = src_pts[:, 1] + x = dst_pts[:, 0] + y = dst_pts[:, 1] + [x, y].T = cv_trans * [u, v, 1].T + + Parameters: + ---------- + @trans: 3x3 np.array + transform matrix from uv to xy + + Returns: + ---------- + @cv2_trans: 2x3 np.array + transform matrix from src_pts to dst_pts, could be directly used + for cv2.warpAffine() + """ + cv2_trans = trans[:, 0:2].T + + return cv2_trans + + +def get_similarity_transform_for_cv2(src_pts, dst_pts, reflective=True): + """ + Function: + ---------- + Find Similarity Transform Matrix 'cv2_trans' which could be + directly used by cv2.warpAffine(): + u = src_pts[:, 0] + v = src_pts[:, 1] + x = dst_pts[:, 0] + y = dst_pts[:, 1] + [x, y].T = cv_trans * [u, v, 1].T + + Parameters: + ---------- + @src_pts: Kx2 np.array + source points, each row is a pair of coordinates (x, y) + @dst_pts: Kx2 np.array + destination points, each row is a pair of transformed + coordinates (x, y) + reflective: True or False + if True: + use reflective similarity transform + else: + use non-reflective similarity transform + + Returns: + ---------- + @cv2_trans: 2x3 np.array + transform matrix from src_pts to dst_pts, could be directly used + for cv2.warpAffine() + """ + trans, trans_inv = get_similarity_transform(src_pts, dst_pts, reflective) + cv2_trans = cvt_tform_mat_for_cv2(trans) + + return cv2_trans + + +if __name__ == '__main__': + """ + u = [0, 6, -2] + v = [0, 3, 5] + x = [-1, 0, 4] + y = [-1, -10, 4] + + # In Matlab, run: + # + # uv = [u'; v']; + # xy = [x'; y']; + # tform_sim=cp2tform(uv,xy,'similarity'); + # + # trans = tform_sim.tdata.T + # ans = + # -0.0764 -1.6190 0 + # 1.6190 -0.0764 0 + # -3.2156 0.0290 1.0000 + # trans_inv = tform_sim.tdata.Tinv + # ans = + # + # -0.0291 0.6163 0 + # -0.6163 -0.0291 0 + # -0.0756 1.9826 1.0000 + # xy_m=tformfwd(tform_sim, u,v) + # + # xy_m = + # + # -3.2156 0.0290 + # 1.1833 -9.9143 + # 5.0323 2.8853 + # uv_m=tforminv(tform_sim, x,y) + # + # uv_m = + # + # 0.5698 1.3953 + # 6.0872 2.2733 + # -2.6570 4.3314 + """ + u = [0, 6, -2] + v = [0, 3, 5] + x = [-1, 0, 4] + y = [-1, -10, 4] + + uv = np.array((u, v)).T + xy = np.array((x, y)).T + + print('\n--->uv:') + print(uv) + print('\n--->xy:') + print(xy) + + trans, trans_inv = get_similarity_transform(uv, xy) + + print('\n--->trans matrix:') + print(trans) + + print('\n--->trans_inv matrix:') + print(trans_inv) + + print('\n---> apply transform to uv') + print('\nxy_m = uv_augmented * trans') + uv_aug = np.hstack((uv, np.ones((uv.shape[0], 1)))) + xy_m = np.dot(uv_aug, trans) + print(xy_m) + + print('\nxy_m = tformfwd(trans, uv)') + xy_m = tformfwd(trans, uv) + print(xy_m) + + print('\n---> apply inverse transform to xy') + print('\nuv_m = xy_augmented * trans_inv') + xy_aug = np.hstack((xy, np.ones((xy.shape[0], 1)))) + uv_m = np.dot(xy_aug, trans_inv) + print(uv_m) + + print('\nuv_m = tformfwd(trans_inv, xy)') + uv_m = tformfwd(trans_inv, xy) + print(uv_m) + + uv_m = tforminv(trans, xy) + print('\nuv_m = tforminv(trans, xy)') + print(uv_m) diff --git a/inpaint/plugins/facexlib/detection/retinaface.py b/inpaint/plugins/facexlib/detection/retinaface.py new file mode 100644 index 0000000..6c4a84d --- /dev/null +++ b/inpaint/plugins/facexlib/detection/retinaface.py @@ -0,0 +1,419 @@ +import cv2 +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F +from PIL import Image +from torchvision.models._utils import IntermediateLayerGetter as IntermediateLayerGetter + +from .align_trans import get_reference_facial_points, warp_and_crop_face +from .retinaface_net import ( + FPN, + SSH, + MobileNetV1, + make_bbox_head, + make_class_head, + make_landmark_head, +) +from .retinaface_utils import ( + PriorBox, + batched_decode, + batched_decode_landm, + decode, + decode_landm, + py_cpu_nms, +) + + +def generate_config(network_name): + cfg_mnet = { + "name": "mobilenet0.25", + "min_sizes": [[16, 32], [64, 128], [256, 512]], + "steps": [8, 16, 32], + "variance": [0.1, 0.2], + "clip": False, + "loc_weight": 2.0, + "gpu_train": True, + "batch_size": 32, + "ngpu": 1, + "epoch": 250, + "decay1": 190, + "decay2": 220, + "image_size": 640, + "return_layers": {"stage1": 1, "stage2": 2, "stage3": 3}, + "in_channel": 32, + "out_channel": 64, + } + + cfg_re50 = { + "name": "Resnet50", + "min_sizes": [[16, 32], [64, 128], [256, 512]], + "steps": [8, 16, 32], + "variance": [0.1, 0.2], + "clip": False, + "loc_weight": 2.0, + "gpu_train": True, + "batch_size": 24, + "ngpu": 4, + "epoch": 100, + "decay1": 70, + "decay2": 90, + "image_size": 840, + "return_layers": {"layer2": 1, "layer3": 2, "layer4": 3}, + "in_channel": 256, + "out_channel": 256, + } + + if network_name == "mobile0.25": + return cfg_mnet + elif network_name == "resnet50": + return cfg_re50 + else: + raise NotImplementedError(f"network_name={network_name}") + + +class RetinaFace(nn.Module): + def __init__(self, network_name="resnet50", half=False, phase="test", device=None): + self.device = ( + torch.device("cuda" if torch.cuda.is_available() else "cpu") + if device is None + else device + ) + + super(RetinaFace, self).__init__() + self.half_inference = half + cfg = generate_config(network_name) + self.backbone = cfg["name"] + + self.model_name = f"retinaface_{network_name}" + self.cfg = cfg + self.phase = phase + self.target_size, self.max_size = 1600, 2150 + self.resize, self.scale, self.scale1 = 1.0, None, None + self.mean_tensor = torch.tensor( + [[[[104.0]], [[117.0]], [[123.0]]]], device=self.device + ) + self.reference = get_reference_facial_points(default_square=True) + # Build network. + backbone = None + if cfg["name"] == "mobilenet0.25": + backbone = MobileNetV1() + self.body = IntermediateLayerGetter(backbone, cfg["return_layers"]) + elif cfg["name"] == "Resnet50": + import torchvision.models as models + + backbone = models.resnet50(pretrained=False) + self.body = IntermediateLayerGetter(backbone, cfg["return_layers"]) + + in_channels_stage2 = cfg["in_channel"] + in_channels_list = [ + in_channels_stage2 * 2, + in_channels_stage2 * 4, + in_channels_stage2 * 8, + ] + + out_channels = cfg["out_channel"] + self.fpn = FPN(in_channels_list, out_channels) + self.ssh1 = SSH(out_channels, out_channels) + self.ssh2 = SSH(out_channels, out_channels) + self.ssh3 = SSH(out_channels, out_channels) + + self.ClassHead = make_class_head(fpn_num=3, inchannels=cfg["out_channel"]) + self.BboxHead = make_bbox_head(fpn_num=3, inchannels=cfg["out_channel"]) + self.LandmarkHead = make_landmark_head(fpn_num=3, inchannels=cfg["out_channel"]) + + self.to(self.device) + self.eval() + if self.half_inference: + self.half() + + def forward(self, inputs): + out = self.body(inputs) + + if self.backbone == "mobilenet0.25" or self.backbone == "Resnet50": + out = list(out.values()) + # FPN + fpn = self.fpn(out) + + # SSH + feature1 = self.ssh1(fpn[0]) + feature2 = self.ssh2(fpn[1]) + feature3 = self.ssh3(fpn[2]) + features = [feature1, feature2, feature3] + + bbox_regressions = torch.cat( + [self.BboxHead[i](feature) for i, feature in enumerate(features)], dim=1 + ) + classifications = torch.cat( + [self.ClassHead[i](feature) for i, feature in enumerate(features)], dim=1 + ) + tmp = [self.LandmarkHead[i](feature) for i, feature in enumerate(features)] + ldm_regressions = torch.cat(tmp, dim=1) + + if self.phase == "train": + output = (bbox_regressions, classifications, ldm_regressions) + else: + output = ( + bbox_regressions, + F.softmax(classifications, dim=-1), + ldm_regressions, + ) + return output + + def __detect_faces(self, inputs): + # get scale + height, width = inputs.shape[2:] + self.scale = torch.tensor( + [width, height, width, height], dtype=torch.float32, device=self.device + ) + tmp = [ + width, + height, + width, + height, + width, + height, + width, + height, + width, + height, + ] + self.scale1 = torch.tensor(tmp, dtype=torch.float32, device=self.device) + + # forawrd + inputs = inputs.to(self.device) + if self.half_inference: + inputs = inputs.half() + loc, conf, landmarks = self(inputs) + + # get priorbox + priorbox = PriorBox(self.cfg, image_size=inputs.shape[2:]) + priors = priorbox.forward().to(self.device) + + return loc, conf, landmarks, priors + + # single image detection + def transform(self, image, use_origin_size): + # convert to opencv format + if isinstance(image, Image.Image): + image = cv2.cvtColor(np.asarray(image), cv2.COLOR_RGB2BGR) + image = image.astype(np.float32) + + # testing scale + im_size_min = np.min(image.shape[0:2]) + im_size_max = np.max(image.shape[0:2]) + resize = float(self.target_size) / float(im_size_min) + + # prevent bigger axis from being more than max_size + if np.round(resize * im_size_max) > self.max_size: + resize = float(self.max_size) / float(im_size_max) + resize = 1 if use_origin_size else resize + + # resize + if resize != 1: + image = cv2.resize( + image, None, None, fx=resize, fy=resize, interpolation=cv2.INTER_LINEAR + ) + + # convert to torch.tensor format + # image -= (104, 117, 123) + image = image.transpose(2, 0, 1) + image = torch.from_numpy(image).unsqueeze(0) + + return image, resize + + def detect_faces( + self, + image, + conf_threshold=0.8, + nms_threshold=0.4, + use_origin_size=True, + ): + image, self.resize = self.transform(image, use_origin_size) + image = image.to(self.device) + if self.half_inference: + image = image.half() + image = image - self.mean_tensor + + loc, conf, landmarks, priors = self.__detect_faces(image) + + boxes = decode(loc.data.squeeze(0), priors.data, self.cfg["variance"]) + boxes = boxes * self.scale / self.resize + boxes = boxes.cpu().numpy() + + scores = conf.squeeze(0).data.cpu().numpy()[:, 1] + + landmarks = decode_landm(landmarks.squeeze(0), priors, self.cfg["variance"]) + landmarks = landmarks * self.scale1 / self.resize + landmarks = landmarks.cpu().numpy() + + # ignore low scores + inds = np.where(scores > conf_threshold)[0] + boxes, landmarks, scores = boxes[inds], landmarks[inds], scores[inds] + + # sort + order = scores.argsort()[::-1] + boxes, landmarks, scores = boxes[order], landmarks[order], scores[order] + + # do NMS + bounding_boxes = np.hstack((boxes, scores[:, np.newaxis])).astype( + np.float32, copy=False + ) + keep = py_cpu_nms(bounding_boxes, nms_threshold) + bounding_boxes, landmarks = bounding_boxes[keep, :], landmarks[keep] + # self.t['forward_pass'].toc() + # print(self.t['forward_pass'].average_time) + # import sys + # sys.stdout.flush() + return np.concatenate((bounding_boxes, landmarks), axis=1) + + def __align_multi(self, image, boxes, landmarks, limit=None): + if len(boxes) < 1: + return [], [] + + if limit: + boxes = boxes[:limit] + landmarks = landmarks[:limit] + + faces = [] + for landmark in landmarks: + facial5points = [[landmark[2 * j], landmark[2 * j + 1]] for j in range(5)] + + warped_face = warp_and_crop_face( + np.array(image), facial5points, self.reference, crop_size=(112, 112) + ) + faces.append(warped_face) + + return np.concatenate((boxes, landmarks), axis=1), faces + + def align_multi(self, img, conf_threshold=0.8, limit=None): + rlt = self.detect_faces(img, conf_threshold=conf_threshold) + boxes, landmarks = rlt[:, 0:5], rlt[:, 5:] + + return self.__align_multi(img, boxes, landmarks, limit) + + # batched detection + def batched_transform(self, frames, use_origin_size): + """ + Arguments: + frames: a list of PIL.Image, or torch.Tensor(shape=[n, h, w, c], + type=np.float32, BGR format). + use_origin_size: whether to use origin size. + """ + from_PIL = True if isinstance(frames[0], Image.Image) else False + + # convert to opencv format + if from_PIL: + frames = [ + cv2.cvtColor(np.asarray(frame), cv2.COLOR_RGB2BGR) for frame in frames + ] + frames = np.asarray(frames, dtype=np.float32) + + # testing scale + im_size_min = np.min(frames[0].shape[0:2]) + im_size_max = np.max(frames[0].shape[0:2]) + resize = float(self.target_size) / float(im_size_min) + + # prevent bigger axis from being more than max_size + if np.round(resize * im_size_max) > self.max_size: + resize = float(self.max_size) / float(im_size_max) + resize = 1 if use_origin_size else resize + + # resize + if resize != 1: + if not from_PIL: + frames = F.interpolate(frames, scale_factor=resize) + else: + frames = [ + cv2.resize( + frame, + None, + None, + fx=resize, + fy=resize, + interpolation=cv2.INTER_LINEAR, + ) + for frame in frames + ] + + # convert to torch.tensor format + if not from_PIL: + frames = frames.transpose(1, 2).transpose(1, 3).contiguous() + else: + frames = frames.transpose((0, 3, 1, 2)) + frames = torch.from_numpy(frames) + + return frames, resize + + def batched_detect_faces( + self, frames, conf_threshold=0.8, nms_threshold=0.4, use_origin_size=True + ): + """ + Arguments: + frames: a list of PIL.Image, or np.array(shape=[n, h, w, c], + type=np.uint8, BGR format). + conf_threshold: confidence threshold. + nms_threshold: nms threshold. + use_origin_size: whether to use origin size. + Returns: + final_bounding_boxes: list of np.array ([n_boxes, 5], + type=np.float32). + final_landmarks: list of np.array ([n_boxes, 10], type=np.float32). + """ + # self.t['forward_pass'].tic() + frames, self.resize = self.batched_transform(frames, use_origin_size) + frames = frames.to(self.device) + frames = frames - self.mean_tensor + + b_loc, b_conf, b_landmarks, priors = self.__detect_faces(frames) + + final_bounding_boxes, final_landmarks = [], [] + + # decode + priors = priors.unsqueeze(0) + b_loc = ( + batched_decode(b_loc, priors, self.cfg["variance"]) + * self.scale + / self.resize + ) + b_landmarks = ( + batched_decode_landm(b_landmarks, priors, self.cfg["variance"]) + * self.scale1 + / self.resize + ) + b_conf = b_conf[:, :, 1] + + # index for selection + b_indice = b_conf > conf_threshold + + # concat + b_loc_and_conf = torch.cat((b_loc, b_conf.unsqueeze(-1)), dim=2).float() + + for pred, landm, inds in zip(b_loc_and_conf, b_landmarks, b_indice): + # ignore low scores + pred, landm = pred[inds, :], landm[inds, :] + if pred.shape[0] == 0: + final_bounding_boxes.append(np.array([], dtype=np.float32)) + final_landmarks.append(np.array([], dtype=np.float32)) + continue + + # sort + # order = score.argsort(descending=True) + # box, landm, score = box[order], landm[order], score[order] + + # to CPU + bounding_boxes, landm = pred.cpu().numpy(), landm.cpu().numpy() + + # NMS + keep = py_cpu_nms(bounding_boxes, nms_threshold) + bounding_boxes, landmarks = bounding_boxes[keep, :], landm[keep] + + # append + final_bounding_boxes.append(bounding_boxes) + final_landmarks.append(landmarks) + # self.t['forward_pass'].toc(average=True) + # self.batch_time += self.t['forward_pass'].diff + # self.total_frame += len(frames) + # print(self.batch_time / self.total_frame) + + return final_bounding_boxes, final_landmarks diff --git a/inpaint/plugins/facexlib/detection/retinaface_net.py b/inpaint/plugins/facexlib/detection/retinaface_net.py new file mode 100644 index 0000000..ab6aa82 --- /dev/null +++ b/inpaint/plugins/facexlib/detection/retinaface_net.py @@ -0,0 +1,196 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + + +def conv_bn(inp, oup, stride=1, leaky=0): + return nn.Sequential( + nn.Conv2d(inp, oup, 3, stride, 1, bias=False), nn.BatchNorm2d(oup), + nn.LeakyReLU(negative_slope=leaky, inplace=True)) + + +def conv_bn_no_relu(inp, oup, stride): + return nn.Sequential( + nn.Conv2d(inp, oup, 3, stride, 1, bias=False), + nn.BatchNorm2d(oup), + ) + + +def conv_bn1X1(inp, oup, stride, leaky=0): + return nn.Sequential( + nn.Conv2d(inp, oup, 1, stride, padding=0, bias=False), nn.BatchNorm2d(oup), + nn.LeakyReLU(negative_slope=leaky, inplace=True)) + + +def conv_dw(inp, oup, stride, leaky=0.1): + return nn.Sequential( + nn.Conv2d(inp, inp, 3, stride, 1, groups=inp, bias=False), + nn.BatchNorm2d(inp), + nn.LeakyReLU(negative_slope=leaky, inplace=True), + nn.Conv2d(inp, oup, 1, 1, 0, bias=False), + nn.BatchNorm2d(oup), + nn.LeakyReLU(negative_slope=leaky, inplace=True), + ) + + +class SSH(nn.Module): + + def __init__(self, in_channel, out_channel): + super(SSH, self).__init__() + assert out_channel % 4 == 0 + leaky = 0 + if (out_channel <= 64): + leaky = 0.1 + self.conv3X3 = conv_bn_no_relu(in_channel, out_channel // 2, stride=1) + + self.conv5X5_1 = conv_bn(in_channel, out_channel // 4, stride=1, leaky=leaky) + self.conv5X5_2 = conv_bn_no_relu(out_channel // 4, out_channel // 4, stride=1) + + self.conv7X7_2 = conv_bn(out_channel // 4, out_channel // 4, stride=1, leaky=leaky) + self.conv7x7_3 = conv_bn_no_relu(out_channel // 4, out_channel // 4, stride=1) + + def forward(self, input): + conv3X3 = self.conv3X3(input) + + conv5X5_1 = self.conv5X5_1(input) + conv5X5 = self.conv5X5_2(conv5X5_1) + + conv7X7_2 = self.conv7X7_2(conv5X5_1) + conv7X7 = self.conv7x7_3(conv7X7_2) + + out = torch.cat([conv3X3, conv5X5, conv7X7], dim=1) + out = F.relu(out) + return out + + +class FPN(nn.Module): + + def __init__(self, in_channels_list, out_channels): + super(FPN, self).__init__() + leaky = 0 + if (out_channels <= 64): + leaky = 0.1 + self.output1 = conv_bn1X1(in_channels_list[0], out_channels, stride=1, leaky=leaky) + self.output2 = conv_bn1X1(in_channels_list[1], out_channels, stride=1, leaky=leaky) + self.output3 = conv_bn1X1(in_channels_list[2], out_channels, stride=1, leaky=leaky) + + self.merge1 = conv_bn(out_channels, out_channels, leaky=leaky) + self.merge2 = conv_bn(out_channels, out_channels, leaky=leaky) + + def forward(self, input): + # names = list(input.keys()) + # input = list(input.values()) + + output1 = self.output1(input[0]) + output2 = self.output2(input[1]) + output3 = self.output3(input[2]) + + up3 = F.interpolate(output3, size=[output2.size(2), output2.size(3)], mode='nearest') + output2 = output2 + up3 + output2 = self.merge2(output2) + + up2 = F.interpolate(output2, size=[output1.size(2), output1.size(3)], mode='nearest') + output1 = output1 + up2 + output1 = self.merge1(output1) + + out = [output1, output2, output3] + return out + + +class MobileNetV1(nn.Module): + + def __init__(self): + super(MobileNetV1, self).__init__() + self.stage1 = nn.Sequential( + conv_bn(3, 8, 2, leaky=0.1), # 3 + conv_dw(8, 16, 1), # 7 + conv_dw(16, 32, 2), # 11 + conv_dw(32, 32, 1), # 19 + conv_dw(32, 64, 2), # 27 + conv_dw(64, 64, 1), # 43 + ) + self.stage2 = nn.Sequential( + conv_dw(64, 128, 2), # 43 + 16 = 59 + conv_dw(128, 128, 1), # 59 + 32 = 91 + conv_dw(128, 128, 1), # 91 + 32 = 123 + conv_dw(128, 128, 1), # 123 + 32 = 155 + conv_dw(128, 128, 1), # 155 + 32 = 187 + conv_dw(128, 128, 1), # 187 + 32 = 219 + ) + self.stage3 = nn.Sequential( + conv_dw(128, 256, 2), # 219 +3 2 = 241 + conv_dw(256, 256, 1), # 241 + 64 = 301 + ) + self.avg = nn.AdaptiveAvgPool2d((1, 1)) + self.fc = nn.Linear(256, 1000) + + def forward(self, x): + x = self.stage1(x) + x = self.stage2(x) + x = self.stage3(x) + x = self.avg(x) + # x = self.model(x) + x = x.view(-1, 256) + x = self.fc(x) + return x + + +class ClassHead(nn.Module): + + def __init__(self, inchannels=512, num_anchors=3): + super(ClassHead, self).__init__() + self.num_anchors = num_anchors + self.conv1x1 = nn.Conv2d(inchannels, self.num_anchors * 2, kernel_size=(1, 1), stride=1, padding=0) + + def forward(self, x): + out = self.conv1x1(x) + out = out.permute(0, 2, 3, 1).contiguous() + + return out.view(out.shape[0], -1, 2) + + +class BboxHead(nn.Module): + + def __init__(self, inchannels=512, num_anchors=3): + super(BboxHead, self).__init__() + self.conv1x1 = nn.Conv2d(inchannels, num_anchors * 4, kernel_size=(1, 1), stride=1, padding=0) + + def forward(self, x): + out = self.conv1x1(x) + out = out.permute(0, 2, 3, 1).contiguous() + + return out.view(out.shape[0], -1, 4) + + +class LandmarkHead(nn.Module): + + def __init__(self, inchannels=512, num_anchors=3): + super(LandmarkHead, self).__init__() + self.conv1x1 = nn.Conv2d(inchannels, num_anchors * 10, kernel_size=(1, 1), stride=1, padding=0) + + def forward(self, x): + out = self.conv1x1(x) + out = out.permute(0, 2, 3, 1).contiguous() + + return out.view(out.shape[0], -1, 10) + + +def make_class_head(fpn_num=3, inchannels=64, anchor_num=2): + classhead = nn.ModuleList() + for i in range(fpn_num): + classhead.append(ClassHead(inchannels, anchor_num)) + return classhead + + +def make_bbox_head(fpn_num=3, inchannels=64, anchor_num=2): + bboxhead = nn.ModuleList() + for i in range(fpn_num): + bboxhead.append(BboxHead(inchannels, anchor_num)) + return bboxhead + + +def make_landmark_head(fpn_num=3, inchannels=64, anchor_num=2): + landmarkhead = nn.ModuleList() + for i in range(fpn_num): + landmarkhead.append(LandmarkHead(inchannels, anchor_num)) + return landmarkhead diff --git a/inpaint/plugins/facexlib/detection/retinaface_utils.py b/inpaint/plugins/facexlib/detection/retinaface_utils.py new file mode 100644 index 0000000..8c35775 --- /dev/null +++ b/inpaint/plugins/facexlib/detection/retinaface_utils.py @@ -0,0 +1,421 @@ +import numpy as np +import torch +import torchvision +from itertools import product as product +from math import ceil + + +class PriorBox(object): + + def __init__(self, cfg, image_size=None, phase='train'): + super(PriorBox, self).__init__() + self.min_sizes = cfg['min_sizes'] + self.steps = cfg['steps'] + self.clip = cfg['clip'] + self.image_size = image_size + self.feature_maps = [[ceil(self.image_size[0] / step), ceil(self.image_size[1] / step)] for step in self.steps] + self.name = 's' + + def forward(self): + anchors = [] + for k, f in enumerate(self.feature_maps): + min_sizes = self.min_sizes[k] + for i, j in product(range(f[0]), range(f[1])): + for min_size in min_sizes: + s_kx = min_size / self.image_size[1] + s_ky = min_size / self.image_size[0] + dense_cx = [x * self.steps[k] / self.image_size[1] for x in [j + 0.5]] + dense_cy = [y * self.steps[k] / self.image_size[0] for y in [i + 0.5]] + for cy, cx in product(dense_cy, dense_cx): + anchors += [cx, cy, s_kx, s_ky] + + # back to torch land + output = torch.Tensor(anchors).view(-1, 4) + if self.clip: + output.clamp_(max=1, min=0) + return output + + +def py_cpu_nms(dets, thresh): + """Pure Python NMS baseline.""" + keep = torchvision.ops.nms( + boxes=torch.Tensor(dets[:, :4]), + scores=torch.Tensor(dets[:, 4]), + iou_threshold=thresh, + ) + + return list(keep) + + +def point_form(boxes): + """ Convert prior_boxes to (xmin, ymin, xmax, ymax) + representation for comparison to point form ground truth data. + Args: + boxes: (tensor) center-size default boxes from priorbox layers. + Return: + boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes. + """ + return torch.cat( + ( + boxes[:, :2] - boxes[:, 2:] / 2, # xmin, ymin + boxes[:, :2] + boxes[:, 2:] / 2), + 1) # xmax, ymax + + +def center_size(boxes): + """ Convert prior_boxes to (cx, cy, w, h) + representation for comparison to center-size form ground truth data. + Args: + boxes: (tensor) point_form boxes + Return: + boxes: (tensor) Converted xmin, ymin, xmax, ymax form of boxes. + """ + return torch.cat( + (boxes[:, 2:] + boxes[:, :2]) / 2, # cx, cy + boxes[:, 2:] - boxes[:, :2], + 1) # w, h + + +def intersect(box_a, box_b): + """ We resize both tensors to [A,B,2] without new malloc: + [A,2] -> [A,1,2] -> [A,B,2] + [B,2] -> [1,B,2] -> [A,B,2] + Then we compute the area of intersect between box_a and box_b. + Args: + box_a: (tensor) bounding boxes, Shape: [A,4]. + box_b: (tensor) bounding boxes, Shape: [B,4]. + Return: + (tensor) intersection area, Shape: [A,B]. + """ + A = box_a.size(0) + B = box_b.size(0) + max_xy = torch.min(box_a[:, 2:].unsqueeze(1).expand(A, B, 2), box_b[:, 2:].unsqueeze(0).expand(A, B, 2)) + min_xy = torch.max(box_a[:, :2].unsqueeze(1).expand(A, B, 2), box_b[:, :2].unsqueeze(0).expand(A, B, 2)) + inter = torch.clamp((max_xy - min_xy), min=0) + return inter[:, :, 0] * inter[:, :, 1] + + +def jaccard(box_a, box_b): + """Compute the jaccard overlap of two sets of boxes. The jaccard overlap + is simply the intersection over union of two boxes. Here we operate on + ground truth boxes and default boxes. + E.g.: + A ∩ B / A ∪ B = A ∩ B / (area(A) + area(B) - A ∩ B) + Args: + box_a: (tensor) Ground truth bounding boxes, Shape: [num_objects,4] + box_b: (tensor) Prior boxes from priorbox layers, Shape: [num_priors,4] + Return: + jaccard overlap: (tensor) Shape: [box_a.size(0), box_b.size(0)] + """ + inter = intersect(box_a, box_b) + area_a = ((box_a[:, 2] - box_a[:, 0]) * (box_a[:, 3] - box_a[:, 1])).unsqueeze(1).expand_as(inter) # [A,B] + area_b = ((box_b[:, 2] - box_b[:, 0]) * (box_b[:, 3] - box_b[:, 1])).unsqueeze(0).expand_as(inter) # [A,B] + union = area_a + area_b - inter + return inter / union # [A,B] + + +def matrix_iou(a, b): + """ + return iou of a and b, numpy version for data augenmentation + """ + lt = np.maximum(a[:, np.newaxis, :2], b[:, :2]) + rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:]) + + area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2) + area_a = np.prod(a[:, 2:] - a[:, :2], axis=1) + area_b = np.prod(b[:, 2:] - b[:, :2], axis=1) + return area_i / (area_a[:, np.newaxis] + area_b - area_i) + + +def matrix_iof(a, b): + """ + return iof of a and b, numpy version for data augenmentation + """ + lt = np.maximum(a[:, np.newaxis, :2], b[:, :2]) + rb = np.minimum(a[:, np.newaxis, 2:], b[:, 2:]) + + area_i = np.prod(rb - lt, axis=2) * (lt < rb).all(axis=2) + area_a = np.prod(a[:, 2:] - a[:, :2], axis=1) + return area_i / np.maximum(area_a[:, np.newaxis], 1) + + +def match(threshold, truths, priors, variances, labels, landms, loc_t, conf_t, landm_t, idx): + """Match each prior box with the ground truth box of the highest jaccard + overlap, encode the bounding boxes, then return the matched indices + corresponding to both confidence and location preds. + Args: + threshold: (float) The overlap threshold used when matching boxes. + truths: (tensor) Ground truth boxes, Shape: [num_obj, 4]. + priors: (tensor) Prior boxes from priorbox layers, Shape: [n_priors,4]. + variances: (tensor) Variances corresponding to each prior coord, + Shape: [num_priors, 4]. + labels: (tensor) All the class labels for the image, Shape: [num_obj]. + landms: (tensor) Ground truth landms, Shape [num_obj, 10]. + loc_t: (tensor) Tensor to be filled w/ encoded location targets. + conf_t: (tensor) Tensor to be filled w/ matched indices for conf preds. + landm_t: (tensor) Tensor to be filled w/ encoded landm targets. + idx: (int) current batch index + Return: + The matched indices corresponding to 1)location 2)confidence + 3)landm preds. + """ + # jaccard index + overlaps = jaccard(truths, point_form(priors)) + # (Bipartite Matching) + # [1,num_objects] best prior for each ground truth + best_prior_overlap, best_prior_idx = overlaps.max(1, keepdim=True) + + # ignore hard gt + valid_gt_idx = best_prior_overlap[:, 0] >= 0.2 + best_prior_idx_filter = best_prior_idx[valid_gt_idx, :] + if best_prior_idx_filter.shape[0] <= 0: + loc_t[idx] = 0 + conf_t[idx] = 0 + return + + # [1,num_priors] best ground truth for each prior + best_truth_overlap, best_truth_idx = overlaps.max(0, keepdim=True) + best_truth_idx.squeeze_(0) + best_truth_overlap.squeeze_(0) + best_prior_idx.squeeze_(1) + best_prior_idx_filter.squeeze_(1) + best_prior_overlap.squeeze_(1) + best_truth_overlap.index_fill_(0, best_prior_idx_filter, 2) # ensure best prior + # TODO refactor: index best_prior_idx with long tensor + # ensure every gt matches with its prior of max overlap + for j in range(best_prior_idx.size(0)): # 判别此anchor是预测哪一个boxes + best_truth_idx[best_prior_idx[j]] = j + matches = truths[best_truth_idx] # Shape: [num_priors,4] 此处为每一个anchor对应的bbox取出来 + conf = labels[best_truth_idx] # Shape: [num_priors] 此处为每一个anchor对应的label取出来 + conf[best_truth_overlap < threshold] = 0 # label as background overlap<0.35的全部作为负样本 + loc = encode(matches, priors, variances) + + matches_landm = landms[best_truth_idx] + landm = encode_landm(matches_landm, priors, variances) + loc_t[idx] = loc # [num_priors,4] encoded offsets to learn + conf_t[idx] = conf # [num_priors] top class label for each prior + landm_t[idx] = landm + + +def encode(matched, priors, variances): + """Encode the variances from the priorbox layers into the ground truth boxes + we have matched (based on jaccard overlap) with the prior boxes. + Args: + matched: (tensor) Coords of ground truth for each prior in point-form + Shape: [num_priors, 4]. + priors: (tensor) Prior boxes in center-offset form + Shape: [num_priors,4]. + variances: (list[float]) Variances of priorboxes + Return: + encoded boxes (tensor), Shape: [num_priors, 4] + """ + + # dist b/t match center and prior's center + g_cxcy = (matched[:, :2] + matched[:, 2:]) / 2 - priors[:, :2] + # encode variance + g_cxcy /= (variances[0] * priors[:, 2:]) + # match wh / prior wh + g_wh = (matched[:, 2:] - matched[:, :2]) / priors[:, 2:] + g_wh = torch.log(g_wh) / variances[1] + # return target for smooth_l1_loss + return torch.cat([g_cxcy, g_wh], 1) # [num_priors,4] + + +def encode_landm(matched, priors, variances): + """Encode the variances from the priorbox layers into the ground truth boxes + we have matched (based on jaccard overlap) with the prior boxes. + Args: + matched: (tensor) Coords of ground truth for each prior in point-form + Shape: [num_priors, 10]. + priors: (tensor) Prior boxes in center-offset form + Shape: [num_priors,4]. + variances: (list[float]) Variances of priorboxes + Return: + encoded landm (tensor), Shape: [num_priors, 10] + """ + + # dist b/t match center and prior's center + matched = torch.reshape(matched, (matched.size(0), 5, 2)) + priors_cx = priors[:, 0].unsqueeze(1).expand(matched.size(0), 5).unsqueeze(2) + priors_cy = priors[:, 1].unsqueeze(1).expand(matched.size(0), 5).unsqueeze(2) + priors_w = priors[:, 2].unsqueeze(1).expand(matched.size(0), 5).unsqueeze(2) + priors_h = priors[:, 3].unsqueeze(1).expand(matched.size(0), 5).unsqueeze(2) + priors = torch.cat([priors_cx, priors_cy, priors_w, priors_h], dim=2) + g_cxcy = matched[:, :, :2] - priors[:, :, :2] + # encode variance + g_cxcy /= (variances[0] * priors[:, :, 2:]) + # g_cxcy /= priors[:, :, 2:] + g_cxcy = g_cxcy.reshape(g_cxcy.size(0), -1) + # return target for smooth_l1_loss + return g_cxcy + + +# Adapted from https://github.com/Hakuyume/chainer-ssd +def decode(loc, priors, variances): + """Decode locations from predictions using priors to undo + the encoding we did for offset regression at train time. + Args: + loc (tensor): location predictions for loc layers, + Shape: [num_priors,4] + priors (tensor): Prior boxes in center-offset form. + Shape: [num_priors,4]. + variances: (list[float]) Variances of priorboxes + Return: + decoded bounding box predictions + """ + + boxes = torch.cat((priors[:, :2] + loc[:, :2] * variances[0] * priors[:, 2:], + priors[:, 2:] * torch.exp(loc[:, 2:] * variances[1])), 1) + boxes[:, :2] -= boxes[:, 2:] / 2 + boxes[:, 2:] += boxes[:, :2] + return boxes + + +def decode_landm(pre, priors, variances): + """Decode landm from predictions using priors to undo + the encoding we did for offset regression at train time. + Args: + pre (tensor): landm predictions for loc layers, + Shape: [num_priors,10] + priors (tensor): Prior boxes in center-offset form. + Shape: [num_priors,4]. + variances: (list[float]) Variances of priorboxes + Return: + decoded landm predictions + """ + tmp = ( + priors[:, :2] + pre[:, :2] * variances[0] * priors[:, 2:], + priors[:, :2] + pre[:, 2:4] * variances[0] * priors[:, 2:], + priors[:, :2] + pre[:, 4:6] * variances[0] * priors[:, 2:], + priors[:, :2] + pre[:, 6:8] * variances[0] * priors[:, 2:], + priors[:, :2] + pre[:, 8:10] * variances[0] * priors[:, 2:], + ) + landms = torch.cat(tmp, dim=1) + return landms + + +def batched_decode(b_loc, priors, variances): + """Decode locations from predictions using priors to undo + the encoding we did for offset regression at train time. + Args: + b_loc (tensor): location predictions for loc layers, + Shape: [num_batches,num_priors,4] + priors (tensor): Prior boxes in center-offset form. + Shape: [1,num_priors,4]. + variances: (list[float]) Variances of priorboxes + Return: + decoded bounding box predictions + """ + boxes = ( + priors[:, :, :2] + b_loc[:, :, :2] * variances[0] * priors[:, :, 2:], + priors[:, :, 2:] * torch.exp(b_loc[:, :, 2:] * variances[1]), + ) + boxes = torch.cat(boxes, dim=2) + + boxes[:, :, :2] -= boxes[:, :, 2:] / 2 + boxes[:, :, 2:] += boxes[:, :, :2] + return boxes + + +def batched_decode_landm(pre, priors, variances): + """Decode landm from predictions using priors to undo + the encoding we did for offset regression at train time. + Args: + pre (tensor): landm predictions for loc layers, + Shape: [num_batches,num_priors,10] + priors (tensor): Prior boxes in center-offset form. + Shape: [1,num_priors,4]. + variances: (list[float]) Variances of priorboxes + Return: + decoded landm predictions + """ + landms = ( + priors[:, :, :2] + pre[:, :, :2] * variances[0] * priors[:, :, 2:], + priors[:, :, :2] + pre[:, :, 2:4] * variances[0] * priors[:, :, 2:], + priors[:, :, :2] + pre[:, :, 4:6] * variances[0] * priors[:, :, 2:], + priors[:, :, :2] + pre[:, :, 6:8] * variances[0] * priors[:, :, 2:], + priors[:, :, :2] + pre[:, :, 8:10] * variances[0] * priors[:, :, 2:], + ) + landms = torch.cat(landms, dim=2) + return landms + + +def log_sum_exp(x): + """Utility function for computing log_sum_exp while determining + This will be used to determine unaveraged confidence loss across + all examples in a batch. + Args: + x (Variable(tensor)): conf_preds from conf layers + """ + x_max = x.data.max() + return torch.log(torch.sum(torch.exp(x - x_max), 1, keepdim=True)) + x_max + + +# Original author: Francisco Massa: +# https://github.com/fmassa/object-detection.torch +# Ported to PyTorch by Max deGroot (02/01/2017) +def nms(boxes, scores, overlap=0.5, top_k=200): + """Apply non-maximum suppression at test time to avoid detecting too many + overlapping bounding boxes for a given object. + Args: + boxes: (tensor) The location preds for the img, Shape: [num_priors,4]. + scores: (tensor) The class predscores for the img, Shape:[num_priors]. + overlap: (float) The overlap thresh for suppressing unnecessary boxes. + top_k: (int) The Maximum number of box preds to consider. + Return: + The indices of the kept boxes with respect to num_priors. + """ + + keep = torch.Tensor(scores.size(0)).fill_(0).long() + if boxes.numel() == 0: + return keep + x1 = boxes[:, 0] + y1 = boxes[:, 1] + x2 = boxes[:, 2] + y2 = boxes[:, 3] + area = torch.mul(x2 - x1, y2 - y1) + v, idx = scores.sort(0) # sort in ascending order + # I = I[v >= 0.01] + idx = idx[-top_k:] # indices of the top-k largest vals + xx1 = boxes.new() + yy1 = boxes.new() + xx2 = boxes.new() + yy2 = boxes.new() + w = boxes.new() + h = boxes.new() + + # keep = torch.Tensor() + count = 0 + while idx.numel() > 0: + i = idx[-1] # index of current largest val + # keep.append(i) + keep[count] = i + count += 1 + if idx.size(0) == 1: + break + idx = idx[:-1] # remove kept element from view + # load bboxes of next highest vals + torch.index_select(x1, 0, idx, out=xx1) + torch.index_select(y1, 0, idx, out=yy1) + torch.index_select(x2, 0, idx, out=xx2) + torch.index_select(y2, 0, idx, out=yy2) + # store element-wise max with next highest score + xx1 = torch.clamp(xx1, min=x1[i]) + yy1 = torch.clamp(yy1, min=y1[i]) + xx2 = torch.clamp(xx2, max=x2[i]) + yy2 = torch.clamp(yy2, max=y2[i]) + w.resize_as_(xx2) + h.resize_as_(yy2) + w = xx2 - xx1 + h = yy2 - yy1 + # check sizes of xx1 and xx2.. after each iteration + w = torch.clamp(w, min=0.0) + h = torch.clamp(h, min=0.0) + inter = w * h + # IoU = i / (area(a) + area(b) - i) + rem_areas = torch.index_select(area, 0, idx) # load remaining areas) + union = (rem_areas - inter) + area[i] + IoU = inter / union # store result in iou + # keep only elements with an IoU <= overlap + idx = idx[IoU.le(overlap)] + return keep, count diff --git a/inpaint/plugins/facexlib/parsing/__init__.py b/inpaint/plugins/facexlib/parsing/__init__.py new file mode 100644 index 0000000..322a87b --- /dev/null +++ b/inpaint/plugins/facexlib/parsing/__init__.py @@ -0,0 +1,24 @@ +import torch + +from ..utils import load_file_from_url +from .bisenet import BiSeNet +from .parsenet import ParseNet + + +def init_parsing_model(model_name='bisenet', half=False, device='cuda', model_rootpath=None): + if model_name == 'bisenet': + model = BiSeNet(num_class=19) + model_url = 'https://github.com/xinntao/facexlib/releases/download/v0.2.0/parsing_bisenet.pth' + elif model_name == 'parsenet': + model = ParseNet(in_size=512, out_size=512, parsing_ch=19) + model_url = 'https://github.com/xinntao/facexlib/releases/download/v0.2.2/parsing_parsenet.pth' + else: + raise NotImplementedError(f'{model_name} is not implemented.') + + model_path = load_file_from_url( + url=model_url, model_dir='facexlib/weights', progress=True, file_name=None, save_dir=model_rootpath) + load_net = torch.load(model_path, map_location=lambda storage, loc: storage) + model.load_state_dict(load_net, strict=True) + model.eval() + model = model.to(device) + return model diff --git a/inpaint/plugins/facexlib/parsing/bisenet.py b/inpaint/plugins/facexlib/parsing/bisenet.py new file mode 100644 index 0000000..3898cab --- /dev/null +++ b/inpaint/plugins/facexlib/parsing/bisenet.py @@ -0,0 +1,140 @@ +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .resnet import ResNet18 + + +class ConvBNReLU(nn.Module): + + def __init__(self, in_chan, out_chan, ks=3, stride=1, padding=1): + super(ConvBNReLU, self).__init__() + self.conv = nn.Conv2d(in_chan, out_chan, kernel_size=ks, stride=stride, padding=padding, bias=False) + self.bn = nn.BatchNorm2d(out_chan) + + def forward(self, x): + x = self.conv(x) + x = F.relu(self.bn(x)) + return x + + +class BiSeNetOutput(nn.Module): + + def __init__(self, in_chan, mid_chan, num_class): + super(BiSeNetOutput, self).__init__() + self.conv = ConvBNReLU(in_chan, mid_chan, ks=3, stride=1, padding=1) + self.conv_out = nn.Conv2d(mid_chan, num_class, kernel_size=1, bias=False) + + def forward(self, x): + feat = self.conv(x) + out = self.conv_out(feat) + return out, feat + + +class AttentionRefinementModule(nn.Module): + + def __init__(self, in_chan, out_chan): + super(AttentionRefinementModule, self).__init__() + self.conv = ConvBNReLU(in_chan, out_chan, ks=3, stride=1, padding=1) + self.conv_atten = nn.Conv2d(out_chan, out_chan, kernel_size=1, bias=False) + self.bn_atten = nn.BatchNorm2d(out_chan) + self.sigmoid_atten = nn.Sigmoid() + + def forward(self, x): + feat = self.conv(x) + atten = F.avg_pool2d(feat, feat.size()[2:]) + atten = self.conv_atten(atten) + atten = self.bn_atten(atten) + atten = self.sigmoid_atten(atten) + out = torch.mul(feat, atten) + return out + + +class ContextPath(nn.Module): + + def __init__(self): + super(ContextPath, self).__init__() + self.resnet = ResNet18() + self.arm16 = AttentionRefinementModule(256, 128) + self.arm32 = AttentionRefinementModule(512, 128) + self.conv_head32 = ConvBNReLU(128, 128, ks=3, stride=1, padding=1) + self.conv_head16 = ConvBNReLU(128, 128, ks=3, stride=1, padding=1) + self.conv_avg = ConvBNReLU(512, 128, ks=1, stride=1, padding=0) + + def forward(self, x): + feat8, feat16, feat32 = self.resnet(x) + h8, w8 = feat8.size()[2:] + h16, w16 = feat16.size()[2:] + h32, w32 = feat32.size()[2:] + + avg = F.avg_pool2d(feat32, feat32.size()[2:]) + avg = self.conv_avg(avg) + avg_up = F.interpolate(avg, (h32, w32), mode='nearest') + + feat32_arm = self.arm32(feat32) + feat32_sum = feat32_arm + avg_up + feat32_up = F.interpolate(feat32_sum, (h16, w16), mode='nearest') + feat32_up = self.conv_head32(feat32_up) + + feat16_arm = self.arm16(feat16) + feat16_sum = feat16_arm + feat32_up + feat16_up = F.interpolate(feat16_sum, (h8, w8), mode='nearest') + feat16_up = self.conv_head16(feat16_up) + + return feat8, feat16_up, feat32_up # x8, x8, x16 + + +class FeatureFusionModule(nn.Module): + + def __init__(self, in_chan, out_chan): + super(FeatureFusionModule, self).__init__() + self.convblk = ConvBNReLU(in_chan, out_chan, ks=1, stride=1, padding=0) + self.conv1 = nn.Conv2d(out_chan, out_chan // 4, kernel_size=1, stride=1, padding=0, bias=False) + self.conv2 = nn.Conv2d(out_chan // 4, out_chan, kernel_size=1, stride=1, padding=0, bias=False) + self.relu = nn.ReLU(inplace=True) + self.sigmoid = nn.Sigmoid() + + def forward(self, fsp, fcp): + fcat = torch.cat([fsp, fcp], dim=1) + feat = self.convblk(fcat) + atten = F.avg_pool2d(feat, feat.size()[2:]) + atten = self.conv1(atten) + atten = self.relu(atten) + atten = self.conv2(atten) + atten = self.sigmoid(atten) + feat_atten = torch.mul(feat, atten) + feat_out = feat_atten + feat + return feat_out + + +class BiSeNet(nn.Module): + + def __init__(self, num_class): + super(BiSeNet, self).__init__() + self.cp = ContextPath() + self.ffm = FeatureFusionModule(256, 256) + self.conv_out = BiSeNetOutput(256, 256, num_class) + self.conv_out16 = BiSeNetOutput(128, 64, num_class) + self.conv_out32 = BiSeNetOutput(128, 64, num_class) + + def forward(self, x, return_feat=False): + h, w = x.size()[2:] + feat_res8, feat_cp8, feat_cp16 = self.cp(x) # return res3b1 feature + feat_sp = feat_res8 # replace spatial path feature with res3b1 feature + feat_fuse = self.ffm(feat_sp, feat_cp8) + + out, feat = self.conv_out(feat_fuse) + out16, feat16 = self.conv_out16(feat_cp8) + out32, feat32 = self.conv_out32(feat_cp16) + + out = F.interpolate(out, (h, w), mode='bilinear', align_corners=True) + out16 = F.interpolate(out16, (h, w), mode='bilinear', align_corners=True) + out32 = F.interpolate(out32, (h, w), mode='bilinear', align_corners=True) + + if return_feat: + feat = F.interpolate(feat, (h, w), mode='bilinear', align_corners=True) + feat16 = F.interpolate(feat16, (h, w), mode='bilinear', align_corners=True) + feat32 = F.interpolate(feat32, (h, w), mode='bilinear', align_corners=True) + return out, out16, out32, feat, feat16, feat32 + else: + return out, out16, out32 diff --git a/inpaint/plugins/facexlib/parsing/parsenet.py b/inpaint/plugins/facexlib/parsing/parsenet.py new file mode 100644 index 0000000..e178ebe --- /dev/null +++ b/inpaint/plugins/facexlib/parsing/parsenet.py @@ -0,0 +1,194 @@ +"""Modified from https://github.com/chaofengc/PSFRGAN +""" +import numpy as np +import torch.nn as nn +from torch.nn import functional as F + + +class NormLayer(nn.Module): + """Normalization Layers. + + Args: + channels: input channels, for batch norm and instance norm. + input_size: input shape without batch size, for layer norm. + """ + + def __init__(self, channels, normalize_shape=None, norm_type='bn'): + super(NormLayer, self).__init__() + norm_type = norm_type.lower() + self.norm_type = norm_type + if norm_type == 'bn': + self.norm = nn.BatchNorm2d(channels, affine=True) + elif norm_type == 'in': + self.norm = nn.InstanceNorm2d(channels, affine=False) + elif norm_type == 'gn': + self.norm = nn.GroupNorm(32, channels, affine=True) + elif norm_type == 'pixel': + self.norm = lambda x: F.normalize(x, p=2, dim=1) + elif norm_type == 'layer': + self.norm = nn.LayerNorm(normalize_shape) + elif norm_type == 'none': + self.norm = lambda x: x * 1.0 + else: + assert 1 == 0, f'Norm type {norm_type} not support.' + + def forward(self, x, ref=None): + if self.norm_type == 'spade': + return self.norm(x, ref) + else: + return self.norm(x) + + +class ReluLayer(nn.Module): + """Relu Layer. + + Args: + relu type: type of relu layer, candidates are + - ReLU + - LeakyReLU: default relu slope 0.2 + - PRelu + - SELU + - none: direct pass + """ + + def __init__(self, channels, relu_type='relu'): + super(ReluLayer, self).__init__() + relu_type = relu_type.lower() + if relu_type == 'relu': + self.func = nn.ReLU(True) + elif relu_type == 'leakyrelu': + self.func = nn.LeakyReLU(0.2, inplace=True) + elif relu_type == 'prelu': + self.func = nn.PReLU(channels) + elif relu_type == 'selu': + self.func = nn.SELU(True) + elif relu_type == 'none': + self.func = lambda x: x * 1.0 + else: + assert 1 == 0, f'Relu type {relu_type} not support.' + + def forward(self, x): + return self.func(x) + + +class ConvLayer(nn.Module): + + def __init__(self, + in_channels, + out_channels, + kernel_size=3, + scale='none', + norm_type='none', + relu_type='none', + use_pad=True, + bias=True): + super(ConvLayer, self).__init__() + self.use_pad = use_pad + self.norm_type = norm_type + if norm_type in ['bn']: + bias = False + + stride = 2 if scale == 'down' else 1 + + self.scale_func = lambda x: x + if scale == 'up': + self.scale_func = lambda x: nn.functional.interpolate(x, scale_factor=2, mode='nearest') + + self.reflection_pad = nn.ReflectionPad2d(int(np.ceil((kernel_size - 1.) / 2))) + self.conv2d = nn.Conv2d(in_channels, out_channels, kernel_size, stride, bias=bias) + + self.relu = ReluLayer(out_channels, relu_type) + self.norm = NormLayer(out_channels, norm_type=norm_type) + + def forward(self, x): + out = self.scale_func(x) + if self.use_pad: + out = self.reflection_pad(out) + out = self.conv2d(out) + out = self.norm(out) + out = self.relu(out) + return out + + +class ResidualBlock(nn.Module): + """ + Residual block recommended in: http://torch.ch/blog/2016/02/04/resnets.html + """ + + def __init__(self, c_in, c_out, relu_type='prelu', norm_type='bn', scale='none'): + super(ResidualBlock, self).__init__() + + if scale == 'none' and c_in == c_out: + self.shortcut_func = lambda x: x + else: + self.shortcut_func = ConvLayer(c_in, c_out, 3, scale) + + scale_config_dict = {'down': ['none', 'down'], 'up': ['up', 'none'], 'none': ['none', 'none']} + scale_conf = scale_config_dict[scale] + + self.conv1 = ConvLayer(c_in, c_out, 3, scale_conf[0], norm_type=norm_type, relu_type=relu_type) + self.conv2 = ConvLayer(c_out, c_out, 3, scale_conf[1], norm_type=norm_type, relu_type='none') + + def forward(self, x): + identity = self.shortcut_func(x) + + res = self.conv1(x) + res = self.conv2(res) + return identity + res + + +class ParseNet(nn.Module): + + def __init__(self, + in_size=128, + out_size=128, + min_feat_size=32, + base_ch=64, + parsing_ch=19, + res_depth=10, + relu_type='LeakyReLU', + norm_type='bn', + ch_range=[32, 256]): + super().__init__() + self.res_depth = res_depth + act_args = {'norm_type': norm_type, 'relu_type': relu_type} + min_ch, max_ch = ch_range + + ch_clip = lambda x: max(min_ch, min(x, max_ch)) # noqa: E731 + min_feat_size = min(in_size, min_feat_size) + + down_steps = int(np.log2(in_size // min_feat_size)) + up_steps = int(np.log2(out_size // min_feat_size)) + + # =============== define encoder-body-decoder ==================== + self.encoder = [] + self.encoder.append(ConvLayer(3, base_ch, 3, 1)) + head_ch = base_ch + for i in range(down_steps): + cin, cout = ch_clip(head_ch), ch_clip(head_ch * 2) + self.encoder.append(ResidualBlock(cin, cout, scale='down', **act_args)) + head_ch = head_ch * 2 + + self.body = [] + for i in range(res_depth): + self.body.append(ResidualBlock(ch_clip(head_ch), ch_clip(head_ch), **act_args)) + + self.decoder = [] + for i in range(up_steps): + cin, cout = ch_clip(head_ch), ch_clip(head_ch // 2) + self.decoder.append(ResidualBlock(cin, cout, scale='up', **act_args)) + head_ch = head_ch // 2 + + self.encoder = nn.Sequential(*self.encoder) + self.body = nn.Sequential(*self.body) + self.decoder = nn.Sequential(*self.decoder) + self.out_img_conv = ConvLayer(ch_clip(head_ch), 3) + self.out_mask_conv = ConvLayer(ch_clip(head_ch), parsing_ch) + + def forward(self, x): + feat = self.encoder(x) + x = feat + self.body(feat) + x = self.decoder(x) + out_img = self.out_img_conv(x) + out_mask = self.out_mask_conv(x) + return out_mask, out_img diff --git a/inpaint/plugins/facexlib/parsing/resnet.py b/inpaint/plugins/facexlib/parsing/resnet.py new file mode 100644 index 0000000..fec8e82 --- /dev/null +++ b/inpaint/plugins/facexlib/parsing/resnet.py @@ -0,0 +1,69 @@ +import torch.nn as nn +import torch.nn.functional as F + + +def conv3x3(in_planes, out_planes, stride=1): + """3x3 convolution with padding""" + return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False) + + +class BasicBlock(nn.Module): + + def __init__(self, in_chan, out_chan, stride=1): + super(BasicBlock, self).__init__() + self.conv1 = conv3x3(in_chan, out_chan, stride) + self.bn1 = nn.BatchNorm2d(out_chan) + self.conv2 = conv3x3(out_chan, out_chan) + self.bn2 = nn.BatchNorm2d(out_chan) + self.relu = nn.ReLU(inplace=True) + self.downsample = None + if in_chan != out_chan or stride != 1: + self.downsample = nn.Sequential( + nn.Conv2d(in_chan, out_chan, kernel_size=1, stride=stride, bias=False), + nn.BatchNorm2d(out_chan), + ) + + def forward(self, x): + residual = self.conv1(x) + residual = F.relu(self.bn1(residual)) + residual = self.conv2(residual) + residual = self.bn2(residual) + + shortcut = x + if self.downsample is not None: + shortcut = self.downsample(x) + + out = shortcut + residual + out = self.relu(out) + return out + + +def create_layer_basic(in_chan, out_chan, bnum, stride=1): + layers = [BasicBlock(in_chan, out_chan, stride=stride)] + for i in range(bnum - 1): + layers.append(BasicBlock(out_chan, out_chan, stride=1)) + return nn.Sequential(*layers) + + +class ResNet18(nn.Module): + + def __init__(self): + super(ResNet18, self).__init__() + self.conv1 = nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False) + self.bn1 = nn.BatchNorm2d(64) + self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1) + self.layer1 = create_layer_basic(64, 64, bnum=2, stride=1) + self.layer2 = create_layer_basic(64, 128, bnum=2, stride=2) + self.layer3 = create_layer_basic(128, 256, bnum=2, stride=2) + self.layer4 = create_layer_basic(256, 512, bnum=2, stride=2) + + def forward(self, x): + x = self.conv1(x) + x = F.relu(self.bn1(x)) + x = self.maxpool(x) + + x = self.layer1(x) + feat8 = self.layer2(x) # 1/8 + feat16 = self.layer3(feat8) # 1/16 + feat32 = self.layer4(feat16) # 1/32 + return feat8, feat16, feat32 diff --git a/inpaint/plugins/facexlib/utils/__init__.py b/inpaint/plugins/facexlib/utils/__init__.py new file mode 100644 index 0000000..706e077 --- /dev/null +++ b/inpaint/plugins/facexlib/utils/__init__.py @@ -0,0 +1,7 @@ +from .face_utils import align_crop_face_landmarks, compute_increased_bbox, get_valid_bboxes, paste_face_back +from .misc import img2tensor, load_file_from_url, scandir + +__all__ = [ + 'align_crop_face_landmarks', 'compute_increased_bbox', 'get_valid_bboxes', 'load_file_from_url', 'paste_face_back', + 'img2tensor', 'scandir' +] diff --git a/inpaint/plugins/facexlib/utils/face_restoration_helper.py b/inpaint/plugins/facexlib/utils/face_restoration_helper.py new file mode 100644 index 0000000..a547cc8 --- /dev/null +++ b/inpaint/plugins/facexlib/utils/face_restoration_helper.py @@ -0,0 +1,473 @@ +import cv2 +import numpy as np +import os +import torch +from torchvision.transforms.functional import normalize + +from ..detection import init_detection_model +from ..parsing import init_parsing_model +from ..utils.misc import img2tensor, imwrite + + +def get_largest_face(det_faces, h, w): + def get_location(val, length): + if val < 0: + return 0 + elif val > length: + return length + else: + return val + + face_areas = [] + for det_face in det_faces: + left = get_location(det_face[0], w) + right = get_location(det_face[2], w) + top = get_location(det_face[1], h) + bottom = get_location(det_face[3], h) + face_area = (right - left) * (bottom - top) + face_areas.append(face_area) + largest_idx = face_areas.index(max(face_areas)) + return det_faces[largest_idx], largest_idx + + +def get_center_face(det_faces, h=0, w=0, center=None): + if center is not None: + center = np.array(center) + else: + center = np.array([w / 2, h / 2]) + center_dist = [] + for det_face in det_faces: + face_center = np.array( + [(det_face[0] + det_face[2]) / 2, (det_face[1] + det_face[3]) / 2] + ) + dist = np.linalg.norm(face_center - center) + center_dist.append(dist) + center_idx = center_dist.index(min(center_dist)) + return det_faces[center_idx], center_idx + + +class FaceRestoreHelper(object): + """Helper for the face restoration pipeline (base class).""" + + def __init__( + self, + upscale_factor, + face_size=512, + crop_ratio=(1, 1), + det_model="retinaface_resnet50", + save_ext="png", + template_3points=False, + pad_blur=False, + use_parse=False, + device=None, + model_rootpath=None, + ): + self.template_3points = template_3points # improve robustness + self.upscale_factor = upscale_factor + # the cropped face ratio based on the square face + self.crop_ratio = crop_ratio # (h, w) + assert ( + self.crop_ratio[0] >= 1 and self.crop_ratio[1] >= 1 + ), "crop ration only supports >=1" + self.face_size = ( + int(face_size * self.crop_ratio[1]), + int(face_size * self.crop_ratio[0]), + ) + + if self.template_3points: + self.face_template = np.array([[192, 240], [319, 240], [257, 371]]) + else: + # standard 5 landmarks for FFHQ faces with 512 x 512 + self.face_template = np.array( + [ + [192.98138, 239.94708], + [318.90277, 240.1936], + [256.63416, 314.01935], + [201.26117, 371.41043], + [313.08905, 371.15118], + ] + ) + self.face_template = self.face_template * (face_size / 512.0) + if self.crop_ratio[0] > 1: + self.face_template[:, 1] += face_size * (self.crop_ratio[0] - 1) / 2 + if self.crop_ratio[1] > 1: + self.face_template[:, 0] += face_size * (self.crop_ratio[1] - 1) / 2 + self.save_ext = save_ext + self.pad_blur = pad_blur + if self.pad_blur is True: + self.template_3points = False + + self.all_landmarks_5 = [] + self.det_faces = [] + self.affine_matrices = [] + self.inverse_affine_matrices = [] + self.cropped_faces = [] + self.restored_faces = [] + self.pad_input_imgs = [] + + if device is None: + self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu") + else: + self.device = device + + # init face detection model + self.face_det = init_detection_model( + det_model, half=False, device=self.device, model_rootpath=model_rootpath + ) + + # init face parsing model + self.use_parse = use_parse + self.face_parse = init_parsing_model( + model_name="parsenet", device=self.device, model_rootpath=model_rootpath + ) + + def set_upscale_factor(self, upscale_factor): + self.upscale_factor = upscale_factor + + def read_image(self, img): + """img can be image path or cv2 loaded image.""" + # self.input_img is Numpy array, (h, w, c), BGR, uint8, [0, 255] + if isinstance(img, str): + img = cv2.imread(img) + + if np.max(img) > 256: # 16-bit image + img = img / 65535 * 255 + if len(img.shape) == 2: # gray image + img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR) + elif img.shape[2] == 4: # RGBA image with alpha channel + img = img[:, :, 0:3] + + self.input_img = img + + def get_face_landmarks_5( + self, + only_keep_largest=False, + only_center_face=False, + resize=None, + blur_ratio=0.01, + eye_dist_threshold=None, + ): + if resize is None: + scale = 1 + input_img = self.input_img + else: + h, w = self.input_img.shape[0:2] + scale = min(h, w) / resize + h, w = int(h / scale), int(w / scale) + input_img = cv2.resize( + self.input_img, (w, h), interpolation=cv2.INTER_LANCZOS4 + ) + + with torch.no_grad(): + bboxes = self.face_det.detect_faces(input_img, 0.97) * scale + for bbox in bboxes: + # remove faces with too small eye distance: side faces or too small faces + eye_dist = np.linalg.norm([bbox[5] - bbox[7], bbox[6] - bbox[8]]) + if eye_dist_threshold is not None and (eye_dist < eye_dist_threshold): + continue + + if self.template_3points: + landmark = np.array([[bbox[i], bbox[i + 1]] for i in range(5, 11, 2)]) + else: + landmark = np.array([[bbox[i], bbox[i + 1]] for i in range(5, 15, 2)]) + self.all_landmarks_5.append(landmark) + self.det_faces.append(bbox[0:5]) + if len(self.det_faces) == 0: + return 0 + if only_keep_largest: + h, w, _ = self.input_img.shape + self.det_faces, largest_idx = get_largest_face(self.det_faces, h, w) + self.all_landmarks_5 = [self.all_landmarks_5[largest_idx]] + elif only_center_face: + h, w, _ = self.input_img.shape + self.det_faces, center_idx = get_center_face(self.det_faces, h, w) + self.all_landmarks_5 = [self.all_landmarks_5[center_idx]] + + # pad blurry images + if self.pad_blur: + self.pad_input_imgs = [] + for landmarks in self.all_landmarks_5: + # get landmarks + eye_left = landmarks[0, :] + eye_right = landmarks[1, :] + eye_avg = (eye_left + eye_right) * 0.5 + mouth_avg = (landmarks[3, :] + landmarks[4, :]) * 0.5 + eye_to_eye = eye_right - eye_left + eye_to_mouth = mouth_avg - eye_avg + + # Get the oriented crop rectangle + # x: half width of the oriented crop rectangle + x = eye_to_eye - np.flipud(eye_to_mouth) * [-1, 1] + # - np.flipud(eye_to_mouth) * [-1, 1]: rotate 90 clockwise + # norm with the hypotenuse: get the direction + x /= np.hypot(*x) # get the hypotenuse of a right triangle + rect_scale = 1.5 + x *= max( + np.hypot(*eye_to_eye) * 2.0 * rect_scale, + np.hypot(*eye_to_mouth) * 1.8 * rect_scale, + ) + # y: half height of the oriented crop rectangle + y = np.flipud(x) * [-1, 1] + + # c: center + c = eye_avg + eye_to_mouth * 0.1 + # quad: (left_top, left_bottom, right_bottom, right_top) + quad = np.stack([c - x - y, c - x + y, c + x + y, c + x - y]) + # qsize: side length of the square + qsize = np.hypot(*x) * 2 + border = max(int(np.rint(qsize * 0.1)), 3) + + # get pad + # pad: (width_left, height_top, width_right, height_bottom) + pad = ( + int(np.floor(min(quad[:, 0]))), + int(np.floor(min(quad[:, 1]))), + int(np.ceil(max(quad[:, 0]))), + int(np.ceil(max(quad[:, 1]))), + ) + pad = [ + max(-pad[0] + border, 1), + max(-pad[1] + border, 1), + max(pad[2] - self.input_img.shape[0] + border, 1), + max(pad[3] - self.input_img.shape[1] + border, 1), + ] + + if max(pad) > 1: + # pad image + pad_img = np.pad( + self.input_img, + ((pad[1], pad[3]), (pad[0], pad[2]), (0, 0)), + "reflect", + ) + # modify landmark coords + landmarks[:, 0] += pad[0] + landmarks[:, 1] += pad[1] + # blur pad images + h, w, _ = pad_img.shape + y, x, _ = np.ogrid[:h, :w, :1] + mask = np.maximum( + 1.0 + - np.minimum( + np.float32(x) / pad[0], np.float32(w - 1 - x) / pad[2] + ), + 1.0 + - np.minimum( + np.float32(y) / pad[1], np.float32(h - 1 - y) / pad[3] + ), + ) + blur = int(qsize * blur_ratio) + if blur % 2 == 0: + blur += 1 + blur_img = cv2.boxFilter(pad_img, 0, ksize=(blur, blur)) + # blur_img = cv2.GaussianBlur(pad_img, (blur, blur), 0) + + pad_img = pad_img.astype("float32") + pad_img += (blur_img - pad_img) * np.clip( + mask * 3.0 + 1.0, 0.0, 1.0 + ) + pad_img += (np.median(pad_img, axis=(0, 1)) - pad_img) * np.clip( + mask, 0.0, 1.0 + ) + pad_img = np.clip(pad_img, 0, 255) # float32, [0, 255] + self.pad_input_imgs.append(pad_img) + else: + self.pad_input_imgs.append(np.copy(self.input_img)) + + return len(self.all_landmarks_5) + + def align_warp_face(self, save_cropped_path=None, border_mode="constant"): + """Align and warp faces with face template.""" + if self.pad_blur: + assert ( + len(self.pad_input_imgs) == len(self.all_landmarks_5) + ), f"Mismatched samples: {len(self.pad_input_imgs)} and {len(self.all_landmarks_5)}" + for idx, landmark in enumerate(self.all_landmarks_5): + # use 5 landmarks to get affine matrix + # use cv2.LMEDS method for the equivalence to skimage transform + # ref: https://blog.csdn.net/yichxi/article/details/115827338 + affine_matrix = cv2.estimateAffinePartial2D( + landmark, self.face_template, method=cv2.LMEDS + )[0] + self.affine_matrices.append(affine_matrix) + # warp and crop faces + if border_mode == "constant": + border_mode = cv2.BORDER_CONSTANT + elif border_mode == "reflect101": + border_mode = cv2.BORDER_REFLECT101 + elif border_mode == "reflect": + border_mode = cv2.BORDER_REFLECT + if self.pad_blur: + input_img = self.pad_input_imgs[idx] + else: + input_img = self.input_img + cropped_face = cv2.warpAffine( + input_img, + affine_matrix, + self.face_size, + borderMode=border_mode, + borderValue=(135, 133, 132), + ) # gray + self.cropped_faces.append(cropped_face) + # save the cropped face + if save_cropped_path is not None: + path = os.path.splitext(save_cropped_path)[0] + save_path = f"{path}_{idx:02d}.{self.save_ext}" + imwrite(cropped_face, save_path) + + def get_inverse_affine(self, save_inverse_affine_path=None): + """Get inverse affine matrix.""" + for idx, affine_matrix in enumerate(self.affine_matrices): + inverse_affine = cv2.invertAffineTransform(affine_matrix) + inverse_affine *= self.upscale_factor + self.inverse_affine_matrices.append(inverse_affine) + # save inverse affine matrices + if save_inverse_affine_path is not None: + path, _ = os.path.splitext(save_inverse_affine_path) + save_path = f"{path}_{idx:02d}.pth" + torch.save(inverse_affine, save_path) + + def add_restored_face(self, face): + self.restored_faces.append(face) + + def paste_faces_to_input_image(self, save_path=None, upsample_img=None): + h, w, _ = self.input_img.shape + h_up, w_up = int(h * self.upscale_factor), int(w * self.upscale_factor) + + if upsample_img is None: + # simply resize the background + upsample_img = cv2.resize( + self.input_img, (w_up, h_up), interpolation=cv2.INTER_LANCZOS4 + ) + else: + upsample_img = cv2.resize( + upsample_img, (w_up, h_up), interpolation=cv2.INTER_LANCZOS4 + ) + + assert len(self.restored_faces) == len( + self.inverse_affine_matrices + ), "length of restored_faces and affine_matrices are different." + for restored_face, inverse_affine in zip( + self.restored_faces, self.inverse_affine_matrices + ): + # Add an offset to inverse affine matrix, for more precise back alignment + if self.upscale_factor > 1: + extra_offset = 0.5 * self.upscale_factor + else: + extra_offset = 0 + inverse_affine[:, 2] += extra_offset + inv_restored = cv2.warpAffine(restored_face, inverse_affine, (w_up, h_up)) + + if self.use_parse: + # inference + face_input = cv2.resize( + restored_face, (512, 512), interpolation=cv2.INTER_LINEAR + ) + face_input = img2tensor( + face_input.astype("float32") / 255.0, bgr2rgb=True, float32=True + ) + normalize(face_input, (0.5, 0.5, 0.5), (0.5, 0.5, 0.5), inplace=True) + face_input = torch.unsqueeze(face_input, 0).to(self.device) + with torch.no_grad(): + out = self.face_parse(face_input)[0] + out = out.argmax(dim=1).squeeze().cpu().numpy() + + mask = np.zeros(out.shape) + MASK_COLORMAP = [ + 0, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 255, + 0, + 255, + 0, + 0, + 0, + ] + for idx, color in enumerate(MASK_COLORMAP): + mask[out == idx] = color + # blur the mask + mask = cv2.GaussianBlur(mask, (101, 101), 11) + mask = cv2.GaussianBlur(mask, (101, 101), 11) + # remove the black borders + thres = 10 + mask[:thres, :] = 0 + mask[-thres:, :] = 0 + mask[:, :thres] = 0 + mask[:, -thres:] = 0 + mask = mask / 255.0 + + mask = cv2.resize(mask, restored_face.shape[:2]) + mask = cv2.warpAffine(mask, inverse_affine, (w_up, h_up), flags=3) + inv_soft_mask = mask[:, :, None] + pasted_face = inv_restored + + else: # use square parse maps + mask = np.ones(self.face_size, dtype=np.float32) + inv_mask = cv2.warpAffine(mask, inverse_affine, (w_up, h_up)) + # remove the black borders + inv_mask_erosion = cv2.erode( + inv_mask, + np.ones( + (int(2 * self.upscale_factor), int(2 * self.upscale_factor)), + np.uint8, + ), + ) + pasted_face = inv_mask_erosion[:, :, None] * inv_restored + total_face_area = np.sum(inv_mask_erosion) # // 3 + # compute the fusion edge based on the area of face + w_edge = int(total_face_area**0.5) // 20 + erosion_radius = w_edge * 2 + inv_mask_center = cv2.erode( + inv_mask_erosion, + np.ones((erosion_radius, erosion_radius), np.uint8), + ) + blur_size = w_edge * 2 + inv_soft_mask = cv2.GaussianBlur( + inv_mask_center, (blur_size + 1, blur_size + 1), 0 + ) + if len(upsample_img.shape) == 2: # upsample_img is gray image + upsample_img = upsample_img[:, :, None] + inv_soft_mask = inv_soft_mask[:, :, None] + + if ( + len(upsample_img.shape) == 3 and upsample_img.shape[2] == 4 + ): # alpha channel + alpha = upsample_img[:, :, 3:] + upsample_img = ( + inv_soft_mask * pasted_face + + (1 - inv_soft_mask) * upsample_img[:, :, 0:3] + ) + upsample_img = np.concatenate((upsample_img, alpha), axis=2) + else: + upsample_img = ( + inv_soft_mask * pasted_face + (1 - inv_soft_mask) * upsample_img + ) + + if np.max(upsample_img) > 256: # 16-bit image + upsample_img = upsample_img.astype(np.uint16) + else: + upsample_img = upsample_img.astype(np.uint8) + if save_path is not None: + path = os.path.splitext(save_path)[0] + save_path = f"{path}.{self.save_ext}" + imwrite(upsample_img, save_path) + return upsample_img + + def clean_all(self): + self.all_landmarks_5 = [] + self.restored_faces = [] + self.affine_matrices = [] + self.cropped_faces = [] + self.inverse_affine_matrices = [] + self.det_faces = [] + self.pad_input_imgs = [] diff --git a/inpaint/plugins/facexlib/utils/face_utils.py b/inpaint/plugins/facexlib/utils/face_utils.py new file mode 100644 index 0000000..13ff043 --- /dev/null +++ b/inpaint/plugins/facexlib/utils/face_utils.py @@ -0,0 +1,208 @@ +import cv2 +import numpy as np +import torch + + +def compute_increased_bbox(bbox, increase_area, preserve_aspect=True): + left, top, right, bot = bbox + width = right - left + height = bot - top + + if preserve_aspect: + width_increase = max(increase_area, ((1 + 2 * increase_area) * height - width) / (2 * width)) + height_increase = max(increase_area, ((1 + 2 * increase_area) * width - height) / (2 * height)) + else: + width_increase = height_increase = increase_area + left = int(left - width_increase * width) + top = int(top - height_increase * height) + right = int(right + width_increase * width) + bot = int(bot + height_increase * height) + return (left, top, right, bot) + + +def get_valid_bboxes(bboxes, h, w): + left = max(bboxes[0], 0) + top = max(bboxes[1], 0) + right = min(bboxes[2], w) + bottom = min(bboxes[3], h) + return (left, top, right, bottom) + + +def align_crop_face_landmarks(img, + landmarks, + output_size, + transform_size=None, + enable_padding=True, + return_inverse_affine=False, + shrink_ratio=(1, 1)): + """Align and crop face with landmarks. + + The output_size and transform_size are based on width. The height is + adjusted based on shrink_ratio_h/shring_ration_w. + + Modified from: + https://github.com/NVlabs/ffhq-dataset/blob/master/download_ffhq.py + + Args: + img (Numpy array): Input image. + landmarks (Numpy array): 5 or 68 or 98 landmarks. + output_size (int): Output face size. + transform_size (ing): Transform size. Usually the four time of + output_size. + enable_padding (float): Default: True. + shrink_ratio (float | tuple[float] | list[float]): Shring the whole + face for height and width (crop larger area). Default: (1, 1). + + Returns: + (Numpy array): Cropped face. + """ + lm_type = 'retinaface_5' # Options: dlib_5, retinaface_5 + + if isinstance(shrink_ratio, (float, int)): + shrink_ratio = (shrink_ratio, shrink_ratio) + if transform_size is None: + transform_size = output_size * 4 + + # Parse landmarks + lm = np.array(landmarks) + if lm.shape[0] == 5 and lm_type == 'retinaface_5': + eye_left = lm[0] + eye_right = lm[1] + mouth_avg = (lm[3] + lm[4]) * 0.5 + elif lm.shape[0] == 5 and lm_type == 'dlib_5': + lm_eye_left = lm[2:4] + lm_eye_right = lm[0:2] + eye_left = np.mean(lm_eye_left, axis=0) + eye_right = np.mean(lm_eye_right, axis=0) + mouth_avg = lm[4] + elif lm.shape[0] == 68: + lm_eye_left = lm[36:42] + lm_eye_right = lm[42:48] + eye_left = np.mean(lm_eye_left, axis=0) + eye_right = np.mean(lm_eye_right, axis=0) + mouth_avg = (lm[48] + lm[54]) * 0.5 + elif lm.shape[0] == 98: + lm_eye_left = lm[60:68] + lm_eye_right = lm[68:76] + eye_left = np.mean(lm_eye_left, axis=0) + eye_right = np.mean(lm_eye_right, axis=0) + mouth_avg = (lm[76] + lm[82]) * 0.5 + + eye_avg = (eye_left + eye_right) * 0.5 + eye_to_eye = eye_right - eye_left + eye_to_mouth = mouth_avg - eye_avg + + # Get the oriented crop rectangle + # x: half width of the oriented crop rectangle + x = eye_to_eye - np.flipud(eye_to_mouth) * [-1, 1] + # - np.flipud(eye_to_mouth) * [-1, 1]: rotate 90 clockwise + # norm with the hypotenuse: get the direction + x /= np.hypot(*x) # get the hypotenuse of a right triangle + rect_scale = 1 # TODO: you can edit it to get larger rect + x *= max(np.hypot(*eye_to_eye) * 2.0 * rect_scale, np.hypot(*eye_to_mouth) * 1.8 * rect_scale) + # y: half height of the oriented crop rectangle + y = np.flipud(x) * [-1, 1] + + x *= shrink_ratio[1] # width + y *= shrink_ratio[0] # height + + # c: center + c = eye_avg + eye_to_mouth * 0.1 + # quad: (left_top, left_bottom, right_bottom, right_top) + quad = np.stack([c - x - y, c - x + y, c + x + y, c + x - y]) + # qsize: side length of the square + qsize = np.hypot(*x) * 2 + + quad_ori = np.copy(quad) + # Shrink, for large face + # TODO: do we really need shrink + shrink = int(np.floor(qsize / output_size * 0.5)) + if shrink > 1: + h, w = img.shape[0:2] + rsize = (int(np.rint(float(w) / shrink)), int(np.rint(float(h) / shrink))) + img = cv2.resize(img, rsize, interpolation=cv2.INTER_AREA) + quad /= shrink + qsize /= shrink + + # Crop + h, w = img.shape[0:2] + border = max(int(np.rint(qsize * 0.1)), 3) + crop = (int(np.floor(min(quad[:, 0]))), int(np.floor(min(quad[:, 1]))), int(np.ceil(max(quad[:, 0]))), + int(np.ceil(max(quad[:, 1])))) + crop = (max(crop[0] - border, 0), max(crop[1] - border, 0), min(crop[2] + border, w), min(crop[3] + border, h)) + if crop[2] - crop[0] < w or crop[3] - crop[1] < h: + img = img[crop[1]:crop[3], crop[0]:crop[2], :] + quad -= crop[0:2] + + # Pad + # pad: (width_left, height_top, width_right, height_bottom) + h, w = img.shape[0:2] + pad = (int(np.floor(min(quad[:, 0]))), int(np.floor(min(quad[:, 1]))), int(np.ceil(max(quad[:, 0]))), + int(np.ceil(max(quad[:, 1])))) + pad = (max(-pad[0] + border, 0), max(-pad[1] + border, 0), max(pad[2] - w + border, 0), max(pad[3] - h + border, 0)) + if enable_padding and max(pad) > border - 4: + pad = np.maximum(pad, int(np.rint(qsize * 0.3))) + img = np.pad(img, ((pad[1], pad[3]), (pad[0], pad[2]), (0, 0)), 'reflect') + h, w = img.shape[0:2] + y, x, _ = np.ogrid[:h, :w, :1] + mask = np.maximum(1.0 - np.minimum(np.float32(x) / pad[0], + np.float32(w - 1 - x) / pad[2]), + 1.0 - np.minimum(np.float32(y) / pad[1], + np.float32(h - 1 - y) / pad[3])) + blur = int(qsize * 0.02) + if blur % 2 == 0: + blur += 1 + blur_img = cv2.boxFilter(img, 0, ksize=(blur, blur)) + + img = img.astype('float32') + img += (blur_img - img) * np.clip(mask * 3.0 + 1.0, 0.0, 1.0) + img += (np.median(img, axis=(0, 1)) - img) * np.clip(mask, 0.0, 1.0) + img = np.clip(img, 0, 255) # float32, [0, 255] + quad += pad[:2] + + # Transform use cv2 + h_ratio = shrink_ratio[0] / shrink_ratio[1] + dst_h, dst_w = int(transform_size * h_ratio), transform_size + template = np.array([[0, 0], [0, dst_h], [dst_w, dst_h], [dst_w, 0]]) + # use cv2.LMEDS method for the equivalence to skimage transform + # ref: https://blog.csdn.net/yichxi/article/details/115827338 + affine_matrix = cv2.estimateAffinePartial2D(quad, template, method=cv2.LMEDS)[0] + cropped_face = cv2.warpAffine( + img, affine_matrix, (dst_w, dst_h), borderMode=cv2.BORDER_CONSTANT, borderValue=(135, 133, 132)) # gray + + if output_size < transform_size: + cropped_face = cv2.resize( + cropped_face, (output_size, int(output_size * h_ratio)), interpolation=cv2.INTER_LINEAR) + + if return_inverse_affine: + dst_h, dst_w = int(output_size * h_ratio), output_size + template = np.array([[0, 0], [0, dst_h], [dst_w, dst_h], [dst_w, 0]]) + # use cv2.LMEDS method for the equivalence to skimage transform + # ref: https://blog.csdn.net/yichxi/article/details/115827338 + affine_matrix = cv2.estimateAffinePartial2D( + quad_ori, np.array([[0, 0], [0, output_size], [dst_w, dst_h], [dst_w, 0]]), method=cv2.LMEDS)[0] + inverse_affine = cv2.invertAffineTransform(affine_matrix) + else: + inverse_affine = None + return cropped_face, inverse_affine + + +def paste_face_back(img, face, inverse_affine): + h, w = img.shape[0:2] + face_h, face_w = face.shape[0:2] + inv_restored = cv2.warpAffine(face, inverse_affine, (w, h)) + mask = np.ones((face_h, face_w, 3), dtype=np.float32) + inv_mask = cv2.warpAffine(mask, inverse_affine, (w, h)) + # remove the black borders + inv_mask_erosion = cv2.erode(inv_mask, np.ones((2, 2), np.uint8)) + inv_restored_remove_border = inv_mask_erosion * inv_restored + total_face_area = np.sum(inv_mask_erosion) // 3 + # compute the fusion edge based on the area of face + w_edge = int(total_face_area**0.5) // 20 + erosion_radius = w_edge * 2 + inv_mask_center = cv2.erode(inv_mask_erosion, np.ones((erosion_radius, erosion_radius), np.uint8)) + blur_size = w_edge * 2 + inv_soft_mask = cv2.GaussianBlur(inv_mask_center, (blur_size + 1, blur_size + 1), 0) + img = inv_soft_mask * inv_restored_remove_border + (1 - inv_soft_mask) * img + # float32, [0, 255] + return img diff --git a/inpaint/plugins/facexlib/utils/misc.py b/inpaint/plugins/facexlib/utils/misc.py new file mode 100644 index 0000000..b1a597c --- /dev/null +++ b/inpaint/plugins/facexlib/utils/misc.py @@ -0,0 +1,118 @@ +import cv2 +import os +import os.path as osp +import torch +from torch.hub import download_url_to_file, get_dir +from urllib.parse import urlparse + +ROOT_DIR = os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) + + +def imwrite(img, file_path, params=None, auto_mkdir=True): + """Write image to file. + + Args: + img (ndarray): Image array to be written. + file_path (str): Image file path. + params (None or list): Same as opencv's :func:`imwrite` interface. + auto_mkdir (bool): If the parent folder of `file_path` does not exist, + whether to create it automatically. + + Returns: + bool: Successful or not. + """ + if auto_mkdir: + dir_name = os.path.abspath(os.path.dirname(file_path)) + os.makedirs(dir_name, exist_ok=True) + return cv2.imwrite(file_path, img, params) + + +def img2tensor(imgs, bgr2rgb=True, float32=True): + """Numpy array to tensor. + + Args: + imgs (list[ndarray] | ndarray): Input images. + bgr2rgb (bool): Whether to change bgr to rgb. + float32 (bool): Whether to change to float32. + + Returns: + list[tensor] | tensor: Tensor images. If returned results only have + one element, just return tensor. + """ + + def _totensor(img, bgr2rgb, float32): + if img.shape[2] == 3 and bgr2rgb: + if img.dtype == 'float64': + img = img.astype('float32') + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + img = torch.from_numpy(img.transpose(2, 0, 1)) + if float32: + img = img.float() + return img + + if isinstance(imgs, list): + return [_totensor(img, bgr2rgb, float32) for img in imgs] + else: + return _totensor(imgs, bgr2rgb, float32) + + +def load_file_from_url(url, model_dir=None, progress=True, file_name=None, save_dir=None): + """Ref:https://github.com/1adrianb/face-alignment/blob/master/face_alignment/utils.py + """ + if model_dir is None: + hub_dir = get_dir() + model_dir = os.path.join(hub_dir, 'checkpoints') + + if save_dir is None: + save_dir = os.path.join(ROOT_DIR, model_dir) + os.makedirs(save_dir, exist_ok=True) + + parts = urlparse(url) + filename = os.path.basename(parts.path) + if file_name is not None: + filename = file_name + cached_file = os.path.abspath(os.path.join(save_dir, filename)) + if not os.path.exists(cached_file): + print(f'Downloading: "{url}" to {cached_file}\n') + download_url_to_file(url, cached_file, hash_prefix=None, progress=progress) + return cached_file + + +def scandir(dir_path, suffix=None, recursive=False, full_path=False): + """Scan a directory to find the interested files. + Args: + dir_path (str): Path of the directory. + suffix (str | tuple(str), optional): File suffix that we are + interested in. Default: None. + recursive (bool, optional): If set to True, recursively scan the + directory. Default: False. + full_path (bool, optional): If set to True, include the dir_path. + Default: False. + Returns: + A generator for all the interested files with relative paths. + """ + + if (suffix is not None) and not isinstance(suffix, (str, tuple)): + raise TypeError('"suffix" must be a string or tuple of strings') + + root = dir_path + + def _scandir(dir_path, suffix, recursive): + for entry in os.scandir(dir_path): + if not entry.name.startswith('.') and entry.is_file(): + if full_path: + return_path = entry.path + else: + return_path = osp.relpath(entry.path, root) + + if suffix is None: + yield return_path + elif return_path.endswith(suffix): + yield return_path + else: + if recursive: + yield from _scandir(entry.path, suffix=suffix, recursive=recursive) + else: + continue + + return _scandir(dir_path, suffix=suffix, recursive=recursive) diff --git a/inpaint/plugins/gfpgan/archs/gfpganv1_clean_arch.py b/inpaint/plugins/gfpgan/archs/gfpganv1_clean_arch.py new file mode 100644 index 0000000..0733216 --- /dev/null +++ b/inpaint/plugins/gfpgan/archs/gfpganv1_clean_arch.py @@ -0,0 +1,322 @@ +import math +import random +import torch +from torch import nn +from torch.nn import functional as F + +from .stylegan2_clean_arch import StyleGAN2GeneratorClean + + +class StyleGAN2GeneratorCSFT(StyleGAN2GeneratorClean): + """StyleGAN2 Generator with SFT modulation (Spatial Feature Transform). + + It is the clean version without custom compiled CUDA extensions used in StyleGAN2. + + Args: + out_size (int): The spatial size of outputs. + num_style_feat (int): Channel number of style features. Default: 512. + num_mlp (int): Layer number of MLP style layers. Default: 8. + channel_multiplier (int): Channel multiplier for large networks of StyleGAN2. Default: 2. + narrow (float): The narrow ratio for channels. Default: 1. + sft_half (bool): Whether to apply SFT on half of the input channels. Default: False. + """ + + def __init__(self, out_size, num_style_feat=512, num_mlp=8, channel_multiplier=2, narrow=1, sft_half=False): + super(StyleGAN2GeneratorCSFT, self).__init__( + out_size, + num_style_feat=num_style_feat, + num_mlp=num_mlp, + channel_multiplier=channel_multiplier, + narrow=narrow) + self.sft_half = sft_half + + def forward(self, + styles, + conditions, + input_is_latent=False, + noise=None, + randomize_noise=True, + truncation=1, + truncation_latent=None, + inject_index=None, + return_latents=False): + """Forward function for StyleGAN2GeneratorCSFT. + + Args: + styles (list[Tensor]): Sample codes of styles. + conditions (list[Tensor]): SFT conditions to generators. + input_is_latent (bool): Whether input is latent style. Default: False. + noise (Tensor | None): Input noise or None. Default: None. + randomize_noise (bool): Randomize noise, used when 'noise' is False. Default: True. + truncation (float): The truncation ratio. Default: 1. + truncation_latent (Tensor | None): The truncation latent tensor. Default: None. + inject_index (int | None): The injection index for mixing noise. Default: None. + return_latents (bool): Whether to return style latents. Default: False. + """ + # style codes -> latents with Style MLP layer + if not input_is_latent: + styles = [self.style_mlp(s) for s in styles] + # noises + if noise is None: + if randomize_noise: + noise = [None] * self.num_layers # for each style conv layer + else: # use the stored noise + noise = [getattr(self.noises, f'noise{i}') for i in range(self.num_layers)] + # style truncation + if truncation < 1: + style_truncation = [] + for style in styles: + style_truncation.append(truncation_latent + truncation * (style - truncation_latent)) + styles = style_truncation + # get style latents with injection + if len(styles) == 1: + inject_index = self.num_latent + + if styles[0].ndim < 3: + # repeat latent code for all the layers + latent = styles[0].unsqueeze(1).repeat(1, inject_index, 1) + else: # used for encoder with different latent code for each layer + latent = styles[0] + elif len(styles) == 2: # mixing noises + if inject_index is None: + inject_index = random.randint(1, self.num_latent - 1) + latent1 = styles[0].unsqueeze(1).repeat(1, inject_index, 1) + latent2 = styles[1].unsqueeze(1).repeat(1, self.num_latent - inject_index, 1) + latent = torch.cat([latent1, latent2], 1) + + # main generation + out = self.constant_input(latent.shape[0]) + out = self.style_conv1(out, latent[:, 0], noise=noise[0]) + skip = self.to_rgb1(out, latent[:, 1]) + + i = 1 + for conv1, conv2, noise1, noise2, to_rgb in zip(self.style_convs[::2], self.style_convs[1::2], noise[1::2], + noise[2::2], self.to_rgbs): + out = conv1(out, latent[:, i], noise=noise1) + + # the conditions may have fewer levels + if i < len(conditions): + # SFT part to combine the conditions + if self.sft_half: # only apply SFT to half of the channels + out_same, out_sft = torch.split(out, int(out.size(1) // 2), dim=1) + out_sft = out_sft * conditions[i - 1] + conditions[i] + out = torch.cat([out_same, out_sft], dim=1) + else: # apply SFT to all the channels + out = out * conditions[i - 1] + conditions[i] + + out = conv2(out, latent[:, i + 1], noise=noise2) + skip = to_rgb(out, latent[:, i + 2], skip) # feature back to the rgb space + i += 2 + + image = skip + + if return_latents: + return image, latent + else: + return image, None + + +class ResBlock(nn.Module): + """Residual block with bilinear upsampling/downsampling. + + Args: + in_channels (int): Channel number of the input. + out_channels (int): Channel number of the output. + mode (str): Upsampling/downsampling mode. Options: down | up. Default: down. + """ + + def __init__(self, in_channels, out_channels, mode='down'): + super(ResBlock, self).__init__() + + self.conv1 = nn.Conv2d(in_channels, in_channels, 3, 1, 1) + self.conv2 = nn.Conv2d(in_channels, out_channels, 3, 1, 1) + self.skip = nn.Conv2d(in_channels, out_channels, 1, bias=False) + if mode == 'down': + self.scale_factor = 0.5 + elif mode == 'up': + self.scale_factor = 2 + + def forward(self, x): + out = F.leaky_relu_(self.conv1(x), negative_slope=0.2) + # upsample/downsample + out = F.interpolate(out, scale_factor=self.scale_factor, mode='bilinear', align_corners=False) + out = F.leaky_relu_(self.conv2(out), negative_slope=0.2) + # skip + x = F.interpolate(x, scale_factor=self.scale_factor, mode='bilinear', align_corners=False) + skip = self.skip(x) + out = out + skip + return out + + +class GFPGANv1Clean(nn.Module): + """The GFPGAN architecture: Unet + StyleGAN2 decoder with SFT. + + It is the clean version without custom compiled CUDA extensions used in StyleGAN2. + + Ref: GFP-GAN: Towards Real-World Blind Face Restoration with Generative Facial Prior. + + Args: + out_size (int): The spatial size of outputs. + num_style_feat (int): Channel number of style features. Default: 512. + channel_multiplier (int): Channel multiplier for large networks of StyleGAN2. Default: 2. + decoder_load_path (str): The path to the pre-trained decoder model (usually, the StyleGAN2). Default: None. + fix_decoder (bool): Whether to fix the decoder. Default: True. + + num_mlp (int): Layer number of MLP style layers. Default: 8. + input_is_latent (bool): Whether input is latent style. Default: False. + different_w (bool): Whether to use different latent w for different layers. Default: False. + narrow (float): The narrow ratio for channels. Default: 1. + sft_half (bool): Whether to apply SFT on half of the input channels. Default: False. + """ + + def __init__( + self, + out_size, + num_style_feat=512, + channel_multiplier=1, + decoder_load_path=None, + fix_decoder=True, + # for stylegan decoder + num_mlp=8, + input_is_latent=False, + different_w=False, + narrow=1, + sft_half=False): + + super(GFPGANv1Clean, self).__init__() + self.input_is_latent = input_is_latent + self.different_w = different_w + self.num_style_feat = num_style_feat + + unet_narrow = narrow * 0.5 # by default, use a half of input channels + channels = { + '4': int(512 * unet_narrow), + '8': int(512 * unet_narrow), + '16': int(512 * unet_narrow), + '32': int(512 * unet_narrow), + '64': int(256 * channel_multiplier * unet_narrow), + '128': int(128 * channel_multiplier * unet_narrow), + '256': int(64 * channel_multiplier * unet_narrow), + '512': int(32 * channel_multiplier * unet_narrow), + '1024': int(16 * channel_multiplier * unet_narrow) + } + + self.log_size = int(math.log(out_size, 2)) + first_out_size = 2**(int(math.log(out_size, 2))) + + self.conv_body_first = nn.Conv2d(3, channels[f'{first_out_size}'], 1) + + # downsample + in_channels = channels[f'{first_out_size}'] + self.conv_body_down = nn.ModuleList() + for i in range(self.log_size, 2, -1): + out_channels = channels[f'{2**(i - 1)}'] + self.conv_body_down.append(ResBlock(in_channels, out_channels, mode='down')) + in_channels = out_channels + + self.final_conv = nn.Conv2d(in_channels, channels['4'], 3, 1, 1) + + # upsample + in_channels = channels['4'] + self.conv_body_up = nn.ModuleList() + for i in range(3, self.log_size + 1): + out_channels = channels[f'{2**i}'] + self.conv_body_up.append(ResBlock(in_channels, out_channels, mode='up')) + in_channels = out_channels + + # to RGB + self.toRGB = nn.ModuleList() + for i in range(3, self.log_size + 1): + self.toRGB.append(nn.Conv2d(channels[f'{2**i}'], 3, 1)) + + if different_w: + linear_out_channel = (int(math.log(out_size, 2)) * 2 - 2) * num_style_feat + else: + linear_out_channel = num_style_feat + + self.final_linear = nn.Linear(channels['4'] * 4 * 4, linear_out_channel) + + # the decoder: stylegan2 generator with SFT modulations + self.stylegan_decoder = StyleGAN2GeneratorCSFT( + out_size=out_size, + num_style_feat=num_style_feat, + num_mlp=num_mlp, + channel_multiplier=channel_multiplier, + narrow=narrow, + sft_half=sft_half) + + # load pre-trained stylegan2 model if necessary + if decoder_load_path: + self.stylegan_decoder.load_state_dict( + torch.load(decoder_load_path, map_location=lambda storage, loc: storage)['params_ema']) + # fix decoder without updating params + if fix_decoder: + for _, param in self.stylegan_decoder.named_parameters(): + param.requires_grad = False + + # for SFT modulations (scale and shift) + self.condition_scale = nn.ModuleList() + self.condition_shift = nn.ModuleList() + for i in range(3, self.log_size + 1): + out_channels = channels[f'{2**i}'] + if sft_half: + sft_out_channels = out_channels + else: + sft_out_channels = out_channels * 2 + self.condition_scale.append( + nn.Sequential( + nn.Conv2d(out_channels, out_channels, 3, 1, 1), nn.LeakyReLU(0.2, True), + nn.Conv2d(out_channels, sft_out_channels, 3, 1, 1))) + self.condition_shift.append( + nn.Sequential( + nn.Conv2d(out_channels, out_channels, 3, 1, 1), nn.LeakyReLU(0.2, True), + nn.Conv2d(out_channels, sft_out_channels, 3, 1, 1))) + + def forward(self, x, return_latents=False, return_rgb=True, randomize_noise=True, **kwargs): + """Forward function for GFPGANv1Clean. + + Args: + x (Tensor): Input images. + return_latents (bool): Whether to return style latents. Default: False. + return_rgb (bool): Whether return intermediate rgb images. Default: True. + randomize_noise (bool): Randomize noise, used when 'noise' is False. Default: True. + """ + conditions = [] + unet_skips = [] + out_rgbs = [] + + # encoder + feat = F.leaky_relu_(self.conv_body_first(x), negative_slope=0.2) + for i in range(self.log_size - 2): + feat = self.conv_body_down[i](feat) + unet_skips.insert(0, feat) + feat = F.leaky_relu_(self.final_conv(feat), negative_slope=0.2) + + # style code + style_code = self.final_linear(feat.view(feat.size(0), -1)) + if self.different_w: + style_code = style_code.view(style_code.size(0), -1, self.num_style_feat) + + # decode + for i in range(self.log_size - 2): + # add unet skip + feat = feat + unet_skips[i] + # ResUpLayer + feat = self.conv_body_up[i](feat) + # generate scale and shift for SFT layers + scale = self.condition_scale[i](feat) + conditions.append(scale.clone()) + shift = self.condition_shift[i](feat) + conditions.append(shift.clone()) + # generate rgb images + if return_rgb: + out_rgbs.append(self.toRGB[i](feat)) + + # decoder + image, _ = self.stylegan_decoder([style_code], + conditions, + return_latents=return_latents, + input_is_latent=self.input_is_latent, + randomize_noise=randomize_noise) + + return image, out_rgbs diff --git a/inpaint/plugins/gfpgan/archs/restoreformer_arch.py b/inpaint/plugins/gfpgan/archs/restoreformer_arch.py new file mode 100644 index 0000000..1485c3e --- /dev/null +++ b/inpaint/plugins/gfpgan/archs/restoreformer_arch.py @@ -0,0 +1,759 @@ +"""Modified from https://github.com/wzhouxiff/RestoreFormer""" + +import numpy as np +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class VectorQuantizer(nn.Module): + """ + see https://github.com/MishaLaskin/vqvae/blob/d761a999e2267766400dc646d82d3ac3657771d4/models/quantizer.py + ____________________________________________ + Discretization bottleneck part of the VQ-VAE. + Inputs: + - n_e : number of embeddings + - e_dim : dimension of embedding + - beta : commitment cost used in loss term, beta * ||z_e(x)-sg[e]||^2 + _____________________________________________ + """ + + def __init__(self, n_e, e_dim, beta): + super(VectorQuantizer, self).__init__() + self.n_e = n_e + self.e_dim = e_dim + self.beta = beta + + self.embedding = nn.Embedding(self.n_e, self.e_dim) + self.embedding.weight.data.uniform_(-1.0 / self.n_e, 1.0 / self.n_e) + + def forward(self, z): + """ + Inputs the output of the encoder network z and maps it to a discrete + one-hot vector that is the index of the closest embedding vector e_j + z (continuous) -> z_q (discrete) + z.shape = (batch, channel, height, width) + quantization pipeline: + 1. get encoder input (B,C,H,W) + 2. flatten input to (B*H*W,C) + """ + # reshape z -> (batch, height, width, channel) and flatten + z = z.permute(0, 2, 3, 1).contiguous() + z_flattened = z.view(-1, self.e_dim) + # distances from z to embeddings e_j (z - e)^2 = z^2 + e^2 - 2 e * z + + d = ( + torch.sum(z_flattened**2, dim=1, keepdim=True) + + torch.sum(self.embedding.weight**2, dim=1) + - 2 * torch.matmul(z_flattened, self.embedding.weight.t()) + ) + + # could possible replace this here + # #\start... + # find closest encodings + + min_value, min_encoding_indices = torch.min(d, dim=1) + + min_encoding_indices = min_encoding_indices.unsqueeze(1) + + min_encodings = torch.zeros(min_encoding_indices.shape[0], self.n_e).to(z) + min_encodings.scatter_(1, min_encoding_indices, 1) + + # dtype min encodings: torch.float32 + # min_encodings shape: torch.Size([2048, 512]) + # min_encoding_indices.shape: torch.Size([2048, 1]) + + # get quantized latent vectors + z_q = torch.matmul(min_encodings, self.embedding.weight).view(z.shape) + # .........\end + + # with: + # .........\start + # min_encoding_indices = torch.argmin(d, dim=1) + # z_q = self.embedding(min_encoding_indices) + # ......\end......... (TODO) + + # compute loss for embedding + loss = torch.mean((z_q.detach() - z) ** 2) + self.beta * torch.mean( + (z_q - z.detach()) ** 2 + ) + + # preserve gradients + z_q = z + (z_q - z).detach() + + # perplexity + + e_mean = torch.mean(min_encodings, dim=0) + perplexity = torch.exp(-torch.sum(e_mean * torch.log(e_mean + 1e-10))) + + # reshape back to match original input shape + z_q = z_q.permute(0, 3, 1, 2).contiguous() + + return z_q, loss, (perplexity, min_encodings, min_encoding_indices, d) + + def get_codebook_entry(self, indices, shape): + # shape specifying (batch, height, width, channel) + # TODO: check for more easy handling with nn.Embedding + min_encodings = torch.zeros(indices.shape[0], self.n_e).to(indices) + min_encodings.scatter_(1, indices[:, None], 1) + + # get quantized latent vectors + z_q = torch.matmul(min_encodings.float(), self.embedding.weight) + + if shape is not None: + z_q = z_q.view(shape) + + # reshape back to match original input shape + z_q = z_q.permute(0, 3, 1, 2).contiguous() + + return z_q + + +# pytorch_diffusion + derived encoder decoder +def nonlinearity(x): + # swish + return x * torch.sigmoid(x) + + +def Normalize(in_channels): + return torch.nn.GroupNorm( + num_groups=32, num_channels=in_channels, eps=1e-6, affine=True + ) + + +class Upsample(nn.Module): + def __init__(self, in_channels, with_conv): + super().__init__() + self.with_conv = with_conv + if self.with_conv: + self.conv = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=3, stride=1, padding=1 + ) + + def forward(self, x): + x = torch.nn.functional.interpolate(x, scale_factor=2.0, mode="nearest") + if self.with_conv: + x = self.conv(x) + return x + + +class Downsample(nn.Module): + def __init__(self, in_channels, with_conv): + super().__init__() + self.with_conv = with_conv + if self.with_conv: + # no asymmetric padding in torch conv, must do it ourselves + self.conv = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=3, stride=2, padding=0 + ) + + def forward(self, x): + if self.with_conv: + pad = (0, 1, 0, 1) + x = torch.nn.functional.pad(x, pad, mode="constant", value=0) + x = self.conv(x) + else: + x = torch.nn.functional.avg_pool2d(x, kernel_size=2, stride=2) + return x + + +class ResnetBlock(nn.Module): + def __init__( + self, + *, + in_channels, + out_channels=None, + conv_shortcut=False, + dropout, + temb_channels=512, + ): + super().__init__() + self.in_channels = in_channels + out_channels = in_channels if out_channels is None else out_channels + self.out_channels = out_channels + self.use_conv_shortcut = conv_shortcut + + self.norm1 = Normalize(in_channels) + self.conv1 = torch.nn.Conv2d( + in_channels, out_channels, kernel_size=3, stride=1, padding=1 + ) + if temb_channels > 0: + self.temb_proj = torch.nn.Linear(temb_channels, out_channels) + self.norm2 = Normalize(out_channels) + self.dropout = torch.nn.Dropout(dropout) + self.conv2 = torch.nn.Conv2d( + out_channels, out_channels, kernel_size=3, stride=1, padding=1 + ) + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + self.conv_shortcut = torch.nn.Conv2d( + in_channels, out_channels, kernel_size=3, stride=1, padding=1 + ) + else: + self.nin_shortcut = torch.nn.Conv2d( + in_channels, out_channels, kernel_size=1, stride=1, padding=0 + ) + + def forward(self, x, temb): + h = x + h = self.norm1(h) + h = nonlinearity(h) + h = self.conv1(h) + + if temb is not None: + h = h + self.temb_proj(nonlinearity(temb))[:, :, None, None] + + h = self.norm2(h) + h = nonlinearity(h) + h = self.dropout(h) + h = self.conv2(h) + + if self.in_channels != self.out_channels: + if self.use_conv_shortcut: + x = self.conv_shortcut(x) + else: + x = self.nin_shortcut(x) + + return x + h + + +class MultiHeadAttnBlock(nn.Module): + def __init__(self, in_channels, head_size=1): + super().__init__() + self.in_channels = in_channels + self.head_size = head_size + self.att_size = in_channels // head_size + assert ( + in_channels % head_size == 0 + ), "The size of head should be divided by the number of channels." + + self.norm1 = Normalize(in_channels) + self.norm2 = Normalize(in_channels) + + self.q = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0 + ) + self.k = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0 + ) + self.v = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0 + ) + self.proj_out = torch.nn.Conv2d( + in_channels, in_channels, kernel_size=1, stride=1, padding=0 + ) + self.num = 0 + + def forward(self, x, y=None): + h_ = x + h_ = self.norm1(h_) + if y is None: + y = h_ + else: + y = self.norm2(y) + + q = self.q(y) + k = self.k(h_) + v = self.v(h_) + + # compute attention + b, c, h, w = q.shape + q = q.reshape(b, self.head_size, self.att_size, h * w) + q = q.permute(0, 3, 1, 2) # b, hw, head, att + + k = k.reshape(b, self.head_size, self.att_size, h * w) + k = k.permute(0, 3, 1, 2) + + v = v.reshape(b, self.head_size, self.att_size, h * w) + v = v.permute(0, 3, 1, 2) + + q = q.transpose(1, 2) + v = v.transpose(1, 2) + k = k.transpose(1, 2).transpose(2, 3) + + scale = int(self.att_size) ** (-0.5) + q.mul_(scale) + w_ = torch.matmul(q, k) + w_ = F.softmax(w_, dim=3) + + w_ = w_.matmul(v) + + w_ = w_.transpose(1, 2).contiguous() # [b, h*w, head, att] + w_ = w_.view(b, h, w, -1) + w_ = w_.permute(0, 3, 1, 2) + + w_ = self.proj_out(w_) + + return x + w_ + + +class MultiHeadEncoder(nn.Module): + def __init__( + self, + ch, + out_ch, + ch_mult=(1, 2, 4, 8), + num_res_blocks=2, + attn_resolutions=(16,), + dropout=0.0, + resamp_with_conv=True, + in_channels=3, + resolution=512, + z_channels=256, + double_z=True, + enable_mid=True, + head_size=1, + **ignore_kwargs, + ): + super().__init__() + self.ch = ch + self.temb_ch = 0 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.resolution = resolution + self.in_channels = in_channels + self.enable_mid = enable_mid + + # downsampling + self.conv_in = torch.nn.Conv2d( + in_channels, self.ch, kernel_size=3, stride=1, padding=1 + ) + + curr_res = resolution + in_ch_mult = (1,) + tuple(ch_mult) + self.down = nn.ModuleList() + for i_level in range(self.num_resolutions): + block = nn.ModuleList() + attn = nn.ModuleList() + block_in = ch * in_ch_mult[i_level] + block_out = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks): + block.append( + ResnetBlock( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + ) + ) + block_in = block_out + if curr_res in attn_resolutions: + attn.append(MultiHeadAttnBlock(block_in, head_size)) + down = nn.Module() + down.block = block + down.attn = attn + if i_level != self.num_resolutions - 1: + down.downsample = Downsample(block_in, resamp_with_conv) + curr_res = curr_res // 2 + self.down.append(down) + + # middle + if self.enable_mid: + self.mid = nn.Module() + self.mid.block_1 = ResnetBlock( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + ) + self.mid.attn_1 = MultiHeadAttnBlock(block_in, head_size) + self.mid.block_2 = ResnetBlock( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + ) + + # end + self.norm_out = Normalize(block_in) + self.conv_out = torch.nn.Conv2d( + block_in, + 2 * z_channels if double_z else z_channels, + kernel_size=3, + stride=1, + padding=1, + ) + + def forward(self, x): + hs = {} + # timestep embedding + temb = None + + # downsampling + h = self.conv_in(x) + hs["in"] = h + for i_level in range(self.num_resolutions): + for i_block in range(self.num_res_blocks): + h = self.down[i_level].block[i_block](h, temb) + if len(self.down[i_level].attn) > 0: + h = self.down[i_level].attn[i_block](h) + + if i_level != self.num_resolutions - 1: + # hs.append(h) + hs["block_" + str(i_level)] = h + h = self.down[i_level].downsample(h) + + # middle + # h = hs[-1] + if self.enable_mid: + h = self.mid.block_1(h, temb) + hs["block_" + str(i_level) + "_atten"] = h + h = self.mid.attn_1(h) + h = self.mid.block_2(h, temb) + hs["mid_atten"] = h + + # end + h = self.norm_out(h) + h = nonlinearity(h) + h = self.conv_out(h) + # hs.append(h) + hs["out"] = h + + return hs + + +class MultiHeadDecoder(nn.Module): + def __init__( + self, + ch, + out_ch, + ch_mult=(1, 2, 4, 8), + num_res_blocks=2, + attn_resolutions=(16,), + dropout=0.0, + resamp_with_conv=True, + in_channels=3, + resolution=512, + z_channels=256, + give_pre_end=False, + enable_mid=True, + head_size=1, + **ignorekwargs, + ): + super().__init__() + self.ch = ch + self.temb_ch = 0 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.resolution = resolution + self.in_channels = in_channels + self.give_pre_end = give_pre_end + self.enable_mid = enable_mid + + # compute in_ch_mult, block_in and curr_res at lowest res + block_in = ch * ch_mult[self.num_resolutions - 1] + curr_res = resolution // 2 ** (self.num_resolutions - 1) + self.z_shape = (1, z_channels, curr_res, curr_res) + print( + "Working with z of shape {} = {} dimensions.".format( + self.z_shape, np.prod(self.z_shape) + ) + ) + + # z to block_in + self.conv_in = torch.nn.Conv2d( + z_channels, block_in, kernel_size=3, stride=1, padding=1 + ) + + # middle + if self.enable_mid: + self.mid = nn.Module() + self.mid.block_1 = ResnetBlock( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + ) + self.mid.attn_1 = MultiHeadAttnBlock(block_in, head_size) + self.mid.block_2 = ResnetBlock( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + ) + + # upsampling + self.up = nn.ModuleList() + for i_level in reversed(range(self.num_resolutions)): + block = nn.ModuleList() + attn = nn.ModuleList() + block_out = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks + 1): + block.append( + ResnetBlock( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + ) + ) + block_in = block_out + if curr_res in attn_resolutions: + attn.append(MultiHeadAttnBlock(block_in, head_size)) + up = nn.Module() + up.block = block + up.attn = attn + if i_level != 0: + up.upsample = Upsample(block_in, resamp_with_conv) + curr_res = curr_res * 2 + self.up.insert(0, up) # prepend to get consistent order + + # end + self.norm_out = Normalize(block_in) + self.conv_out = torch.nn.Conv2d( + block_in, out_ch, kernel_size=3, stride=1, padding=1 + ) + + def forward(self, z): + # assert z.shape[1:] == self.z_shape[1:] + self.last_z_shape = z.shape + + # timestep embedding + temb = None + + # z to block_in + h = self.conv_in(z) + + # middle + if self.enable_mid: + h = self.mid.block_1(h, temb) + h = self.mid.attn_1(h) + h = self.mid.block_2(h, temb) + + # upsampling + for i_level in reversed(range(self.num_resolutions)): + for i_block in range(self.num_res_blocks + 1): + h = self.up[i_level].block[i_block](h, temb) + if len(self.up[i_level].attn) > 0: + h = self.up[i_level].attn[i_block](h) + if i_level != 0: + h = self.up[i_level].upsample(h) + + # end + if self.give_pre_end: + return h + + h = self.norm_out(h) + h = nonlinearity(h) + h = self.conv_out(h) + return h + + +class MultiHeadDecoderTransformer(nn.Module): + def __init__( + self, + ch, + out_ch, + ch_mult=(1, 2, 4, 8), + num_res_blocks=2, + attn_resolutions=(16,), + dropout=0.0, + resamp_with_conv=True, + in_channels=3, + resolution=512, + z_channels=256, + give_pre_end=False, + enable_mid=True, + head_size=1, + **ignorekwargs, + ): + super().__init__() + self.ch = ch + self.temb_ch = 0 + self.num_resolutions = len(ch_mult) + self.num_res_blocks = num_res_blocks + self.resolution = resolution + self.in_channels = in_channels + self.give_pre_end = give_pre_end + self.enable_mid = enable_mid + + # compute in_ch_mult, block_in and curr_res at lowest res + block_in = ch * ch_mult[self.num_resolutions - 1] + curr_res = resolution // 2 ** (self.num_resolutions - 1) + self.z_shape = (1, z_channels, curr_res, curr_res) + print( + "Working with z of shape {} = {} dimensions.".format( + self.z_shape, np.prod(self.z_shape) + ) + ) + + # z to block_in + self.conv_in = torch.nn.Conv2d( + z_channels, block_in, kernel_size=3, stride=1, padding=1 + ) + + # middle + if self.enable_mid: + self.mid = nn.Module() + self.mid.block_1 = ResnetBlock( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + ) + self.mid.attn_1 = MultiHeadAttnBlock(block_in, head_size) + self.mid.block_2 = ResnetBlock( + in_channels=block_in, + out_channels=block_in, + temb_channels=self.temb_ch, + dropout=dropout, + ) + + # upsampling + self.up = nn.ModuleList() + for i_level in reversed(range(self.num_resolutions)): + block = nn.ModuleList() + attn = nn.ModuleList() + block_out = ch * ch_mult[i_level] + for i_block in range(self.num_res_blocks + 1): + block.append( + ResnetBlock( + in_channels=block_in, + out_channels=block_out, + temb_channels=self.temb_ch, + dropout=dropout, + ) + ) + block_in = block_out + if curr_res in attn_resolutions: + attn.append(MultiHeadAttnBlock(block_in, head_size)) + up = nn.Module() + up.block = block + up.attn = attn + if i_level != 0: + up.upsample = Upsample(block_in, resamp_with_conv) + curr_res = curr_res * 2 + self.up.insert(0, up) # prepend to get consistent order + + # end + self.norm_out = Normalize(block_in) + self.conv_out = torch.nn.Conv2d( + block_in, out_ch, kernel_size=3, stride=1, padding=1 + ) + + def forward(self, z, hs): + # assert z.shape[1:] == self.z_shape[1:] + # self.last_z_shape = z.shape + + # timestep embedding + temb = None + + # z to block_in + h = self.conv_in(z) + + # middle + if self.enable_mid: + h = self.mid.block_1(h, temb) + h = self.mid.attn_1(h, hs["mid_atten"]) + h = self.mid.block_2(h, temb) + + # upsampling + for i_level in reversed(range(self.num_resolutions)): + for i_block in range(self.num_res_blocks + 1): + h = self.up[i_level].block[i_block](h, temb) + if len(self.up[i_level].attn) > 0: + h = self.up[i_level].attn[i_block]( + h, hs["block_" + str(i_level) + "_atten"] + ) + # hfeature = h.clone() + if i_level != 0: + h = self.up[i_level].upsample(h) + + # end + if self.give_pre_end: + return h + + h = self.norm_out(h) + h = nonlinearity(h) + h = self.conv_out(h) + return h + + +class RestoreFormer(nn.Module): + def __init__( + self, + n_embed=1024, + embed_dim=256, + ch=64, + out_ch=3, + ch_mult=(1, 2, 2, 4, 4, 8), + num_res_blocks=2, + attn_resolutions=(16,), + dropout=0.0, + in_channels=3, + resolution=512, + z_channels=256, + double_z=False, + enable_mid=True, + fix_decoder=False, + fix_codebook=True, + fix_encoder=False, + head_size=8, + ): + super(RestoreFormer, self).__init__() + + self.encoder = MultiHeadEncoder( + ch=ch, + out_ch=out_ch, + ch_mult=ch_mult, + num_res_blocks=num_res_blocks, + attn_resolutions=attn_resolutions, + dropout=dropout, + in_channels=in_channels, + resolution=resolution, + z_channels=z_channels, + double_z=double_z, + enable_mid=enable_mid, + head_size=head_size, + ) + self.decoder = MultiHeadDecoderTransformer( + ch=ch, + out_ch=out_ch, + ch_mult=ch_mult, + num_res_blocks=num_res_blocks, + attn_resolutions=attn_resolutions, + dropout=dropout, + in_channels=in_channels, + resolution=resolution, + z_channels=z_channels, + enable_mid=enable_mid, + head_size=head_size, + ) + + self.quantize = VectorQuantizer(n_embed, embed_dim, beta=0.25) + + self.quant_conv = torch.nn.Conv2d(z_channels, embed_dim, 1) + self.post_quant_conv = torch.nn.Conv2d(embed_dim, z_channels, 1) + + if fix_decoder: + for _, param in self.decoder.named_parameters(): + param.requires_grad = False + for _, param in self.post_quant_conv.named_parameters(): + param.requires_grad = False + for _, param in self.quantize.named_parameters(): + param.requires_grad = False + elif fix_codebook: + for _, param in self.quantize.named_parameters(): + param.requires_grad = False + + if fix_encoder: + for _, param in self.encoder.named_parameters(): + param.requires_grad = False + + def encode(self, x): + hs = self.encoder(x) + h = self.quant_conv(hs["out"]) + quant, emb_loss, info = self.quantize(h) + return quant, emb_loss, info, hs + + def decode(self, quant, hs): + quant = self.post_quant_conv(quant) + dec = self.decoder(quant, hs) + + return dec + + def forward(self, input, **kwargs): + quant, diff, info, hs = self.encode(input) + dec = self.decode(quant, hs) + + return dec, None diff --git a/inpaint/plugins/gfpgan/archs/stylegan2_clean_arch.py b/inpaint/plugins/gfpgan/archs/stylegan2_clean_arch.py new file mode 100644 index 0000000..553368a --- /dev/null +++ b/inpaint/plugins/gfpgan/archs/stylegan2_clean_arch.py @@ -0,0 +1,434 @@ +import math +import random +import torch +from torch import nn +from torch.nn import functional as F + +from iopaint.plugins.basicsr.arch_util import default_init_weights + + +class NormStyleCode(nn.Module): + def forward(self, x): + """Normalize the style codes. + + Args: + x (Tensor): Style codes with shape (b, c). + + Returns: + Tensor: Normalized tensor. + """ + return x * torch.rsqrt(torch.mean(x**2, dim=1, keepdim=True) + 1e-8) + + +class ModulatedConv2d(nn.Module): + """Modulated Conv2d used in StyleGAN2. + + There is no bias in ModulatedConv2d. + + Args: + in_channels (int): Channel number of the input. + out_channels (int): Channel number of the output. + kernel_size (int): Size of the convolving kernel. + num_style_feat (int): Channel number of style features. + demodulate (bool): Whether to demodulate in the conv layer. Default: True. + sample_mode (str | None): Indicating 'upsample', 'downsample' or None. Default: None. + eps (float): A value added to the denominator for numerical stability. Default: 1e-8. + """ + + def __init__( + self, + in_channels, + out_channels, + kernel_size, + num_style_feat, + demodulate=True, + sample_mode=None, + eps=1e-8, + ): + super(ModulatedConv2d, self).__init__() + self.in_channels = in_channels + self.out_channels = out_channels + self.kernel_size = kernel_size + self.demodulate = demodulate + self.sample_mode = sample_mode + self.eps = eps + + # modulation inside each modulated conv + self.modulation = nn.Linear(num_style_feat, in_channels, bias=True) + # initialization + default_init_weights( + self.modulation, + scale=1, + bias_fill=1, + a=0, + mode="fan_in", + nonlinearity="linear", + ) + + self.weight = nn.Parameter( + torch.randn(1, out_channels, in_channels, kernel_size, kernel_size) + / math.sqrt(in_channels * kernel_size**2) + ) + self.padding = kernel_size // 2 + + def forward(self, x, style): + """Forward function. + + Args: + x (Tensor): Tensor with shape (b, c, h, w). + style (Tensor): Tensor with shape (b, num_style_feat). + + Returns: + Tensor: Modulated tensor after convolution. + """ + b, c, h, w = x.shape # c = c_in + # weight modulation + style = self.modulation(style).view(b, 1, c, 1, 1) + # self.weight: (1, c_out, c_in, k, k); style: (b, 1, c, 1, 1) + weight = self.weight * style # (b, c_out, c_in, k, k) + + if self.demodulate: + demod = torch.rsqrt(weight.pow(2).sum([2, 3, 4]) + self.eps) + weight = weight * demod.view(b, self.out_channels, 1, 1, 1) + + weight = weight.view( + b * self.out_channels, c, self.kernel_size, self.kernel_size + ) + + # upsample or downsample if necessary + if self.sample_mode == "upsample": + x = F.interpolate(x, scale_factor=2, mode="bilinear", align_corners=False) + elif self.sample_mode == "downsample": + x = F.interpolate(x, scale_factor=0.5, mode="bilinear", align_corners=False) + + b, c, h, w = x.shape + x = x.view(1, b * c, h, w) + # weight: (b*c_out, c_in, k, k), groups=b + out = F.conv2d(x, weight, padding=self.padding, groups=b) + out = out.view(b, self.out_channels, *out.shape[2:4]) + + return out + + def __repr__(self): + return ( + f"{self.__class__.__name__}(in_channels={self.in_channels}, out_channels={self.out_channels}, " + f"kernel_size={self.kernel_size}, demodulate={self.demodulate}, sample_mode={self.sample_mode})" + ) + + +class StyleConv(nn.Module): + """Style conv used in StyleGAN2. + + Args: + in_channels (int): Channel number of the input. + out_channels (int): Channel number of the output. + kernel_size (int): Size of the convolving kernel. + num_style_feat (int): Channel number of style features. + demodulate (bool): Whether demodulate in the conv layer. Default: True. + sample_mode (str | None): Indicating 'upsample', 'downsample' or None. Default: None. + """ + + def __init__( + self, + in_channels, + out_channels, + kernel_size, + num_style_feat, + demodulate=True, + sample_mode=None, + ): + super(StyleConv, self).__init__() + self.modulated_conv = ModulatedConv2d( + in_channels, + out_channels, + kernel_size, + num_style_feat, + demodulate=demodulate, + sample_mode=sample_mode, + ) + self.weight = nn.Parameter(torch.zeros(1)) # for noise injection + self.bias = nn.Parameter(torch.zeros(1, out_channels, 1, 1)) + self.activate = nn.LeakyReLU(negative_slope=0.2, inplace=True) + + def forward(self, x, style, noise=None): + # modulate + out = self.modulated_conv(x, style) * 2**0.5 # for conversion + # noise injection + if noise is None: + b, _, h, w = out.shape + noise = out.new_empty(b, 1, h, w).normal_() + out = out + self.weight * noise + # add bias + out = out + self.bias + # activation + out = self.activate(out) + return out + + +class ToRGB(nn.Module): + """To RGB (image space) from features. + + Args: + in_channels (int): Channel number of input. + num_style_feat (int): Channel number of style features. + upsample (bool): Whether to upsample. Default: True. + """ + + def __init__(self, in_channels, num_style_feat, upsample=True): + super(ToRGB, self).__init__() + self.upsample = upsample + self.modulated_conv = ModulatedConv2d( + in_channels, + 3, + kernel_size=1, + num_style_feat=num_style_feat, + demodulate=False, + sample_mode=None, + ) + self.bias = nn.Parameter(torch.zeros(1, 3, 1, 1)) + + def forward(self, x, style, skip=None): + """Forward function. + + Args: + x (Tensor): Feature tensor with shape (b, c, h, w). + style (Tensor): Tensor with shape (b, num_style_feat). + skip (Tensor): Base/skip tensor. Default: None. + + Returns: + Tensor: RGB images. + """ + out = self.modulated_conv(x, style) + out = out + self.bias + if skip is not None: + if self.upsample: + skip = F.interpolate( + skip, scale_factor=2, mode="bilinear", align_corners=False + ) + out = out + skip + return out + + +class ConstantInput(nn.Module): + """Constant input. + + Args: + num_channel (int): Channel number of constant input. + size (int): Spatial size of constant input. + """ + + def __init__(self, num_channel, size): + super(ConstantInput, self).__init__() + self.weight = nn.Parameter(torch.randn(1, num_channel, size, size)) + + def forward(self, batch): + out = self.weight.repeat(batch, 1, 1, 1) + return out + + +class StyleGAN2GeneratorClean(nn.Module): + """Clean version of StyleGAN2 Generator. + + Args: + out_size (int): The spatial size of outputs. + num_style_feat (int): Channel number of style features. Default: 512. + num_mlp (int): Layer number of MLP style layers. Default: 8. + channel_multiplier (int): Channel multiplier for large networks of StyleGAN2. Default: 2. + narrow (float): Narrow ratio for channels. Default: 1.0. + """ + + def __init__( + self, out_size, num_style_feat=512, num_mlp=8, channel_multiplier=2, narrow=1 + ): + super(StyleGAN2GeneratorClean, self).__init__() + # Style MLP layers + self.num_style_feat = num_style_feat + style_mlp_layers = [NormStyleCode()] + for i in range(num_mlp): + style_mlp_layers.extend( + [ + nn.Linear(num_style_feat, num_style_feat, bias=True), + nn.LeakyReLU(negative_slope=0.2, inplace=True), + ] + ) + self.style_mlp = nn.Sequential(*style_mlp_layers) + # initialization + default_init_weights( + self.style_mlp, + scale=1, + bias_fill=0, + a=0.2, + mode="fan_in", + nonlinearity="leaky_relu", + ) + + # channel list + channels = { + "4": int(512 * narrow), + "8": int(512 * narrow), + "16": int(512 * narrow), + "32": int(512 * narrow), + "64": int(256 * channel_multiplier * narrow), + "128": int(128 * channel_multiplier * narrow), + "256": int(64 * channel_multiplier * narrow), + "512": int(32 * channel_multiplier * narrow), + "1024": int(16 * channel_multiplier * narrow), + } + self.channels = channels + + self.constant_input = ConstantInput(channels["4"], size=4) + self.style_conv1 = StyleConv( + channels["4"], + channels["4"], + kernel_size=3, + num_style_feat=num_style_feat, + demodulate=True, + sample_mode=None, + ) + self.to_rgb1 = ToRGB(channels["4"], num_style_feat, upsample=False) + + self.log_size = int(math.log(out_size, 2)) + self.num_layers = (self.log_size - 2) * 2 + 1 + self.num_latent = self.log_size * 2 - 2 + + self.style_convs = nn.ModuleList() + self.to_rgbs = nn.ModuleList() + self.noises = nn.Module() + + in_channels = channels["4"] + # noise + for layer_idx in range(self.num_layers): + resolution = 2 ** ((layer_idx + 5) // 2) + shape = [1, 1, resolution, resolution] + self.noises.register_buffer(f"noise{layer_idx}", torch.randn(*shape)) + # style convs and to_rgbs + for i in range(3, self.log_size + 1): + out_channels = channels[f"{2 ** i}"] + self.style_convs.append( + StyleConv( + in_channels, + out_channels, + kernel_size=3, + num_style_feat=num_style_feat, + demodulate=True, + sample_mode="upsample", + ) + ) + self.style_convs.append( + StyleConv( + out_channels, + out_channels, + kernel_size=3, + num_style_feat=num_style_feat, + demodulate=True, + sample_mode=None, + ) + ) + self.to_rgbs.append(ToRGB(out_channels, num_style_feat, upsample=True)) + in_channels = out_channels + + def make_noise(self): + """Make noise for noise injection.""" + device = self.constant_input.weight.device + noises = [torch.randn(1, 1, 4, 4, device=device)] + + for i in range(3, self.log_size + 1): + for _ in range(2): + noises.append(torch.randn(1, 1, 2**i, 2**i, device=device)) + + return noises + + def get_latent(self, x): + return self.style_mlp(x) + + def mean_latent(self, num_latent): + latent_in = torch.randn( + num_latent, self.num_style_feat, device=self.constant_input.weight.device + ) + latent = self.style_mlp(latent_in).mean(0, keepdim=True) + return latent + + def forward( + self, + styles, + input_is_latent=False, + noise=None, + randomize_noise=True, + truncation=1, + truncation_latent=None, + inject_index=None, + return_latents=False, + ): + """Forward function for StyleGAN2GeneratorClean. + + Args: + styles (list[Tensor]): Sample codes of styles. + input_is_latent (bool): Whether input is latent style. Default: False. + noise (Tensor | None): Input noise or None. Default: None. + randomize_noise (bool): Randomize noise, used when 'noise' is False. Default: True. + truncation (float): The truncation ratio. Default: 1. + truncation_latent (Tensor | None): The truncation latent tensor. Default: None. + inject_index (int | None): The injection index for mixing noise. Default: None. + return_latents (bool): Whether to return style latents. Default: False. + """ + # style codes -> latents with Style MLP layer + if not input_is_latent: + styles = [self.style_mlp(s) for s in styles] + # noises + if noise is None: + if randomize_noise: + noise = [None] * self.num_layers # for each style conv layer + else: # use the stored noise + noise = [ + getattr(self.noises, f"noise{i}") for i in range(self.num_layers) + ] + # style truncation + if truncation < 1: + style_truncation = [] + for style in styles: + style_truncation.append( + truncation_latent + truncation * (style - truncation_latent) + ) + styles = style_truncation + # get style latents with injection + if len(styles) == 1: + inject_index = self.num_latent + + if styles[0].ndim < 3: + # repeat latent code for all the layers + latent = styles[0].unsqueeze(1).repeat(1, inject_index, 1) + else: # used for encoder with different latent code for each layer + latent = styles[0] + elif len(styles) == 2: # mixing noises + if inject_index is None: + inject_index = random.randint(1, self.num_latent - 1) + latent1 = styles[0].unsqueeze(1).repeat(1, inject_index, 1) + latent2 = ( + styles[1].unsqueeze(1).repeat(1, self.num_latent - inject_index, 1) + ) + latent = torch.cat([latent1, latent2], 1) + + # main generation + out = self.constant_input(latent.shape[0]) + out = self.style_conv1(out, latent[:, 0], noise=noise[0]) + skip = self.to_rgb1(out, latent[:, 1]) + + i = 1 + for conv1, conv2, noise1, noise2, to_rgb in zip( + self.style_convs[::2], + self.style_convs[1::2], + noise[1::2], + noise[2::2], + self.to_rgbs, + ): + out = conv1(out, latent[:, i], noise=noise1) + out = conv2(out, latent[:, i + 1], noise=noise2) + skip = to_rgb(out, latent[:, i + 2], skip) # feature back to the rgb space + i += 2 + + image = skip + + if return_latents: + return image, latent + else: + return image, None diff --git a/inpaint/plugins/gfpgan_plugin.py b/inpaint/plugins/gfpgan_plugin.py new file mode 100644 index 0000000..760f525 --- /dev/null +++ b/inpaint/plugins/gfpgan_plugin.py @@ -0,0 +1,61 @@ +import cv2 +import numpy as np +from loguru import logger + +from iopaint.helper import download_model +from iopaint.plugins.base_plugin import BasePlugin +from iopaint.schema import RunPluginRequest + + +class GFPGANPlugin(BasePlugin): + name = "GFPGAN" + support_gen_image = True + + def __init__(self, device, upscaler=None): + super().__init__() + from .gfpganer import MyGFPGANer + + url = "https://github.com/TencentARC/GFPGAN/releases/download/v1.3.0/GFPGANv1.4.pth" + model_md5 = "94d735072630ab734561130a47bc44f8" + model_path = download_model(url, model_md5) + logger.info(f"GFPGAN model path: {model_path}") + + # Use GFPGAN for face enhancement + self.face_enhancer = MyGFPGANer( + model_path=model_path, + upscale=1, + arch="clean", + channel_multiplier=2, + device=device, + bg_upsampler=upscaler.model if upscaler is not None else None, + ) + self.face_enhancer.face_helper.face_det.mean_tensor.to(device) + self.face_enhancer.face_helper.face_det = ( + self.face_enhancer.face_helper.face_det.to(device) + ) + + def gen_image(self, rgb_np_img, req: RunPluginRequest) -> np.ndarray: + weight = 0.5 + bgr_np_img = cv2.cvtColor(rgb_np_img, cv2.COLOR_RGB2BGR) + logger.info(f"GFPGAN input shape: {bgr_np_img.shape}") + _, _, bgr_output = self.face_enhancer.enhance( + bgr_np_img, + has_aligned=False, + only_center_face=False, + paste_back=True, + weight=weight, + ) + logger.info(f"GFPGAN output shape: {bgr_output.shape}") + + # try: + # if scale != 2: + # interpolation = cv2.INTER_AREA if scale < 2 else cv2.INTER_LANCZOS4 + # h, w = img.shape[0:2] + # output = cv2.resize( + # output, + # (int(w * scale / 2), int(h * scale / 2)), + # interpolation=interpolation, + # ) + # except Exception as error: + # print("wrong scale input.", error) + return bgr_output diff --git a/inpaint/plugins/gfpganer.py b/inpaint/plugins/gfpganer.py new file mode 100644 index 0000000..26cdb71 --- /dev/null +++ b/inpaint/plugins/gfpganer.py @@ -0,0 +1,156 @@ +import os + +import cv2 +import torch +from torchvision.transforms.functional import normalize +from torch.hub import get_dir + +from .facexlib.utils.face_restoration_helper import FaceRestoreHelper +from .gfpgan.archs.gfpganv1_clean_arch import GFPGANv1Clean +from .basicsr.img_util import img2tensor, tensor2img + + +class MyGFPGANer: + """Helper for restoration with GFPGAN. + + It will detect and crop faces, and then resize the faces to 512x512. + GFPGAN is used to restored the resized faces. + The background is upsampled with the bg_upsampler. + Finally, the faces will be pasted back to the upsample background image. + + Args: + model_path (str): The path to the GFPGAN model. It can be urls (will first download it automatically). + upscale (float): The upscale of the final output. Default: 2. + arch (str): The GFPGAN architecture. Option: clean | original. Default: clean. + channel_multiplier (int): Channel multiplier for large networks of StyleGAN2. Default: 2. + bg_upsampler (nn.Module): The upsampler for the background. Default: None. + """ + + def __init__( + self, + model_path, + upscale=2, + arch="clean", + channel_multiplier=2, + bg_upsampler=None, + device=None, + ): + self.upscale = upscale + self.bg_upsampler = bg_upsampler + + # initialize model + self.device = ( + torch.device("cuda" if torch.cuda.is_available() else "cpu") + if device is None + else device + ) + # initialize the GFP-GAN + if arch == "clean": + self.gfpgan = GFPGANv1Clean( + out_size=512, + num_style_feat=512, + channel_multiplier=channel_multiplier, + decoder_load_path=None, + fix_decoder=False, + num_mlp=8, + input_is_latent=True, + different_w=True, + narrow=1, + sft_half=True, + ) + elif arch == "RestoreFormer": + from .gfpgan.archs.restoreformer_arch import RestoreFormer + + self.gfpgan = RestoreFormer() + + hub_dir = get_dir() + model_dir = os.path.join(hub_dir, "checkpoints") + + # initialize face helper + self.face_helper = FaceRestoreHelper( + upscale, + face_size=512, + crop_ratio=(1, 1), + det_model="retinaface_resnet50", + save_ext="png", + use_parse=True, + device=self.device, + model_rootpath=model_dir, + ) + + loadnet = torch.load(model_path) + if "params_ema" in loadnet: + keyname = "params_ema" + else: + keyname = "params" + self.gfpgan.load_state_dict(loadnet[keyname], strict=True) + self.gfpgan.eval() + self.gfpgan = self.gfpgan.to(self.device) + + @torch.no_grad() + def enhance( + self, + img, + has_aligned=False, + only_center_face=False, + paste_back=True, + weight=0.5, + ): + self.face_helper.clean_all() + + if has_aligned: # the inputs are already aligned + img = cv2.resize(img, (512, 512)) + self.face_helper.cropped_faces = [img] + else: + self.face_helper.read_image(img) + # get face landmarks for each face + self.face_helper.get_face_landmarks_5( + only_center_face=only_center_face, eye_dist_threshold=5 + ) + # eye_dist_threshold=5: skip faces whose eye distance is smaller than 5 pixels + # TODO: even with eye_dist_threshold, it will still introduce wrong detections and restorations. + # align and warp each face + self.face_helper.align_warp_face() + + # face restoration + for cropped_face in self.face_helper.cropped_faces: + # prepare data + cropped_face_t = img2tensor( + cropped_face / 255.0, bgr2rgb=True, float32=True + ) + normalize(cropped_face_t, (0.5, 0.5, 0.5), (0.5, 0.5, 0.5), inplace=True) + cropped_face_t = cropped_face_t.unsqueeze(0).to(self.device) + + try: + output = self.gfpgan(cropped_face_t, return_rgb=False, weight=weight)[0] + # convert to image + restored_face = tensor2img( + output.squeeze(0), rgb2bgr=True, min_max=(-1, 1) + ) + except RuntimeError as error: + print(f"\tFailed inference for GFPGAN: {error}.") + restored_face = cropped_face + + restored_face = restored_face.astype("uint8") + self.face_helper.add_restored_face(restored_face) + + if not has_aligned and paste_back: + # upsample the background + if self.bg_upsampler is not None: + # Now only support RealESRGAN for upsampling background + bg_img = self.bg_upsampler.enhance(img, outscale=self.upscale)[0] + else: + bg_img = None + + self.face_helper.get_inverse_affine(None) + # paste each restored face to the input image + restored_img = self.face_helper.paste_faces_to_input_image( + upsample_img=bg_img + ) + return ( + self.face_helper.cropped_faces, + self.face_helper.restored_faces, + restored_img, + ) + else: + return self.face_helper.cropped_faces, self.face_helper.restored_faces, None diff --git a/inpaint/plugins/interactive_seg.py b/inpaint/plugins/interactive_seg.py new file mode 100644 index 0000000..27859fa --- /dev/null +++ b/inpaint/plugins/interactive_seg.py @@ -0,0 +1,130 @@ +import hashlib +from typing import List + +import numpy as np +import torch +from loguru import logger + +from iopaint.helper import download_model +from iopaint.plugins.base_plugin import BasePlugin +from iopaint.plugins.segment_anything import SamPredictor, sam_model_registry +from iopaint.plugins.segment_anything.predictor_hq import SamHQPredictor +from iopaint.plugins.segment_anything2.build_sam import build_sam2 +from iopaint.plugins.segment_anything2.sam2_image_predictor import SAM2ImagePredictor +from iopaint.schema import RunPluginRequest + +# 从小到大 +SEGMENT_ANYTHING_MODELS = { + "vit_b": { + "url": "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_b_01ec64.pth", + "md5": "01ec64d29a2fca3f0661936605ae66f8", + }, + "vit_l": { + "url": "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_l_0b3195.pth", + "md5": "0b3195507c641ddb6910d2bb5adee89c", + }, + "vit_h": { + "url": "https://dl.fbaipublicfiles.com/segment_anything/sam_vit_h_4b8939.pth", + "md5": "4b8939a88964f0f4ff5f5b2642c598a6", + }, + "mobile_sam": { + "url": "https://github.com/Sanster/models/releases/download/MobileSAM/mobile_sam.pt", + "md5": "f3c0d8cda613564d499310dab6c812cd", + }, + "sam_hq_vit_b": { + "url": "https://huggingface.co/lkeab/hq-sam/resolve/main/sam_hq_vit_b.pth", + "md5": "c6b8953247bcfdc8bb8ef91e36a6cacc", + }, + "sam_hq_vit_l": { + "url": "https://huggingface.co/lkeab/hq-sam/resolve/main/sam_hq_vit_l.pth", + "md5": "08947267966e4264fb39523eccc33f86", + }, + "sam_hq_vit_h": { + "url": "https://huggingface.co/lkeab/hq-sam/resolve/main/sam_hq_vit_h.pth", + "md5": "3560f6b6a5a6edacd814a1325c39640a", + }, + "sam2_tiny": { + "url": "https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_tiny.pt", + "md5": "99eacccce4ada0b35153d4fd7af05297", + }, + "sam2_small": { + "url": "https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_small.pt", + "md5": "7f320dbeb497330a2472da5a16c7324d", + }, + "sam2_base": { + "url": "https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_base_plus.pt", + "md5": "09dc5a3d7719f64aaea1d37341ef26f2", + }, + "sam2_large": { + "url": "https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_large.pt", + "md5": "08083462423be3260cd6a5eef94dc01c", + }, +} + + +class InteractiveSeg(BasePlugin): + name = "InteractiveSeg" + support_gen_mask = True + + def __init__(self, model_name, device): + super().__init__() + self.model_name = model_name + self.device = device + self._init_session(model_name) + + def _init_session(self, model_name: str): + model_path = download_model( + SEGMENT_ANYTHING_MODELS[model_name]["url"], + SEGMENT_ANYTHING_MODELS[model_name]["md5"], + ) + logger.info(f"SegmentAnything model path: {model_path}") + if "sam_hq" in model_name: + self.predictor = SamHQPredictor( + sam_model_registry[model_name](checkpoint=model_path).to(self.device) + ) + elif model_name.startswith("sam2"): + sam2_model = build_sam2( + model_name, ckpt_path=model_path, device=self.device + ) + self.predictor = SAM2ImagePredictor(sam2_model) + else: + self.predictor = SamPredictor( + sam_model_registry[model_name](checkpoint=model_path).to(self.device) + ) + self.prev_img_md5 = None + + def switch_model(self, new_model_name): + if self.model_name == new_model_name: + return + + logger.info( + f"Switching InteractiveSeg model from {self.model_name} to {new_model_name}" + ) + self._init_session(new_model_name) + self.model_name = new_model_name + + def gen_mask(self, rgb_np_img, req: RunPluginRequest) -> np.ndarray: + img_md5 = hashlib.md5(req.image.encode("utf-8")).hexdigest() + return self.forward(rgb_np_img, req.clicks, img_md5) + + @torch.inference_mode() + def forward(self, rgb_np_img, clicks: List[List], img_md5: str): + input_point = [] + input_label = [] + for click in clicks: + x = click[0] + y = click[1] + input_point.append([x, y]) + input_label.append(click[2]) + + if img_md5 and img_md5 != self.prev_img_md5: + self.prev_img_md5 = img_md5 + self.predictor.set_image(rgb_np_img) + + masks, _, _ = self.predictor.predict( + point_coords=np.array(input_point), + point_labels=np.array(input_label), + multimask_output=False, + ) + mask = masks[0].astype(np.uint8) * 255 + return mask diff --git a/inpaint/plugins/realesrgan.py b/inpaint/plugins/realesrgan.py new file mode 100644 index 0000000..21e0a8f --- /dev/null +++ b/inpaint/plugins/realesrgan.py @@ -0,0 +1,468 @@ +import math + +import cv2 +import numpy as np +import torch +from torch import nn +import torch.nn.functional as F +from loguru import logger + +from iopaint.helper import download_model +from iopaint.plugins.base_plugin import BasePlugin +from iopaint.schema import RunPluginRequest, RealESRGANModel + + +class RealESRGANer: + """A helper class for upsampling images with RealESRGAN. + + Args: + scale (int): Upsampling scale factor used in the networks. It is usually 2 or 4. + model_path (str): The path to the pretrained model. It can be urls (will first download it automatically). + model (nn.Module): The defined network. Default: None. + tile (int): As too large images result in the out of GPU memory issue, so this tile option will first crop + input images into tiles, and then process each of them. Finally, they will be merged into one image. + 0 denotes for do not use tile. Default: 0. + tile_pad (int): The pad size for each tile, to remove border artifacts. Default: 10. + pre_pad (int): Pad the input images to avoid border artifacts. Default: 10. + half (float): Whether to use half precision during inference. Default: False. + """ + + def __init__( + self, + scale, + model_path, + dni_weight=None, + model=None, + tile=0, + tile_pad=10, + pre_pad=10, + half=False, + device=None, + gpu_id=None, + ): + self.scale = scale + self.tile_size = tile + self.tile_pad = tile_pad + self.pre_pad = pre_pad + self.mod_scale = None + self.half = half + + # initialize model + if gpu_id: + self.device = ( + torch.device(f"cuda:{gpu_id}" if torch.cuda.is_available() else "cpu") + if device is None + else device + ) + else: + self.device = ( + torch.device("cuda" if torch.cuda.is_available() else "cpu") + if device is None + else device + ) + + if isinstance(model_path, list): + # dni + assert len(model_path) == len( + dni_weight + ), "model_path and dni_weight should have the save length." + loadnet = self.dni(model_path[0], model_path[1], dni_weight) + else: + # if the model_path starts with https, it will first download models to the folder: weights + loadnet = torch.load(model_path, map_location=torch.device("cpu")) + + # prefer to use params_ema + if "params_ema" in loadnet: + keyname = "params_ema" + else: + keyname = "params" + model.load_state_dict(loadnet[keyname], strict=True) + + model.eval() + self.model = model.to(self.device) + if self.half: + self.model = self.model.half() + + def dni(self, net_a, net_b, dni_weight, key="params", loc="cpu"): + """Deep network interpolation. + + ``Paper: Deep Network Interpolation for Continuous Imagery Effect Transition`` + """ + net_a = torch.load(net_a, map_location=torch.device(loc)) + net_b = torch.load(net_b, map_location=torch.device(loc)) + for k, v_a in net_a[key].items(): + net_a[key][k] = dni_weight[0] * v_a + dni_weight[1] * net_b[key][k] + return net_a + + def pre_process(self, img): + """Pre-process, such as pre-pad and mod pad, so that the images can be divisible""" + img = torch.from_numpy(np.transpose(img, (2, 0, 1))).float() + self.img = img.unsqueeze(0).to(self.device) + if self.half: + self.img = self.img.half() + + # pre_pad + if self.pre_pad != 0: + self.img = F.pad(self.img, (0, self.pre_pad, 0, self.pre_pad), "reflect") + # mod pad for divisible borders + if self.scale == 2: + self.mod_scale = 2 + elif self.scale == 1: + self.mod_scale = 4 + if self.mod_scale is not None: + self.mod_pad_h, self.mod_pad_w = 0, 0 + _, _, h, w = self.img.size() + if h % self.mod_scale != 0: + self.mod_pad_h = self.mod_scale - h % self.mod_scale + if w % self.mod_scale != 0: + self.mod_pad_w = self.mod_scale - w % self.mod_scale + self.img = F.pad( + self.img, (0, self.mod_pad_w, 0, self.mod_pad_h), "reflect" + ) + + def process(self): + # model inference + self.output = self.model(self.img) + + def tile_process(self): + """It will first crop input images to tiles, and then process each tile. + Finally, all the processed tiles are merged into one images. + + Modified from: https://github.com/ata4/esrgan-launcher + """ + batch, channel, height, width = self.img.shape + output_height = height * self.scale + output_width = width * self.scale + output_shape = (batch, channel, output_height, output_width) + + # start with black image + self.output = self.img.new_zeros(output_shape) + tiles_x = math.ceil(width / self.tile_size) + tiles_y = math.ceil(height / self.tile_size) + + # loop over all tiles + for y in range(tiles_y): + for x in range(tiles_x): + # extract tile from input image + ofs_x = x * self.tile_size + ofs_y = y * self.tile_size + # input tile area on total image + input_start_x = ofs_x + input_end_x = min(ofs_x + self.tile_size, width) + input_start_y = ofs_y + input_end_y = min(ofs_y + self.tile_size, height) + + # input tile area on total image with padding + input_start_x_pad = max(input_start_x - self.tile_pad, 0) + input_end_x_pad = min(input_end_x + self.tile_pad, width) + input_start_y_pad = max(input_start_y - self.tile_pad, 0) + input_end_y_pad = min(input_end_y + self.tile_pad, height) + + # input tile dimensions + input_tile_width = input_end_x - input_start_x + input_tile_height = input_end_y - input_start_y + tile_idx = y * tiles_x + x + 1 + input_tile = self.img[ + :, + :, + input_start_y_pad:input_end_y_pad, + input_start_x_pad:input_end_x_pad, + ] + + # upscale tile + try: + with torch.no_grad(): + output_tile = self.model(input_tile) + except RuntimeError as error: + print("Error", error) + print(f"\tTile {tile_idx}/{tiles_x * tiles_y}") + + # output tile area on total image + output_start_x = input_start_x * self.scale + output_end_x = input_end_x * self.scale + output_start_y = input_start_y * self.scale + output_end_y = input_end_y * self.scale + + # output tile area without padding + output_start_x_tile = (input_start_x - input_start_x_pad) * self.scale + output_end_x_tile = output_start_x_tile + input_tile_width * self.scale + output_start_y_tile = (input_start_y - input_start_y_pad) * self.scale + output_end_y_tile = output_start_y_tile + input_tile_height * self.scale + + # put tile into output image + self.output[ + :, :, output_start_y:output_end_y, output_start_x:output_end_x + ] = output_tile[ + :, + :, + output_start_y_tile:output_end_y_tile, + output_start_x_tile:output_end_x_tile, + ] + + def post_process(self): + # remove extra pad + if self.mod_scale is not None: + _, _, h, w = self.output.size() + self.output = self.output[ + :, + :, + 0 : h - self.mod_pad_h * self.scale, + 0 : w - self.mod_pad_w * self.scale, + ] + # remove prepad + if self.pre_pad != 0: + _, _, h, w = self.output.size() + self.output = self.output[ + :, + :, + 0 : h - self.pre_pad * self.scale, + 0 : w - self.pre_pad * self.scale, + ] + return self.output + + @torch.no_grad() + def enhance(self, img, outscale=None, alpha_upsampler="realesrgan"): + h_input, w_input = img.shape[0:2] + # img: numpy + img = img.astype(np.float32) + if np.max(img) > 256: # 16-bit image + max_range = 65535 + print("\tInput is a 16-bit image") + else: + max_range = 255 + img = img / max_range + if len(img.shape) == 2: # gray image + img_mode = "L" + img = cv2.cvtColor(img, cv2.COLOR_GRAY2RGB) + elif img.shape[2] == 4: # RGBA image with alpha channel + img_mode = "RGBA" + alpha = img[:, :, 3] + img = img[:, :, 0:3] + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + if alpha_upsampler == "realesrgan": + alpha = cv2.cvtColor(alpha, cv2.COLOR_GRAY2RGB) + else: + img_mode = "RGB" + img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB) + + # ------------------- process image (without the alpha channel) ------------------- # + self.pre_process(img) + if self.tile_size > 0: + self.tile_process() + else: + self.process() + output_img = self.post_process() + output_img = output_img.data.squeeze().float().cpu().clamp_(0, 1).numpy() + output_img = np.transpose(output_img[[2, 1, 0], :, :], (1, 2, 0)) + if img_mode == "L": + output_img = cv2.cvtColor(output_img, cv2.COLOR_BGR2GRAY) + + # ------------------- process the alpha channel if necessary ------------------- # + if img_mode == "RGBA": + if alpha_upsampler == "realesrgan": + self.pre_process(alpha) + if self.tile_size > 0: + self.tile_process() + else: + self.process() + output_alpha = self.post_process() + output_alpha = ( + output_alpha.data.squeeze().float().cpu().clamp_(0, 1).numpy() + ) + output_alpha = np.transpose(output_alpha[[2, 1, 0], :, :], (1, 2, 0)) + output_alpha = cv2.cvtColor(output_alpha, cv2.COLOR_BGR2GRAY) + else: # use the cv2 resize for alpha channel + h, w = alpha.shape[0:2] + output_alpha = cv2.resize( + alpha, + (w * self.scale, h * self.scale), + interpolation=cv2.INTER_LINEAR, + ) + + # merge the alpha channel + output_img = cv2.cvtColor(output_img, cv2.COLOR_BGR2BGRA) + output_img[:, :, 3] = output_alpha + + # ------------------------------ return ------------------------------ # + if max_range == 65535: # 16-bit image + output = (output_img * 65535.0).round().astype(np.uint16) + else: + output = (output_img * 255.0).round().astype(np.uint8) + + if outscale is not None and outscale != float(self.scale): + output = cv2.resize( + output, + ( + int(w_input * outscale), + int(h_input * outscale), + ), + interpolation=cv2.INTER_LANCZOS4, + ) + + return output, img_mode + + +class SRVGGNetCompact(nn.Module): + """A compact VGG-style network structure for super-resolution. + + It is a compact network structure, which performs upsampling in the last layer and no convolution is + conducted on the HR feature space. + + Args: + num_in_ch (int): Channel number of inputs. Default: 3. + num_out_ch (int): Channel number of outputs. Default: 3. + num_feat (int): Channel number of intermediate features. Default: 64. + num_conv (int): Number of convolution layers in the body network. Default: 16. + upscale (int): Upsampling factor. Default: 4. + act_type (str): Activation type, options: 'relu', 'prelu', 'leakyrelu'. Default: prelu. + """ + + def __init__( + self, + num_in_ch=3, + num_out_ch=3, + num_feat=64, + num_conv=16, + upscale=4, + act_type="prelu", + ): + super(SRVGGNetCompact, self).__init__() + self.num_in_ch = num_in_ch + self.num_out_ch = num_out_ch + self.num_feat = num_feat + self.num_conv = num_conv + self.upscale = upscale + self.act_type = act_type + + self.body = nn.ModuleList() + # the first conv + self.body.append(nn.Conv2d(num_in_ch, num_feat, 3, 1, 1)) + # the first activation + if act_type == "relu": + activation = nn.ReLU(inplace=True) + elif act_type == "prelu": + activation = nn.PReLU(num_parameters=num_feat) + elif act_type == "leakyrelu": + activation = nn.LeakyReLU(negative_slope=0.1, inplace=True) + self.body.append(activation) + + # the body structure + for _ in range(num_conv): + self.body.append(nn.Conv2d(num_feat, num_feat, 3, 1, 1)) + # activation + if act_type == "relu": + activation = nn.ReLU(inplace=True) + elif act_type == "prelu": + activation = nn.PReLU(num_parameters=num_feat) + elif act_type == "leakyrelu": + activation = nn.LeakyReLU(negative_slope=0.1, inplace=True) + self.body.append(activation) + + # the last conv + self.body.append(nn.Conv2d(num_feat, num_out_ch * upscale * upscale, 3, 1, 1)) + # upsample + self.upsampler = nn.PixelShuffle(upscale) + + def forward(self, x): + out = x + for i in range(0, len(self.body)): + out = self.body[i](out) + + out = self.upsampler(out) + # add the nearest upsampled image, so that the network learns the residual + base = F.interpolate(x, scale_factor=self.upscale, mode="nearest") + out += base + return out + + +class RealESRGANUpscaler(BasePlugin): + name = "RealESRGAN" + support_gen_image = True + + def __init__(self, name, device, no_half=False): + super().__init__() + self.model_name = name + self.device = device + self.no_half = no_half + self._init_model(name) + + def _init_model(self, name): + from .basicsr import RRDBNet + + REAL_ESRGAN_MODELS = { + RealESRGANModel.realesr_general_x4v3: { + "url": "https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.5.0/realesr-general-x4v3.pth", + "scale": 4, + "model": lambda: SRVGGNetCompact( + num_in_ch=3, + num_out_ch=3, + num_feat=64, + num_conv=32, + upscale=4, + act_type="prelu", + ), + "model_md5": "91a7644643c884ee00737db24e478156", + }, + RealESRGANModel.RealESRGAN_x4plus: { + "url": "https://github.com/xinntao/Real-ESRGAN/releases/download/v0.1.0/RealESRGAN_x4plus.pth", + "scale": 4, + "model": lambda: RRDBNet( + num_in_ch=3, + num_out_ch=3, + num_feat=64, + num_block=23, + num_grow_ch=32, + scale=4, + ), + "model_md5": "99ec365d4afad750833258a1a24f44ca", + }, + RealESRGANModel.RealESRGAN_x4plus_anime_6B: { + "url": "https://github.com/xinntao/Real-ESRGAN/releases/download/v0.2.2.4/RealESRGAN_x4plus_anime_6B.pth", + "scale": 4, + "model": lambda: RRDBNet( + num_in_ch=3, + num_out_ch=3, + num_feat=64, + num_block=6, + num_grow_ch=32, + scale=4, + ), + "model_md5": "d58ce384064ec1591c2ea7b79dbf47ba", + }, + } + if name not in REAL_ESRGAN_MODELS: + raise ValueError(f"Unknown RealESRGAN model name: {name}") + model_info = REAL_ESRGAN_MODELS[name] + + model_path = download_model(model_info["url"], model_info["model_md5"]) + logger.info(f"RealESRGAN model path: {model_path}") + + self.model = RealESRGANer( + scale=model_info["scale"], + model_path=model_path, + model=model_info["model"](), + half=True if "cuda" in str(self.device) and not self.no_half else False, + tile=512, + tile_pad=10, + pre_pad=10, + device=self.device, + ) + + def switch_model(self, new_model_name: str): + if self.model_name == new_model_name: + return + self._init_model(new_model_name) + self.model_name = new_model_name + + def gen_image(self, rgb_np_img, req: RunPluginRequest) -> np.ndarray: + bgr_np_img = cv2.cvtColor(rgb_np_img, cv2.COLOR_RGB2BGR) + logger.info(f"RealESRGAN input shape: {bgr_np_img.shape}, scale: {req.scale}") + result = self.forward(bgr_np_img, req.scale) + logger.info(f"RealESRGAN output shape: {result.shape}") + return result + + @torch.inference_mode() + def forward(self, bgr_np_img, scale: float): + # 输出是 BGR + upsampled = self.model.enhance(bgr_np_img, outscale=scale)[0] + return upsampled diff --git a/inpaint/plugins/remove_bg.py b/inpaint/plugins/remove_bg.py new file mode 100644 index 0000000..64bf785 --- /dev/null +++ b/inpaint/plugins/remove_bg.py @@ -0,0 +1,71 @@ +import os +import cv2 +import numpy as np +from loguru import logger +from torch.hub import get_dir + +from iopaint.plugins.base_plugin import BasePlugin +from iopaint.schema import RunPluginRequest, RemoveBGModel + + +class RemoveBG(BasePlugin): + name = "RemoveBG" + support_gen_mask = True + support_gen_image = True + + def __init__(self, model_name): + super().__init__() + self.model_name = model_name + + hub_dir = get_dir() + model_dir = os.path.join(hub_dir, "checkpoints") + os.environ["U2NET_HOME"] = model_dir + + self._init_session(model_name) + + def _init_session(self, model_name: str): + if model_name == RemoveBGModel.briaai_rmbg_1_4: + from iopaint.plugins.briarmbg import ( + create_briarmbg_session, + briarmbg_process, + ) + + self.session = create_briarmbg_session() + self.remove = briarmbg_process + else: + from rembg import new_session, remove + + self.session = new_session(model_name=model_name) + self.remove = remove + + def switch_model(self, new_model_name): + if self.model_name == new_model_name: + return + + logger.info( + f"Switching removebg model from {self.model_name} to {new_model_name}" + ) + self._init_session(new_model_name) + self.model_name = new_model_name + + def gen_image(self, rgb_np_img, req: RunPluginRequest) -> np.ndarray: + bgr_np_img = cv2.cvtColor(rgb_np_img, cv2.COLOR_RGB2BGR) + + # return BGRA image + output = self.remove(bgr_np_img, session=self.session) + return cv2.cvtColor(output, cv2.COLOR_BGRA2RGBA) + + def gen_mask(self, rgb_np_img, req: RunPluginRequest) -> np.ndarray: + bgr_np_img = cv2.cvtColor(rgb_np_img, cv2.COLOR_RGB2BGR) + + # return BGR image, 255 means foreground, 0 means background + output = self.remove(bgr_np_img, session=self.session, only_mask=True) + return output + + def check_dep(self): + try: + import rembg + except ImportError: + return ( + "RemoveBG is not installed, please install it first. pip install rembg" + ) diff --git a/inpaint/plugins/restoreformer.py b/inpaint/plugins/restoreformer.py new file mode 100644 index 0000000..9bc3f07 --- /dev/null +++ b/inpaint/plugins/restoreformer.py @@ -0,0 +1,44 @@ +import cv2 +import numpy as np +from loguru import logger + +from iopaint.helper import download_model +from iopaint.plugins.base_plugin import BasePlugin +from iopaint.schema import RunPluginRequest + + +class RestoreFormerPlugin(BasePlugin): + name = "RestoreFormer" + support_gen_image = True + + def __init__(self, device, upscaler=None): + super().__init__() + from .gfpganer import MyGFPGANer + + url = "https://github.com/TencentARC/GFPGAN/releases/download/v1.3.4/RestoreFormer.pth" + model_md5 = "eaeeff6c4a1caa1673977cb374e6f699" + model_path = download_model(url, model_md5) + logger.info(f"RestoreFormer model path: {model_path}") + + self.face_enhancer = MyGFPGANer( + model_path=model_path, + upscale=1, + arch="RestoreFormer", + channel_multiplier=2, + device=device, + bg_upsampler=upscaler.model if upscaler is not None else None, + ) + + def gen_image(self, rgb_np_img, req: RunPluginRequest) -> np.ndarray: + weight = 0.5 + bgr_np_img = cv2.cvtColor(rgb_np_img, cv2.COLOR_RGB2BGR) + logger.info(f"RestoreFormer input shape: {bgr_np_img.shape}") + _, _, bgr_output = self.face_enhancer.enhance( + bgr_np_img, + has_aligned=False, + only_center_face=False, + paste_back=True, + weight=weight, + ) + logger.info(f"RestoreFormer output shape: {bgr_output.shape}") + return bgr_output diff --git a/inpaint/plugins/segment_anything/__init__.py b/inpaint/plugins/segment_anything/__init__.py new file mode 100644 index 0000000..420f04b --- /dev/null +++ b/inpaint/plugins/segment_anything/__init__.py @@ -0,0 +1,16 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from .build_sam import ( + build_sam_vit_h, + build_sam_vit_l, + build_sam_vit_b, + build_sam_vit_h_hq, + build_sam_vit_l_hq, + build_sam_vit_b_hq, + sam_model_registry, +) +from .predictor import SamPredictor diff --git a/inpaint/plugins/segment_anything/build_sam.py b/inpaint/plugins/segment_anything/build_sam.py new file mode 100644 index 0000000..9b905ef --- /dev/null +++ b/inpaint/plugins/segment_anything/build_sam.py @@ -0,0 +1,269 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch + +from functools import partial + +from iopaint.plugins.segment_anything.modeling.tiny_vit_sam import TinyViT + +from .modeling import ( + ImageEncoderViT, + MaskDecoder, + PromptEncoder, + Sam, + TwoWayTransformer, +) +from .modeling.image_encoder_hq import ImageEncoderViTHQ +from .modeling.mask_decoder import MaskDecoderHQ +from .modeling.sam_hq import SamHQ + + +def build_sam_vit_h(checkpoint=None): + return _build_sam( + encoder_embed_dim=1280, + encoder_depth=32, + encoder_num_heads=16, + encoder_global_attn_indexes=[7, 15, 23, 31], + checkpoint=checkpoint, + ) + + +def build_sam_vit_l(checkpoint=None): + return _build_sam( + encoder_embed_dim=1024, + encoder_depth=24, + encoder_num_heads=16, + encoder_global_attn_indexes=[5, 11, 17, 23], + checkpoint=checkpoint, + ) + + +def build_sam_vit_b(checkpoint=None): + return _build_sam( + encoder_embed_dim=768, + encoder_depth=12, + encoder_num_heads=12, + encoder_global_attn_indexes=[2, 5, 8, 11], + checkpoint=checkpoint, + ) + + +def build_sam_vit_t(checkpoint=None): + prompt_embed_dim = 256 + image_size = 1024 + vit_patch_size = 16 + image_embedding_size = image_size // vit_patch_size + mobile_sam = Sam( + image_encoder=TinyViT( + img_size=1024, + in_chans=3, + num_classes=1000, + embed_dims=[64, 128, 160, 320], + depths=[2, 2, 6, 2], + num_heads=[2, 4, 5, 10], + window_sizes=[7, 7, 14, 7], + mlp_ratio=4.0, + drop_rate=0.0, + drop_path_rate=0.0, + use_checkpoint=False, + mbconv_expand_ratio=4.0, + local_conv_size=3, + layer_lr_decay=0.8, + ), + prompt_encoder=PromptEncoder( + embed_dim=prompt_embed_dim, + image_embedding_size=(image_embedding_size, image_embedding_size), + input_image_size=(image_size, image_size), + mask_in_chans=16, + ), + mask_decoder=MaskDecoder( + num_multimask_outputs=3, + transformer=TwoWayTransformer( + depth=2, + embedding_dim=prompt_embed_dim, + mlp_dim=2048, + num_heads=8, + ), + transformer_dim=prompt_embed_dim, + iou_head_depth=3, + iou_head_hidden_dim=256, + ), + pixel_mean=[123.675, 116.28, 103.53], + pixel_std=[58.395, 57.12, 57.375], + ) + + mobile_sam.eval() + if checkpoint is not None: + with open(checkpoint, "rb") as f: + state_dict = torch.load(f) + mobile_sam.load_state_dict(state_dict) + return mobile_sam + + +def build_sam_vit_h_hq(checkpoint=None): + return _build_sam_hq( + encoder_embed_dim=1280, + encoder_depth=32, + encoder_num_heads=16, + encoder_global_attn_indexes=[7, 15, 23, 31], + checkpoint=checkpoint, + ) + + +def build_sam_vit_l_hq(checkpoint=None): + return _build_sam_hq( + encoder_embed_dim=1024, + encoder_depth=24, + encoder_num_heads=16, + encoder_global_attn_indexes=[5, 11, 17, 23], + checkpoint=checkpoint, + ) + + +def build_sam_vit_b_hq(checkpoint=None): + return _build_sam_hq( + encoder_embed_dim=768, + encoder_depth=12, + encoder_num_heads=12, + encoder_global_attn_indexes=[2, 5, 8, 11], + checkpoint=checkpoint, + ) + + +sam_model_registry = { + "default": build_sam_vit_h, + "vit_h": build_sam_vit_h, + "vit_l": build_sam_vit_l, + "vit_b": build_sam_vit_b, + "sam_hq_vit_h": build_sam_vit_h_hq, + "sam_hq_vit_l": build_sam_vit_l_hq, + "sam_hq_vit_b": build_sam_vit_b_hq, + "mobile_sam": build_sam_vit_t, +} + + +def _build_sam( + encoder_embed_dim, + encoder_depth, + encoder_num_heads, + encoder_global_attn_indexes, + checkpoint=None, +): + prompt_embed_dim = 256 + image_size = 1024 + vit_patch_size = 16 + image_embedding_size = image_size // vit_patch_size + sam = Sam( + image_encoder=ImageEncoderViT( + depth=encoder_depth, + embed_dim=encoder_embed_dim, + img_size=image_size, + mlp_ratio=4, + norm_layer=partial(torch.nn.LayerNorm, eps=1e-6), + num_heads=encoder_num_heads, + patch_size=vit_patch_size, + qkv_bias=True, + use_rel_pos=True, + global_attn_indexes=encoder_global_attn_indexes, + window_size=14, + out_chans=prompt_embed_dim, + ), + prompt_encoder=PromptEncoder( + embed_dim=prompt_embed_dim, + image_embedding_size=(image_embedding_size, image_embedding_size), + input_image_size=(image_size, image_size), + mask_in_chans=16, + ), + mask_decoder=MaskDecoder( + num_multimask_outputs=3, + transformer=TwoWayTransformer( + depth=2, + embedding_dim=prompt_embed_dim, + mlp_dim=2048, + num_heads=8, + ), + transformer_dim=prompt_embed_dim, + iou_head_depth=3, + iou_head_hidden_dim=256, + ), + pixel_mean=[123.675, 116.28, 103.53], + pixel_std=[58.395, 57.12, 57.375], + ) + sam.eval() + if checkpoint is not None: + with open(checkpoint, "rb") as f: + state_dict = torch.load(f) + sam.load_state_dict(state_dict) + return sam + + +def _build_sam_hq( + encoder_embed_dim, + encoder_depth, + encoder_num_heads, + encoder_global_attn_indexes, + checkpoint=None, +): + prompt_embed_dim = 256 + image_size = 1024 + vit_patch_size = 16 + image_embedding_size = image_size // vit_patch_size + sam = SamHQ( + image_encoder=ImageEncoderViTHQ( + depth=encoder_depth, + embed_dim=encoder_embed_dim, + img_size=image_size, + mlp_ratio=4, + norm_layer=partial(torch.nn.LayerNorm, eps=1e-6), + num_heads=encoder_num_heads, + patch_size=vit_patch_size, + qkv_bias=True, + use_rel_pos=True, + global_attn_indexes=encoder_global_attn_indexes, + window_size=14, + out_chans=prompt_embed_dim, + ), + prompt_encoder=PromptEncoder( + embed_dim=prompt_embed_dim, + image_embedding_size=(image_embedding_size, image_embedding_size), + input_image_size=(image_size, image_size), + mask_in_chans=16, + ), + mask_decoder=MaskDecoderHQ( + num_multimask_outputs=3, + transformer=TwoWayTransformer( + depth=2, + embedding_dim=prompt_embed_dim, + mlp_dim=2048, + num_heads=8, + ), + transformer_dim=prompt_embed_dim, + iou_head_depth=3, + iou_head_hidden_dim=256, + vit_dim=encoder_embed_dim, + ), + pixel_mean=[123.675, 116.28, 103.53], + pixel_std=[58.395, 57.12, 57.375], + ) + sam.eval() + if checkpoint is not None: + with open(checkpoint, "rb") as f: + device = "cuda" if torch.cuda.is_available() else "cpu" + state_dict = torch.load(f, map_location=device) + info = sam.load_state_dict(state_dict, strict=False) + print(info) + for n, p in sam.named_parameters(): + if ( + "hf_token" not in n + and "hf_mlp" not in n + and "compress_vit_feat" not in n + and "embedding_encoder" not in n + and "embedding_maskfeature" not in n + ): + p.requires_grad = False + + return sam diff --git a/inpaint/plugins/segment_anything/modeling/__init__.py b/inpaint/plugins/segment_anything/modeling/__init__.py new file mode 100644 index 0000000..38e9062 --- /dev/null +++ b/inpaint/plugins/segment_anything/modeling/__init__.py @@ -0,0 +1,11 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from .sam import Sam +from .image_encoder import ImageEncoderViT +from .mask_decoder import MaskDecoder +from .prompt_encoder import PromptEncoder +from .transformer import TwoWayTransformer diff --git a/inpaint/plugins/segment_anything/modeling/common.py b/inpaint/plugins/segment_anything/modeling/common.py new file mode 100644 index 0000000..2bf1523 --- /dev/null +++ b/inpaint/plugins/segment_anything/modeling/common.py @@ -0,0 +1,43 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn + +from typing import Type + + +class MLPBlock(nn.Module): + def __init__( + self, + embedding_dim: int, + mlp_dim: int, + act: Type[nn.Module] = nn.GELU, + ) -> None: + super().__init__() + self.lin1 = nn.Linear(embedding_dim, mlp_dim) + self.lin2 = nn.Linear(mlp_dim, embedding_dim) + self.act = act() + + def forward(self, x: torch.Tensor) -> torch.Tensor: + return self.lin2(self.act(self.lin1(x))) + + +# From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa +# Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119 # noqa +class LayerNorm2d(nn.Module): + def __init__(self, num_channels: int, eps: float = 1e-6) -> None: + super().__init__() + self.weight = nn.Parameter(torch.ones(num_channels)) + self.bias = nn.Parameter(torch.zeros(num_channels)) + self.eps = eps + + def forward(self, x: torch.Tensor) -> torch.Tensor: + u = x.mean(1, keepdim=True) + s = (x - u).pow(2).mean(1, keepdim=True) + x = (x - u) / torch.sqrt(s + self.eps) + x = self.weight[:, None, None] * x + self.bias[:, None, None] + return x diff --git a/inpaint/plugins/segment_anything/modeling/image_encoder.py b/inpaint/plugins/segment_anything/modeling/image_encoder.py new file mode 100644 index 0000000..a6ad9ad --- /dev/null +++ b/inpaint/plugins/segment_anything/modeling/image_encoder.py @@ -0,0 +1,395 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from typing import Optional, Tuple, Type + +from .common import LayerNorm2d, MLPBlock + + +# This class and its supporting functions below lightly adapted from the ViTDet backbone available at: https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/vit.py # noqa +class ImageEncoderViT(nn.Module): + def __init__( + self, + img_size: int = 1024, + patch_size: int = 16, + in_chans: int = 3, + embed_dim: int = 768, + depth: int = 12, + num_heads: int = 12, + mlp_ratio: float = 4.0, + out_chans: int = 256, + qkv_bias: bool = True, + norm_layer: Type[nn.Module] = nn.LayerNorm, + act_layer: Type[nn.Module] = nn.GELU, + use_abs_pos: bool = True, + use_rel_pos: bool = False, + rel_pos_zero_init: bool = True, + window_size: int = 0, + global_attn_indexes: Tuple[int, ...] = (), + ) -> None: + """ + Args: + img_size (int): Input image size. + patch_size (int): Patch size. + in_chans (int): Number of input image channels. + embed_dim (int): Patch embedding dimension. + depth (int): Depth of ViT. + num_heads (int): Number of attention heads in each ViT block. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool): If True, add a learnable bias to query, key, value. + norm_layer (nn.Module): Normalization layer. + act_layer (nn.Module): Activation layer. + use_abs_pos (bool): If True, use absolute positional embeddings. + use_rel_pos (bool): If True, add relative positional embeddings to the attention map. + rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. + window_size (int): Window size for window attention blocks. + global_attn_indexes (list): Indexes for blocks using global attention. + """ + super().__init__() + self.img_size = img_size + + self.patch_embed = PatchEmbed( + kernel_size=(patch_size, patch_size), + stride=(patch_size, patch_size), + in_chans=in_chans, + embed_dim=embed_dim, + ) + + self.pos_embed: Optional[nn.Parameter] = None + if use_abs_pos: + # Initialize absolute positional embedding with pretrain image size. + self.pos_embed = nn.Parameter( + torch.zeros(1, img_size // patch_size, img_size // patch_size, embed_dim) + ) + + self.blocks = nn.ModuleList() + for i in range(depth): + block = Block( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + norm_layer=norm_layer, + act_layer=act_layer, + use_rel_pos=use_rel_pos, + rel_pos_zero_init=rel_pos_zero_init, + window_size=window_size if i not in global_attn_indexes else 0, + input_size=(img_size // patch_size, img_size // patch_size), + ) + self.blocks.append(block) + + self.neck = nn.Sequential( + nn.Conv2d( + embed_dim, + out_chans, + kernel_size=1, + bias=False, + ), + LayerNorm2d(out_chans), + nn.Conv2d( + out_chans, + out_chans, + kernel_size=3, + padding=1, + bias=False, + ), + LayerNorm2d(out_chans), + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.patch_embed(x) + if self.pos_embed is not None: + x = x + self.pos_embed + + for blk in self.blocks: + x = blk(x) + + x = self.neck(x.permute(0, 3, 1, 2)) + + return x + + +class Block(nn.Module): + """Transformer blocks with support of window attention and residual propagation blocks""" + + def __init__( + self, + dim: int, + num_heads: int, + mlp_ratio: float = 4.0, + qkv_bias: bool = True, + norm_layer: Type[nn.Module] = nn.LayerNorm, + act_layer: Type[nn.Module] = nn.GELU, + use_rel_pos: bool = False, + rel_pos_zero_init: bool = True, + window_size: int = 0, + input_size: Optional[Tuple[int, int]] = None, + ) -> None: + """ + Args: + dim (int): Number of input channels. + num_heads (int): Number of attention heads in each ViT block. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool): If True, add a learnable bias to query, key, value. + norm_layer (nn.Module): Normalization layer. + act_layer (nn.Module): Activation layer. + use_rel_pos (bool): If True, add relative positional embeddings to the attention map. + rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. + window_size (int): Window size for window attention blocks. If it equals 0, then + use global attention. + input_size (int or None): Input resolution for calculating the relative positional + parameter size. + """ + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + use_rel_pos=use_rel_pos, + rel_pos_zero_init=rel_pos_zero_init, + input_size=input_size if window_size == 0 else (window_size, window_size), + ) + + self.norm2 = norm_layer(dim) + self.mlp = MLPBlock(embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer) + + self.window_size = window_size + + def forward(self, x: torch.Tensor) -> torch.Tensor: + shortcut = x + x = self.norm1(x) + # Window partition + if self.window_size > 0: + H, W = x.shape[1], x.shape[2] + x, pad_hw = window_partition(x, self.window_size) + + x = self.attn(x) + # Reverse window partition + if self.window_size > 0: + x = window_unpartition(x, self.window_size, pad_hw, (H, W)) + + x = shortcut + x + x = x + self.mlp(self.norm2(x)) + + return x + + +class Attention(nn.Module): + """Multi-head Attention block with relative position embeddings.""" + + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = True, + use_rel_pos: bool = False, + rel_pos_zero_init: bool = True, + input_size: Optional[Tuple[int, int]] = None, + ) -> None: + """ + Args: + dim (int): Number of input channels. + num_heads (int): Number of attention heads. + qkv_bias (bool: If True, add a learnable bias to query, key, value. + rel_pos (bool): If True, add relative positional embeddings to the attention map. + rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. + input_size (int or None): Input resolution for calculating the relative positional + parameter size. + """ + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim**-0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.proj = nn.Linear(dim, dim) + + self.use_rel_pos = use_rel_pos + if self.use_rel_pos: + assert ( + input_size is not None + ), "Input size must be provided if using relative positional encoding." + # initialize relative positional embeddings + self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim)) + self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim)) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + B, H, W, _ = x.shape + # qkv with shape (3, B, nHead, H * W, C) + qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) + # q, k, v with shape (B * nHead, H * W, C) + q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(0) + + attn = (q * self.scale) @ k.transpose(-2, -1) + + if self.use_rel_pos: + attn = add_decomposed_rel_pos(attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W)) + + attn = attn.softmax(dim=-1) + x = (attn @ v).view(B, self.num_heads, H, W, -1).permute(0, 2, 3, 1, 4).reshape(B, H, W, -1) + x = self.proj(x) + + return x + + +def window_partition(x: torch.Tensor, window_size: int) -> Tuple[torch.Tensor, Tuple[int, int]]: + """ + Partition into non-overlapping windows with padding if needed. + Args: + x (tensor): input tokens with [B, H, W, C]. + window_size (int): window size. + + Returns: + windows: windows after partition with [B * num_windows, window_size, window_size, C]. + (Hp, Wp): padded height and width before partition + """ + B, H, W, C = x.shape + + pad_h = (window_size - H % window_size) % window_size + pad_w = (window_size - W % window_size) % window_size + if pad_h > 0 or pad_w > 0: + x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h)) + Hp, Wp = H + pad_h, W + pad_w + + x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C) + windows = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) + return windows, (Hp, Wp) + + +def window_unpartition( + windows: torch.Tensor, window_size: int, pad_hw: Tuple[int, int], hw: Tuple[int, int] +) -> torch.Tensor: + """ + Window unpartition into original sequences and removing padding. + Args: + x (tensor): input tokens with [B * num_windows, window_size, window_size, C]. + window_size (int): window size. + pad_hw (Tuple): padded height and width (Hp, Wp). + hw (Tuple): original height and width (H, W) before padding. + + Returns: + x: unpartitioned sequences with [B, H, W, C]. + """ + Hp, Wp = pad_hw + H, W = hw + B = windows.shape[0] // (Hp * Wp // window_size // window_size) + x = windows.view(B, Hp // window_size, Wp // window_size, window_size, window_size, -1) + x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1) + + if Hp > H or Wp > W: + x = x[:, :H, :W, :].contiguous() + return x + + +def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor: + """ + Get relative positional embeddings according to the relative positions of + query and key sizes. + Args: + q_size (int): size of query q. + k_size (int): size of key k. + rel_pos (Tensor): relative position embeddings (L, C). + + Returns: + Extracted positional embeddings according to relative positions. + """ + max_rel_dist = int(2 * max(q_size, k_size) - 1) + # Interpolate rel pos if needed. + if rel_pos.shape[0] != max_rel_dist: + # Interpolate rel pos. + rel_pos_resized = F.interpolate( + rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1), + size=max_rel_dist, + mode="linear", + ) + rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0) + else: + rel_pos_resized = rel_pos + + # Scale the coords with short length if shapes for q and k are different. + q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0) + k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0) + relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0) + + return rel_pos_resized[relative_coords.long()] + + +def add_decomposed_rel_pos( + attn: torch.Tensor, + q: torch.Tensor, + rel_pos_h: torch.Tensor, + rel_pos_w: torch.Tensor, + q_size: Tuple[int, int], + k_size: Tuple[int, int], +) -> torch.Tensor: + """ + Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`. + https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py # noqa B950 + Args: + attn (Tensor): attention map. + q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C). + rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis. + rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis. + q_size (Tuple): spatial sequence size of query q with (q_h, q_w). + k_size (Tuple): spatial sequence size of key k with (k_h, k_w). + + Returns: + attn (Tensor): attention map with added relative positional embeddings. + """ + q_h, q_w = q_size + k_h, k_w = k_size + Rh = get_rel_pos(q_h, k_h, rel_pos_h) + Rw = get_rel_pos(q_w, k_w, rel_pos_w) + + B, _, dim = q.shape + r_q = q.reshape(B, q_h, q_w, dim) + rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh) + rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw) + + attn = ( + attn.view(B, q_h, q_w, k_h, k_w) + rel_h[:, :, :, :, None] + rel_w[:, :, :, None, :] + ).view(B, q_h * q_w, k_h * k_w) + + return attn + + +class PatchEmbed(nn.Module): + """ + Image to Patch Embedding. + """ + + def __init__( + self, + kernel_size: Tuple[int, int] = (16, 16), + stride: Tuple[int, int] = (16, 16), + padding: Tuple[int, int] = (0, 0), + in_chans: int = 3, + embed_dim: int = 768, + ) -> None: + """ + Args: + kernel_size (Tuple): kernel size of the projection layer. + stride (Tuple): stride of the projection layer. + padding (Tuple): padding size of the projection layer. + in_chans (int): Number of input image channels. + embed_dim (int): embed_dim (int): Patch embedding dimension. + """ + super().__init__() + + self.proj = nn.Conv2d( + in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.proj(x) + # B C H W -> B H W C + x = x.permute(0, 2, 3, 1) + return x diff --git a/inpaint/plugins/segment_anything/modeling/image_encoder_hq.py b/inpaint/plugins/segment_anything/modeling/image_encoder_hq.py new file mode 100644 index 0000000..f12803b --- /dev/null +++ b/inpaint/plugins/segment_anything/modeling/image_encoder_hq.py @@ -0,0 +1,422 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from typing import Optional, Tuple, Type + +from .common import LayerNorm2d, MLPBlock + + +# This class and its supporting functions below lightly adapted from the ViTDet backbone available at: https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/vit.py # noqa +class ImageEncoderViTHQ(nn.Module): + def __init__( + self, + img_size: int = 1024, + patch_size: int = 16, + in_chans: int = 3, + embed_dim: int = 768, + depth: int = 12, + num_heads: int = 12, + mlp_ratio: float = 4.0, + out_chans: int = 256, + qkv_bias: bool = True, + norm_layer: Type[nn.Module] = nn.LayerNorm, + act_layer: Type[nn.Module] = nn.GELU, + use_abs_pos: bool = True, + use_rel_pos: bool = False, + rel_pos_zero_init: bool = True, + window_size: int = 0, + global_attn_indexes: Tuple[int, ...] = (), + ) -> None: + """ + Args: + img_size (int): Input image size. + patch_size (int): Patch size. + in_chans (int): Number of input image channels. + embed_dim (int): Patch embedding dimension. + depth (int): Depth of ViT. + num_heads (int): Number of attention heads in each ViT block. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool): If True, add a learnable bias to query, key, value. + norm_layer (nn.Module): Normalization layer. + act_layer (nn.Module): Activation layer. + use_abs_pos (bool): If True, use absolute positional embeddings. + use_rel_pos (bool): If True, add relative positional embeddings to the attention map. + rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. + window_size (int): Window size for window attention blocks. + global_attn_indexes (list): Indexes for blocks using global attention. + """ + super().__init__() + self.img_size = img_size + + self.patch_embed = PatchEmbed( + kernel_size=(patch_size, patch_size), + stride=(patch_size, patch_size), + in_chans=in_chans, + embed_dim=embed_dim, + ) + + self.pos_embed: Optional[nn.Parameter] = None + if use_abs_pos: + # Initialize absolute positional embedding with pretrain image size. + self.pos_embed = nn.Parameter( + torch.zeros( + 1, img_size // patch_size, img_size // patch_size, embed_dim + ) + ) + + self.blocks = nn.ModuleList() + for i in range(depth): + block = Block( + dim=embed_dim, + num_heads=num_heads, + mlp_ratio=mlp_ratio, + qkv_bias=qkv_bias, + norm_layer=norm_layer, + act_layer=act_layer, + use_rel_pos=use_rel_pos, + rel_pos_zero_init=rel_pos_zero_init, + window_size=window_size if i not in global_attn_indexes else 0, + input_size=(img_size // patch_size, img_size // patch_size), + ) + self.blocks.append(block) + + self.neck = nn.Sequential( + nn.Conv2d( + embed_dim, + out_chans, + kernel_size=1, + bias=False, + ), + LayerNorm2d(out_chans), + nn.Conv2d( + out_chans, + out_chans, + kernel_size=3, + padding=1, + bias=False, + ), + LayerNorm2d(out_chans), + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.patch_embed(x) + if self.pos_embed is not None: + x = x + self.pos_embed + + interm_embeddings = [] + for blk in self.blocks: + x = blk(x) + if blk.window_size == 0: + interm_embeddings.append(x) + + x = self.neck(x.permute(0, 3, 1, 2)) + + return x, interm_embeddings + + +class Block(nn.Module): + """Transformer blocks with support of window attention and residual propagation blocks""" + + def __init__( + self, + dim: int, + num_heads: int, + mlp_ratio: float = 4.0, + qkv_bias: bool = True, + norm_layer: Type[nn.Module] = nn.LayerNorm, + act_layer: Type[nn.Module] = nn.GELU, + use_rel_pos: bool = False, + rel_pos_zero_init: bool = True, + window_size: int = 0, + input_size: Optional[Tuple[int, int]] = None, + ) -> None: + """ + Args: + dim (int): Number of input channels. + num_heads (int): Number of attention heads in each ViT block. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + qkv_bias (bool): If True, add a learnable bias to query, key, value. + norm_layer (nn.Module): Normalization layer. + act_layer (nn.Module): Activation layer. + use_rel_pos (bool): If True, add relative positional embeddings to the attention map. + rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. + window_size (int): Window size for window attention blocks. If it equals 0, then + use global attention. + input_size (tuple(int, int) or None): Input resolution for calculating the relative + positional parameter size. + """ + super().__init__() + self.norm1 = norm_layer(dim) + self.attn = Attention( + dim, + num_heads=num_heads, + qkv_bias=qkv_bias, + use_rel_pos=use_rel_pos, + rel_pos_zero_init=rel_pos_zero_init, + input_size=input_size if window_size == 0 else (window_size, window_size), + ) + + self.norm2 = norm_layer(dim) + self.mlp = MLPBlock( + embedding_dim=dim, mlp_dim=int(dim * mlp_ratio), act=act_layer + ) + + self.window_size = window_size + + def forward(self, x: torch.Tensor) -> torch.Tensor: + shortcut = x + x = self.norm1(x) + # Window partition + if self.window_size > 0: + H, W = x.shape[1], x.shape[2] + x, pad_hw = window_partition(x, self.window_size) + + x = self.attn(x) + # Reverse window partition + if self.window_size > 0: + x = window_unpartition(x, self.window_size, pad_hw, (H, W)) + + x = shortcut + x + x = x + self.mlp(self.norm2(x)) + + return x + + +class Attention(nn.Module): + """Multi-head Attention block with relative position embeddings.""" + + def __init__( + self, + dim: int, + num_heads: int = 8, + qkv_bias: bool = True, + use_rel_pos: bool = False, + rel_pos_zero_init: bool = True, + input_size: Optional[Tuple[int, int]] = None, + ) -> None: + """ + Args: + dim (int): Number of input channels. + num_heads (int): Number of attention heads. + qkv_bias (bool): If True, add a learnable bias to query, key, value. + rel_pos (bool): If True, add relative positional embeddings to the attention map. + rel_pos_zero_init (bool): If True, zero initialize relative positional parameters. + input_size (tuple(int, int) or None): Input resolution for calculating the relative + positional parameter size. + """ + super().__init__() + self.num_heads = num_heads + head_dim = dim // num_heads + self.scale = head_dim**-0.5 + + self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias) + self.proj = nn.Linear(dim, dim) + + self.use_rel_pos = use_rel_pos + if self.use_rel_pos: + assert ( + input_size is not None + ), "Input size must be provided if using relative positional encoding." + # initialize relative positional embeddings + self.rel_pos_h = nn.Parameter(torch.zeros(2 * input_size[0] - 1, head_dim)) + self.rel_pos_w = nn.Parameter(torch.zeros(2 * input_size[1] - 1, head_dim)) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + B, H, W, _ = x.shape + # qkv with shape (3, B, nHead, H * W, C) + qkv = ( + self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1).permute(2, 0, 3, 1, 4) + ) + # q, k, v with shape (B * nHead, H * W, C) + q, k, v = qkv.reshape(3, B * self.num_heads, H * W, -1).unbind(0) + + attn = (q * self.scale) @ k.transpose(-2, -1) + + if self.use_rel_pos: + attn = add_decomposed_rel_pos( + attn, q, self.rel_pos_h, self.rel_pos_w, (H, W), (H, W) + ) + + attn = attn.softmax(dim=-1) + x = ( + (attn @ v) + .view(B, self.num_heads, H, W, -1) + .permute(0, 2, 3, 1, 4) + .reshape(B, H, W, -1) + ) + x = self.proj(x) + + return x + + +def window_partition( + x: torch.Tensor, window_size: int +) -> Tuple[torch.Tensor, Tuple[int, int]]: + """ + Partition into non-overlapping windows with padding if needed. + Args: + x (tensor): input tokens with [B, H, W, C]. + window_size (int): window size. + + Returns: + windows: windows after partition with [B * num_windows, window_size, window_size, C]. + (Hp, Wp): padded height and width before partition + """ + B, H, W, C = x.shape + + pad_h = (window_size - H % window_size) % window_size + pad_w = (window_size - W % window_size) % window_size + if pad_h > 0 or pad_w > 0: + x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h)) + Hp, Wp = H + pad_h, W + pad_w + + x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C) + windows = ( + x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) + ) + return windows, (Hp, Wp) + + +def window_unpartition( + windows: torch.Tensor, + window_size: int, + pad_hw: Tuple[int, int], + hw: Tuple[int, int], +) -> torch.Tensor: + """ + Window unpartition into original sequences and removing padding. + Args: + windows (tensor): input tokens with [B * num_windows, window_size, window_size, C]. + window_size (int): window size. + pad_hw (Tuple): padded height and width (Hp, Wp). + hw (Tuple): original height and width (H, W) before padding. + + Returns: + x: unpartitioned sequences with [B, H, W, C]. + """ + Hp, Wp = pad_hw + H, W = hw + B = windows.shape[0] // (Hp * Wp // window_size // window_size) + x = windows.view( + B, Hp // window_size, Wp // window_size, window_size, window_size, -1 + ) + x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1) + + if Hp > H or Wp > W: + x = x[:, :H, :W, :].contiguous() + return x + + +def get_rel_pos(q_size: int, k_size: int, rel_pos: torch.Tensor) -> torch.Tensor: + """ + Get relative positional embeddings according to the relative positions of + query and key sizes. + Args: + q_size (int): size of query q. + k_size (int): size of key k. + rel_pos (Tensor): relative position embeddings (L, C). + + Returns: + Extracted positional embeddings according to relative positions. + """ + max_rel_dist = int(2 * max(q_size, k_size) - 1) + # Interpolate rel pos if needed. + if rel_pos.shape[0] != max_rel_dist: + # Interpolate rel pos. + rel_pos_resized = F.interpolate( + rel_pos.reshape(1, rel_pos.shape[0], -1).permute(0, 2, 1), + size=max_rel_dist, + mode="linear", + ) + rel_pos_resized = rel_pos_resized.reshape(-1, max_rel_dist).permute(1, 0) + else: + rel_pos_resized = rel_pos + + # Scale the coords with short length if shapes for q and k are different. + q_coords = torch.arange(q_size)[:, None] * max(k_size / q_size, 1.0) + k_coords = torch.arange(k_size)[None, :] * max(q_size / k_size, 1.0) + relative_coords = (q_coords - k_coords) + (k_size - 1) * max(q_size / k_size, 1.0) + + return rel_pos_resized[relative_coords.long()] + + +def add_decomposed_rel_pos( + attn: torch.Tensor, + q: torch.Tensor, + rel_pos_h: torch.Tensor, + rel_pos_w: torch.Tensor, + q_size: Tuple[int, int], + k_size: Tuple[int, int], +) -> torch.Tensor: + """ + Calculate decomposed Relative Positional Embeddings from :paper:`mvitv2`. + https://github.com/facebookresearch/mvit/blob/19786631e330df9f3622e5402b4a419a263a2c80/mvit/models/attention.py # noqa B950 + Args: + attn (Tensor): attention map. + q (Tensor): query q in the attention layer with shape (B, q_h * q_w, C). + rel_pos_h (Tensor): relative position embeddings (Lh, C) for height axis. + rel_pos_w (Tensor): relative position embeddings (Lw, C) for width axis. + q_size (Tuple): spatial sequence size of query q with (q_h, q_w). + k_size (Tuple): spatial sequence size of key k with (k_h, k_w). + + Returns: + attn (Tensor): attention map with added relative positional embeddings. + """ + q_h, q_w = q_size + k_h, k_w = k_size + Rh = get_rel_pos(q_h, k_h, rel_pos_h) + Rw = get_rel_pos(q_w, k_w, rel_pos_w) + + B, _, dim = q.shape + r_q = q.reshape(B, q_h, q_w, dim) + rel_h = torch.einsum("bhwc,hkc->bhwk", r_q, Rh) + rel_w = torch.einsum("bhwc,wkc->bhwk", r_q, Rw) + + attn = ( + attn.view(B, q_h, q_w, k_h, k_w) + + rel_h[:, :, :, :, None] + + rel_w[:, :, :, None, :] + ).view(B, q_h * q_w, k_h * k_w) + + return attn + + +class PatchEmbed(nn.Module): + """ + Image to Patch Embedding. + """ + + def __init__( + self, + kernel_size: Tuple[int, int] = (16, 16), + stride: Tuple[int, int] = (16, 16), + padding: Tuple[int, int] = (0, 0), + in_chans: int = 3, + embed_dim: int = 768, + ) -> None: + """ + Args: + kernel_size (Tuple): kernel size of the projection layer. + stride (Tuple): stride of the projection layer. + padding (Tuple): padding size of the projection layer. + in_chans (int): Number of input image channels. + embed_dim (int): Patch embedding dimension. + """ + super().__init__() + + self.proj = nn.Conv2d( + in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.proj(x) + # B C H W -> B H W C + x = x.permute(0, 2, 3, 1) + return x diff --git a/inpaint/plugins/segment_anything/modeling/mask_decoder.py b/inpaint/plugins/segment_anything/modeling/mask_decoder.py new file mode 100644 index 0000000..67e0f77 --- /dev/null +++ b/inpaint/plugins/segment_anything/modeling/mask_decoder.py @@ -0,0 +1,410 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch +from torch import nn +from torch.nn import functional as F + +from typing import List, Tuple, Type + +from .common import LayerNorm2d + + +class MaskDecoder(nn.Module): + def __init__( + self, + *, + transformer_dim: int, + transformer: nn.Module, + num_multimask_outputs: int = 3, + activation: Type[nn.Module] = nn.GELU, + iou_head_depth: int = 3, + iou_head_hidden_dim: int = 256, + ) -> None: + """ + Predicts masks given an image and prompt embeddings, using a + tranformer architecture. + + Arguments: + transformer_dim (int): the channel dimension of the transformer + transformer (nn.Module): the transformer used to predict masks + num_multimask_outputs (int): the number of masks to predict + when disambiguating masks + activation (nn.Module): the type of activation to use when + upscaling masks + iou_head_depth (int): the depth of the MLP used to predict + mask quality + iou_head_hidden_dim (int): the hidden dimension of the MLP + used to predict mask quality + """ + super().__init__() + self.transformer_dim = transformer_dim + self.transformer = transformer + + self.num_multimask_outputs = num_multimask_outputs + + self.iou_token = nn.Embedding(1, transformer_dim) + self.num_mask_tokens = num_multimask_outputs + 1 + self.mask_tokens = nn.Embedding(self.num_mask_tokens, transformer_dim) + + self.output_upscaling = nn.Sequential( + nn.ConvTranspose2d( + transformer_dim, transformer_dim // 4, kernel_size=2, stride=2 + ), + LayerNorm2d(transformer_dim // 4), + activation(), + nn.ConvTranspose2d( + transformer_dim // 4, transformer_dim // 8, kernel_size=2, stride=2 + ), + activation(), + ) + self.output_hypernetworks_mlps = nn.ModuleList( + [ + MLP(transformer_dim, transformer_dim, transformer_dim // 8, 3) + for i in range(self.num_mask_tokens) + ] + ) + + self.iou_prediction_head = MLP( + transformer_dim, iou_head_hidden_dim, self.num_mask_tokens, iou_head_depth + ) + + def forward( + self, + image_embeddings: torch.Tensor, + image_pe: torch.Tensor, + sparse_prompt_embeddings: torch.Tensor, + dense_prompt_embeddings: torch.Tensor, + multimask_output: bool, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Predict masks given image and prompt embeddings. + + Arguments: + image_embeddings (torch.Tensor): the embeddings from the image encoder + image_pe (torch.Tensor): positional encoding with the shape of image_embeddings + sparse_prompt_embeddings (torch.Tensor): the embeddings of the points and boxes + dense_prompt_embeddings (torch.Tensor): the embeddings of the mask inputs + multimask_output (bool): Whether to return multiple masks or a single + mask. + + Returns: + torch.Tensor: batched predicted masks + torch.Tensor: batched predictions of mask quality + """ + masks, iou_pred = self.predict_masks( + image_embeddings=image_embeddings, + image_pe=image_pe, + sparse_prompt_embeddings=sparse_prompt_embeddings, + dense_prompt_embeddings=dense_prompt_embeddings, + ) + + # Select the correct mask or masks for outptu + if multimask_output: + mask_slice = slice(1, None) + else: + mask_slice = slice(0, 1) + masks = masks[:, mask_slice, :, :] + iou_pred = iou_pred[:, mask_slice] + + # Prepare output + return masks, iou_pred + + def predict_masks( + self, + image_embeddings: torch.Tensor, + image_pe: torch.Tensor, + sparse_prompt_embeddings: torch.Tensor, + dense_prompt_embeddings: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Predicts masks. See 'forward' for more details.""" + # Concatenate output tokens + output_tokens = torch.cat( + [self.iou_token.weight, self.mask_tokens.weight], dim=0 + ) + output_tokens = output_tokens.unsqueeze(0).expand( + sparse_prompt_embeddings.size(0), -1, -1 + ) + tokens = torch.cat((output_tokens, sparse_prompt_embeddings), dim=1) + + # Expand per-image data in batch direction to be per-mask + src = torch.repeat_interleave(image_embeddings, tokens.shape[0], dim=0) + src = src + dense_prompt_embeddings + pos_src = torch.repeat_interleave(image_pe, tokens.shape[0], dim=0) + b, c, h, w = src.shape + + # Run the transformer + hs, src = self.transformer(src, pos_src, tokens) + iou_token_out = hs[:, 0, :] + mask_tokens_out = hs[:, 1 : (1 + self.num_mask_tokens), :] + + # Upscale mask embeddings and predict masks using the mask tokens + src = src.transpose(1, 2).view(b, c, h, w) + upscaled_embedding = self.output_upscaling(src) + hyper_in_list: List[torch.Tensor] = [] + for i in range(self.num_mask_tokens): + hyper_in_list.append( + self.output_hypernetworks_mlps[i](mask_tokens_out[:, i, :]) + ) + hyper_in = torch.stack(hyper_in_list, dim=1) + b, c, h, w = upscaled_embedding.shape + masks = (hyper_in @ upscaled_embedding.view(b, c, h * w)).view(b, -1, h, w) + + # Generate mask quality predictions + iou_pred = self.iou_prediction_head(iou_token_out) + + return masks, iou_pred + +# https://github.com/SysCV/sam-hq/blob/main/segment_anything/modeling/mask_decoder_hq.py#L17 +class MaskDecoderHQ(nn.Module): + def __init__( + self, + *, + transformer_dim: int, + transformer: nn.Module, + num_multimask_outputs: int = 3, + activation: Type[nn.Module] = nn.GELU, + iou_head_depth: int = 3, + iou_head_hidden_dim: int = 256, + vit_dim: int = 1024, + ) -> None: + """ + Predicts masks given an image and prompt embeddings, using a + transformer architecture. + + Arguments: + transformer_dim (int): the channel dimension of the transformer + transformer (nn.Module): the transformer used to predict masks + num_multimask_outputs (int): the number of masks to predict + when disambiguating masks + activation (nn.Module): the type of activation to use when + upscaling masks + iou_head_depth (int): the depth of the MLP used to predict + mask quality + iou_head_hidden_dim (int): the hidden dimension of the MLP + used to predict mask quality + """ + super().__init__() + self.transformer_dim = transformer_dim + self.transformer = transformer + + self.num_multimask_outputs = num_multimask_outputs + + self.iou_token = nn.Embedding(1, transformer_dim) + self.num_mask_tokens = num_multimask_outputs + 1 + self.mask_tokens = nn.Embedding(self.num_mask_tokens, transformer_dim) + + self.output_upscaling = nn.Sequential( + nn.ConvTranspose2d( + transformer_dim, transformer_dim // 4, kernel_size=2, stride=2 + ), + LayerNorm2d(transformer_dim // 4), + activation(), + nn.ConvTranspose2d( + transformer_dim // 4, transformer_dim // 8, kernel_size=2, stride=2 + ), + activation(), + ) + self.output_hypernetworks_mlps = nn.ModuleList( + [ + MLP(transformer_dim, transformer_dim, transformer_dim // 8, 3) + for i in range(self.num_mask_tokens) + ] + ) + + self.iou_prediction_head = MLP( + transformer_dim, iou_head_hidden_dim, self.num_mask_tokens, iou_head_depth + ) + + # HQ-SAM parameters + self.hf_token = nn.Embedding(1, transformer_dim) # HQ-Ouptput-Token + self.hf_mlp = MLP( + transformer_dim, transformer_dim, transformer_dim // 8, 3 + ) # corresponding new MLP layer for HQ-Ouptput-Token + self.num_mask_tokens = self.num_mask_tokens + 1 + + # three conv fusion layers for obtaining HQ-Feature + self.compress_vit_feat = nn.Sequential( + nn.ConvTranspose2d(vit_dim, transformer_dim, kernel_size=2, stride=2), + LayerNorm2d(transformer_dim), + nn.GELU(), + nn.ConvTranspose2d( + transformer_dim, transformer_dim // 8, kernel_size=2, stride=2 + ), + ) + + self.embedding_encoder = nn.Sequential( + nn.ConvTranspose2d( + transformer_dim, transformer_dim // 4, kernel_size=2, stride=2 + ), + LayerNorm2d(transformer_dim // 4), + nn.GELU(), + nn.ConvTranspose2d( + transformer_dim // 4, transformer_dim // 8, kernel_size=2, stride=2 + ), + ) + self.embedding_maskfeature = nn.Sequential( + nn.Conv2d(transformer_dim // 8, transformer_dim // 4, 3, 1, 1), + LayerNorm2d(transformer_dim // 4), + nn.GELU(), + nn.Conv2d(transformer_dim // 4, transformer_dim // 8, 3, 1, 1), + ) + + def forward( + self, + image_embeddings: torch.Tensor, + image_pe: torch.Tensor, + sparse_prompt_embeddings: torch.Tensor, + dense_prompt_embeddings: torch.Tensor, + multimask_output: bool, + hq_token_only: bool, + interm_embeddings: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Predict masks given image and prompt embeddings. + + Arguments: + image_embeddings (torch.Tensor): the embeddings from the ViT image encoder + image_pe (torch.Tensor): positional encoding with the shape of image_embeddings + sparse_prompt_embeddings (torch.Tensor): the embeddings of the points and boxes + dense_prompt_embeddings (torch.Tensor): the embeddings of the mask inputs + multimask_output (bool): Whether to return multiple masks or a single + mask. + + Returns: + torch.Tensor: batched predicted masks + torch.Tensor: batched predictions of mask quality + """ + vit_features = interm_embeddings[0].permute( + 0, 3, 1, 2 + ) # early-layer ViT feature, after 1st global attention block in ViT + hq_features = self.embedding_encoder(image_embeddings) + self.compress_vit_feat( + vit_features + ) + + masks, iou_pred = self.predict_masks( + image_embeddings=image_embeddings, + image_pe=image_pe, + sparse_prompt_embeddings=sparse_prompt_embeddings, + dense_prompt_embeddings=dense_prompt_embeddings, + hq_features=hq_features, + ) + + # Select the correct mask or masks for output + if multimask_output: + # mask with highest score + mask_slice = slice(1, self.num_mask_tokens - 1) + iou_pred = iou_pred[:, mask_slice] + iou_pred, max_iou_idx = torch.max(iou_pred, dim=1) + iou_pred = iou_pred.unsqueeze(1) + masks_multi = masks[:, mask_slice, :, :] + masks_sam = masks_multi[ + torch.arange(masks_multi.size(0)), max_iou_idx + ].unsqueeze(1) + else: + # singale mask output, default + mask_slice = slice(0, 1) + iou_pred = iou_pred[:, mask_slice] + masks_sam = masks[:, mask_slice] + + masks_hq = masks[:, slice(self.num_mask_tokens - 1, self.num_mask_tokens)] + if hq_token_only: + masks = masks_hq + else: + masks = masks_sam + masks_hq + # Prepare output + return masks, iou_pred + + def predict_masks( + self, + image_embeddings: torch.Tensor, + image_pe: torch.Tensor, + sparse_prompt_embeddings: torch.Tensor, + dense_prompt_embeddings: torch.Tensor, + hq_features: torch.Tensor, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Predicts masks. See 'forward' for more details.""" + # Concatenate output tokens + output_tokens = torch.cat( + [self.iou_token.weight, self.mask_tokens.weight, self.hf_token.weight], + dim=0, + ) + output_tokens = output_tokens.unsqueeze(0).expand( + sparse_prompt_embeddings.size(0), -1, -1 + ) + tokens = torch.cat((output_tokens, sparse_prompt_embeddings), dim=1) + + # Expand per-image data in batch direction to be per-mask + src = torch.repeat_interleave(image_embeddings, tokens.shape[0], dim=0) + src = src + dense_prompt_embeddings + pos_src = torch.repeat_interleave(image_pe, tokens.shape[0], dim=0) + b, c, h, w = src.shape + + # Run the transformer + hs, src = self.transformer(src, pos_src, tokens) + iou_token_out = hs[:, 0, :] + mask_tokens_out = hs[:, 1 : (1 + self.num_mask_tokens), :] + + # Upscale mask embeddings and predict masks using the mask tokens + src = src.transpose(1, 2).view(b, c, h, w) + + upscaled_embedding_sam = self.output_upscaling(src) + upscaled_embedding_hq = self.embedding_maskfeature( + upscaled_embedding_sam + ) + hq_features.repeat(b, 1, 1, 1) + + hyper_in_list: List[torch.Tensor] = [] + for i in range(self.num_mask_tokens): + if i < self.num_mask_tokens - 1: + hyper_in_list.append( + self.output_hypernetworks_mlps[i](mask_tokens_out[:, i, :]) + ) + else: + hyper_in_list.append(self.hf_mlp(mask_tokens_out[:, i, :])) + + hyper_in = torch.stack(hyper_in_list, dim=1) + b, c, h, w = upscaled_embedding_sam.shape + + masks_sam = ( + hyper_in[:, : self.num_mask_tokens - 1] + @ upscaled_embedding_sam.view(b, c, h * w) + ).view(b, -1, h, w) + masks_sam_hq = ( + hyper_in[:, self.num_mask_tokens - 1 :] + @ upscaled_embedding_hq.view(b, c, h * w) + ).view(b, -1, h, w) + masks = torch.cat([masks_sam, masks_sam_hq], dim=1) + # Generate mask quality predictions + iou_pred = self.iou_prediction_head(iou_token_out) + + return masks, iou_pred + + +# Lightly adapted from +# https://github.com/facebookresearch/MaskFormer/blob/main/mask_former/modeling/transformer/transformer_predictor.py # noqa +class MLP(nn.Module): + def __init__( + self, + input_dim: int, + hidden_dim: int, + output_dim: int, + num_layers: int, + sigmoid_output: bool = False, + ) -> None: + super().__init__() + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.ModuleList( + nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]) + ) + self.sigmoid_output = sigmoid_output + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = F.relu(layer(x)) if i < self.num_layers - 1 else layer(x) + if self.sigmoid_output: + x = F.sigmoid(x) + return x diff --git a/inpaint/plugins/segment_anything/modeling/prompt_encoder.py b/inpaint/plugins/segment_anything/modeling/prompt_encoder.py new file mode 100644 index 0000000..c3143f4 --- /dev/null +++ b/inpaint/plugins/segment_anything/modeling/prompt_encoder.py @@ -0,0 +1,214 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import numpy as np +import torch +from torch import nn + +from typing import Any, Optional, Tuple, Type + +from .common import LayerNorm2d + + +class PromptEncoder(nn.Module): + def __init__( + self, + embed_dim: int, + image_embedding_size: Tuple[int, int], + input_image_size: Tuple[int, int], + mask_in_chans: int, + activation: Type[nn.Module] = nn.GELU, + ) -> None: + """ + Encodes prompts for input to SAM's mask decoder. + + Arguments: + embed_dim (int): The prompts' embedding dimension + image_embedding_size (tuple(int, int)): The spatial size of the + image embedding, as (H, W). + input_image_size (int): The padded size of the image as input + to the image encoder, as (H, W). + mask_in_chans (int): The number of hidden channels used for + encoding input masks. + activation (nn.Module): The activation to use when encoding + input masks. + """ + super().__init__() + self.embed_dim = embed_dim + self.input_image_size = input_image_size + self.image_embedding_size = image_embedding_size + self.pe_layer = PositionEmbeddingRandom(embed_dim // 2) + + self.num_point_embeddings: int = 4 # pos/neg point + 2 box corners + point_embeddings = [nn.Embedding(1, embed_dim) for i in range(self.num_point_embeddings)] + self.point_embeddings = nn.ModuleList(point_embeddings) + self.not_a_point_embed = nn.Embedding(1, embed_dim) + + self.mask_input_size = (4 * image_embedding_size[0], 4 * image_embedding_size[1]) + self.mask_downscaling = nn.Sequential( + nn.Conv2d(1, mask_in_chans // 4, kernel_size=2, stride=2), + LayerNorm2d(mask_in_chans // 4), + activation(), + nn.Conv2d(mask_in_chans // 4, mask_in_chans, kernel_size=2, stride=2), + LayerNorm2d(mask_in_chans), + activation(), + nn.Conv2d(mask_in_chans, embed_dim, kernel_size=1), + ) + self.no_mask_embed = nn.Embedding(1, embed_dim) + + def get_dense_pe(self) -> torch.Tensor: + """ + Returns the positional encoding used to encode point prompts, + applied to a dense set of points the shape of the image encoding. + + Returns: + torch.Tensor: Positional encoding with shape + 1x(embed_dim)x(embedding_h)x(embedding_w) + """ + return self.pe_layer(self.image_embedding_size).unsqueeze(0) + + def _embed_points( + self, + points: torch.Tensor, + labels: torch.Tensor, + pad: bool, + ) -> torch.Tensor: + """Embeds point prompts.""" + points = points + 0.5 # Shift to center of pixel + if pad: + padding_point = torch.zeros((points.shape[0], 1, 2), device=points.device) + padding_label = -torch.ones((labels.shape[0], 1), device=labels.device) + points = torch.cat([points, padding_point], dim=1) + labels = torch.cat([labels, padding_label], dim=1) + point_embedding = self.pe_layer.forward_with_coords(points, self.input_image_size) + point_embedding[labels == -1] = 0.0 + point_embedding[labels == -1] += self.not_a_point_embed.weight + point_embedding[labels == 0] += self.point_embeddings[0].weight + point_embedding[labels == 1] += self.point_embeddings[1].weight + return point_embedding + + def _embed_boxes(self, boxes: torch.Tensor) -> torch.Tensor: + """Embeds box prompts.""" + boxes = boxes + 0.5 # Shift to center of pixel + coords = boxes.reshape(-1, 2, 2) + corner_embedding = self.pe_layer.forward_with_coords(coords, self.input_image_size) + corner_embedding[:, 0, :] += self.point_embeddings[2].weight + corner_embedding[:, 1, :] += self.point_embeddings[3].weight + return corner_embedding + + def _embed_masks(self, masks: torch.Tensor) -> torch.Tensor: + """Embeds mask inputs.""" + mask_embedding = self.mask_downscaling(masks) + return mask_embedding + + def _get_batch_size( + self, + points: Optional[Tuple[torch.Tensor, torch.Tensor]], + boxes: Optional[torch.Tensor], + masks: Optional[torch.Tensor], + ) -> int: + """ + Gets the batch size of the output given the batch size of the input prompts. + """ + if points is not None: + return points[0].shape[0] + elif boxes is not None: + return boxes.shape[0] + elif masks is not None: + return masks.shape[0] + else: + return 1 + + def _get_device(self) -> torch.device: + return self.point_embeddings[0].weight.device + + def forward( + self, + points: Optional[Tuple[torch.Tensor, torch.Tensor]], + boxes: Optional[torch.Tensor], + masks: Optional[torch.Tensor], + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Embeds different types of prompts, returning both sparse and dense + embeddings. + + Arguments: + points (tuple(torch.Tensor, torch.Tensor) or none): point coordinates + and labels to embed. + boxes (torch.Tensor or none): boxes to embed + masks (torch.Tensor or none): masks to embed + + Returns: + torch.Tensor: sparse embeddings for the points and boxes, with shape + BxNx(embed_dim), where N is determined by the number of input points + and boxes. + torch.Tensor: dense embeddings for the masks, in the shape + Bx(embed_dim)x(embed_H)x(embed_W) + """ + bs = self._get_batch_size(points, boxes, masks) + sparse_embeddings = torch.empty((bs, 0, self.embed_dim), device=self._get_device()) + if points is not None: + coords, labels = points + point_embeddings = self._embed_points(coords, labels, pad=(boxes is None)) + sparse_embeddings = torch.cat([sparse_embeddings, point_embeddings], dim=1) + if boxes is not None: + box_embeddings = self._embed_boxes(boxes) + sparse_embeddings = torch.cat([sparse_embeddings, box_embeddings], dim=1) + + if masks is not None: + dense_embeddings = self._embed_masks(masks) + else: + dense_embeddings = self.no_mask_embed.weight.reshape(1, -1, 1, 1).expand( + bs, -1, self.image_embedding_size[0], self.image_embedding_size[1] + ) + + return sparse_embeddings, dense_embeddings + + +class PositionEmbeddingRandom(nn.Module): + """ + Positional encoding using random spatial frequencies. + """ + + def __init__(self, num_pos_feats: int = 64, scale: Optional[float] = None) -> None: + super().__init__() + if scale is None or scale <= 0.0: + scale = 1.0 + self.register_buffer( + "positional_encoding_gaussian_matrix", + scale * torch.randn((2, num_pos_feats)), + ) + + def _pe_encoding(self, coords: torch.Tensor) -> torch.Tensor: + """Positionally encode points that are normalized to [0,1].""" + # assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape + coords = 2 * coords - 1 + coords = coords @ self.positional_encoding_gaussian_matrix + coords = 2 * np.pi * coords + # outputs d_1 x ... x d_n x C shape + return torch.cat([torch.sin(coords), torch.cos(coords)], dim=-1) + + def forward(self, size: Tuple[int, int]) -> torch.Tensor: + """Generate positional encoding for a grid of the specified size.""" + h, w = size + device: Any = self.positional_encoding_gaussian_matrix.device + grid = torch.ones((h, w), device=device, dtype=torch.float32) + y_embed = grid.cumsum(dim=0) - 0.5 + x_embed = grid.cumsum(dim=1) - 0.5 + y_embed = y_embed / h + x_embed = x_embed / w + + pe = self._pe_encoding(torch.stack([x_embed, y_embed], dim=-1)) + return pe.permute(2, 0, 1) # C x H x W + + def forward_with_coords( + self, coords_input: torch.Tensor, image_size: Tuple[int, int] + ) -> torch.Tensor: + """Positionally encode points that are not normalized to [0,1].""" + coords = coords_input.clone() + coords[:, :, 0] = coords[:, :, 0] / image_size[1] + coords[:, :, 1] = coords[:, :, 1] / image_size[0] + return self._pe_encoding(coords.to(torch.float)) # B x N x C diff --git a/inpaint/plugins/segment_anything/modeling/sam.py b/inpaint/plugins/segment_anything/modeling/sam.py new file mode 100644 index 0000000..303bc2f --- /dev/null +++ b/inpaint/plugins/segment_anything/modeling/sam.py @@ -0,0 +1,174 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch +from torch import nn +from torch.nn import functional as F + +from typing import Any, Dict, List, Tuple + +from .image_encoder import ImageEncoderViT +from .mask_decoder import MaskDecoder +from .prompt_encoder import PromptEncoder + + +class Sam(nn.Module): + mask_threshold: float = 0.0 + image_format: str = "RGB" + + def __init__( + self, + image_encoder: ImageEncoderViT, + prompt_encoder: PromptEncoder, + mask_decoder: MaskDecoder, + pixel_mean: List[float] = [123.675, 116.28, 103.53], + pixel_std: List[float] = [58.395, 57.12, 57.375], + ) -> None: + """ + SAM predicts object masks from an image and input prompts. + + Arguments: + image_encoder (ImageEncoderViT): The backbone used to encode the + image into image embeddings that allow for efficient mask prediction. + prompt_encoder (PromptEncoder): Encodes various types of input prompts. + mask_decoder (MaskDecoder): Predicts masks from the image embeddings + and encoded prompts. + pixel_mean (list(float)): Mean values for normalizing pixels in the input image. + pixel_std (list(float)): Std values for normalizing pixels in the input image. + """ + super().__init__() + self.image_encoder = image_encoder + self.prompt_encoder = prompt_encoder + self.mask_decoder = mask_decoder + self.register_buffer("pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False) + self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False) + + @property + def device(self) -> Any: + return self.pixel_mean.device + + @torch.no_grad() + def forward( + self, + batched_input: List[Dict[str, Any]], + multimask_output: bool, + ) -> List[Dict[str, torch.Tensor]]: + """ + Predicts masks end-to-end from provided images and prompts. + If prompts are not known in advance, using SamPredictor is + recommended over calling the model directly. + + Arguments: + batched_input (list(dict)): A list over input images, each a + dictionary with the following keys. A prompt key can be + excluded if it is not present. + 'image': The image as a torch tensor in 3xHxW format, + already transformed for input to the model. + 'original_size': (tuple(int, int)) The original size of + the image before transformation, as (H, W). + 'point_coords': (torch.Tensor) Batched point prompts for + this image, with shape BxNx2. Already transformed to the + input frame of the model. + 'point_labels': (torch.Tensor) Batched labels for point prompts, + with shape BxN. + 'boxes': (torch.Tensor) Batched box inputs, with shape Bx4. + Already transformed to the input frame of the model. + 'mask_inputs': (torch.Tensor) Batched mask inputs to the model, + in the form Bx1xHxW. + multimask_output (bool): Whether the model should predict multiple + disambiguating masks, or return a single mask. + + Returns: + (list(dict)): A list over input images, where each element is + as dictionary with the following keys. + 'masks': (torch.Tensor) Batched binary mask predictions, + with shape BxCxHxW, where B is the number of input promts, + C is determiend by multimask_output, and (H, W) is the + original size of the image. + 'iou_predictions': (torch.Tensor) The model's predictions + of mask quality, in shape BxC. + 'low_res_logits': (torch.Tensor) Low resolution logits with + shape BxCxHxW, where H=W=256. Can be passed as mask input + to subsequent iterations of prediction. + """ + input_images = torch.stack([self.preprocess(x["image"]) for x in batched_input], dim=0) + image_embeddings = self.image_encoder(input_images) + + outputs = [] + for image_record, curr_embedding in zip(batched_input, image_embeddings): + if "point_coords" in image_record: + points = (image_record["point_coords"], image_record["point_labels"]) + else: + points = None + sparse_embeddings, dense_embeddings = self.prompt_encoder( + points=points, + boxes=image_record.get("boxes", None), + masks=image_record.get("mask_inputs", None), + ) + low_res_masks, iou_predictions = self.mask_decoder( + image_embeddings=curr_embedding.unsqueeze(0), + image_pe=self.prompt_encoder.get_dense_pe(), + sparse_prompt_embeddings=sparse_embeddings, + dense_prompt_embeddings=dense_embeddings, + multimask_output=multimask_output, + ) + masks = self.postprocess_masks( + low_res_masks, + input_size=image_record["image"].shape[-2:], + original_size=image_record["original_size"], + ) + masks = masks > self.mask_threshold + outputs.append( + { + "masks": masks, + "iou_predictions": iou_predictions, + "low_res_logits": low_res_masks, + } + ) + return outputs + + def postprocess_masks( + self, + masks: torch.Tensor, + input_size: Tuple[int, ...], + original_size: Tuple[int, ...], + ) -> torch.Tensor: + """ + Remove padding and upscale masks to the original image size. + + Arguments: + masks (torch.Tensor): Batched masks from the mask_decoder, + in BxCxHxW format. + input_size (tuple(int, int)): The size of the image input to the + model, in (H, W) format. Used to remove padding. + original_size (tuple(int, int)): The original size of the image + before resizing for input to the model, in (H, W) format. + + Returns: + (torch.Tensor): Batched masks in BxCxHxW format, where (H, W) + is given by original_size. + """ + masks = F.interpolate( + masks, + (self.image_encoder.img_size, self.image_encoder.img_size), + mode="bilinear", + align_corners=False, + ) + masks = masks[..., : input_size[0], : input_size[1]] + masks = F.interpolate(masks, original_size, mode="bilinear", align_corners=False) + return masks + + def preprocess(self, x: torch.Tensor) -> torch.Tensor: + """Normalize pixel values and pad to a square input.""" + # Normalize colors + x = (x - self.pixel_mean) / self.pixel_std + + # Pad + h, w = x.shape[-2:] + padh = self.image_encoder.img_size - h + padw = self.image_encoder.img_size - w + x = F.pad(x, (0, padw, 0, padh)) + return x diff --git a/inpaint/plugins/segment_anything/modeling/sam_hq.py b/inpaint/plugins/segment_anything/modeling/sam_hq.py new file mode 100644 index 0000000..d2ae3a3 --- /dev/null +++ b/inpaint/plugins/segment_anything/modeling/sam_hq.py @@ -0,0 +1,177 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch +from torch import nn +from torch.nn import functional as F + +from typing import Any, Dict, List, Tuple + +from .image_encoder import ImageEncoderViT +from .mask_decoder import MaskDecoder +from .prompt_encoder import PromptEncoder + + +class SamHQ(nn.Module): + mask_threshold: float = 0.0 + image_format: str = "RGB" + + def __init__( + self, + image_encoder: ImageEncoderViT, + prompt_encoder: PromptEncoder, + mask_decoder: MaskDecoder, + pixel_mean: List[float] = [123.675, 116.28, 103.53], + pixel_std: List[float] = [58.395, 57.12, 57.375], + ) -> None: + """ + SAM predicts object masks from an image and input prompts. + + Arguments: + image_encoder (ImageEncoderViT): The backbone used to encode the + image into image embeddings that allow for efficient mask prediction. + prompt_encoder (PromptEncoder): Encodes various types of input prompts. + mask_decoder (MaskDecoder): Predicts masks from the image embeddings + and encoded prompts. + pixel_mean (list(float)): Mean values for normalizing pixels in the input image. + pixel_std (list(float)): Std values for normalizing pixels in the input image. + """ + super().__init__() + self.image_encoder = image_encoder + self.prompt_encoder = prompt_encoder + self.mask_decoder = mask_decoder + self.register_buffer("pixel_mean", torch.Tensor(pixel_mean).view(-1, 1, 1), False) + self.register_buffer("pixel_std", torch.Tensor(pixel_std).view(-1, 1, 1), False) + + @property + def device(self) -> Any: + return self.pixel_mean.device + + def forward( + self, + batched_input: List[Dict[str, Any]], + multimask_output: bool, + hq_token_only: bool =False, + ) -> List[Dict[str, torch.Tensor]]: + """ + Predicts masks end-to-end from provided images and prompts. + If prompts are not known in advance, using SamPredictor is + recommended over calling the model directly. + + Arguments: + batched_input (list(dict)): A list over input images, each a + dictionary with the following keys. A prompt key can be + excluded if it is not present. + 'image': The image as a torch tensor in 3xHxW format, + already transformed for input to the model. + 'original_size': (tuple(int, int)) The original size of + the image before transformation, as (H, W). + 'point_coords': (torch.Tensor) Batched point prompts for + this image, with shape BxNx2. Already transformed to the + input frame of the model. + 'point_labels': (torch.Tensor) Batched labels for point prompts, + with shape BxN. + 'boxes': (torch.Tensor) Batched box inputs, with shape Bx4. + Already transformed to the input frame of the model. + 'mask_inputs': (torch.Tensor) Batched mask inputs to the model, + in the form Bx1xHxW. + multimask_output (bool): Whether the model should predict multiple + disambiguating masks, or return a single mask. + + Returns: + (list(dict)): A list over input images, where each element is + as dictionary with the following keys. + 'masks': (torch.Tensor) Batched binary mask predictions, + with shape BxCxHxW, where B is the number of input prompts, + C is determined by multimask_output, and (H, W) is the + original size of the image. + 'iou_predictions': (torch.Tensor) The model's predictions + of mask quality, in shape BxC. + 'low_res_logits': (torch.Tensor) Low resolution logits with + shape BxCxHxW, where H=W=256. Can be passed as mask input + to subsequent iterations of prediction. + """ + input_images = torch.stack([self.preprocess(x["image"]) for x in batched_input], dim=0) + image_embeddings, interm_embeddings = self.image_encoder(input_images) + interm_embeddings = interm_embeddings[0] # early layer + + outputs = [] + for image_record, curr_embedding, curr_interm in zip(batched_input, image_embeddings, interm_embeddings): + if "point_coords" in image_record: + points = (image_record["point_coords"], image_record["point_labels"]) + else: + points = None + sparse_embeddings, dense_embeddings = self.prompt_encoder( + points=points, + boxes=image_record.get("boxes", None), + masks=image_record.get("mask_inputs", None), + ) + low_res_masks, iou_predictions = self.mask_decoder( + image_embeddings=curr_embedding.unsqueeze(0), + image_pe=self.prompt_encoder.get_dense_pe(), + sparse_prompt_embeddings=sparse_embeddings, + dense_prompt_embeddings=dense_embeddings, + multimask_output=multimask_output, + hq_token_only=hq_token_only, + interm_embeddings=curr_interm.unsqueeze(0).unsqueeze(0), + ) + masks = self.postprocess_masks( + low_res_masks, + input_size=image_record["image"].shape[-2:], + original_size=image_record["original_size"], + ) + masks = masks > self.mask_threshold + outputs.append( + { + "masks": masks, + "iou_predictions": iou_predictions, + "low_res_logits": low_res_masks, + } + ) + return outputs + + def postprocess_masks( + self, + masks: torch.Tensor, + input_size: Tuple[int, ...], + original_size: Tuple[int, ...], + ) -> torch.Tensor: + """ + Remove padding and upscale masks to the original image size. + + Arguments: + masks (torch.Tensor): Batched masks from the mask_decoder, + in BxCxHxW format. + input_size (tuple(int, int)): The size of the image input to the + model, in (H, W) format. Used to remove padding. + original_size (tuple(int, int)): The original size of the image + before resizing for input to the model, in (H, W) format. + + Returns: + (torch.Tensor): Batched masks in BxCxHxW format, where (H, W) + is given by original_size. + """ + masks = F.interpolate( + masks, + (self.image_encoder.img_size, self.image_encoder.img_size), + mode="bilinear", + align_corners=False, + ) + masks = masks[..., : input_size[0], : input_size[1]] + masks = F.interpolate(masks, original_size, mode="bilinear", align_corners=False) + return masks + + def preprocess(self, x: torch.Tensor) -> torch.Tensor: + """Normalize pixel values and pad to a square input.""" + # Normalize colors + x = (x - self.pixel_mean) / self.pixel_std + + # Pad + h, w = x.shape[-2:] + padh = self.image_encoder.img_size - h + padw = self.image_encoder.img_size - w + x = F.pad(x, (0, padw, 0, padh)) + return x \ No newline at end of file diff --git a/inpaint/plugins/segment_anything/modeling/tiny_vit_sam.py b/inpaint/plugins/segment_anything/modeling/tiny_vit_sam.py new file mode 100644 index 0000000..a5127c7 --- /dev/null +++ b/inpaint/plugins/segment_anything/modeling/tiny_vit_sam.py @@ -0,0 +1,822 @@ +# -------------------------------------------------------- +# TinyViT Model Architecture +# Copyright (c) 2022 Microsoft +# Adapted from LeViT and Swin Transformer +# LeViT: (https://github.com/facebookresearch/levit) +# Swin: (https://github.com/microsoft/swin-transformer) +# Build the TinyViT Model +# -------------------------------------------------------- + +import collections +import itertools +import math +import warnings +import torch +import torch.nn as nn +import torch.nn.functional as F +import torch.utils.checkpoint as checkpoint +from typing import Tuple + + +def _ntuple(n): + def parse(x): + if isinstance(x, collections.abc.Iterable) and not isinstance(x, str): + return x + return tuple(itertools.repeat(x, n)) + + return parse + + +to_2tuple = _ntuple(2) + + +def _trunc_normal_(tensor, mean, std, a, b): + # Cut & paste from PyTorch official master until it's in a few official releases - RW + # Method based on https://people.sc.fsu.edu/~jburkardt/presentations/truncated_normal.pdf + def norm_cdf(x): + # Computes standard normal cumulative distribution function + return (1.0 + math.erf(x / math.sqrt(2.0))) / 2.0 + + if (mean < a - 2 * std) or (mean > b + 2 * std): + warnings.warn( + "mean is more than 2 std from [a, b] in nn.init.trunc_normal_. " + "The distribution of values may be incorrect.", + stacklevel=2, + ) + + # Values are generated by using a truncated uniform distribution and + # then using the inverse CDF for the normal distribution. + # Get upper and lower cdf values + l = norm_cdf((a - mean) / std) + u = norm_cdf((b - mean) / std) + + # Uniformly fill tensor with values from [l, u], then translate to + # [2l-1, 2u-1]. + tensor.uniform_(2 * l - 1, 2 * u - 1) + + # Use inverse cdf transform for normal distribution to get truncated + # standard normal + tensor.erfinv_() + + # Transform to proper mean, std + tensor.mul_(std * math.sqrt(2.0)) + tensor.add_(mean) + + # Clamp to ensure it's in the proper range + tensor.clamp_(min=a, max=b) + return tensor + + +def trunc_normal_(tensor, mean=0.0, std=1.0, a=-2.0, b=2.0): + # type: (Tensor, float, float, float, float) -> Tensor + r"""Fills the input Tensor with values drawn from a truncated + normal distribution. The values are effectively drawn from the + normal distribution :math:`\mathcal{N}(\text{mean}, \text{std}^2)` + with values outside :math:`[a, b]` redrawn until they are within + the bounds. The method used for generating the random values works + best when :math:`a \leq \text{mean} \leq b`. + + NOTE: this impl is similar to the PyTorch trunc_normal_, the bounds [a, b] are + applied while sampling the normal with mean/std applied, therefore a, b args + should be adjusted to match the range of mean, std args. + + Args: + tensor: an n-dimensional `torch.Tensor` + mean: the mean of the normal distribution + std: the standard deviation of the normal distribution + a: the minimum cutoff value + b: the maximum cutoff value + Examples: + >>> w = torch.empty(3, 5) + >>> nn.init.trunc_normal_(w) + """ + with torch.no_grad(): + return _trunc_normal_(tensor, mean, std, a, b) + + +def drop_path( + x, drop_prob: float = 0.0, training: bool = False, scale_by_keep: bool = True +): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks). + + This is the same as the DropConnect impl I created for EfficientNet, etc networks, however, + the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper... + See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for + changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use + 'survival rate' as the argument. + + """ + if drop_prob == 0.0 or not training: + return x + keep_prob = 1 - drop_prob + shape = (x.shape[0],) + (1,) * ( + x.ndim - 1 + ) # work with diff dim tensors, not just 2D ConvNets + random_tensor = x.new_empty(shape).bernoulli_(keep_prob) + if keep_prob > 0.0 and scale_by_keep: + random_tensor.div_(keep_prob) + return x * random_tensor + + +class TimmDropPath(nn.Module): + """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).""" + + def __init__(self, drop_prob: float = 0.0, scale_by_keep: bool = True): + super(TimmDropPath, self).__init__() + self.drop_prob = drop_prob + self.scale_by_keep = scale_by_keep + + def forward(self, x): + return drop_path(x, self.drop_prob, self.training, self.scale_by_keep) + + def extra_repr(self): + return f"drop_prob={round(self.drop_prob,3):0.3f}" + + +class Conv2d_BN(torch.nn.Sequential): + def __init__( + self, a, b, ks=1, stride=1, pad=0, dilation=1, groups=1, bn_weight_init=1 + ): + super().__init__() + self.add_module( + "c", torch.nn.Conv2d(a, b, ks, stride, pad, dilation, groups, bias=False) + ) + bn = torch.nn.BatchNorm2d(b) + torch.nn.init.constant_(bn.weight, bn_weight_init) + torch.nn.init.constant_(bn.bias, 0) + self.add_module("bn", bn) + + @torch.no_grad() + def fuse(self): + c, bn = self._modules.values() + w = bn.weight / (bn.running_var + bn.eps) ** 0.5 + w = c.weight * w[:, None, None, None] + b = bn.bias - bn.running_mean * bn.weight / (bn.running_var + bn.eps) ** 0.5 + m = torch.nn.Conv2d( + w.size(1) * self.c.groups, + w.size(0), + w.shape[2:], + stride=self.c.stride, + padding=self.c.padding, + dilation=self.c.dilation, + groups=self.c.groups, + ) + m.weight.data.copy_(w) + m.bias.data.copy_(b) + return m + + +class DropPath(TimmDropPath): + def __init__(self, drop_prob=None): + super().__init__(drop_prob=drop_prob) + self.drop_prob = drop_prob + + def __repr__(self): + msg = super().__repr__() + msg += f"(drop_prob={self.drop_prob})" + return msg + + +class PatchEmbed(nn.Module): + def __init__(self, in_chans, embed_dim, resolution, activation): + super().__init__() + img_size: Tuple[int, int] = to_2tuple(resolution) + self.patches_resolution = (img_size[0] // 4, img_size[1] // 4) + self.num_patches = self.patches_resolution[0] * self.patches_resolution[1] + self.in_chans = in_chans + self.embed_dim = embed_dim + n = embed_dim + self.seq = nn.Sequential( + Conv2d_BN(in_chans, n // 2, 3, 2, 1), + activation(), + Conv2d_BN(n // 2, n, 3, 2, 1), + ) + + def forward(self, x): + return self.seq(x) + + +class MBConv(nn.Module): + def __init__(self, in_chans, out_chans, expand_ratio, activation, drop_path): + super().__init__() + self.in_chans = in_chans + self.hidden_chans = int(in_chans * expand_ratio) + self.out_chans = out_chans + + self.conv1 = Conv2d_BN(in_chans, self.hidden_chans, ks=1) + self.act1 = activation() + + self.conv2 = Conv2d_BN( + self.hidden_chans, + self.hidden_chans, + ks=3, + stride=1, + pad=1, + groups=self.hidden_chans, + ) + self.act2 = activation() + + self.conv3 = Conv2d_BN(self.hidden_chans, out_chans, ks=1, bn_weight_init=0.0) + self.act3 = activation() + + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + + def forward(self, x): + shortcut = x + + x = self.conv1(x) + x = self.act1(x) + + x = self.conv2(x) + x = self.act2(x) + + x = self.conv3(x) + + x = self.drop_path(x) + + x += shortcut + x = self.act3(x) + + return x + + +class PatchMerging(nn.Module): + def __init__(self, input_resolution, dim, out_dim, activation): + super().__init__() + + self.input_resolution = input_resolution + self.dim = dim + self.out_dim = out_dim + self.act = activation() + self.conv1 = Conv2d_BN(dim, out_dim, 1, 1, 0) + stride_c = 2 + if out_dim == 320 or out_dim == 448 or out_dim == 576: + stride_c = 1 + self.conv2 = Conv2d_BN(out_dim, out_dim, 3, stride_c, 1, groups=out_dim) + self.conv3 = Conv2d_BN(out_dim, out_dim, 1, 1, 0) + + def forward(self, x): + if x.ndim == 3: + H, W = self.input_resolution + B = len(x) + # (B, C, H, W) + x = x.view(B, H, W, -1).permute(0, 3, 1, 2) + + x = self.conv1(x) + x = self.act(x) + + x = self.conv2(x) + x = self.act(x) + x = self.conv3(x) + x = x.flatten(2).transpose(1, 2) + return x + + +class ConvLayer(nn.Module): + def __init__( + self, + dim, + input_resolution, + depth, + activation, + drop_path=0.0, + downsample=None, + use_checkpoint=False, + out_dim=None, + conv_expand_ratio=4.0, + ): + super().__init__() + self.dim = dim + self.input_resolution = input_resolution + self.depth = depth + self.use_checkpoint = use_checkpoint + + # build blocks + self.blocks = nn.ModuleList( + [ + MBConv( + dim, + dim, + conv_expand_ratio, + activation, + drop_path[i] if isinstance(drop_path, list) else drop_path, + ) + for i in range(depth) + ] + ) + + # patch merging layer + if downsample is not None: + self.downsample = downsample( + input_resolution, dim=dim, out_dim=out_dim, activation=activation + ) + else: + self.downsample = None + + def forward(self, x): + for blk in self.blocks: + if self.use_checkpoint: + x = checkpoint.checkpoint(blk, x) + else: + x = blk(x) + if self.downsample is not None: + x = self.downsample(x) + return x + + +class Mlp(nn.Module): + def __init__( + self, + in_features, + hidden_features=None, + out_features=None, + act_layer=nn.GELU, + drop=0.0, + ): + super().__init__() + out_features = out_features or in_features + hidden_features = hidden_features or in_features + self.norm = nn.LayerNorm(in_features) + self.fc1 = nn.Linear(in_features, hidden_features) + self.fc2 = nn.Linear(hidden_features, out_features) + self.act = act_layer() + self.drop = nn.Dropout(drop) + + def forward(self, x): + x = self.norm(x) + + x = self.fc1(x) + x = self.act(x) + x = self.drop(x) + x = self.fc2(x) + x = self.drop(x) + return x + + +class Attention(torch.nn.Module): + def __init__( + self, + dim, + key_dim, + num_heads=8, + attn_ratio=4, + resolution=(14, 14), + ): + super().__init__() + # (h, w) + assert isinstance(resolution, tuple) and len(resolution) == 2 + self.num_heads = num_heads + self.scale = key_dim**-0.5 + self.key_dim = key_dim + self.nh_kd = nh_kd = key_dim * num_heads + self.d = int(attn_ratio * key_dim) + self.dh = int(attn_ratio * key_dim) * num_heads + self.attn_ratio = attn_ratio + h = self.dh + nh_kd * 2 + + self.norm = nn.LayerNorm(dim) + self.qkv = nn.Linear(dim, h) + self.proj = nn.Linear(self.dh, dim) + + points = list(itertools.product(range(resolution[0]), range(resolution[1]))) + N = len(points) + attention_offsets = {} + idxs = [] + for p1 in points: + for p2 in points: + offset = (abs(p1[0] - p2[0]), abs(p1[1] - p2[1])) + if offset not in attention_offsets: + attention_offsets[offset] = len(attention_offsets) + idxs.append(attention_offsets[offset]) + self.attention_biases = torch.nn.Parameter( + torch.zeros(num_heads, len(attention_offsets)) + ) + self.register_buffer( + "attention_bias_idxs", torch.LongTensor(idxs).view(N, N), persistent=False + ) + + @torch.no_grad() + def train(self, mode=True): + super().train(mode) + if mode and hasattr(self, "ab"): + del self.ab + else: + self.register_buffer( + "ab", + self.attention_biases[:, self.attention_bias_idxs], + persistent=False, + ) + + def forward(self, x): # x (B,N,C) + B, N, _ = x.shape + + # Normalization + x = self.norm(x) + + qkv = self.qkv(x) + # (B, N, num_heads, d) + q, k, v = qkv.view(B, N, self.num_heads, -1).split( + [self.key_dim, self.key_dim, self.d], dim=3 + ) + # (B, num_heads, N, d) + q = q.permute(0, 2, 1, 3) + k = k.permute(0, 2, 1, 3) + v = v.permute(0, 2, 1, 3) + + attn = (q @ k.transpose(-2, -1)) * self.scale + ( + self.attention_biases[:, self.attention_bias_idxs] + if self.training + else self.ab + ) + attn = attn.softmax(dim=-1) + x = (attn @ v).transpose(1, 2).reshape(B, N, self.dh) + x = self.proj(x) + return x + + +class TinyViTBlock(nn.Module): + r"""TinyViT Block. + + Args: + dim (int): Number of input channels. + input_resolution (tuple[int, int]): Input resolution. + num_heads (int): Number of attention heads. + window_size (int): Window size. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + drop (float, optional): Dropout rate. Default: 0.0 + drop_path (float, optional): Stochastic depth rate. Default: 0.0 + local_conv_size (int): the kernel size of the convolution between + Attention and MLP. Default: 3 + activation: the activation function. Default: nn.GELU + """ + + def __init__( + self, + dim, + input_resolution, + num_heads, + window_size=7, + mlp_ratio=4.0, + drop=0.0, + drop_path=0.0, + local_conv_size=3, + activation=nn.GELU, + ): + super().__init__() + self.dim = dim + self.input_resolution = input_resolution + self.num_heads = num_heads + assert window_size > 0, "window_size must be greater than 0" + self.window_size = window_size + self.mlp_ratio = mlp_ratio + + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + + assert dim % num_heads == 0, "dim must be divisible by num_heads" + head_dim = dim // num_heads + + window_resolution = (window_size, window_size) + self.attn = Attention( + dim, head_dim, num_heads, attn_ratio=1, resolution=window_resolution + ) + + mlp_hidden_dim = int(dim * mlp_ratio) + mlp_activation = activation + self.mlp = Mlp( + in_features=dim, + hidden_features=mlp_hidden_dim, + act_layer=mlp_activation, + drop=drop, + ) + + pad = local_conv_size // 2 + self.local_conv = Conv2d_BN( + dim, dim, ks=local_conv_size, stride=1, pad=pad, groups=dim + ) + + def forward(self, x): + H, W = self.input_resolution + B, L, C = x.shape + assert L == H * W, "input feature has wrong size" + res_x = x + if H == self.window_size and W == self.window_size: + x = self.attn(x) + else: + x = x.view(B, H, W, C) + pad_b = (self.window_size - H % self.window_size) % self.window_size + pad_r = (self.window_size - W % self.window_size) % self.window_size + padding = pad_b > 0 or pad_r > 0 + + if padding: + x = F.pad(x, (0, 0, 0, pad_r, 0, pad_b)) + + pH, pW = H + pad_b, W + pad_r + nH = pH // self.window_size + nW = pW // self.window_size + # window partition + x = ( + x.view(B, nH, self.window_size, nW, self.window_size, C) + .transpose(2, 3) + .reshape(B * nH * nW, self.window_size * self.window_size, C) + ) + x = self.attn(x) + # window reverse + x = ( + x.view(B, nH, nW, self.window_size, self.window_size, C) + .transpose(2, 3) + .reshape(B, pH, pW, C) + ) + + if padding: + x = x[:, :H, :W].contiguous() + + x = x.view(B, L, C) + + x = res_x + self.drop_path(x) + + x = x.transpose(1, 2).reshape(B, C, H, W) + x = self.local_conv(x) + x = x.view(B, C, L).transpose(1, 2) + + x = x + self.drop_path(self.mlp(x)) + return x + + def extra_repr(self) -> str: + return ( + f"dim={self.dim}, input_resolution={self.input_resolution}, num_heads={self.num_heads}, " + f"window_size={self.window_size}, mlp_ratio={self.mlp_ratio}" + ) + + +class BasicLayer(nn.Module): + """A basic TinyViT layer for one stage. + + Args: + dim (int): Number of input channels. + input_resolution (tuple[int]): Input resolution. + depth (int): Number of blocks. + num_heads (int): Number of attention heads. + window_size (int): Local window size. + mlp_ratio (float): Ratio of mlp hidden dim to embedding dim. + drop (float, optional): Dropout rate. Default: 0.0 + drop_path (float | tuple[float], optional): Stochastic depth rate. Default: 0.0 + downsample (nn.Module | None, optional): Downsample layer at the end of the layer. Default: None + use_checkpoint (bool): Whether to use checkpointing to save memory. Default: False. + local_conv_size: the kernel size of the depthwise convolution between attention and MLP. Default: 3 + activation: the activation function. Default: nn.GELU + out_dim: the output dimension of the layer. Default: dim + """ + + def __init__( + self, + dim, + input_resolution, + depth, + num_heads, + window_size, + mlp_ratio=4.0, + drop=0.0, + drop_path=0.0, + downsample=None, + use_checkpoint=False, + local_conv_size=3, + activation=nn.GELU, + out_dim=None, + ): + super().__init__() + self.dim = dim + self.input_resolution = input_resolution + self.depth = depth + self.use_checkpoint = use_checkpoint + + # build blocks + self.blocks = nn.ModuleList( + [ + TinyViTBlock( + dim=dim, + input_resolution=input_resolution, + num_heads=num_heads, + window_size=window_size, + mlp_ratio=mlp_ratio, + drop=drop, + drop_path=drop_path[i] + if isinstance(drop_path, list) + else drop_path, + local_conv_size=local_conv_size, + activation=activation, + ) + for i in range(depth) + ] + ) + + # patch merging layer + if downsample is not None: + self.downsample = downsample( + input_resolution, dim=dim, out_dim=out_dim, activation=activation + ) + else: + self.downsample = None + + def forward(self, x): + for blk in self.blocks: + if self.use_checkpoint: + x = checkpoint.checkpoint(blk, x) + else: + x = blk(x) + if self.downsample is not None: + x = self.downsample(x) + return x + + def extra_repr(self) -> str: + return f"dim={self.dim}, input_resolution={self.input_resolution}, depth={self.depth}" + + +class LayerNorm2d(nn.Module): + def __init__(self, num_channels: int, eps: float = 1e-6) -> None: + super().__init__() + self.weight = nn.Parameter(torch.ones(num_channels)) + self.bias = nn.Parameter(torch.zeros(num_channels)) + self.eps = eps + + def forward(self, x: torch.Tensor) -> torch.Tensor: + u = x.mean(1, keepdim=True) + s = (x - u).pow(2).mean(1, keepdim=True) + x = (x - u) / torch.sqrt(s + self.eps) + x = self.weight[:, None, None] * x + self.bias[:, None, None] + return x + + +class TinyViT(nn.Module): + def __init__( + self, + img_size=224, + in_chans=3, + num_classes=1000, + embed_dims=[96, 192, 384, 768], + depths=[2, 2, 6, 2], + num_heads=[3, 6, 12, 24], + window_sizes=[7, 7, 14, 7], + mlp_ratio=4.0, + drop_rate=0.0, + drop_path_rate=0.1, + use_checkpoint=False, + mbconv_expand_ratio=4.0, + local_conv_size=3, + layer_lr_decay=1.0, + ): + super().__init__() + self.img_size = img_size + self.num_classes = num_classes + self.depths = depths + self.num_layers = len(depths) + self.mlp_ratio = mlp_ratio + + activation = nn.GELU + + self.patch_embed = PatchEmbed( + in_chans=in_chans, + embed_dim=embed_dims[0], + resolution=img_size, + activation=activation, + ) + + patches_resolution = self.patch_embed.patches_resolution + self.patches_resolution = patches_resolution + + # stochastic depth + dpr = [ + x.item() for x in torch.linspace(0, drop_path_rate, sum(depths)) + ] # stochastic depth decay rule + + # build layers + self.layers = nn.ModuleList() + for i_layer in range(self.num_layers): + kwargs = dict( + dim=embed_dims[i_layer], + input_resolution=( + patches_resolution[0] + // (2 ** (i_layer - 1 if i_layer == 3 else i_layer)), + patches_resolution[1] + // (2 ** (i_layer - 1 if i_layer == 3 else i_layer)), + ), + # input_resolution=(patches_resolution[0] // (2 ** i_layer), + # patches_resolution[1] // (2 ** i_layer)), + depth=depths[i_layer], + drop_path=dpr[sum(depths[:i_layer]) : sum(depths[: i_layer + 1])], + downsample=PatchMerging if (i_layer < self.num_layers - 1) else None, + use_checkpoint=use_checkpoint, + out_dim=embed_dims[min(i_layer + 1, len(embed_dims) - 1)], + activation=activation, + ) + if i_layer == 0: + layer = ConvLayer( + conv_expand_ratio=mbconv_expand_ratio, + **kwargs, + ) + else: + layer = BasicLayer( + num_heads=num_heads[i_layer], + window_size=window_sizes[i_layer], + mlp_ratio=self.mlp_ratio, + drop=drop_rate, + local_conv_size=local_conv_size, + **kwargs, + ) + self.layers.append(layer) + + # Classifier head + self.norm_head = nn.LayerNorm(embed_dims[-1]) + self.head = ( + nn.Linear(embed_dims[-1], num_classes) + if num_classes > 0 + else torch.nn.Identity() + ) + + # init weights + self.apply(self._init_weights) + self.set_layer_lr_decay(layer_lr_decay) + self.neck = nn.Sequential( + nn.Conv2d( + embed_dims[-1], + 256, + kernel_size=1, + bias=False, + ), + LayerNorm2d(256), + nn.Conv2d( + 256, + 256, + kernel_size=3, + padding=1, + bias=False, + ), + LayerNorm2d(256), + ) + + def set_layer_lr_decay(self, layer_lr_decay): + decay_rate = layer_lr_decay + + # layers -> blocks (depth) + depth = sum(self.depths) + lr_scales = [decay_rate ** (depth - i - 1) for i in range(depth)] + # print("LR SCALES:", lr_scales) + + def _set_lr_scale(m, scale): + for p in m.parameters(): + p.lr_scale = scale + + self.patch_embed.apply(lambda x: _set_lr_scale(x, lr_scales[0])) + i = 0 + for layer in self.layers: + for block in layer.blocks: + block.apply(lambda x: _set_lr_scale(x, lr_scales[i])) + i += 1 + if layer.downsample is not None: + layer.downsample.apply(lambda x: _set_lr_scale(x, lr_scales[i - 1])) + assert i == depth + for m in [self.norm_head, self.head]: + m.apply(lambda x: _set_lr_scale(x, lr_scales[-1])) + + for k, p in self.named_parameters(): + p.param_name = k + + def _check_lr_scale(m): + for p in m.parameters(): + assert hasattr(p, "lr_scale"), p.param_name + + self.apply(_check_lr_scale) + + def _init_weights(self, m): + if isinstance(m, nn.Linear): + trunc_normal_(m.weight, std=0.02) + if isinstance(m, nn.Linear) and m.bias is not None: + nn.init.constant_(m.bias, 0) + elif isinstance(m, nn.LayerNorm): + nn.init.constant_(m.bias, 0) + nn.init.constant_(m.weight, 1.0) + + @torch.jit.ignore + def no_weight_decay_keywords(self): + return {"attention_biases"} + + def forward_features(self, x): + # x: (N, C, H, W) + x = self.patch_embed(x) + + x = self.layers[0](x) + start_i = 1 + + for i in range(start_i, len(self.layers)): + layer = self.layers[i] + x = layer(x) + B, _, C = x.size() + x = x.view(B, 64, 64, C) + x = x.permute(0, 3, 1, 2) + x = self.neck(x) + return x + + def forward(self, x): + x = self.forward_features(x) + # x = self.norm_head(x) + # x = self.head(x) + return x diff --git a/inpaint/plugins/segment_anything/modeling/transformer.py b/inpaint/plugins/segment_anything/modeling/transformer.py new file mode 100644 index 0000000..f1a2812 --- /dev/null +++ b/inpaint/plugins/segment_anything/modeling/transformer.py @@ -0,0 +1,240 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch +from torch import Tensor, nn + +import math +from typing import Tuple, Type + +from .common import MLPBlock + + +class TwoWayTransformer(nn.Module): + def __init__( + self, + depth: int, + embedding_dim: int, + num_heads: int, + mlp_dim: int, + activation: Type[nn.Module] = nn.ReLU, + attention_downsample_rate: int = 2, + ) -> None: + """ + A transformer decoder that attends to an input image using + queries whose positional embedding is supplied. + + Args: + depth (int): number of layers in the transformer + embedding_dim (int): the channel dimension for the input embeddings + num_heads (int): the number of heads for multihead attention. Must + divide embedding_dim + mlp_dim (int): the channel dimension internal to the MLP block + activation (nn.Module): the activation to use in the MLP block + """ + super().__init__() + self.depth = depth + self.embedding_dim = embedding_dim + self.num_heads = num_heads + self.mlp_dim = mlp_dim + self.layers = nn.ModuleList() + + for i in range(depth): + self.layers.append( + TwoWayAttentionBlock( + embedding_dim=embedding_dim, + num_heads=num_heads, + mlp_dim=mlp_dim, + activation=activation, + attention_downsample_rate=attention_downsample_rate, + skip_first_layer_pe=(i == 0), + ) + ) + + self.final_attn_token_to_image = Attention( + embedding_dim, num_heads, downsample_rate=attention_downsample_rate + ) + self.norm_final_attn = nn.LayerNorm(embedding_dim) + + def forward( + self, + image_embedding: Tensor, + image_pe: Tensor, + point_embedding: Tensor, + ) -> Tuple[Tensor, Tensor]: + """ + Args: + image_embedding (torch.Tensor): image to attend to. Should be shape + B x embedding_dim x h x w for any h and w. + image_pe (torch.Tensor): the positional encoding to add to the image. Must + have the same shape as image_embedding. + point_embedding (torch.Tensor): the embedding to add to the query points. + Must have shape B x N_points x embedding_dim for any N_points. + + Returns: + torch.Tensor: the processed point_embedding + torch.Tensor: the processed image_embedding + """ + # BxCxHxW -> BxHWxC == B x N_image_tokens x C + bs, c, h, w = image_embedding.shape + image_embedding = image_embedding.flatten(2).permute(0, 2, 1) + image_pe = image_pe.flatten(2).permute(0, 2, 1) + + # Prepare queries + queries = point_embedding + keys = image_embedding + + # Apply transformer blocks and final layernorm + for layer in self.layers: + queries, keys = layer( + queries=queries, + keys=keys, + query_pe=point_embedding, + key_pe=image_pe, + ) + + # Apply the final attenion layer from the points to the image + q = queries + point_embedding + k = keys + image_pe + attn_out = self.final_attn_token_to_image(q=q, k=k, v=keys) + queries = queries + attn_out + queries = self.norm_final_attn(queries) + + return queries, keys + + +class TwoWayAttentionBlock(nn.Module): + def __init__( + self, + embedding_dim: int, + num_heads: int, + mlp_dim: int = 2048, + activation: Type[nn.Module] = nn.ReLU, + attention_downsample_rate: int = 2, + skip_first_layer_pe: bool = False, + ) -> None: + """ + A transformer block with four layers: (1) self-attention of sparse + inputs, (2) cross attention of sparse inputs to dense inputs, (3) mlp + block on sparse inputs, and (4) cross attention of dense inputs to sparse + inputs. + + Arguments: + embedding_dim (int): the channel dimension of the embeddings + num_heads (int): the number of heads in the attention layers + mlp_dim (int): the hidden dimension of the mlp block + activation (nn.Module): the activation of the mlp block + skip_first_layer_pe (bool): skip the PE on the first layer + """ + super().__init__() + self.self_attn = Attention(embedding_dim, num_heads) + self.norm1 = nn.LayerNorm(embedding_dim) + + self.cross_attn_token_to_image = Attention( + embedding_dim, num_heads, downsample_rate=attention_downsample_rate + ) + self.norm2 = nn.LayerNorm(embedding_dim) + + self.mlp = MLPBlock(embedding_dim, mlp_dim, activation) + self.norm3 = nn.LayerNorm(embedding_dim) + + self.norm4 = nn.LayerNorm(embedding_dim) + self.cross_attn_image_to_token = Attention( + embedding_dim, num_heads, downsample_rate=attention_downsample_rate + ) + + self.skip_first_layer_pe = skip_first_layer_pe + + def forward( + self, queries: Tensor, keys: Tensor, query_pe: Tensor, key_pe: Tensor + ) -> Tuple[Tensor, Tensor]: + # Self attention block + if self.skip_first_layer_pe: + queries = self.self_attn(q=queries, k=queries, v=queries) + else: + q = queries + query_pe + attn_out = self.self_attn(q=q, k=q, v=queries) + queries = queries + attn_out + queries = self.norm1(queries) + + # Cross attention block, tokens attending to image embedding + q = queries + query_pe + k = keys + key_pe + attn_out = self.cross_attn_token_to_image(q=q, k=k, v=keys) + queries = queries + attn_out + queries = self.norm2(queries) + + # MLP block + mlp_out = self.mlp(queries) + queries = queries + mlp_out + queries = self.norm3(queries) + + # Cross attention block, image embedding attending to tokens + q = queries + query_pe + k = keys + key_pe + attn_out = self.cross_attn_image_to_token(q=k, k=q, v=queries) + keys = keys + attn_out + keys = self.norm4(keys) + + return queries, keys + + +class Attention(nn.Module): + """ + An attention layer that allows for downscaling the size of the embedding + after projection to queries, keys, and values. + """ + + def __init__( + self, + embedding_dim: int, + num_heads: int, + downsample_rate: int = 1, + ) -> None: + super().__init__() + self.embedding_dim = embedding_dim + self.internal_dim = embedding_dim // downsample_rate + self.num_heads = num_heads + assert self.internal_dim % num_heads == 0, "num_heads must divide embedding_dim." + + self.q_proj = nn.Linear(embedding_dim, self.internal_dim) + self.k_proj = nn.Linear(embedding_dim, self.internal_dim) + self.v_proj = nn.Linear(embedding_dim, self.internal_dim) + self.out_proj = nn.Linear(self.internal_dim, embedding_dim) + + def _separate_heads(self, x: Tensor, num_heads: int) -> Tensor: + b, n, c = x.shape + x = x.reshape(b, n, num_heads, c // num_heads) + return x.transpose(1, 2) # B x N_heads x N_tokens x C_per_head + + def _recombine_heads(self, x: Tensor) -> Tensor: + b, n_heads, n_tokens, c_per_head = x.shape + x = x.transpose(1, 2) + return x.reshape(b, n_tokens, n_heads * c_per_head) # B x N_tokens x C + + def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor: + # Input projections + q = self.q_proj(q) + k = self.k_proj(k) + v = self.v_proj(v) + + # Separate into heads + q = self._separate_heads(q, self.num_heads) + k = self._separate_heads(k, self.num_heads) + v = self._separate_heads(v, self.num_heads) + + # Attention + _, _, _, c_per_head = q.shape + attn = q @ k.permute(0, 1, 3, 2) # B x N_heads x N_tokens x N_tokens + attn = attn / math.sqrt(c_per_head) + attn = torch.softmax(attn, dim=-1) + + # Get output + out = attn @ v + out = self._recombine_heads(out) + out = self.out_proj(out) + + return out diff --git a/inpaint/plugins/segment_anything/predictor.py b/inpaint/plugins/segment_anything/predictor.py new file mode 100644 index 0000000..23d0649 --- /dev/null +++ b/inpaint/plugins/segment_anything/predictor.py @@ -0,0 +1,285 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import numpy as np +import torch + +from .modeling import Sam + +from typing import Optional, Tuple + + +class SamPredictor: + def __init__( + self, + sam_model: Sam, + ) -> None: + """ + Uses SAM to calculate the image embedding for an image, and then + allow repeated, efficient mask prediction given prompts. + + Arguments: + sam_model (Sam): The model to use for mask prediction. + """ + super().__init__() + self.model = sam_model + from .utils.transforms import ResizeLongestSide + + self.transform = ResizeLongestSide(sam_model.image_encoder.img_size) + self.reset_image() + + def set_image( + self, + image: np.ndarray, + image_format: str = "RGB", + ) -> None: + """ + Calculates the image embeddings for the provided image, allowing + masks to be predicted with the 'predict' method. + + Arguments: + image (np.ndarray): The image for calculating masks. Expects an + image in HWC uint8 format, with pixel values in [0, 255]. + image_format (str): The color format of the image, in ['RGB', 'BGR']. + """ + assert image_format in [ + "RGB", + "BGR", + ], f"image_format must be in ['RGB', 'BGR'], is {image_format}." + if image_format != self.model.image_format: + image = image[..., ::-1] + + # Transform the image to the form expected by the model + input_image = self.transform.apply_image(image) + input_image_torch = torch.as_tensor(input_image, device=self.device) + input_image_torch = input_image_torch.permute(2, 0, 1).contiguous()[ + None, :, :, : + ] + + self.set_torch_image(input_image_torch, image.shape[:2]) + + @torch.no_grad() + def set_torch_image( + self, + transformed_image: torch.Tensor, + original_image_size: Tuple[int, ...], + ) -> None: + """ + Calculates the image embeddings for the provided image, allowing + masks to be predicted with the 'predict' method. Expects the input + image to be already transformed to the format expected by the model. + + Arguments: + transformed_image (torch.Tensor): The input image, with shape + 1x3xHxW, which has been transformed with ResizeLongestSide. + original_image_size (tuple(int, int)): The size of the image + before transformation, in (H, W) format. + """ + assert ( + len(transformed_image.shape) == 4 + and transformed_image.shape[1] == 3 + and max(*transformed_image.shape[2:]) == self.model.image_encoder.img_size + ), f"set_torch_image input must be BCHW with long side {self.model.image_encoder.img_size}." + self.reset_image() + + self.original_size = original_image_size + self.input_size = tuple(transformed_image.shape[-2:]) + input_image = self.model.preprocess(transformed_image) + self.features = self.model.image_encoder(input_image) + self.is_image_set = True + + def predict( + self, + point_coords: Optional[np.ndarray] = None, + point_labels: Optional[np.ndarray] = None, + box: Optional[np.ndarray] = None, + mask_input: Optional[np.ndarray] = None, + multimask_output: bool = True, + return_logits: bool = False, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Predict masks for the given input prompts, using the currently set image. + + Arguments: + point_coords (np.ndarray or None): A Nx2 array of point prompts to the + model. Each point is in (X,Y) in pixels. + point_labels (np.ndarray or None): A length N array of labels for the + point prompts. 1 indicates a foreground point and 0 indicates a + background point. + box (np.ndarray or None): A length 4 array given a box prompt to the + model, in XYXY format. + mask_input (np.ndarray): A low resolution mask input to the model, typically + coming from a previous prediction iteration. Has form 1xHxW, where + for SAM, H=W=256. + multimask_output (bool): If true, the model will return three masks. + For ambiguous input prompts (such as a single click), this will often + produce better masks than a single prediction. If only a single + mask is needed, the model's predicted quality score can be used + to select the best mask. For non-ambiguous prompts, such as multiple + input prompts, multimask_output=False can give better results. + return_logits (bool): If true, returns un-thresholded masks logits + instead of a binary mask. + + Returns: + (np.ndarray): The output masks in CxHxW format, where C is the + number of masks, and (H, W) is the original image size. + (np.ndarray): An array of length C containing the model's + predictions for the quality of each mask. + (np.ndarray): An array of shape CxHxW, where C is the number + of masks and H=W=256. These low resolution logits can be passed to + a subsequent iteration as mask input. + """ + if not self.is_image_set: + raise RuntimeError( + "An image must be set with .set_image(...) before mask prediction." + ) + + # Transform input prompts + coords_torch, labels_torch, box_torch, mask_input_torch = None, None, None, None + if point_coords is not None: + assert ( + point_labels is not None + ), "point_labels must be supplied if point_coords is supplied." + point_coords = self.transform.apply_coords(point_coords, self.original_size) + coords_torch = torch.as_tensor( + point_coords, dtype=torch.float, device=self.device + ) + labels_torch = torch.as_tensor( + point_labels, dtype=torch.int, device=self.device + ) + coords_torch, labels_torch = coords_torch[None, :, :], labels_torch[None, :] + if box is not None: + box = self.transform.apply_boxes(box, self.original_size) + box_torch = torch.as_tensor(box, dtype=torch.float, device=self.device) + box_torch = box_torch[None, :] + if mask_input is not None: + mask_input_torch = torch.as_tensor( + mask_input, dtype=torch.float, device=self.device + ) + mask_input_torch = mask_input_torch[None, :, :, :] + + masks, iou_predictions, low_res_masks = self.predict_torch( + coords_torch, + labels_torch, + box_torch, + mask_input_torch, + multimask_output, + return_logits=return_logits, + ) + + masks = masks[0].detach().cpu().numpy() + iou_predictions = iou_predictions[0].detach().cpu().numpy() + low_res_masks = low_res_masks[0].detach().cpu().numpy() + return masks, iou_predictions, low_res_masks + + @torch.no_grad() + def predict_torch( + self, + point_coords: Optional[torch.Tensor], + point_labels: Optional[torch.Tensor], + boxes: Optional[torch.Tensor] = None, + mask_input: Optional[torch.Tensor] = None, + multimask_output: bool = True, + return_logits: bool = False, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Predict masks for the given input prompts, using the currently set image. + Input prompts are batched torch tensors and are expected to already be + transformed to the input frame using ResizeLongestSide. + + Arguments: + point_coords (torch.Tensor or None): A BxNx2 array of point prompts to the + model. Each point is in (X,Y) in pixels. + point_labels (torch.Tensor or None): A BxN array of labels for the + point prompts. 1 indicates a foreground point and 0 indicates a + background point. + box (np.ndarray or None): A Bx4 array given a box prompt to the + model, in XYXY format. + mask_input (np.ndarray): A low resolution mask input to the model, typically + coming from a previous prediction iteration. Has form Bx1xHxW, where + for SAM, H=W=256. Masks returned by a previous iteration of the + predict method do not need further transformation. + multimask_output (bool): If true, the model will return three masks. + For ambiguous input prompts (such as a single click), this will often + produce better masks than a single prediction. If only a single + mask is needed, the model's predicted quality score can be used + to select the best mask. For non-ambiguous prompts, such as multiple + input prompts, multimask_output=False can give better results. + return_logits (bool): If true, returns un-thresholded masks logits + instead of a binary mask. + + Returns: + (torch.Tensor): The output masks in BxCxHxW format, where C is the + number of masks, and (H, W) is the original image size. + (torch.Tensor): An array of shape BxC containing the model's + predictions for the quality of each mask. + (torch.Tensor): An array of shape BxCxHxW, where C is the number + of masks and H=W=256. These low res logits can be passed to + a subsequent iteration as mask input. + """ + if not self.is_image_set: + raise RuntimeError( + "An image must be set with .set_image(...) before mask prediction." + ) + + if point_coords is not None: + points = (point_coords, point_labels) + else: + points = None + + # Embed prompts + sparse_embeddings, dense_embeddings = self.model.prompt_encoder( + points=points, + boxes=boxes, + masks=mask_input, + ) + + # Predict masks + low_res_masks, iou_predictions = self.model.mask_decoder( + image_embeddings=self.features, + image_pe=self.model.prompt_encoder.get_dense_pe(), + sparse_prompt_embeddings=sparse_embeddings, + dense_prompt_embeddings=dense_embeddings, + multimask_output=multimask_output, + ) + + # Upscale the masks to the original image resolution + masks = self.model.postprocess_masks( + low_res_masks, self.input_size, self.original_size + ) + + if not return_logits: + masks = masks > self.model.mask_threshold + + return masks, iou_predictions, low_res_masks + + def get_image_embedding(self) -> torch.Tensor: + """ + Returns the image embeddings for the currently set image, with + shape 1xCxHxW, where C is the embedding dimension and (H,W) are + the embedding spatial dimension of SAM (typically C=256, H=W=64). + """ + if not self.is_image_set: + raise RuntimeError( + "An image must be set with .set_image(...) to generate an embedding." + ) + assert ( + self.features is not None + ), "Features must exist if an image has been set." + return self.features + + @property + def device(self) -> torch.device: + return self.model.device + + def reset_image(self) -> None: + """Resets the currently set image.""" + self.is_image_set = False + self.features = None + self.orig_h = None + self.orig_w = None + self.input_h = None + self.input_w = None diff --git a/inpaint/plugins/segment_anything/predictor_hq.py b/inpaint/plugins/segment_anything/predictor_hq.py new file mode 100644 index 0000000..d8fd50f --- /dev/null +++ b/inpaint/plugins/segment_anything/predictor_hq.py @@ -0,0 +1,292 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import numpy as np +import torch + +from .modeling import Sam + +from typing import Optional, Tuple + +from .utils.transforms import ResizeLongestSide + + +class SamHQPredictor: + def __init__( + self, + sam_model: Sam, + ) -> None: + """ + Uses SAM to calculate the image embedding for an image, and then + allow repeated, efficient mask prediction given prompts. + + Arguments: + sam_model (Sam): The model to use for mask prediction. + """ + super().__init__() + self.model = sam_model + self.transform = ResizeLongestSide(sam_model.image_encoder.img_size) + self.reset_image() + + def set_image( + self, + image: np.ndarray, + image_format: str = "RGB", + ) -> None: + """ + Calculates the image embeddings for the provided image, allowing + masks to be predicted with the 'predict' method. + + Arguments: + image (np.ndarray): The image for calculating masks. Expects an + image in HWC uint8 format, with pixel values in [0, 255]. + image_format (str): The color format of the image, in ['RGB', 'BGR']. + """ + assert image_format in [ + "RGB", + "BGR", + ], f"image_format must be in ['RGB', 'BGR'], is {image_format}." + # import pdb;pdb.set_trace() + if image_format != self.model.image_format: + image = image[..., ::-1] + + # Transform the image to the form expected by the model + # import pdb;pdb.set_trace() + input_image = self.transform.apply_image(image) + input_image_torch = torch.as_tensor(input_image, device=self.device) + input_image_torch = input_image_torch.permute(2, 0, 1).contiguous()[ + None, :, :, : + ] + + self.set_torch_image(input_image_torch, image.shape[:2]) + + @torch.no_grad() + def set_torch_image( + self, + transformed_image: torch.Tensor, + original_image_size: Tuple[int, ...], + ) -> None: + """ + Calculates the image embeddings for the provided image, allowing + masks to be predicted with the 'predict' method. Expects the input + image to be already transformed to the format expected by the model. + + Arguments: + transformed_image (torch.Tensor): The input image, with shape + 1x3xHxW, which has been transformed with ResizeLongestSide. + original_image_size (tuple(int, int)): The size of the image + before transformation, in (H, W) format. + """ + assert ( + len(transformed_image.shape) == 4 + and transformed_image.shape[1] == 3 + and max(*transformed_image.shape[2:]) == self.model.image_encoder.img_size + ), f"set_torch_image input must be BCHW with long side {self.model.image_encoder.img_size}." + self.reset_image() + + self.original_size = original_image_size + self.input_size = tuple(transformed_image.shape[-2:]) + input_image = self.model.preprocess(transformed_image) + self.features, self.interm_features = self.model.image_encoder(input_image) + self.is_image_set = True + + def predict( + self, + point_coords: Optional[np.ndarray] = None, + point_labels: Optional[np.ndarray] = None, + box: Optional[np.ndarray] = None, + mask_input: Optional[np.ndarray] = None, + multimask_output: bool = True, + return_logits: bool = False, + hq_token_only: bool = False, + ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + """ + Predict masks for the given input prompts, using the currently set image. + + Arguments: + point_coords (np.ndarray or None): A Nx2 array of point prompts to the + model. Each point is in (X,Y) in pixels. + point_labels (np.ndarray or None): A length N array of labels for the + point prompts. 1 indicates a foreground point and 0 indicates a + background point. + box (np.ndarray or None): A length 4 array given a box prompt to the + model, in XYXY format. + mask_input (np.ndarray): A low resolution mask input to the model, typically + coming from a previous prediction iteration. Has form 1xHxW, where + for SAM, H=W=256. + multimask_output (bool): If true, the model will return three masks. + For ambiguous input prompts (such as a single click), this will often + produce better masks than a single prediction. If only a single + mask is needed, the model's predicted quality score can be used + to select the best mask. For non-ambiguous prompts, such as multiple + input prompts, multimask_output=False can give better results. + return_logits (bool): If true, returns un-thresholded masks logits + instead of a binary mask. + + Returns: + (np.ndarray): The output masks in CxHxW format, where C is the + number of masks, and (H, W) is the original image size. + (np.ndarray): An array of length C containing the model's + predictions for the quality of each mask. + (np.ndarray): An array of shape CxHxW, where C is the number + of masks and H=W=256. These low resolution logits can be passed to + a subsequent iteration as mask input. + """ + if not self.is_image_set: + raise RuntimeError( + "An image must be set with .set_image(...) before mask prediction." + ) + + # Transform input prompts + coords_torch, labels_torch, box_torch, mask_input_torch = None, None, None, None + if point_coords is not None: + assert ( + point_labels is not None + ), "point_labels must be supplied if point_coords is supplied." + point_coords = self.transform.apply_coords(point_coords, self.original_size) + coords_torch = torch.as_tensor( + point_coords, dtype=torch.float, device=self.device + ) + labels_torch = torch.as_tensor( + point_labels, dtype=torch.int, device=self.device + ) + coords_torch, labels_torch = coords_torch[None, :, :], labels_torch[None, :] + if box is not None: + box = self.transform.apply_boxes(box, self.original_size) + box_torch = torch.as_tensor(box, dtype=torch.float, device=self.device) + box_torch = box_torch[None, :] + if mask_input is not None: + mask_input_torch = torch.as_tensor( + mask_input, dtype=torch.float, device=self.device + ) + mask_input_torch = mask_input_torch[None, :, :, :] + + masks, iou_predictions, low_res_masks = self.predict_torch( + coords_torch, + labels_torch, + box_torch, + mask_input_torch, + multimask_output, + return_logits=return_logits, + hq_token_only=hq_token_only, + ) + + masks_np = masks[0].detach().cpu().numpy() + iou_predictions_np = iou_predictions[0].detach().cpu().numpy() + low_res_masks_np = low_res_masks[0].detach().cpu().numpy() + return masks_np, iou_predictions_np, low_res_masks_np + + @torch.no_grad() + def predict_torch( + self, + point_coords: Optional[torch.Tensor], + point_labels: Optional[torch.Tensor], + boxes: Optional[torch.Tensor] = None, + mask_input: Optional[torch.Tensor] = None, + multimask_output: bool = True, + return_logits: bool = False, + hq_token_only: bool = False, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Predict masks for the given input prompts, using the currently set image. + Input prompts are batched torch tensors and are expected to already be + transformed to the input frame using ResizeLongestSide. + + Arguments: + point_coords (torch.Tensor or None): A BxNx2 array of point prompts to the + model. Each point is in (X,Y) in pixels. + point_labels (torch.Tensor or None): A BxN array of labels for the + point prompts. 1 indicates a foreground point and 0 indicates a + background point. + boxes (np.ndarray or None): A Bx4 array given a box prompt to the + model, in XYXY format. + mask_input (np.ndarray): A low resolution mask input to the model, typically + coming from a previous prediction iteration. Has form Bx1xHxW, where + for SAM, H=W=256. Masks returned by a previous iteration of the + predict method do not need further transformation. + multimask_output (bool): If true, the model will return three masks. + For ambiguous input prompts (such as a single click), this will often + produce better masks than a single prediction. If only a single + mask is needed, the model's predicted quality score can be used + to select the best mask. For non-ambiguous prompts, such as multiple + input prompts, multimask_output=False can give better results. + return_logits (bool): If true, returns un-thresholded masks logits + instead of a binary mask. + + Returns: + (torch.Tensor): The output masks in BxCxHxW format, where C is the + number of masks, and (H, W) is the original image size. + (torch.Tensor): An array of shape BxC containing the model's + predictions for the quality of each mask. + (torch.Tensor): An array of shape BxCxHxW, where C is the number + of masks and H=W=256. These low res logits can be passed to + a subsequent iteration as mask input. + """ + if not self.is_image_set: + raise RuntimeError( + "An image must be set with .set_image(...) before mask prediction." + ) + + if point_coords is not None: + points = (point_coords, point_labels) + else: + points = None + + # Embed prompts + sparse_embeddings, dense_embeddings = self.model.prompt_encoder( + points=points, + boxes=boxes, + masks=mask_input, + ) + + # Predict masks + low_res_masks, iou_predictions = self.model.mask_decoder( + image_embeddings=self.features, + image_pe=self.model.prompt_encoder.get_dense_pe(), + sparse_prompt_embeddings=sparse_embeddings, + dense_prompt_embeddings=dense_embeddings, + multimask_output=multimask_output, + hq_token_only=hq_token_only, + interm_embeddings=self.interm_features, + ) + + # Upscale the masks to the original image resolution + masks = self.model.postprocess_masks( + low_res_masks, self.input_size, self.original_size + ) + + if not return_logits: + masks = masks > self.model.mask_threshold + + return masks, iou_predictions, low_res_masks + + def get_image_embedding(self) -> torch.Tensor: + """ + Returns the image embeddings for the currently set image, with + shape 1xCxHxW, where C is the embedding dimension and (H,W) are + the embedding spatial dimension of SAM (typically C=256, H=W=64). + """ + if not self.is_image_set: + raise RuntimeError( + "An image must be set with .set_image(...) to generate an embedding." + ) + assert ( + self.features is not None + ), "Features must exist if an image has been set." + return self.features + + @property + def device(self) -> torch.device: + return self.model.device + + def reset_image(self) -> None: + """Resets the currently set image.""" + self.is_image_set = False + self.features = None + self.orig_h = None + self.orig_w = None + self.input_h = None + self.input_w = None diff --git a/inpaint/plugins/segment_anything/utils/__init__.py b/inpaint/plugins/segment_anything/utils/__init__.py new file mode 100644 index 0000000..5277f46 --- /dev/null +++ b/inpaint/plugins/segment_anything/utils/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. diff --git a/inpaint/plugins/segment_anything/utils/transforms.py b/inpaint/plugins/segment_anything/utils/transforms.py new file mode 100644 index 0000000..90f50ed --- /dev/null +++ b/inpaint/plugins/segment_anything/utils/transforms.py @@ -0,0 +1,112 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import numpy as np +import torch +from torch.nn import functional as F +from torchvision.transforms.functional import resize, to_pil_image # type: ignore + +from copy import deepcopy +from typing import Tuple + + +class ResizeLongestSide: + """ + Resizes images to longest side 'target_length', as well as provides + methods for resizing coordinates and boxes. Provides methods for + transforming both numpy array and batched torch tensors. + """ + + def __init__(self, target_length: int) -> None: + self.target_length = target_length + + def apply_image(self, image: np.ndarray) -> np.ndarray: + """ + Expects a numpy array with shape HxWxC in uint8 format. + """ + target_size = self.get_preprocess_shape( + image.shape[0], image.shape[1], self.target_length + ) + return np.array(resize(to_pil_image(image), target_size)) + + def apply_coords( + self, coords: np.ndarray, original_size: Tuple[int, ...] + ) -> np.ndarray: + """ + Expects a numpy array of length 2 in the final dimension. Requires the + original image size in (H, W) format. + """ + old_h, old_w = original_size + new_h, new_w = self.get_preprocess_shape( + original_size[0], original_size[1], self.target_length + ) + coords = deepcopy(coords).astype(float) + coords[..., 0] = coords[..., 0] * (new_w / old_w) + coords[..., 1] = coords[..., 1] * (new_h / old_h) + return coords + + def apply_boxes( + self, boxes: np.ndarray, original_size: Tuple[int, ...] + ) -> np.ndarray: + """ + Expects a numpy array shape Bx4. Requires the original image size + in (H, W) format. + """ + boxes = self.apply_coords(boxes.reshape(-1, 2, 2), original_size) + return boxes.reshape(-1, 4) + + def apply_image_torch(self, image: torch.Tensor) -> torch.Tensor: + """ + Expects batched images with shape BxCxHxW and float format. This + transformation may not exactly match apply_image. apply_image is + the transformation expected by the model. + """ + # Expects an image in BCHW format. May not exactly match apply_image. + target_size = self.get_preprocess_shape( + image.shape[0], image.shape[1], self.target_length + ) + return F.interpolate( + image, target_size, mode="bilinear", align_corners=False, antialias=True + ) + + def apply_coords_torch( + self, coords: torch.Tensor, original_size: Tuple[int, ...] + ) -> torch.Tensor: + """ + Expects a torch tensor with length 2 in the last dimension. Requires the + original image size in (H, W) format. + """ + old_h, old_w = original_size + new_h, new_w = self.get_preprocess_shape( + original_size[0], original_size[1], self.target_length + ) + coords = deepcopy(coords).to(torch.float) + coords[..., 0] = coords[..., 0] * (new_w / old_w) + coords[..., 1] = coords[..., 1] * (new_h / old_h) + return coords + + def apply_boxes_torch( + self, boxes: torch.Tensor, original_size: Tuple[int, ...] + ) -> torch.Tensor: + """ + Expects a torch tensor with shape Bx4. Requires the original image + size in (H, W) format. + """ + boxes = self.apply_coords_torch(boxes.reshape(-1, 2, 2), original_size) + return boxes.reshape(-1, 4) + + @staticmethod + def get_preprocess_shape( + oldh: int, oldw: int, long_side_length: int + ) -> Tuple[int, int]: + """ + Compute the output size given input size and target long side length. + """ + scale = long_side_length * 1.0 / max(oldh, oldw) + newh, neww = oldh * scale, oldw * scale + neww = int(neww + 0.5) + newh = int(newh + 0.5) + return (newh, neww) diff --git a/inpaint/plugins/segment_anything2/__init__.py b/inpaint/plugins/segment_anything2/__init__.py new file mode 100644 index 0000000..5277f46 --- /dev/null +++ b/inpaint/plugins/segment_anything2/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. diff --git a/inpaint/plugins/segment_anything2/build_sam.py b/inpaint/plugins/segment_anything2/build_sam.py new file mode 100644 index 0000000..5100f70 --- /dev/null +++ b/inpaint/plugins/segment_anything2/build_sam.py @@ -0,0 +1,262 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import logging + +import torch +from pathlib import Path + +from .modeling.backbones.hieradet import Hiera +from .modeling.backbones.image_encoder import ImageEncoder, FpnNeck +from .modeling.memory_attention import MemoryAttention, MemoryAttentionLayer +from .modeling.memory_encoder import MemoryEncoder, MaskDownSampler, Fuser, CXBlock +from .modeling.position_encoding import PositionEmbeddingSine +from .modeling.sam.transformer import RoPEAttention +from .modeling.sam2_base import SAM2Base + +CURRENT_DIR = Path(__file__).parent +CONFIG_DIR = CURRENT_DIR / "sam2_configs" + +common_kwargs = dict( + num_maskmem=7, + image_size=1024, + sigmoid_scale_for_mem_enc=20.0, + sigmoid_bias_for_mem_enc=-10.0, + use_mask_input_as_output_without_sam=True, + directly_add_no_mem_embed=True, + use_high_res_features_in_sam=True, + multimask_output_in_sam=True, + iou_prediction_use_sigmoid=True, + use_obj_ptrs_in_encoder=True, + add_tpos_enc_to_obj_ptrs=False, + only_obj_ptrs_in_the_past_for_eval=True, + pred_obj_scores=True, + pred_obj_scores_mlp=True, + fixed_no_obj_ptr=True, + multimask_output_for_tracking=True, + use_multimask_token_for_obj_ptr=True, + multimask_min_pt_num=0, + multimask_max_pt_num=1, + use_mlp_for_obj_ptr_proj=True, + compile_image_encoder=False, +) + + +def build_memory_attention(): + return MemoryAttention( + d_model=256, + pos_enc_at_input=True, + layer=MemoryAttentionLayer( + activation="relu", + dim_feedforward=2048, + dropout=0.1, + pos_enc_at_attn=False, + self_attention=RoPEAttention( + rope_theta=10000.0, + feat_sizes=[32, 32], + embedding_dim=256, + num_heads=1, + downsample_rate=1, + dropout=0.1, + ), + d_model=256, + pos_enc_at_cross_attn_keys=True, + pos_enc_at_cross_attn_queries=False, + cross_attention=RoPEAttention( + rope_theta=10000.0, + feat_sizes=[32, 32], + embedding_dim=256, + num_heads=1, + downsample_rate=1, + dropout=0.1, + kv_in_dim=64, + ), + ), + num_layers=4, + ) + + +def build_memory_encoder(): + return MemoryEncoder( + out_dim=64, + position_encoding=PositionEmbeddingSine( + num_pos_feats=64, normalize=True, scale=None, temperature=10000 + ), + mask_downsampler=MaskDownSampler( + kernel_size=3, + stride=2, + padding=1, + ), + fuser=Fuser( + layer=CXBlock( + dim=256, + kernel_size=7, + padding=3, + layer_scale_init_value=1e-6, + use_dwconv=True, + ), + num_layers=2, + ), + ) + + +def build_sam2_tiny(): + return SAM2Base( + **common_kwargs, + image_encoder=ImageEncoder( + scalp=1, + trunk=Hiera( + embed_dim=96, + num_heads=1, + stages=(1, 2, 7, 2), + global_att_blocks=(5, 7, 9), + window_pos_embed_bkg_spatial_size=(7, 7), + window_spec=(8, 4, 14, 7), + ), + neck=FpnNeck( + position_encoding=PositionEmbeddingSine( + num_pos_feats=256, + normalize=True, + scale=None, + temperature=10000, + ), + d_model=256, + backbone_channel_list=[768, 384, 192, 96], + fpn_top_down_levels=[2, 3], + fpn_interp_model="nearest", + ), + ), + memory_attention=build_memory_attention(), + memory_encoder=build_memory_encoder(), + ) + + +def build_sam2_small(): + return SAM2Base( + **common_kwargs, + image_encoder=ImageEncoder( + scalp=1, + trunk=Hiera( + embed_dim=96, + num_heads=1, + stages=(1, 2, 11, 2), + global_att_blocks=(7, 10, 13), + window_pos_embed_bkg_spatial_size=(7, 7), + window_spec=(8, 4, 14, 7), + ), + neck=FpnNeck( + position_encoding=PositionEmbeddingSine( + num_pos_feats=256, + normalize=True, + scale=None, + temperature=10000, + ), + d_model=256, + backbone_channel_list=[768, 384, 192, 96], + fpn_top_down_levels=[2, 3], + fpn_interp_model="nearest", + ), + ), + memory_attention=build_memory_attention(), + memory_encoder=build_memory_encoder(), + ) + + +def build_sam2_base(): + return SAM2Base( + **common_kwargs, + image_encoder=ImageEncoder( + scalp=1, + trunk=Hiera( + embed_dim=112, + num_heads=2, + stages=(2, 3, 16, 3), + global_att_blocks=(12, 16, 20), + window_pos_embed_bkg_spatial_size=(14, 14), + window_spec=(8, 4, 14, 7), + ), + neck=FpnNeck( + position_encoding=PositionEmbeddingSine( + num_pos_feats=256, + normalize=True, + scale=None, + temperature=10000, + ), + d_model=256, + backbone_channel_list=[896, 448, 224, 112], + fpn_top_down_levels=[2, 3], + fpn_interp_model="nearest", + ), + ), + memory_attention=build_memory_attention(), + memory_encoder=build_memory_encoder(), + ) + + +def build_sam2_large(): + return SAM2Base( + **common_kwargs, + image_encoder=ImageEncoder( + scalp=1, + trunk=Hiera( + embed_dim=144, + num_heads=2, + stages=(2, 6, 36, 4), + global_att_blocks=(23, 33, 43), + window_pos_embed_bkg_spatial_size=(7, 7), + window_spec=(8, 4, 16, 8), + ), + neck=FpnNeck( + position_encoding=PositionEmbeddingSine( + num_pos_feats=256, + normalize=True, + scale=None, + temperature=10000, + ), + d_model=256, + backbone_channel_list=[1152, 576, 288, 144], + fpn_top_down_levels=[2, 3], + fpn_interp_model="nearest", + ), + ), + memory_attention=build_memory_attention(), + memory_encoder=build_memory_encoder(), + ) + + +sam2_model_registry = { + "sam2_tiny": build_sam2_tiny, + "sam2_small": build_sam2_small, + "sam2_base": build_sam2_base, + "sam2_large": build_sam2_large, +} + + +def build_sam2( + name, + ckpt_path=None, + device="cuda", + mode="eval", +): + model = sam2_model_registry[name]() + _load_checkpoint(model, ckpt_path) + model = model.to(device) + if mode == "eval": + model.eval() + return model + + +def _load_checkpoint(model, ckpt_path): + if ckpt_path is not None: + sd = torch.load(ckpt_path, map_location="cpu")["model"] + missing_keys, unexpected_keys = model.load_state_dict(sd) + if missing_keys: + logging.error(missing_keys) + raise RuntimeError() + if unexpected_keys: + logging.error(unexpected_keys) + raise RuntimeError() + logging.info("Loaded checkpoint sucessfully") diff --git a/inpaint/plugins/segment_anything2/modeling/__init__.py b/inpaint/plugins/segment_anything2/modeling/__init__.py new file mode 100644 index 0000000..5277f46 --- /dev/null +++ b/inpaint/plugins/segment_anything2/modeling/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. diff --git a/inpaint/plugins/segment_anything2/modeling/backbones/__init__.py b/inpaint/plugins/segment_anything2/modeling/backbones/__init__.py new file mode 100644 index 0000000..5277f46 --- /dev/null +++ b/inpaint/plugins/segment_anything2/modeling/backbones/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. diff --git a/inpaint/plugins/segment_anything2/modeling/backbones/hieradet.py b/inpaint/plugins/segment_anything2/modeling/backbones/hieradet.py new file mode 100644 index 0000000..9375b6a --- /dev/null +++ b/inpaint/plugins/segment_anything2/modeling/backbones/hieradet.py @@ -0,0 +1,295 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from functools import partial +from typing import List, Tuple, Union + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from ..backbones.utils import ( + PatchEmbed, + window_partition, + window_unpartition, +) + +from ..sam2_utils import DropPath, MLP + + +def do_pool(x: torch.Tensor, pool: nn.Module, norm: nn.Module = None) -> torch.Tensor: + if pool is None: + return x + # (B, H, W, C) -> (B, C, H, W) + x = x.permute(0, 3, 1, 2) + x = pool(x) + # (B, C, H', W') -> (B, H', W', C) + x = x.permute(0, 2, 3, 1) + if norm: + x = norm(x) + + return x + + +class MultiScaleAttention(nn.Module): + def __init__( + self, + dim: int, + dim_out: int, + num_heads: int, + q_pool: nn.Module = None, + ): + super().__init__() + + self.dim = dim + self.dim_out = dim_out + + self.num_heads = num_heads + head_dim = dim_out // num_heads + self.scale = head_dim**-0.5 + + self.q_pool = q_pool + self.qkv = nn.Linear(dim, dim_out * 3) + self.proj = nn.Linear(dim_out, dim_out) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + B, H, W, _ = x.shape + # qkv with shape (B, H * W, 3, nHead, C) + qkv = self.qkv(x).reshape(B, H * W, 3, self.num_heads, -1) + # q, k, v with shape (B, H * W, nheads, C) + q, k, v = torch.unbind(qkv, 2) + + # Q pooling (for downsample at stage changes) + if self.q_pool: + q = do_pool(q.reshape(B, H, W, -1), self.q_pool) + H, W = q.shape[1:3] # downsampled shape + q = q.reshape(B, H * W, self.num_heads, -1) + + # Torch's SDPA expects [B, nheads, H*W, C] so we transpose + x = F.scaled_dot_product_attention( + q.transpose(1, 2), + k.transpose(1, 2), + v.transpose(1, 2), + ) + # Transpose back + x = x.transpose(1, 2) + x = x.reshape(B, H, W, -1) + + x = self.proj(x) + + return x + + +class MultiScaleBlock(nn.Module): + def __init__( + self, + dim: int, + dim_out: int, + num_heads: int, + mlp_ratio: float = 4.0, + drop_path: float = 0.0, + norm_layer: Union[nn.Module, str] = "LayerNorm", + q_stride: Tuple[int, int] = None, + act_layer: nn.Module = nn.GELU, + window_size: int = 0, + ): + super().__init__() + + if isinstance(norm_layer, str): + norm_layer = partial(getattr(nn, norm_layer), eps=1e-6) + + self.dim = dim + self.dim_out = dim_out + self.norm1 = norm_layer(dim) + + self.window_size = window_size + + self.pool, self.q_stride = None, q_stride + if self.q_stride: + self.pool = nn.MaxPool2d( + kernel_size=q_stride, stride=q_stride, ceil_mode=False + ) + + self.attn = MultiScaleAttention( + dim, + dim_out, + num_heads=num_heads, + q_pool=self.pool, + ) + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + + self.norm2 = norm_layer(dim_out) + self.mlp = MLP( + dim_out, + int(dim_out * mlp_ratio), + dim_out, + num_layers=2, + activation=act_layer, + ) + + if dim != dim_out: + self.proj = nn.Linear(dim, dim_out) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + shortcut = x # B, H, W, C + x = self.norm1(x) + + # Skip connection + if self.dim != self.dim_out: + shortcut = do_pool(self.proj(x), self.pool) + + # Window partition + window_size = self.window_size + if window_size > 0: + H, W = x.shape[1], x.shape[2] + x, pad_hw = window_partition(x, window_size) + + # Window Attention + Q Pooling (if stage change) + x = self.attn(x) + if self.q_stride: + # Shapes have changed due to Q pooling + window_size = self.window_size // self.q_stride[0] + H, W = shortcut.shape[1:3] + + pad_h = (window_size - H % window_size) % window_size + pad_w = (window_size - W % window_size) % window_size + pad_hw = (H + pad_h, W + pad_w) + + # Reverse window partition + if self.window_size > 0: + x = window_unpartition(x, window_size, pad_hw, (H, W)) + + x = shortcut + self.drop_path(x) + # MLP + x = x + self.drop_path(self.mlp(self.norm2(x))) + return x + + +class Hiera(nn.Module): + """ + Reference: https://arxiv.org/abs/2306.00989 + """ + + def __init__( + self, + embed_dim: int = 96, # initial embed dim + num_heads: int = 1, # initial number of heads + drop_path_rate: float = 0.0, # stochastic depth + q_pool: int = 3, # number of q_pool stages + q_stride: Tuple[int, int] = (2, 2), # downsample stride bet. stages + stages: Tuple[int, ...] = (2, 3, 16, 3), # blocks per stage + dim_mul: float = 2.0, # dim_mul factor at stage shift + head_mul: float = 2.0, # head_mul factor at stage shift + window_pos_embed_bkg_spatial_size: Tuple[int, int] = (14, 14), + # window size per stage, when not using global att. + window_spec: Tuple[int, ...] = ( + 8, + 4, + 14, + 7, + ), + # global attn in these blocks + global_att_blocks: Tuple[int, ...] = ( + 12, + 16, + 20, + ), + return_interm_layers=True, # return feats from every stage + ): + super().__init__() + + assert len(stages) == len(window_spec) + self.window_spec = window_spec + + depth = sum(stages) + self.q_stride = q_stride + self.stage_ends = [sum(stages[:i]) - 1 for i in range(1, len(stages) + 1)] + assert 0 <= q_pool <= len(self.stage_ends[:-1]) + self.q_pool_blocks = [x + 1 for x in self.stage_ends[:-1]][:q_pool] + self.return_interm_layers = return_interm_layers + + self.patch_embed = PatchEmbed( + embed_dim=embed_dim, + ) + # Which blocks have global att? + self.global_att_blocks = global_att_blocks + + # Windowed positional embedding (https://arxiv.org/abs/2311.05613) + self.window_pos_embed_bkg_spatial_size = window_pos_embed_bkg_spatial_size + self.pos_embed = nn.Parameter( + torch.zeros(1, embed_dim, *self.window_pos_embed_bkg_spatial_size) + ) + self.pos_embed_window = nn.Parameter( + torch.zeros(1, embed_dim, self.window_spec[0], self.window_spec[0]) + ) + + dpr = [ + x.item() for x in torch.linspace(0, drop_path_rate, depth) + ] # stochastic depth decay rule + + cur_stage = 1 + self.blocks = nn.ModuleList() + + for i in range(depth): + dim_out = embed_dim + # lags by a block, so first block of + # next stage uses an initial window size + # of previous stage and final window size of current stage + window_size = self.window_spec[cur_stage - 1] + + if self.global_att_blocks is not None: + window_size = 0 if i in self.global_att_blocks else window_size + + if i - 1 in self.stage_ends: + dim_out = int(embed_dim * dim_mul) + num_heads = int(num_heads * head_mul) + cur_stage += 1 + + block = MultiScaleBlock( + dim=embed_dim, + dim_out=dim_out, + num_heads=num_heads, + drop_path=dpr[i], + q_stride=self.q_stride if i in self.q_pool_blocks else None, + window_size=window_size, + ) + + embed_dim = dim_out + self.blocks.append(block) + + self.channel_list = ( + [self.blocks[i].dim_out for i in self.stage_ends[::-1]] + if return_interm_layers + else [self.blocks[-1].dim_out] + ) + + def _get_pos_embed(self, hw: Tuple[int, int]) -> torch.Tensor: + h, w = hw + window_embed = self.pos_embed_window + pos_embed = F.interpolate(self.pos_embed, size=(h, w), mode="bicubic") + pos_embed = pos_embed + window_embed.tile( + [x // y for x, y in zip(pos_embed.shape, window_embed.shape)] + ) + pos_embed = pos_embed.permute(0, 2, 3, 1) + return pos_embed + + def forward(self, x: torch.Tensor) -> List[torch.Tensor]: + x = self.patch_embed(x) + # x: (B, H, W, C) + + # Add pos embed + x = x + self._get_pos_embed(x.shape[1:3]) + + outputs = [] + for i, blk in enumerate(self.blocks): + x = blk(x) + if (i == self.stage_ends[-1]) or ( + i in self.stage_ends and self.return_interm_layers + ): + feats = x.permute(0, 3, 1, 2) + outputs.append(feats) + + return outputs diff --git a/inpaint/plugins/segment_anything2/modeling/backbones/image_encoder.py b/inpaint/plugins/segment_anything2/modeling/backbones/image_encoder.py new file mode 100644 index 0000000..5f92baf --- /dev/null +++ b/inpaint/plugins/segment_anything2/modeling/backbones/image_encoder.py @@ -0,0 +1,133 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import List, Optional + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +class ImageEncoder(nn.Module): + def __init__( + self, + trunk: nn.Module, + neck: nn.Module, + scalp: int = 0, + ): + super().__init__() + self.trunk = trunk + self.neck = neck + self.scalp = scalp + assert ( + self.trunk.channel_list == self.neck.backbone_channel_list + ), f"Channel dims of trunk and neck do not match. Trunk: {self.trunk.channel_list}, neck: {self.neck.backbone_channel_list}" + + def forward(self, sample: torch.Tensor): + # Forward through backbone + features, pos = self.neck(self.trunk(sample)) + if self.scalp > 0: + # Discard the lowest resolution features + features, pos = features[: -self.scalp], pos[: -self.scalp] + + src = features[-1] + output = { + "vision_features": src, + "vision_pos_enc": pos, + "backbone_fpn": features, + } + return output + + +class FpnNeck(nn.Module): + """ + A modified variant of Feature Pyramid Network (FPN) neck + (we remove output conv and also do bicubic interpolation similar to ViT + pos embed interpolation) + """ + + def __init__( + self, + position_encoding: nn.Module, + d_model: int, + backbone_channel_list: List[int], + kernel_size: int = 1, + stride: int = 1, + padding: int = 0, + fpn_interp_model: str = "bilinear", + fuse_type: str = "sum", + fpn_top_down_levels: Optional[List[int]] = None, + ): + """Initialize the neck + :param trunk: the backbone + :param position_encoding: the positional encoding to use + :param d_model: the dimension of the model + :param neck_norm: the normalization to use + """ + super().__init__() + self.position_encoding = position_encoding + self.convs = nn.ModuleList() + self.backbone_channel_list = backbone_channel_list + for dim in backbone_channel_list: + current = nn.Sequential() + current.add_module( + "conv", + nn.Conv2d( + in_channels=dim, + out_channels=d_model, + kernel_size=kernel_size, + stride=stride, + padding=padding, + ), + ) + + self.convs.append(current) + self.fpn_interp_model = fpn_interp_model + assert fuse_type in ["sum", "avg"] + self.fuse_type = fuse_type + + # levels to have top-down features in its outputs + # e.g. if fpn_top_down_levels is [2, 3], then only outputs of level 2 and 3 + # have top-down propagation, while outputs of level 0 and level 1 have only + # lateral features from the same backbone level. + if fpn_top_down_levels is None: + # default is to have top-down features on all levels + fpn_top_down_levels = range(len(self.convs)) + self.fpn_top_down_levels = list(fpn_top_down_levels) + + def forward(self, xs: List[torch.Tensor]): + + out = [None] * len(self.convs) + pos = [None] * len(self.convs) + assert len(xs) == len(self.convs) + # fpn forward pass + # see https://github.com/facebookresearch/detectron2/blob/main/detectron2/modeling/backbone/fpn.py + prev_features = None + # forward in top-down order (from low to high resolution) + n = len(self.convs) - 1 + for i in range(n, -1, -1): + x = xs[i] + lateral_features = self.convs[n - i](x) + if i in self.fpn_top_down_levels and prev_features is not None: + top_down_features = F.interpolate( + prev_features.to(dtype=torch.float32), + scale_factor=2.0, + mode=self.fpn_interp_model, + align_corners=( + None if self.fpn_interp_model == "nearest" else False + ), + antialias=False, + ) + prev_features = lateral_features + top_down_features + if self.fuse_type == "avg": + prev_features /= 2 + else: + prev_features = lateral_features + x_out = prev_features + out[i] = x_out + pos[i] = self.position_encoding(x_out).to(x_out.dtype) + + return out, pos diff --git a/inpaint/plugins/segment_anything2/modeling/backbones/utils.py b/inpaint/plugins/segment_anything2/modeling/backbones/utils.py new file mode 100644 index 0000000..32d55c7 --- /dev/null +++ b/inpaint/plugins/segment_anything2/modeling/backbones/utils.py @@ -0,0 +1,95 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +"""Some utilities for backbones, in particular for windowing""" + +from typing import Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +def window_partition(x, window_size): + """ + Partition into non-overlapping windows with padding if needed. + Args: + x (tensor): input tokens with [B, H, W, C]. + window_size (int): window size. + Returns: + windows: windows after partition with [B * num_windows, window_size, window_size, C]. + (Hp, Wp): padded height and width before partition + """ + B, H, W, C = x.shape + + pad_h = (window_size - H % window_size) % window_size + pad_w = (window_size - W % window_size) % window_size + if pad_h > 0 or pad_w > 0: + x = F.pad(x, (0, 0, 0, pad_w, 0, pad_h)) + Hp, Wp = H + pad_h, W + pad_w + + x = x.view(B, Hp // window_size, window_size, Wp // window_size, window_size, C) + windows = ( + x.permute(0, 1, 3, 2, 4, 5).contiguous().view(-1, window_size, window_size, C) + ) + return windows, (Hp, Wp) + + +def window_unpartition(windows, window_size, pad_hw, hw): + """ + Window unpartition into original sequences and removing padding. + Args: + x (tensor): input tokens with [B * num_windows, window_size, window_size, C]. + window_size (int): window size. + pad_hw (Tuple): padded height and width (Hp, Wp). + hw (Tuple): original height and width (H, W) before padding. + Returns: + x: unpartitioned sequences with [B, H, W, C]. + """ + Hp, Wp = pad_hw + H, W = hw + B = windows.shape[0] // (Hp * Wp // window_size // window_size) + x = windows.view( + B, Hp // window_size, Wp // window_size, window_size, window_size, -1 + ) + x = x.permute(0, 1, 3, 2, 4, 5).contiguous().view(B, Hp, Wp, -1) + + if Hp > H or Wp > W: + x = x[:, :H, :W, :].contiguous() + return x + + +class PatchEmbed(nn.Module): + """ + Image to Patch Embedding. + """ + + def __init__( + self, + kernel_size: Tuple[int, ...] = (7, 7), + stride: Tuple[int, ...] = (4, 4), + padding: Tuple[int, ...] = (3, 3), + in_chans: int = 3, + embed_dim: int = 768, + ): + """ + Args: + kernel_size (Tuple): kernel size of the projection layer. + stride (Tuple): stride of the projection layer. + padding (Tuple): padding size of the projection layer. + in_chans (int): Number of input image channels. + embed_dim (int): embed_dim (int): Patch embedding dimension. + """ + super().__init__() + self.proj = nn.Conv2d( + in_chans, embed_dim, kernel_size=kernel_size, stride=stride, padding=padding + ) + + def forward(self, x: torch.Tensor) -> torch.Tensor: + x = self.proj(x) + # B C H W -> B H W C + x = x.permute(0, 2, 3, 1) + return x diff --git a/inpaint/plugins/segment_anything2/modeling/memory_attention.py b/inpaint/plugins/segment_anything2/modeling/memory_attention.py new file mode 100644 index 0000000..8a14327 --- /dev/null +++ b/inpaint/plugins/segment_anything2/modeling/memory_attention.py @@ -0,0 +1,169 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Optional + +import torch +from torch import nn, Tensor + +from .sam.transformer import RoPEAttention + +from .sam2_utils import get_activation_fn, get_clones + + +class MemoryAttentionLayer(nn.Module): + + def __init__( + self, + activation: str, + cross_attention: nn.Module, + d_model: int, + dim_feedforward: int, + dropout: float, + pos_enc_at_attn: bool, + pos_enc_at_cross_attn_keys: bool, + pos_enc_at_cross_attn_queries: bool, + self_attention: nn.Module, + ): + super().__init__() + self.d_model = d_model + self.dim_feedforward = dim_feedforward + self.dropout_value = dropout + self.self_attn = self_attention + self.cross_attn_image = cross_attention + + # Implementation of Feedforward model + self.linear1 = nn.Linear(d_model, dim_feedforward) + self.dropout = nn.Dropout(dropout) + self.linear2 = nn.Linear(dim_feedforward, d_model) + + self.norm1 = nn.LayerNorm(d_model) + self.norm2 = nn.LayerNorm(d_model) + self.norm3 = nn.LayerNorm(d_model) + self.dropout1 = nn.Dropout(dropout) + self.dropout2 = nn.Dropout(dropout) + self.dropout3 = nn.Dropout(dropout) + + self.activation_str = activation + self.activation = get_activation_fn(activation) + + # Where to add pos enc + self.pos_enc_at_attn = pos_enc_at_attn + self.pos_enc_at_cross_attn_queries = pos_enc_at_cross_attn_queries + self.pos_enc_at_cross_attn_keys = pos_enc_at_cross_attn_keys + + def _forward_sa(self, tgt, query_pos): + # Self-Attention + tgt2 = self.norm1(tgt) + q = k = tgt2 + query_pos if self.pos_enc_at_attn else tgt2 + tgt2 = self.self_attn(q, k, v=tgt2) + tgt = tgt + self.dropout1(tgt2) + return tgt + + def _forward_ca(self, tgt, memory, query_pos, pos, num_k_exclude_rope=0): + kwds = {} + if num_k_exclude_rope > 0: + assert isinstance(self.cross_attn_image, RoPEAttention) + kwds = {"num_k_exclude_rope": num_k_exclude_rope} + + # Cross-Attention + tgt2 = self.norm2(tgt) + tgt2 = self.cross_attn_image( + q=tgt2 + query_pos if self.pos_enc_at_cross_attn_queries else tgt2, + k=memory + pos if self.pos_enc_at_cross_attn_keys else memory, + v=memory, + **kwds, + ) + tgt = tgt + self.dropout2(tgt2) + return tgt + + def forward( + self, + tgt, + memory, + pos: Optional[Tensor] = None, + query_pos: Optional[Tensor] = None, + num_k_exclude_rope: int = 0, + ) -> torch.Tensor: + + # Self-Attn, Cross-Attn + tgt = self._forward_sa(tgt, query_pos) + tgt = self._forward_ca(tgt, memory, query_pos, pos, num_k_exclude_rope) + # MLP + tgt2 = self.norm3(tgt) + tgt2 = self.linear2(self.dropout(self.activation(self.linear1(tgt2)))) + tgt = tgt + self.dropout3(tgt2) + return tgt + + +class MemoryAttention(nn.Module): + def __init__( + self, + d_model: int, + pos_enc_at_input: bool, + layer: nn.Module, + num_layers: int, + batch_first: bool = True, # Do layers expect batch first input? + ): + super().__init__() + self.d_model = d_model + self.layers = get_clones(layer, num_layers) + self.num_layers = num_layers + self.norm = nn.LayerNorm(d_model) + self.pos_enc_at_input = pos_enc_at_input + self.batch_first = batch_first + + def forward( + self, + curr: torch.Tensor, # self-attention inputs + memory: torch.Tensor, # cross-attention inputs + curr_pos: Optional[Tensor] = None, # pos_enc for self-attention inputs + memory_pos: Optional[Tensor] = None, # pos_enc for cross-attention inputs + num_obj_ptr_tokens: int = 0, # number of object pointer *tokens* + ): + if isinstance(curr, list): + assert isinstance(curr_pos, list) + assert len(curr) == len(curr_pos) == 1 + curr, curr_pos = ( + curr[0], + curr_pos[0], + ) + + assert ( + curr.shape[1] == memory.shape[1] + ), "Batch size must be the same for curr and memory" + + output = curr + if self.pos_enc_at_input and curr_pos is not None: + output = output + 0.1 * curr_pos + + if self.batch_first: + # Convert to batch first + output = output.transpose(0, 1) + curr_pos = curr_pos.transpose(0, 1) + memory = memory.transpose(0, 1) + memory_pos = memory_pos.transpose(0, 1) + + for layer in self.layers: + kwds = {} + if isinstance(layer.cross_attn_image, RoPEAttention): + kwds = {"num_k_exclude_rope": num_obj_ptr_tokens} + + output = layer( + tgt=output, + memory=memory, + pos=memory_pos, + query_pos=curr_pos, + **kwds, + ) + normed_output = self.norm(output) + + if self.batch_first: + # Convert back to seq first + normed_output = normed_output.transpose(0, 1) + curr_pos = curr_pos.transpose(0, 1) + + return normed_output diff --git a/inpaint/plugins/segment_anything2/modeling/memory_encoder.py b/inpaint/plugins/segment_anything2/modeling/memory_encoder.py new file mode 100644 index 0000000..14cb6e7 --- /dev/null +++ b/inpaint/plugins/segment_anything2/modeling/memory_encoder.py @@ -0,0 +1,181 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import math +from typing import Tuple + +import torch +import torch.nn as nn +import torch.nn.functional as F + +from .sam2_utils import DropPath, get_clones, LayerNorm2d + + +class MaskDownSampler(nn.Module): + """ + Progressively downsample a mask by total_stride, each time by stride. + Note that LayerNorm is applied per *token*, like in ViT. + + With each downsample (by a factor stride**2), channel capacity increases by the same factor. + In the end, we linearly project to embed_dim channels. + """ + + def __init__( + self, + embed_dim=256, + kernel_size=4, + stride=4, + padding=0, + total_stride=16, + activation=nn.GELU, + ): + super().__init__() + num_layers = int(math.log2(total_stride) // math.log2(stride)) + assert stride**num_layers == total_stride + self.encoder = nn.Sequential() + mask_in_chans, mask_out_chans = 1, 1 + for _ in range(num_layers): + mask_out_chans = mask_in_chans * (stride**2) + self.encoder.append( + nn.Conv2d( + mask_in_chans, + mask_out_chans, + kernel_size=kernel_size, + stride=stride, + padding=padding, + ) + ) + self.encoder.append(LayerNorm2d(mask_out_chans)) + self.encoder.append(activation()) + mask_in_chans = mask_out_chans + + self.encoder.append(nn.Conv2d(mask_out_chans, embed_dim, kernel_size=1)) + + def forward(self, x): + return self.encoder(x) + + +# Lightly adapted from ConvNext (https://github.com/facebookresearch/ConvNeXt) +class CXBlock(nn.Module): + r"""ConvNeXt Block. There are two equivalent implementations: + (1) DwConv -> LayerNorm (channels_first) -> 1x1 Conv -> GELU -> 1x1 Conv; all in (N, C, H, W) + (2) DwConv -> Permute to (N, H, W, C); LayerNorm (channels_last) -> Linear -> GELU -> Linear; Permute back + We use (2) as we find it slightly faster in PyTorch + + Args: + dim (int): Number of input channels. + drop_path (float): Stochastic depth rate. Default: 0.0 + layer_scale_init_value (float): Init value for Layer Scale. Default: 1e-6. + """ + + def __init__( + self, + dim, + kernel_size=7, + padding=3, + drop_path=0.0, + layer_scale_init_value=1e-6, + use_dwconv=True, + ): + super().__init__() + self.dwconv = nn.Conv2d( + dim, + dim, + kernel_size=kernel_size, + padding=padding, + groups=dim if use_dwconv else 1, + ) # depthwise conv + self.norm = LayerNorm2d(dim, eps=1e-6) + self.pwconv1 = nn.Linear( + dim, 4 * dim + ) # pointwise/1x1 convs, implemented with linear layers + self.act = nn.GELU() + self.pwconv2 = nn.Linear(4 * dim, dim) + self.gamma = ( + nn.Parameter(layer_scale_init_value * torch.ones((dim)), requires_grad=True) + if layer_scale_init_value > 0 + else None + ) + self.drop_path = DropPath(drop_path) if drop_path > 0.0 else nn.Identity() + + def forward(self, x): + input = x + x = self.dwconv(x) + x = self.norm(x) + x = x.permute(0, 2, 3, 1) # (N, C, H, W) -> (N, H, W, C) + x = self.pwconv1(x) + x = self.act(x) + x = self.pwconv2(x) + if self.gamma is not None: + x = self.gamma * x + x = x.permute(0, 3, 1, 2) # (N, H, W, C) -> (N, C, H, W) + + x = input + self.drop_path(x) + return x + + +class Fuser(nn.Module): + def __init__(self, layer, num_layers, dim=None, input_projection=False): + super().__init__() + self.proj = nn.Identity() + self.layers = get_clones(layer, num_layers) + + if input_projection: + assert dim is not None + self.proj = nn.Conv2d(dim, dim, kernel_size=1) + + def forward(self, x): + # normally x: (N, C, H, W) + x = self.proj(x) + for layer in self.layers: + x = layer(x) + return x + + +class MemoryEncoder(nn.Module): + def __init__( + self, + out_dim, + mask_downsampler, + fuser, + position_encoding, + in_dim=256, # in_dim of pix_feats + ): + super().__init__() + + self.mask_downsampler = mask_downsampler + + self.pix_feat_proj = nn.Conv2d(in_dim, in_dim, kernel_size=1) + self.fuser = fuser + self.position_encoding = position_encoding + self.out_proj = nn.Identity() + if out_dim != in_dim: + self.out_proj = nn.Conv2d(in_dim, out_dim, kernel_size=1) + + def forward( + self, + pix_feat: torch.Tensor, + masks: torch.Tensor, + skip_mask_sigmoid: bool = False, + ) -> Tuple[torch.Tensor, torch.Tensor]: + ## Process masks + # sigmoid, so that less domain shift from gt masks which are bool + if not skip_mask_sigmoid: + masks = F.sigmoid(masks) + masks = self.mask_downsampler(masks) + + ## Fuse pix_feats and downsampled masks + # in case the visual features are on CPU, cast them to CUDA + pix_feat = pix_feat.to(masks.device) + + x = self.pix_feat_proj(pix_feat) + x = x + masks + x = self.fuser(x) + x = self.out_proj(x) + + pos = self.position_encoding(x).to(x.dtype) + + return {"vision_features": x, "vision_pos_enc": [pos]} diff --git a/inpaint/plugins/segment_anything2/modeling/position_encoding.py b/inpaint/plugins/segment_anything2/modeling/position_encoding.py new file mode 100644 index 0000000..f4b57ae --- /dev/null +++ b/inpaint/plugins/segment_anything2/modeling/position_encoding.py @@ -0,0 +1,216 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import math +from typing import Any, Optional, Tuple + +import numpy as np + +import torch +from torch import nn + + +class PositionEmbeddingSine(nn.Module): + """ + This is a more standard version of the position embedding, very similar to the one + used by the Attention is all you need paper, generalized to work on images. + """ + + def __init__( + self, + num_pos_feats, + temperature: int = 10000, + normalize: bool = True, + scale: Optional[float] = None, + ): + super().__init__() + assert num_pos_feats % 2 == 0, "Expecting even model width" + self.num_pos_feats = num_pos_feats // 2 + self.temperature = temperature + self.normalize = normalize + if scale is not None and normalize is False: + raise ValueError("normalize should be True if scale is passed") + if scale is None: + scale = 2 * math.pi + self.scale = scale + + self.cache = {} + + def _encode_xy(self, x, y): + # The positions are expected to be normalized + assert len(x) == len(y) and x.ndim == y.ndim == 1 + x_embed = x * self.scale + y_embed = y * self.scale + + dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) + dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) + + pos_x = x_embed[:, None] / dim_t + pos_y = y_embed[:, None] / dim_t + pos_x = torch.stack( + (pos_x[:, 0::2].sin(), pos_x[:, 1::2].cos()), dim=2 + ).flatten(1) + pos_y = torch.stack( + (pos_y[:, 0::2].sin(), pos_y[:, 1::2].cos()), dim=2 + ).flatten(1) + return pos_x, pos_y + + @torch.no_grad() + def encode_boxes(self, x, y, w, h): + pos_x, pos_y = self._encode_xy(x, y) + pos = torch.cat((pos_y, pos_x, h[:, None], w[:, None]), dim=1) + return pos + + encode = encode_boxes # Backwards compatibility + + @torch.no_grad() + def encode_points(self, x, y, labels): + (bx, nx), (by, ny), (bl, nl) = x.shape, y.shape, labels.shape + assert bx == by and nx == ny and bx == bl and nx == nl + pos_x, pos_y = self._encode_xy(x.flatten(), y.flatten()) + pos_x, pos_y = pos_x.reshape(bx, nx, -1), pos_y.reshape(by, ny, -1) + pos = torch.cat((pos_y, pos_x, labels[:, :, None]), dim=2) + return pos + + @torch.no_grad() + def forward(self, x: torch.Tensor): + cache_key = (x.shape[-2], x.shape[-1]) + if cache_key in self.cache: + return self.cache[cache_key][None].repeat(x.shape[0], 1, 1, 1) + y_embed = ( + torch.arange(1, x.shape[-2] + 1, dtype=torch.float32, device=x.device) + .view(1, -1, 1) + .repeat(x.shape[0], 1, x.shape[-1]) + ) + x_embed = ( + torch.arange(1, x.shape[-1] + 1, dtype=torch.float32, device=x.device) + .view(1, 1, -1) + .repeat(x.shape[0], x.shape[-2], 1) + ) + + if self.normalize: + eps = 1e-6 + y_embed = y_embed / (y_embed[:, -1:, :] + eps) * self.scale + x_embed = x_embed / (x_embed[:, :, -1:] + eps) * self.scale + + dim_t = torch.arange(self.num_pos_feats, dtype=torch.float32, device=x.device) + dim_t = self.temperature ** (2 * (dim_t // 2) / self.num_pos_feats) + + pos_x = x_embed[:, :, :, None] / dim_t + pos_y = y_embed[:, :, :, None] / dim_t + pos_x = torch.stack( + (pos_x[:, :, :, 0::2].sin(), pos_x[:, :, :, 1::2].cos()), dim=4 + ).flatten(3) + pos_y = torch.stack( + (pos_y[:, :, :, 0::2].sin(), pos_y[:, :, :, 1::2].cos()), dim=4 + ).flatten(3) + pos = torch.cat((pos_y, pos_x), dim=3).permute(0, 3, 1, 2) + self.cache[cache_key] = pos[0] + return pos + + +class PositionEmbeddingRandom(nn.Module): + """ + Positional encoding using random spatial frequencies. + """ + + def __init__(self, num_pos_feats: int = 64, scale: Optional[float] = None) -> None: + super().__init__() + if scale is None or scale <= 0.0: + scale = 1.0 + self.register_buffer( + "positional_encoding_gaussian_matrix", + scale * torch.randn((2, num_pos_feats)), + ) + + def _pe_encoding(self, coords: torch.Tensor) -> torch.Tensor: + """Positionally encode points that are normalized to [0,1].""" + # assuming coords are in [0, 1]^2 square and have d_1 x ... x d_n x 2 shape + coords = 2 * coords - 1 + coords = coords @ self.positional_encoding_gaussian_matrix + coords = 2 * np.pi * coords + # outputs d_1 x ... x d_n x C shape + return torch.cat([torch.sin(coords), torch.cos(coords)], dim=-1) + + def forward(self, size: Tuple[int, int]) -> torch.Tensor: + """Generate positional encoding for a grid of the specified size.""" + h, w = size + device: Any = self.positional_encoding_gaussian_matrix.device + grid = torch.ones((h, w), device=device, dtype=torch.float32) + y_embed = grid.cumsum(dim=0) - 0.5 + x_embed = grid.cumsum(dim=1) - 0.5 + y_embed = y_embed / h + x_embed = x_embed / w + + pe = self._pe_encoding(torch.stack([x_embed, y_embed], dim=-1)) + return pe.permute(2, 0, 1) # C x H x W + + def forward_with_coords( + self, coords_input: torch.Tensor, image_size: Tuple[int, int] + ) -> torch.Tensor: + """Positionally encode points that are not normalized to [0,1].""" + coords = coords_input.clone() + coords[:, :, 0] = coords[:, :, 0] / image_size[1] + coords[:, :, 1] = coords[:, :, 1] / image_size[0] + return self._pe_encoding(coords.to(torch.float)) # B x N x C + + +# Rotary Positional Encoding, adapted from: +# 1. https://github.com/meta-llama/codellama/blob/main/llama/model.py +# 2. https://github.com/naver-ai/rope-vit +# 3. https://github.com/lucidrains/rotary-embedding-torch + + +def init_t_xy(end_x: int, end_y: int): + t = torch.arange(end_x * end_y, dtype=torch.float32) + t_x = (t % end_x).float() + t_y = torch.div(t, end_x, rounding_mode="floor").float() + return t_x, t_y + + +def compute_axial_cis(dim: int, end_x: int, end_y: int, theta: float = 10000.0): + freqs_x = 1.0 / (theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim)) + freqs_y = 1.0 / (theta ** (torch.arange(0, dim, 4)[: (dim // 4)].float() / dim)) + + t_x, t_y = init_t_xy(end_x, end_y) + freqs_x = torch.outer(t_x, freqs_x) + freqs_y = torch.outer(t_y, freqs_y) + freqs_cis_x = torch.polar(torch.ones_like(freqs_x), freqs_x) + freqs_cis_y = torch.polar(torch.ones_like(freqs_y), freqs_y) + return torch.cat([freqs_cis_x, freqs_cis_y], dim=-1) + + +def reshape_for_broadcast(freqs_cis: torch.Tensor, x: torch.Tensor): + ndim = x.ndim + assert 0 <= 1 < ndim + assert freqs_cis.shape == (x.shape[-2], x.shape[-1]) + shape = [d if i >= ndim - 2 else 1 for i, d in enumerate(x.shape)] + return freqs_cis.view(*shape) + + +def apply_rotary_enc( + xq: torch.Tensor, + xk: torch.Tensor, + freqs_cis: torch.Tensor, + repeat_freqs_k: bool = False, +): + xq_ = torch.view_as_complex(xq.float().reshape(*xq.shape[:-1], -1, 2)) + xk_ = ( + torch.view_as_complex(xk.float().reshape(*xk.shape[:-1], -1, 2)) + if xk.shape[-2] != 0 + else None + ) + freqs_cis = reshape_for_broadcast(freqs_cis, xq_) + xq_out = torch.view_as_real(xq_ * freqs_cis).flatten(3) + if xk_ is None: + # no keys to rotate, due to dropout + return xq_out.type_as(xq).to(xq.device), xk + # repeat freqs along seq_len dim to match k seq_len + if repeat_freqs_k: + r = xk_.shape[-2] // xq_.shape[-2] + freqs_cis = freqs_cis.repeat(*([1] * (freqs_cis.ndim - 2)), r, 1) + xk_out = torch.view_as_real(xk_ * freqs_cis).flatten(3) + return xq_out.type_as(xq).to(xq.device), xk_out.type_as(xk).to(xk.device) diff --git a/inpaint/plugins/segment_anything2/modeling/sam/__init__.py b/inpaint/plugins/segment_anything2/modeling/sam/__init__.py new file mode 100644 index 0000000..5277f46 --- /dev/null +++ b/inpaint/plugins/segment_anything2/modeling/sam/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. diff --git a/inpaint/plugins/segment_anything2/modeling/sam/mask_decoder.py b/inpaint/plugins/segment_anything2/modeling/sam/mask_decoder.py new file mode 100644 index 0000000..fb8bb05 --- /dev/null +++ b/inpaint/plugins/segment_anything2/modeling/sam/mask_decoder.py @@ -0,0 +1,295 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import List, Optional, Tuple, Type + +import torch +from torch import nn + +from ..sam2_utils import LayerNorm2d, MLP + + +class MaskDecoder(nn.Module): + def __init__( + self, + *, + transformer_dim: int, + transformer: nn.Module, + num_multimask_outputs: int = 3, + activation: Type[nn.Module] = nn.GELU, + iou_head_depth: int = 3, + iou_head_hidden_dim: int = 256, + use_high_res_features: bool = False, + iou_prediction_use_sigmoid=False, + dynamic_multimask_via_stability=False, + dynamic_multimask_stability_delta=0.05, + dynamic_multimask_stability_thresh=0.98, + pred_obj_scores: bool = False, + pred_obj_scores_mlp: bool = False, + use_multimask_token_for_obj_ptr: bool = False, + ) -> None: + """ + Predicts masks given an image and prompt embeddings, using a + transformer architecture. + + Arguments: + transformer_dim (int): the channel dimension of the transformer + transformer (nn.Module): the transformer used to predict masks + num_multimask_outputs (int): the number of masks to predict + when disambiguating masks + activation (nn.Module): the type of activation to use when + upscaling masks + iou_head_depth (int): the depth of the MLP used to predict + mask quality + iou_head_hidden_dim (int): the hidden dimension of the MLP + used to predict mask quality + """ + super().__init__() + self.transformer_dim = transformer_dim + self.transformer = transformer + + self.num_multimask_outputs = num_multimask_outputs + + self.iou_token = nn.Embedding(1, transformer_dim) + self.num_mask_tokens = num_multimask_outputs + 1 + self.mask_tokens = nn.Embedding(self.num_mask_tokens, transformer_dim) + + self.pred_obj_scores = pred_obj_scores + if self.pred_obj_scores: + self.obj_score_token = nn.Embedding(1, transformer_dim) + self.use_multimask_token_for_obj_ptr = use_multimask_token_for_obj_ptr + + self.output_upscaling = nn.Sequential( + nn.ConvTranspose2d( + transformer_dim, transformer_dim // 4, kernel_size=2, stride=2 + ), + LayerNorm2d(transformer_dim // 4), + activation(), + nn.ConvTranspose2d( + transformer_dim // 4, transformer_dim // 8, kernel_size=2, stride=2 + ), + activation(), + ) + self.use_high_res_features = use_high_res_features + if use_high_res_features: + self.conv_s0 = nn.Conv2d( + transformer_dim, transformer_dim // 8, kernel_size=1, stride=1 + ) + self.conv_s1 = nn.Conv2d( + transformer_dim, transformer_dim // 4, kernel_size=1, stride=1 + ) + + self.output_hypernetworks_mlps = nn.ModuleList( + [ + MLP(transformer_dim, transformer_dim, transformer_dim // 8, 3) + for i in range(self.num_mask_tokens) + ] + ) + + self.iou_prediction_head = MLP( + transformer_dim, + iou_head_hidden_dim, + self.num_mask_tokens, + iou_head_depth, + sigmoid_output=iou_prediction_use_sigmoid, + ) + if self.pred_obj_scores: + self.pred_obj_score_head = nn.Linear(transformer_dim, 1) + if pred_obj_scores_mlp: + self.pred_obj_score_head = MLP(transformer_dim, transformer_dim, 1, 3) + + # When outputting a single mask, optionally we can dynamically fall back to the best + # multimask output token if the single mask output token gives low stability scores. + self.dynamic_multimask_via_stability = dynamic_multimask_via_stability + self.dynamic_multimask_stability_delta = dynamic_multimask_stability_delta + self.dynamic_multimask_stability_thresh = dynamic_multimask_stability_thresh + + def forward( + self, + image_embeddings: torch.Tensor, + image_pe: torch.Tensor, + sparse_prompt_embeddings: torch.Tensor, + dense_prompt_embeddings: torch.Tensor, + multimask_output: bool, + repeat_image: bool, + high_res_features: Optional[List[torch.Tensor]] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Predict masks given image and prompt embeddings. + + Arguments: + image_embeddings (torch.Tensor): the embeddings from the image encoder + image_pe (torch.Tensor): positional encoding with the shape of image_embeddings + sparse_prompt_embeddings (torch.Tensor): the embeddings of the points and boxes + dense_prompt_embeddings (torch.Tensor): the embeddings of the mask inputs + multimask_output (bool): Whether to return multiple masks or a single + mask. + + Returns: + torch.Tensor: batched predicted masks + torch.Tensor: batched predictions of mask quality + torch.Tensor: batched SAM token for mask output + """ + masks, iou_pred, mask_tokens_out, object_score_logits = self.predict_masks( + image_embeddings=image_embeddings, + image_pe=image_pe, + sparse_prompt_embeddings=sparse_prompt_embeddings, + dense_prompt_embeddings=dense_prompt_embeddings, + repeat_image=repeat_image, + high_res_features=high_res_features, + ) + + # Select the correct mask or masks for output + if multimask_output: + masks = masks[:, 1:, :, :] + iou_pred = iou_pred[:, 1:] + elif self.dynamic_multimask_via_stability and not self.training: + masks, iou_pred = self._dynamic_multimask_via_stability(masks, iou_pred) + else: + masks = masks[:, 0:1, :, :] + iou_pred = iou_pred[:, 0:1] + + if multimask_output and self.use_multimask_token_for_obj_ptr: + sam_tokens_out = mask_tokens_out[:, 1:] # [b, 3, c] shape + else: + # Take the mask output token. Here we *always* use the token for single mask output. + # At test time, even if we track after 1-click (and using multimask_output=True), + # we still take the single mask token here. The rationale is that we always track + # after multiple clicks during training, so the past tokens seen during training + # are always the single mask token (and we'll let it be the object-memory token). + sam_tokens_out = mask_tokens_out[:, 0:1] # [b, 1, c] shape + + # Prepare output + return masks, iou_pred, sam_tokens_out, object_score_logits + + def predict_masks( + self, + image_embeddings: torch.Tensor, + image_pe: torch.Tensor, + sparse_prompt_embeddings: torch.Tensor, + dense_prompt_embeddings: torch.Tensor, + repeat_image: bool, + high_res_features: Optional[List[torch.Tensor]] = None, + ) -> Tuple[torch.Tensor, torch.Tensor]: + """Predicts masks. See 'forward' for more details.""" + # Concatenate output tokens + s = 0 + if self.pred_obj_scores: + output_tokens = torch.cat( + [ + self.obj_score_token.weight, + self.iou_token.weight, + self.mask_tokens.weight, + ], + dim=0, + ) + s = 1 + else: + output_tokens = torch.cat( + [self.iou_token.weight, self.mask_tokens.weight], dim=0 + ) + output_tokens = output_tokens.unsqueeze(0).expand( + sparse_prompt_embeddings.size(0), -1, -1 + ) + tokens = torch.cat((output_tokens, sparse_prompt_embeddings), dim=1) + + # Expand per-image data in batch direction to be per-mask + if repeat_image: + src = torch.repeat_interleave(image_embeddings, tokens.shape[0], dim=0) + else: + assert image_embeddings.shape[0] == tokens.shape[0] + src = image_embeddings + src = src + dense_prompt_embeddings + assert ( + image_pe.size(0) == 1 + ), "image_pe should have size 1 in batch dim (from `get_dense_pe()`)" + pos_src = torch.repeat_interleave(image_pe, tokens.shape[0], dim=0) + b, c, h, w = src.shape + + # Run the transformer + hs, src = self.transformer(src, pos_src, tokens) + iou_token_out = hs[:, s, :] + mask_tokens_out = hs[:, s + 1 : (s + 1 + self.num_mask_tokens), :] + + # Upscale mask embeddings and predict masks using the mask tokens + src = src.transpose(1, 2).view(b, c, h, w) + if not self.use_high_res_features: + upscaled_embedding = self.output_upscaling(src) + else: + dc1, ln1, act1, dc2, act2 = self.output_upscaling + feat_s0, feat_s1 = high_res_features + upscaled_embedding = act1(ln1(dc1(src) + feat_s1)) + upscaled_embedding = act2(dc2(upscaled_embedding) + feat_s0) + + hyper_in_list: List[torch.Tensor] = [] + for i in range(self.num_mask_tokens): + hyper_in_list.append( + self.output_hypernetworks_mlps[i](mask_tokens_out[:, i, :]) + ) + hyper_in = torch.stack(hyper_in_list, dim=1) + b, c, h, w = upscaled_embedding.shape + masks = (hyper_in @ upscaled_embedding.view(b, c, h * w)).view(b, -1, h, w) + + # Generate mask quality predictions + iou_pred = self.iou_prediction_head(iou_token_out) + if self.pred_obj_scores: + assert s == 1 + object_score_logits = self.pred_obj_score_head(hs[:, 0, :]) + else: + # Obj scores logits - default to 10.0, i.e. assuming the object is present, sigmoid(10)=1 + object_score_logits = 10.0 * iou_pred.new_ones(iou_pred.shape[0], 1) + + return masks, iou_pred, mask_tokens_out, object_score_logits + + def _get_stability_scores(self, mask_logits): + """ + Compute stability scores of the mask logits based on the IoU between upper and + lower thresholds, similar to https://github.com/fairinternal/onevision/pull/568. + """ + mask_logits = mask_logits.flatten(-2) + stability_delta = self.dynamic_multimask_stability_delta + area_i = torch.sum(mask_logits > stability_delta, dim=-1).float() + area_u = torch.sum(mask_logits > -stability_delta, dim=-1).float() + stability_scores = torch.where(area_u > 0, area_i / area_u, 1.0) + return stability_scores + + def _dynamic_multimask_via_stability(self, all_mask_logits, all_iou_scores): + """ + When outputting a single mask, if the stability score from the current single-mask + output (based on output token 0) falls below a threshold, we instead select from + multi-mask outputs (based on output token 1~3) the mask with the highest predicted + IoU score. This is intended to ensure a valid mask for both clicking and tracking. + """ + # The best mask from multimask output tokens (1~3) + multimask_logits = all_mask_logits[:, 1:, :, :] + multimask_iou_scores = all_iou_scores[:, 1:] + best_scores_inds = torch.argmax(multimask_iou_scores, dim=-1) + batch_inds = torch.arange( + multimask_iou_scores.size(0), device=all_iou_scores.device + ) + best_multimask_logits = multimask_logits[batch_inds, best_scores_inds] + best_multimask_logits = best_multimask_logits.unsqueeze(1) + best_multimask_iou_scores = multimask_iou_scores[batch_inds, best_scores_inds] + best_multimask_iou_scores = best_multimask_iou_scores.unsqueeze(1) + + # The mask from singlemask output token 0 and its stability score + singlemask_logits = all_mask_logits[:, 0:1, :, :] + singlemask_iou_scores = all_iou_scores[:, 0:1] + stability_scores = self._get_stability_scores(singlemask_logits) + is_stable = stability_scores >= self.dynamic_multimask_stability_thresh + + # Dynamically fall back to best multimask output upon low stability scores. + mask_logits_out = torch.where( + is_stable[..., None, None].expand_as(singlemask_logits), + singlemask_logits, + best_multimask_logits, + ) + iou_scores_out = torch.where( + is_stable.expand_as(singlemask_iou_scores), + singlemask_iou_scores, + best_multimask_iou_scores, + ) + return mask_logits_out, iou_scores_out diff --git a/inpaint/plugins/segment_anything2/modeling/sam/prompt_encoder.py b/inpaint/plugins/segment_anything2/modeling/sam/prompt_encoder.py new file mode 100644 index 0000000..0f6d46e --- /dev/null +++ b/inpaint/plugins/segment_anything2/modeling/sam/prompt_encoder.py @@ -0,0 +1,182 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +from typing import Optional, Tuple, Type + +import torch +from torch import nn + +from ..position_encoding import PositionEmbeddingRandom + +from ..sam2_utils import LayerNorm2d + + +class PromptEncoder(nn.Module): + def __init__( + self, + embed_dim: int, + image_embedding_size: Tuple[int, int], + input_image_size: Tuple[int, int], + mask_in_chans: int, + activation: Type[nn.Module] = nn.GELU, + ) -> None: + """ + Encodes prompts for input to SAM's mask decoder. + + Arguments: + embed_dim (int): The prompts' embedding dimension + image_embedding_size (tuple(int, int)): The spatial size of the + image embedding, as (H, W). + input_image_size (int): The padded size of the image as input + to the image encoder, as (H, W). + mask_in_chans (int): The number of hidden channels used for + encoding input masks. + activation (nn.Module): The activation to use when encoding + input masks. + """ + super().__init__() + self.embed_dim = embed_dim + self.input_image_size = input_image_size + self.image_embedding_size = image_embedding_size + self.pe_layer = PositionEmbeddingRandom(embed_dim // 2) + + self.num_point_embeddings: int = 4 # pos/neg point + 2 box corners + point_embeddings = [ + nn.Embedding(1, embed_dim) for i in range(self.num_point_embeddings) + ] + self.point_embeddings = nn.ModuleList(point_embeddings) + self.not_a_point_embed = nn.Embedding(1, embed_dim) + + self.mask_input_size = ( + 4 * image_embedding_size[0], + 4 * image_embedding_size[1], + ) + self.mask_downscaling = nn.Sequential( + nn.Conv2d(1, mask_in_chans // 4, kernel_size=2, stride=2), + LayerNorm2d(mask_in_chans // 4), + activation(), + nn.Conv2d(mask_in_chans // 4, mask_in_chans, kernel_size=2, stride=2), + LayerNorm2d(mask_in_chans), + activation(), + nn.Conv2d(mask_in_chans, embed_dim, kernel_size=1), + ) + self.no_mask_embed = nn.Embedding(1, embed_dim) + + def get_dense_pe(self) -> torch.Tensor: + """ + Returns the positional encoding used to encode point prompts, + applied to a dense set of points the shape of the image encoding. + + Returns: + torch.Tensor: Positional encoding with shape + 1x(embed_dim)x(embedding_h)x(embedding_w) + """ + return self.pe_layer(self.image_embedding_size).unsqueeze(0) + + def _embed_points( + self, + points: torch.Tensor, + labels: torch.Tensor, + pad: bool, + ) -> torch.Tensor: + """Embeds point prompts.""" + points = points + 0.5 # Shift to center of pixel + if pad: + padding_point = torch.zeros((points.shape[0], 1, 2), device=points.device) + padding_label = -torch.ones((labels.shape[0], 1), device=labels.device) + points = torch.cat([points, padding_point], dim=1) + labels = torch.cat([labels, padding_label], dim=1) + point_embedding = self.pe_layer.forward_with_coords( + points, self.input_image_size + ) + point_embedding[labels == -1] = 0.0 + point_embedding[labels == -1] += self.not_a_point_embed.weight + point_embedding[labels == 0] += self.point_embeddings[0].weight + point_embedding[labels == 1] += self.point_embeddings[1].weight + point_embedding[labels == 2] += self.point_embeddings[2].weight + point_embedding[labels == 3] += self.point_embeddings[3].weight + return point_embedding + + def _embed_boxes(self, boxes: torch.Tensor) -> torch.Tensor: + """Embeds box prompts.""" + boxes = boxes + 0.5 # Shift to center of pixel + coords = boxes.reshape(-1, 2, 2) + corner_embedding = self.pe_layer.forward_with_coords( + coords, self.input_image_size + ) + corner_embedding[:, 0, :] += self.point_embeddings[2].weight + corner_embedding[:, 1, :] += self.point_embeddings[3].weight + return corner_embedding + + def _embed_masks(self, masks: torch.Tensor) -> torch.Tensor: + """Embeds mask inputs.""" + mask_embedding = self.mask_downscaling(masks) + return mask_embedding + + def _get_batch_size( + self, + points: Optional[Tuple[torch.Tensor, torch.Tensor]], + boxes: Optional[torch.Tensor], + masks: Optional[torch.Tensor], + ) -> int: + """ + Gets the batch size of the output given the batch size of the input prompts. + """ + if points is not None: + return points[0].shape[0] + elif boxes is not None: + return boxes.shape[0] + elif masks is not None: + return masks.shape[0] + else: + return 1 + + def _get_device(self) -> torch.device: + return self.point_embeddings[0].weight.device + + def forward( + self, + points: Optional[Tuple[torch.Tensor, torch.Tensor]], + boxes: Optional[torch.Tensor], + masks: Optional[torch.Tensor], + ) -> Tuple[torch.Tensor, torch.Tensor]: + """ + Embeds different types of prompts, returning both sparse and dense + embeddings. + + Arguments: + points (tuple(torch.Tensor, torch.Tensor) or none): point coordinates + and labels to embed. + boxes (torch.Tensor or none): boxes to embed + masks (torch.Tensor or none): masks to embed + + Returns: + torch.Tensor: sparse embeddings for the points and boxes, with shape + BxNx(embed_dim), where N is determined by the number of input points + and boxes. + torch.Tensor: dense embeddings for the masks, in the shape + Bx(embed_dim)x(embed_H)x(embed_W) + """ + bs = self._get_batch_size(points, boxes, masks) + sparse_embeddings = torch.empty( + (bs, 0, self.embed_dim), device=self._get_device() + ) + if points is not None: + coords, labels = points + point_embeddings = self._embed_points(coords, labels, pad=(boxes is None)) + sparse_embeddings = torch.cat([sparse_embeddings, point_embeddings], dim=1) + if boxes is not None: + box_embeddings = self._embed_boxes(boxes) + sparse_embeddings = torch.cat([sparse_embeddings, box_embeddings], dim=1) + + if masks is not None: + dense_embeddings = self._embed_masks(masks) + else: + dense_embeddings = self.no_mask_embed.weight.reshape(1, -1, 1, 1).expand( + bs, -1, self.image_embedding_size[0], self.image_embedding_size[1] + ) + + return sparse_embeddings, dense_embeddings diff --git a/inpaint/plugins/segment_anything2/modeling/sam/transformer.py b/inpaint/plugins/segment_anything2/modeling/sam/transformer.py new file mode 100644 index 0000000..2dedccb --- /dev/null +++ b/inpaint/plugins/segment_anything2/modeling/sam/transformer.py @@ -0,0 +1,327 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import math +import warnings +from functools import partial +from typing import Tuple, Type + +import torch +import torch.nn.functional as F +from torch import nn, Tensor + +from ..position_encoding import apply_rotary_enc, compute_axial_cis + +from ..sam2_utils import MLP +from ...utils.misc import get_sdpa_settings + +warnings.simplefilter(action="ignore", category=FutureWarning) +OLD_GPU, USE_FLASH_ATTN, MATH_KERNEL_ON = get_sdpa_settings() + + +class TwoWayTransformer(nn.Module): + def __init__( + self, + depth: int, + embedding_dim: int, + num_heads: int, + mlp_dim: int, + activation: Type[nn.Module] = nn.ReLU, + attention_downsample_rate: int = 2, + ) -> None: + """ + A transformer decoder that attends to an input image using + queries whose positional embedding is supplied. + + Args: + depth (int): number of layers in the transformer + embedding_dim (int): the channel dimension for the input embeddings + num_heads (int): the number of heads for multihead attention. Must + divide embedding_dim + mlp_dim (int): the channel dimension internal to the MLP block + activation (nn.Module): the activation to use in the MLP block + """ + super().__init__() + self.depth = depth + self.embedding_dim = embedding_dim + self.num_heads = num_heads + self.mlp_dim = mlp_dim + self.layers = nn.ModuleList() + + for i in range(depth): + self.layers.append( + TwoWayAttentionBlock( + embedding_dim=embedding_dim, + num_heads=num_heads, + mlp_dim=mlp_dim, + activation=activation, + attention_downsample_rate=attention_downsample_rate, + skip_first_layer_pe=(i == 0), + ) + ) + + self.final_attn_token_to_image = Attention( + embedding_dim, num_heads, downsample_rate=attention_downsample_rate + ) + self.norm_final_attn = nn.LayerNorm(embedding_dim) + + def forward( + self, + image_embedding: Tensor, + image_pe: Tensor, + point_embedding: Tensor, + ) -> Tuple[Tensor, Tensor]: + """ + Args: + image_embedding (torch.Tensor): image to attend to. Should be shape + B x embedding_dim x h x w for any h and w. + image_pe (torch.Tensor): the positional encoding to add to the image. Must + have the same shape as image_embedding. + point_embedding (torch.Tensor): the embedding to add to the query points. + Must have shape B x N_points x embedding_dim for any N_points. + + Returns: + torch.Tensor: the processed point_embedding + torch.Tensor: the processed image_embedding + """ + # BxCxHxW -> BxHWxC == B x N_image_tokens x C + bs, c, h, w = image_embedding.shape + image_embedding = image_embedding.flatten(2).permute(0, 2, 1) + image_pe = image_pe.flatten(2).permute(0, 2, 1) + + # Prepare queries + queries = point_embedding + keys = image_embedding + + # Apply transformer blocks and final layernorm + for layer in self.layers: + queries, keys = layer( + queries=queries, + keys=keys, + query_pe=point_embedding, + key_pe=image_pe, + ) + + # Apply the final attention layer from the points to the image + q = queries + point_embedding + k = keys + image_pe + attn_out = self.final_attn_token_to_image(q=q, k=k, v=keys) + queries = queries + attn_out + queries = self.norm_final_attn(queries) + + return queries, keys + + +class TwoWayAttentionBlock(nn.Module): + def __init__( + self, + embedding_dim: int, + num_heads: int, + mlp_dim: int = 2048, + activation: Type[nn.Module] = nn.ReLU, + attention_downsample_rate: int = 2, + skip_first_layer_pe: bool = False, + ) -> None: + """ + A transformer block with four layers: (1) self-attention of sparse + inputs, (2) cross attention of sparse inputs to dense inputs, (3) mlp + block on sparse inputs, and (4) cross attention of dense inputs to sparse + inputs. + + Arguments: + embedding_dim (int): the channel dimension of the embeddings + num_heads (int): the number of heads in the attention layers + mlp_dim (int): the hidden dimension of the mlp block + activation (nn.Module): the activation of the mlp block + skip_first_layer_pe (bool): skip the PE on the first layer + """ + super().__init__() + self.self_attn = Attention(embedding_dim, num_heads) + self.norm1 = nn.LayerNorm(embedding_dim) + + self.cross_attn_token_to_image = Attention( + embedding_dim, num_heads, downsample_rate=attention_downsample_rate + ) + self.norm2 = nn.LayerNorm(embedding_dim) + + self.mlp = MLP( + embedding_dim, mlp_dim, embedding_dim, num_layers=2, activation=activation + ) + self.norm3 = nn.LayerNorm(embedding_dim) + + self.norm4 = nn.LayerNorm(embedding_dim) + self.cross_attn_image_to_token = Attention( + embedding_dim, num_heads, downsample_rate=attention_downsample_rate + ) + + self.skip_first_layer_pe = skip_first_layer_pe + + def forward( + self, queries: Tensor, keys: Tensor, query_pe: Tensor, key_pe: Tensor + ) -> Tuple[Tensor, Tensor]: + # Self attention block + if self.skip_first_layer_pe: + queries = self.self_attn(q=queries, k=queries, v=queries) + else: + q = queries + query_pe + attn_out = self.self_attn(q=q, k=q, v=queries) + queries = queries + attn_out + queries = self.norm1(queries) + + # Cross attention block, tokens attending to image embedding + q = queries + query_pe + k = keys + key_pe + attn_out = self.cross_attn_token_to_image(q=q, k=k, v=keys) + queries = queries + attn_out + queries = self.norm2(queries) + + # MLP block + mlp_out = self.mlp(queries) + queries = queries + mlp_out + queries = self.norm3(queries) + + # Cross attention block, image embedding attending to tokens + q = queries + query_pe + k = keys + key_pe + attn_out = self.cross_attn_image_to_token(q=k, k=q, v=queries) + keys = keys + attn_out + keys = self.norm4(keys) + + return queries, keys + + +class Attention(nn.Module): + """ + An attention layer that allows for downscaling the size of the embedding + after projection to queries, keys, and values. + """ + + def __init__( + self, + embedding_dim: int, + num_heads: int, + downsample_rate: int = 1, + dropout: float = 0.0, + kv_in_dim: int = None, + ) -> None: + super().__init__() + self.embedding_dim = embedding_dim + self.kv_in_dim = kv_in_dim if kv_in_dim is not None else embedding_dim + self.internal_dim = embedding_dim // downsample_rate + self.num_heads = num_heads + assert ( + self.internal_dim % num_heads == 0 + ), "num_heads must divide embedding_dim." + + self.q_proj = nn.Linear(embedding_dim, self.internal_dim) + self.k_proj = nn.Linear(self.kv_in_dim, self.internal_dim) + self.v_proj = nn.Linear(self.kv_in_dim, self.internal_dim) + self.out_proj = nn.Linear(self.internal_dim, embedding_dim) + + self.dropout_p = dropout + + def _separate_heads(self, x: Tensor, num_heads: int) -> Tensor: + b, n, c = x.shape + x = x.reshape(b, n, num_heads, c // num_heads) + return x.transpose(1, 2) # B x N_heads x N_tokens x C_per_head + + def _recombine_heads(self, x: Tensor) -> Tensor: + b, n_heads, n_tokens, c_per_head = x.shape + x = x.transpose(1, 2) + return x.reshape(b, n_tokens, n_heads * c_per_head) # B x N_tokens x C + + def forward(self, q: Tensor, k: Tensor, v: Tensor) -> Tensor: + # Input projections + q = self.q_proj(q) + k = self.k_proj(k) + v = self.v_proj(v) + + # Separate into heads + q = self._separate_heads(q, self.num_heads) + k = self._separate_heads(k, self.num_heads) + v = self._separate_heads(v, self.num_heads) + + dropout_p = self.dropout_p if self.training else 0.0 + # Attention + with torch.backends.cuda.sdp_kernel( + enable_flash=USE_FLASH_ATTN, + # if Flash attention kernel is off, then math kernel needs to be enabled + enable_math=(OLD_GPU and dropout_p > 0.0) or MATH_KERNEL_ON, + enable_mem_efficient=OLD_GPU, + ): + out = F.scaled_dot_product_attention(q, k, v, dropout_p=dropout_p) + + out = self._recombine_heads(out) + out = self.out_proj(out) + + return out + + +class RoPEAttention(Attention): + """Attention with rotary position encoding.""" + + def __init__( + self, + *args, + rope_theta=10000.0, + # whether to repeat q rope to match k length + # this is needed for cross-attention to memories + rope_k_repeat=False, + feat_sizes=(32, 32), # [w, h] for stride 16 feats at 512 resolution + **kwargs, + ): + super().__init__(*args, **kwargs) + + self.compute_cis = partial( + compute_axial_cis, dim=self.internal_dim // self.num_heads, theta=rope_theta + ) + freqs_cis = self.compute_cis(end_x=feat_sizes[0], end_y=feat_sizes[1]) + self.freqs_cis = freqs_cis + self.rope_k_repeat = rope_k_repeat + + def forward( + self, q: Tensor, k: Tensor, v: Tensor, num_k_exclude_rope: int = 0 + ) -> Tensor: + # Input projections + q = self.q_proj(q) + k = self.k_proj(k) + v = self.v_proj(v) + + # Separate into heads + q = self._separate_heads(q, self.num_heads) + k = self._separate_heads(k, self.num_heads) + v = self._separate_heads(v, self.num_heads) + + # Apply rotary position encoding + w = h = math.sqrt(q.shape[-2]) + self.freqs_cis = self.freqs_cis.to(q.device) + if self.freqs_cis.shape[0] != q.shape[-2]: + self.freqs_cis = self.compute_cis(end_x=w, end_y=h).to(q.device) + if q.shape[-2] != k.shape[-2]: + assert self.rope_k_repeat + + num_k_rope = k.size(-2) - num_k_exclude_rope + q, k[:, :, :num_k_rope] = apply_rotary_enc( + q, + k[:, :, :num_k_rope], + freqs_cis=self.freqs_cis, + repeat_freqs_k=self.rope_k_repeat, + ) + + dropout_p = self.dropout_p if self.training else 0.0 + # Attention + with torch.backends.cuda.sdp_kernel( + enable_flash=USE_FLASH_ATTN, + # if Flash attention kernel is off, then math kernel needs to be enabled + enable_math=(OLD_GPU and dropout_p > 0.0) or MATH_KERNEL_ON, + enable_mem_efficient=OLD_GPU, + ): + out = F.scaled_dot_product_attention(q, k, v, dropout_p=dropout_p) + + out = self._recombine_heads(out) + out = self.out_proj(out) + + return out diff --git a/inpaint/plugins/segment_anything2/modeling/sam2_base.py b/inpaint/plugins/segment_anything2/modeling/sam2_base.py new file mode 100644 index 0000000..7896060 --- /dev/null +++ b/inpaint/plugins/segment_anything2/modeling/sam2_base.py @@ -0,0 +1,832 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.distributed +import torch.nn.functional as F + +from torch.nn.init import trunc_normal_ + +from .sam.mask_decoder import MaskDecoder +from .sam.prompt_encoder import PromptEncoder +from .sam.transformer import TwoWayTransformer +from .sam2_utils import get_1d_sine_pe, MLP, select_closest_cond_frames + +# a large negative value as a placeholder score for missing objects +NO_OBJ_SCORE = -1024.0 + + +class SAM2Base(torch.nn.Module): + def __init__( + self, + image_encoder, + memory_attention, + memory_encoder, + num_maskmem=7, # default 1 input frame + 6 previous frames + image_size=512, + backbone_stride=16, # stride of the image backbone output + sigmoid_scale_for_mem_enc=1.0, # scale factor for mask sigmoid prob + sigmoid_bias_for_mem_enc=0.0, # bias factor for mask sigmoid prob + # During evaluation, whether to binarize the sigmoid mask logits on interacted frames with clicks + binarize_mask_from_pts_for_mem_enc=False, + use_mask_input_as_output_without_sam=False, + # on frames with mask input, whether to directly output the input mask without using a SAM prompt encoder + mask decoder + # The maximum number of conditioning frames to participate in the memory attention (-1 means no limit; if there are more conditioning frames than this limit, + # we only cross-attend to the temporally closest `max_cond_frames_in_attn` conditioning frames in the encoder when tracking each frame). This gives the model + # a temporal locality when handling a large number of annotated frames (since closer frames should be more important) and also avoids GPU OOM. + max_cond_frames_in_attn=-1, + # on the first frame, whether to directly add the no-memory embedding to the image feature + # (instead of using the transformer encoder) + directly_add_no_mem_embed=False, + # whether to use high-resolution feature maps in the SAM mask decoder + use_high_res_features_in_sam=False, + # whether to output multiple (3) masks for the first click on initial conditioning frames + multimask_output_in_sam=False, + # the minimum and maximum number of clicks to use multimask_output_in_sam (only relevant when `multimask_output_in_sam=True`; + # default is 1 for both, meaning that only the first click gives multimask output; also note that a box counts as two points) + multimask_min_pt_num=1, + multimask_max_pt_num=1, + # whether to also use multimask output for tracking (not just for the first click on initial conditioning frames; only relevant when `multimask_output_in_sam=True`) + multimask_output_for_tracking=False, + # Whether to use multimask tokens for obj ptr; Only relevant when both + # use_obj_ptrs_in_encoder=True and multimask_output_for_tracking=True + use_multimask_token_for_obj_ptr: bool = False, + # whether to use sigmoid to restrict ious prediction to [0-1] + iou_prediction_use_sigmoid=False, + # The memory bank's temporal stride during evaluation (i.e. the `r` parameter in XMem and Cutie; XMem and Cutie use r=5). + # For r>1, the (self.num_maskmem - 1) non-conditioning memory frames consist of + # (self.num_maskmem - 2) nearest frames from every r-th frames, plus the last frame. + memory_temporal_stride_for_eval=1, + # if `add_all_frames_to_correct_as_cond` is True, we also append to the conditioning frame list any frame that receives a later correction click + # if `add_all_frames_to_correct_as_cond` is False, we conditioning frame list to only use those initial conditioning frames + add_all_frames_to_correct_as_cond=False, + # whether to apply non-overlapping constraints on the object masks in the memory encoder during evaluation (to avoid/alleviate superposing masks) + non_overlap_masks_for_mem_enc=False, + # whether to cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder + use_obj_ptrs_in_encoder=False, + # the maximum number of object pointers from other frames in encoder cross attention (only relevant when `use_obj_ptrs_in_encoder=True`) + max_obj_ptrs_in_encoder=16, + # whether to add temporal positional encoding to the object pointers in the encoder (only relevant when `use_obj_ptrs_in_encoder=True`) + add_tpos_enc_to_obj_ptrs=True, + # whether to add an extra linear projection layer for the temporal positional encoding in the object pointers to avoid potential interference + # with spatial positional encoding (only relevant when both `use_obj_ptrs_in_encoder=True` and `add_tpos_enc_to_obj_ptrs=True`) + proj_tpos_enc_in_obj_ptrs=False, + # whether to only attend to object pointers in the past (before the current frame) in the encoder during evaluation + # (only relevant when `use_obj_ptrs_in_encoder=True`; this might avoid pointer information too far in the future to distract the initial tracking) + only_obj_ptrs_in_the_past_for_eval=False, + # Whether to predict if there is an object in the frame + pred_obj_scores: bool = False, + # Whether to use an MLP to predict object scores + pred_obj_scores_mlp: bool = False, + # Only relevant if pred_obj_scores=True and use_obj_ptrs_in_encoder=True; + # Whether to have a fixed no obj pointer when there is no object present + # or to use it as an additive embedding with obj_ptr produced by decoder + fixed_no_obj_ptr: bool = False, + # Soft no object, i.e. mix in no_obj_ptr softly, + # hope to make recovery easier if there is a mistake and mitigate accumulation of errors + soft_no_obj_ptr: bool = False, + use_mlp_for_obj_ptr_proj: bool = False, + # extra arguments used to construct the SAM mask decoder; if not None, it should be a dict of kwargs to be passed into `MaskDecoder` class. + sam_mask_decoder_extra_args=None, + compile_image_encoder: bool = False, + ): + super().__init__() + + # Part 1: the image backbone + self.image_encoder = image_encoder + # Use level 0, 1, 2 for high-res setting, or just level 2 for the default setting + self.use_high_res_features_in_sam = use_high_res_features_in_sam + self.num_feature_levels = 3 if use_high_res_features_in_sam else 1 + self.use_obj_ptrs_in_encoder = use_obj_ptrs_in_encoder + self.max_obj_ptrs_in_encoder = max_obj_ptrs_in_encoder + if use_obj_ptrs_in_encoder: + # A conv layer to downsample the mask prompt to stride 4 (the same stride as + # low-res SAM mask logits) and to change its scales from 0~1 to SAM logit scale, + # so that it can be fed into the SAM mask decoder to generate a pointer. + self.mask_downsample = torch.nn.Conv2d(1, 1, kernel_size=4, stride=4) + self.add_tpos_enc_to_obj_ptrs = add_tpos_enc_to_obj_ptrs + if proj_tpos_enc_in_obj_ptrs: + assert add_tpos_enc_to_obj_ptrs # these options need to be used together + self.proj_tpos_enc_in_obj_ptrs = proj_tpos_enc_in_obj_ptrs + self.only_obj_ptrs_in_the_past_for_eval = only_obj_ptrs_in_the_past_for_eval + + # Part 2: memory attention to condition current frame's visual features + # with memories (and obj ptrs) from past frames + self.memory_attention = memory_attention + self.hidden_dim = memory_attention.d_model + + # Part 3: memory encoder for the previous frame's outputs + self.memory_encoder = memory_encoder + self.mem_dim = self.hidden_dim + if hasattr(self.memory_encoder, "out_proj") and hasattr( + self.memory_encoder.out_proj, "weight" + ): + # if there is compression of memories along channel dim + self.mem_dim = self.memory_encoder.out_proj.weight.shape[0] + self.num_maskmem = num_maskmem # Number of memories accessible + # Temporal encoding of the memories + self.maskmem_tpos_enc = torch.nn.Parameter( + torch.zeros(num_maskmem, 1, 1, self.mem_dim) + ) + trunc_normal_(self.maskmem_tpos_enc, std=0.02) + # a single token to indicate no memory embedding from previous frames + self.no_mem_embed = torch.nn.Parameter(torch.zeros(1, 1, self.hidden_dim)) + self.no_mem_pos_enc = torch.nn.Parameter(torch.zeros(1, 1, self.hidden_dim)) + trunc_normal_(self.no_mem_embed, std=0.02) + trunc_normal_(self.no_mem_pos_enc, std=0.02) + self.directly_add_no_mem_embed = directly_add_no_mem_embed + # Apply sigmoid to the output raw mask logits (to turn them from + # range (-inf, +inf) to range (0, 1)) before feeding them into the memory encoder + self.sigmoid_scale_for_mem_enc = sigmoid_scale_for_mem_enc + self.sigmoid_bias_for_mem_enc = sigmoid_bias_for_mem_enc + self.binarize_mask_from_pts_for_mem_enc = binarize_mask_from_pts_for_mem_enc + self.non_overlap_masks_for_mem_enc = non_overlap_masks_for_mem_enc + self.memory_temporal_stride_for_eval = memory_temporal_stride_for_eval + # On frames with mask input, whether to directly output the input mask without + # using a SAM prompt encoder + mask decoder + self.use_mask_input_as_output_without_sam = use_mask_input_as_output_without_sam + self.multimask_output_in_sam = multimask_output_in_sam + self.multimask_min_pt_num = multimask_min_pt_num + self.multimask_max_pt_num = multimask_max_pt_num + self.multimask_output_for_tracking = multimask_output_for_tracking + self.use_multimask_token_for_obj_ptr = use_multimask_token_for_obj_ptr + self.iou_prediction_use_sigmoid = iou_prediction_use_sigmoid + + # Part 4: SAM-style prompt encoder (for both mask and point inputs) + # and SAM-style mask decoder for the final mask output + self.image_size = image_size + self.backbone_stride = backbone_stride + self.sam_mask_decoder_extra_args = sam_mask_decoder_extra_args + self.pred_obj_scores = pred_obj_scores + self.pred_obj_scores_mlp = pred_obj_scores_mlp + self.fixed_no_obj_ptr = fixed_no_obj_ptr + self.soft_no_obj_ptr = soft_no_obj_ptr + if self.fixed_no_obj_ptr: + assert self.pred_obj_scores + assert self.use_obj_ptrs_in_encoder + if self.pred_obj_scores and self.use_obj_ptrs_in_encoder: + self.no_obj_ptr = torch.nn.Parameter(torch.zeros(1, self.hidden_dim)) + trunc_normal_(self.no_obj_ptr, std=0.02) + self.use_mlp_for_obj_ptr_proj = use_mlp_for_obj_ptr_proj + + self._build_sam_heads() + self.add_all_frames_to_correct_as_cond = add_all_frames_to_correct_as_cond + self.max_cond_frames_in_attn = max_cond_frames_in_attn + + # Model compilation + if compile_image_encoder: + # Compile the forward function (not the full module) to allow loading checkpoints. + print( + "Image encoder compilation is enabled. First forward pass will be slow." + ) + self.image_encoder.forward = torch.compile( + self.image_encoder.forward, + mode="max-autotune", + fullgraph=True, + dynamic=False, + ) + + @property + def device(self): + return next(self.parameters()).device + + def forward(self, *args, **kwargs): + raise NotImplementedError( + "Please use the corresponding methods in SAM2VideoPredictor for inference." + "See notebooks/video_predictor_example.ipynb for an example." + ) + + def _build_sam_heads(self): + """Build SAM-style prompt encoder and mask decoder.""" + self.sam_prompt_embed_dim = self.hidden_dim + self.sam_image_embedding_size = self.image_size // self.backbone_stride + + # build PromptEncoder and MaskDecoder from SAM + # (their hyperparameters like `mask_in_chans=16` are from SAM code) + self.sam_prompt_encoder = PromptEncoder( + embed_dim=self.sam_prompt_embed_dim, + image_embedding_size=( + self.sam_image_embedding_size, + self.sam_image_embedding_size, + ), + input_image_size=(self.image_size, self.image_size), + mask_in_chans=16, + ) + self.sam_mask_decoder = MaskDecoder( + num_multimask_outputs=3, + transformer=TwoWayTransformer( + depth=2, + embedding_dim=self.sam_prompt_embed_dim, + mlp_dim=2048, + num_heads=8, + ), + transformer_dim=self.sam_prompt_embed_dim, + iou_head_depth=3, + iou_head_hidden_dim=256, + use_high_res_features=self.use_high_res_features_in_sam, + iou_prediction_use_sigmoid=self.iou_prediction_use_sigmoid, + pred_obj_scores=self.pred_obj_scores, + pred_obj_scores_mlp=self.pred_obj_scores_mlp, + use_multimask_token_for_obj_ptr=self.use_multimask_token_for_obj_ptr, + **(self.sam_mask_decoder_extra_args or {}), + ) + if self.use_obj_ptrs_in_encoder: + # a linear projection on SAM output tokens to turn them into object pointers + self.obj_ptr_proj = torch.nn.Linear(self.hidden_dim, self.hidden_dim) + if self.use_mlp_for_obj_ptr_proj: + self.obj_ptr_proj = MLP( + self.hidden_dim, self.hidden_dim, self.hidden_dim, 3 + ) + else: + self.obj_ptr_proj = torch.nn.Identity() + if self.proj_tpos_enc_in_obj_ptrs: + # a linear projection on temporal positional encoding in object pointers to + # avoid potential interference with spatial positional encoding + self.obj_ptr_tpos_proj = torch.nn.Linear(self.hidden_dim, self.mem_dim) + else: + self.obj_ptr_tpos_proj = torch.nn.Identity() + + def _forward_sam_heads( + self, + backbone_features, + point_inputs=None, + mask_inputs=None, + high_res_features=None, + multimask_output=False, + ): + """ + Forward SAM prompt encoders and mask heads. + + Inputs: + - backbone_features: image features of [B, C, H, W] shape + - point_inputs: a dictionary with "point_coords" and "point_labels", where + 1) "point_coords" has [B, P, 2] shape and float32 dtype and contains the + absolute pixel-unit coordinate in (x, y) format of the P input points + 2) "point_labels" has shape [B, P] and int32 dtype, where 1 means + positive clicks, 0 means negative clicks, and -1 means padding + - mask_inputs: a mask of [B, 1, H*16, W*16] shape, float or bool, with the + same spatial size as the image. + - high_res_features: either 1) None or 2) or a list of length 2 containing + two feature maps of [B, C, 4*H, 4*W] and [B, C, 2*H, 2*W] shapes respectively, + which will be used as high-resolution feature maps for SAM decoder. + - multimask_output: if it's True, we output 3 candidate masks and their 3 + corresponding IoU estimates, and if it's False, we output only 1 mask and + its corresponding IoU estimate. + + Outputs: + - low_res_multimasks: [B, M, H*4, W*4] shape (where M = 3 if + `multimask_output=True` and M = 1 if `multimask_output=False`), the SAM + output mask logits (before sigmoid) for the low-resolution masks, with 4x + the resolution (1/4 stride) of the input backbone_features. + - high_res_multimasks: [B, M, H*16, W*16] shape (where M = 3 + if `multimask_output=True` and M = 1 if `multimask_output=False`), + upsampled from the low-resolution masks, with shape size as the image + (stride is 1 pixel). + - ious, [B, M] shape, where (where M = 3 if `multimask_output=True` and M = 1 + if `multimask_output=False`), the estimated IoU of each output mask. + - low_res_masks: [B, 1, H*4, W*4] shape, the best mask in `low_res_multimasks`. + If `multimask_output=True`, it's the mask with the highest IoU estimate. + If `multimask_output=False`, it's the same as `low_res_multimasks`. + - high_res_masks: [B, 1, H*16, W*16] shape, the best mask in `high_res_multimasks`. + If `multimask_output=True`, it's the mask with the highest IoU estimate. + If `multimask_output=False`, it's the same as `high_res_multimasks`. + - obj_ptr: [B, C] shape, the object pointer vector for the output mask, extracted + based on the output token from the SAM mask decoder. + """ + B = backbone_features.size(0) + device = backbone_features.device + assert backbone_features.size(1) == self.sam_prompt_embed_dim + assert backbone_features.size(2) == self.sam_image_embedding_size + assert backbone_features.size(3) == self.sam_image_embedding_size + + # a) Handle point prompts + if point_inputs is not None: + sam_point_coords = point_inputs["point_coords"] + sam_point_labels = point_inputs["point_labels"] + assert sam_point_coords.size(0) == B and sam_point_labels.size(0) == B + else: + # If no points are provide, pad with an empty point (with label -1) + sam_point_coords = torch.zeros(B, 1, 2, device=device) + sam_point_labels = -torch.ones(B, 1, dtype=torch.int32, device=device) + + # b) Handle mask prompts + if mask_inputs is not None: + # If mask_inputs is provided, downsize it into low-res mask input if needed + # and feed it as a dense mask prompt into the SAM mask encoder + assert len(mask_inputs.shape) == 4 and mask_inputs.shape[:2] == (B, 1) + if mask_inputs.shape[-2:] != self.sam_prompt_encoder.mask_input_size: + sam_mask_prompt = F.interpolate( + mask_inputs.float(), + size=self.sam_prompt_encoder.mask_input_size, + align_corners=False, + mode="bilinear", + antialias=True, # use antialias for downsampling + ) + else: + sam_mask_prompt = mask_inputs + else: + # Otherwise, simply feed None (and SAM's prompt encoder will add + # a learned `no_mask_embed` to indicate no mask input in this case). + sam_mask_prompt = None + + sparse_embeddings, dense_embeddings = self.sam_prompt_encoder( + points=(sam_point_coords, sam_point_labels), + boxes=None, + masks=sam_mask_prompt, + ) + ( + low_res_multimasks, + ious, + sam_output_tokens, + object_score_logits, + ) = self.sam_mask_decoder( + image_embeddings=backbone_features, + image_pe=self.sam_prompt_encoder.get_dense_pe(), + sparse_prompt_embeddings=sparse_embeddings, + dense_prompt_embeddings=dense_embeddings, + multimask_output=multimask_output, + repeat_image=False, # the image is already batched + high_res_features=high_res_features, + ) + if self.pred_obj_scores: + is_obj_appearing = object_score_logits > 0 + + # Mask used for spatial memories is always a *hard* choice between obj and no obj, + # consistent with the actual mask prediction + low_res_multimasks = torch.where( + is_obj_appearing[:, None, None], + low_res_multimasks, + NO_OBJ_SCORE, + ) + + # convert masks from possibly bfloat16 (or float16) to float32 + # (older PyTorch versions before 2.1 don't support `interpolate` on bf16) + low_res_multimasks = low_res_multimasks.float() + high_res_multimasks = F.interpolate( + low_res_multimasks, + size=(self.image_size, self.image_size), + mode="bilinear", + align_corners=False, + ) + + sam_output_token = sam_output_tokens[:, 0] + if multimask_output: + # take the best mask prediction (with the highest IoU estimation) + best_iou_inds = torch.argmax(ious, dim=-1) + batch_inds = torch.arange(B, device=device) + low_res_masks = low_res_multimasks[batch_inds, best_iou_inds].unsqueeze(1) + high_res_masks = high_res_multimasks[batch_inds, best_iou_inds].unsqueeze(1) + if sam_output_tokens.size(1) > 1: + sam_output_token = sam_output_tokens[batch_inds, best_iou_inds] + else: + low_res_masks, high_res_masks = low_res_multimasks, high_res_multimasks + + # Extract object pointer from the SAM output token (with occlusion handling) + obj_ptr = self.obj_ptr_proj(sam_output_token) + if self.pred_obj_scores: + # Allow *soft* no obj ptr, unlike for masks + if self.soft_no_obj_ptr: + # Only hard possible with gt + assert not self.teacher_force_obj_scores_for_mem + lambda_is_obj_appearing = object_score_logits.sigmoid() + else: + lambda_is_obj_appearing = is_obj_appearing.float() + + if self.fixed_no_obj_ptr: + obj_ptr = lambda_is_obj_appearing * obj_ptr + obj_ptr = obj_ptr + (1 - lambda_is_obj_appearing) * self.no_obj_ptr + + return ( + low_res_multimasks, + high_res_multimasks, + ious, + low_res_masks, + high_res_masks, + obj_ptr, + object_score_logits, + ) + + def _use_mask_as_output(self, backbone_features, high_res_features, mask_inputs): + """ + Directly turn binary `mask_inputs` into a output mask logits without using SAM. + (same input and output shapes as in _forward_sam_heads above). + """ + # Use -10/+10 as logits for neg/pos pixels (very close to 0/1 in prob after sigmoid). + out_scale, out_bias = 20.0, -10.0 # sigmoid(-10.0)=4.5398e-05 + mask_inputs_float = mask_inputs.float() + high_res_masks = mask_inputs_float * out_scale + out_bias + low_res_masks = F.interpolate( + high_res_masks, + size=(high_res_masks.size(-2) // 4, high_res_masks.size(-1) // 4), + align_corners=False, + mode="bilinear", + antialias=True, # use antialias for downsampling + ) + # a dummy IoU prediction of all 1's under mask input + ious = mask_inputs.new_ones(mask_inputs.size(0), 1).float() + if not self.use_obj_ptrs_in_encoder: + # all zeros as a dummy object pointer (of shape [B, C]) + obj_ptr = torch.zeros( + mask_inputs.size(0), self.hidden_dim, device=mask_inputs.device + ) + else: + # produce an object pointer using the SAM decoder from the mask input + _, _, _, _, _, obj_ptr, _ = self._forward_sam_heads( + backbone_features=backbone_features, + mask_inputs=self.mask_downsample(mask_inputs_float), + high_res_features=high_res_features, + ) + # In this method, we are treating mask_input as output, e.g. using it directly to create spatial mem; + # Below, we follow the same design axiom to use mask_input to decide if obj appears or not instead of relying + # on the object_scores from the SAM decoder. + is_obj_appearing = torch.any(mask_inputs.flatten(1).float() > 0.0, dim=1) + is_obj_appearing = is_obj_appearing[..., None] + lambda_is_obj_appearing = is_obj_appearing.float() + object_score_logits = out_scale * lambda_is_obj_appearing + out_bias + if self.pred_obj_scores: + if self.fixed_no_obj_ptr: + obj_ptr = lambda_is_obj_appearing * obj_ptr + obj_ptr = obj_ptr + (1 - lambda_is_obj_appearing) * self.no_obj_ptr + + return ( + low_res_masks, + high_res_masks, + ious, + low_res_masks, + high_res_masks, + obj_ptr, + object_score_logits, + ) + + def forward_image(self, img_batch: torch.Tensor): + """Get the image feature on the input batch.""" + backbone_out = self.image_encoder(img_batch) + if self.use_high_res_features_in_sam: + # precompute projected level 0 and level 1 features in SAM decoder + # to avoid running it again on every SAM click + backbone_out["backbone_fpn"][0] = self.sam_mask_decoder.conv_s0( + backbone_out["backbone_fpn"][0] + ) + backbone_out["backbone_fpn"][1] = self.sam_mask_decoder.conv_s1( + backbone_out["backbone_fpn"][1] + ) + return backbone_out + + def _prepare_backbone_features(self, backbone_out): + """Prepare and flatten visual features.""" + backbone_out = backbone_out.copy() + assert len(backbone_out["backbone_fpn"]) == len(backbone_out["vision_pos_enc"]) + assert len(backbone_out["backbone_fpn"]) >= self.num_feature_levels + + feature_maps = backbone_out["backbone_fpn"][-self.num_feature_levels :] + vision_pos_embeds = backbone_out["vision_pos_enc"][-self.num_feature_levels :] + + feat_sizes = [(x.shape[-2], x.shape[-1]) for x in vision_pos_embeds] + # flatten NxCxHxW to HWxNxC + vision_feats = [x.flatten(2).permute(2, 0, 1) for x in feature_maps] + vision_pos_embeds = [x.flatten(2).permute(2, 0, 1) for x in vision_pos_embeds] + + return backbone_out, vision_feats, vision_pos_embeds, feat_sizes + + def _prepare_memory_conditioned_features( + self, + frame_idx, + is_init_cond_frame, + current_vision_feats, + current_vision_pos_embeds, + feat_sizes, + output_dict, + num_frames, + track_in_reverse=False, # tracking in reverse time order (for demo usage) + ): + """Fuse the current frame's visual feature map with previous memory.""" + B = current_vision_feats[-1].size(1) # batch size on this frame + C = self.hidden_dim + H, W = feat_sizes[-1] # top-level (lowest-resolution) feature size + device = current_vision_feats[-1].device + # The case of `self.num_maskmem == 0` below is primarily used for reproducing SAM on images. + # In this case, we skip the fusion with any memory. + if self.num_maskmem == 0: # Disable memory and skip fusion + pix_feat = current_vision_feats[-1].permute(1, 2, 0).view(B, C, H, W) + return pix_feat + + num_obj_ptr_tokens = 0 + # Step 1: condition the visual features of the current frame on previous memories + if not is_init_cond_frame: + # Retrieve the memories encoded with the maskmem backbone + to_cat_memory, to_cat_memory_pos_embed = [], [] + # Add conditioning frames's output first (all cond frames have t_pos=0 for + # when getting temporal positional embedding below) + assert len(output_dict["cond_frame_outputs"]) > 0 + # Select a maximum number of temporally closest cond frames for cross attention + cond_outputs = output_dict["cond_frame_outputs"] + selected_cond_outputs, unselected_cond_outputs = select_closest_cond_frames( + frame_idx, cond_outputs, self.max_cond_frames_in_attn + ) + t_pos_and_prevs = [(0, out) for out in selected_cond_outputs.values()] + # Add last (self.num_maskmem - 1) frames before current frame for non-conditioning memory + # the earliest one has t_pos=1 and the latest one has t_pos=self.num_maskmem-1 + # We also allow taking the memory frame non-consecutively (with r>1), in which case + # we take (self.num_maskmem - 2) frames among every r-th frames plus the last frame. + r = self.memory_temporal_stride_for_eval + for t_pos in range(1, self.num_maskmem): + t_rel = self.num_maskmem - t_pos # how many frames before current frame + if t_rel == 1: + # for t_rel == 1, we take the last frame (regardless of r) + if not track_in_reverse: + # the frame immediately before this frame (i.e. frame_idx - 1) + prev_frame_idx = frame_idx - t_rel + else: + # the frame immediately after this frame (i.e. frame_idx + 1) + prev_frame_idx = frame_idx + t_rel + else: + # for t_rel >= 2, we take the memory frame from every r-th frames + if not track_in_reverse: + # first find the nearest frame among every r-th frames before this frame + # for r=1, this would be (frame_idx - 2) + prev_frame_idx = ((frame_idx - 2) // r) * r + # then seek further among every r-th frames + prev_frame_idx = prev_frame_idx - (t_rel - 2) * r + else: + # first find the nearest frame among every r-th frames after this frame + # for r=1, this would be (frame_idx + 2) + prev_frame_idx = -(-(frame_idx + 2) // r) * r + # then seek further among every r-th frames + prev_frame_idx = prev_frame_idx + (t_rel - 2) * r + out = output_dict["non_cond_frame_outputs"].get(prev_frame_idx, None) + if out is None: + # If an unselected conditioning frame is among the last (self.num_maskmem - 1) + # frames, we still attend to it as if it's a non-conditioning frame. + out = unselected_cond_outputs.get(prev_frame_idx, None) + t_pos_and_prevs.append((t_pos, out)) + + for t_pos, prev in t_pos_and_prevs: + if prev is None: + continue # skip padding frames + # "maskmem_features" might have been offloaded to CPU in demo use cases, + # so we load it back to GPU (it's a no-op if it's already on GPU). + feats = prev["maskmem_features"].cuda(non_blocking=True) + to_cat_memory.append(feats.flatten(2).permute(2, 0, 1)) + # Spatial positional encoding (it might have been offloaded to CPU in eval) + maskmem_enc = prev["maskmem_pos_enc"][-1].cuda() + maskmem_enc = maskmem_enc.flatten(2).permute(2, 0, 1) + # Temporal positional encoding + maskmem_enc = ( + maskmem_enc + self.maskmem_tpos_enc[self.num_maskmem - t_pos - 1] + ) + to_cat_memory_pos_embed.append(maskmem_enc) + + # Construct the list of past object pointers + if self.use_obj_ptrs_in_encoder: + max_obj_ptrs_in_encoder = min(num_frames, self.max_obj_ptrs_in_encoder) + # First add those object pointers from selected conditioning frames + # (optionally, only include object pointers in the past during evaluation) + if not self.training and self.only_obj_ptrs_in_the_past_for_eval: + ptr_cond_outputs = { + t: out + for t, out in selected_cond_outputs.items() + if (t >= frame_idx if track_in_reverse else t <= frame_idx) + } + else: + ptr_cond_outputs = selected_cond_outputs + pos_and_ptrs = [ + # Temporal pos encoding contains how far away each pointer is from current frame + (abs(frame_idx - t), out["obj_ptr"]) + for t, out in ptr_cond_outputs.items() + ] + # Add up to (max_obj_ptrs_in_encoder - 1) non-conditioning frames before current frame + for t_diff in range(1, max_obj_ptrs_in_encoder): + t = frame_idx + t_diff if track_in_reverse else frame_idx - t_diff + if t < 0 or (num_frames is not None and t >= num_frames): + break + out = output_dict["non_cond_frame_outputs"].get( + t, unselected_cond_outputs.get(t, None) + ) + if out is not None: + pos_and_ptrs.append((t_diff, out["obj_ptr"])) + # If we have at least one object pointer, add them to the across attention + if len(pos_and_ptrs) > 0: + pos_list, ptrs_list = zip(*pos_and_ptrs) + # stack object pointers along dim=0 into [ptr_seq_len, B, C] shape + obj_ptrs = torch.stack(ptrs_list, dim=0) + # a temporal positional embedding based on how far each object pointer is from + # the current frame (sine embedding normalized by the max pointer num). + if self.add_tpos_enc_to_obj_ptrs: + t_diff_max = max_obj_ptrs_in_encoder - 1 + tpos_dim = C if self.proj_tpos_enc_in_obj_ptrs else self.mem_dim + obj_pos = torch.tensor(pos_list, device=device) + obj_pos = get_1d_sine_pe(obj_pos / t_diff_max, dim=tpos_dim) + obj_pos = self.obj_ptr_tpos_proj(obj_pos) + obj_pos = obj_pos.unsqueeze(1).expand(-1, B, self.mem_dim) + else: + obj_pos = obj_ptrs.new_zeros(len(pos_list), B, self.mem_dim) + if self.mem_dim < C: + # split a pointer into (C // self.mem_dim) tokens for self.mem_dim < C + obj_ptrs = obj_ptrs.reshape( + -1, B, C // self.mem_dim, self.mem_dim + ) + obj_ptrs = obj_ptrs.permute(0, 2, 1, 3).flatten(0, 1) + obj_pos = obj_pos.repeat_interleave(C // self.mem_dim, dim=0) + to_cat_memory.append(obj_ptrs) + to_cat_memory_pos_embed.append(obj_pos) + num_obj_ptr_tokens = obj_ptrs.shape[0] + else: + num_obj_ptr_tokens = 0 + else: + # for initial conditioning frames, encode them without using any previous memory + if self.directly_add_no_mem_embed: + # directly add no-mem embedding (instead of using the transformer encoder) + pix_feat_with_mem = current_vision_feats[-1] + self.no_mem_embed + pix_feat_with_mem = pix_feat_with_mem.permute(1, 2, 0).view(B, C, H, W) + return pix_feat_with_mem + + # Use a dummy token on the first frame (to avoid emtpy memory input to tranformer encoder) + to_cat_memory = [self.no_mem_embed.expand(1, B, self.mem_dim)] + to_cat_memory_pos_embed = [self.no_mem_pos_enc.expand(1, B, self.mem_dim)] + + # Step 2: Concatenate the memories and forward through the transformer encoder + memory = torch.cat(to_cat_memory, dim=0) + memory_pos_embed = torch.cat(to_cat_memory_pos_embed, dim=0) + + pix_feat_with_mem = self.memory_attention( + curr=current_vision_feats, + curr_pos=current_vision_pos_embeds, + memory=memory, + memory_pos=memory_pos_embed, + num_obj_ptr_tokens=num_obj_ptr_tokens, + ) + # reshape the output (HW)BC => BCHW + pix_feat_with_mem = pix_feat_with_mem.permute(1, 2, 0).view(B, C, H, W) + return pix_feat_with_mem + + def _encode_new_memory( + self, + current_vision_feats, + feat_sizes, + pred_masks_high_res, + is_mask_from_pts, + ): + """Encode the current image and its prediction into a memory feature.""" + B = current_vision_feats[-1].size(1) # batch size on this frame + C = self.hidden_dim + H, W = feat_sizes[-1] # top-level (lowest-resolution) feature size + # top-level feature, (HW)BC => BCHW + pix_feat = current_vision_feats[-1].permute(1, 2, 0).view(B, C, H, W) + if self.non_overlap_masks_for_mem_enc and not self.training: + # optionally, apply non-overlapping constraints to the masks (it's applied + # in the batch dimension and should only be used during eval, where all + # the objects come from the same video under batch size 1). + pred_masks_high_res = self._apply_non_overlapping_constraints( + pred_masks_high_res + ) + # scale the raw mask logits with a temperature before applying sigmoid + binarize = self.binarize_mask_from_pts_for_mem_enc and is_mask_from_pts + if binarize and not self.training: + mask_for_mem = (pred_masks_high_res > 0).float() + else: + # apply sigmoid on the raw mask logits to turn them into range (0, 1) + mask_for_mem = torch.sigmoid(pred_masks_high_res) + # apply scale and bias terms to the sigmoid probabilities + if self.sigmoid_scale_for_mem_enc != 1.0: + mask_for_mem = mask_for_mem * self.sigmoid_scale_for_mem_enc + if self.sigmoid_bias_for_mem_enc != 0.0: + mask_for_mem = mask_for_mem + self.sigmoid_bias_for_mem_enc + maskmem_out = self.memory_encoder( + pix_feat, + mask_for_mem, + skip_mask_sigmoid=True, # sigmoid already applied + ) + maskmem_features = maskmem_out["vision_features"] + maskmem_pos_enc = maskmem_out["vision_pos_enc"] + + return maskmem_features, maskmem_pos_enc + + def track_step( + self, + frame_idx, + is_init_cond_frame, + current_vision_feats, + current_vision_pos_embeds, + feat_sizes, + point_inputs, + mask_inputs, + output_dict, + num_frames, + track_in_reverse=False, # tracking in reverse time order (for demo usage) + # Whether to run the memory encoder on the predicted masks. Sometimes we might want + # to skip the memory encoder with `run_mem_encoder=False`. For example, + # in demo we might call `track_step` multiple times for each user click, + # and only encode the memory when the user finalizes their clicks. And in ablation + # settings like SAM training on static images, we don't need the memory encoder. + run_mem_encoder=True, + # The previously predicted SAM mask logits (which can be fed together with new clicks in demo). + prev_sam_mask_logits=None, + ): + current_out = {"point_inputs": point_inputs, "mask_inputs": mask_inputs} + # High-resolution feature maps for the SAM head, reshape (HW)BC => BCHW + if len(current_vision_feats) > 1: + high_res_features = [ + x.permute(1, 2, 0).view(x.size(1), x.size(2), *s) + for x, s in zip(current_vision_feats[:-1], feat_sizes[:-1]) + ] + else: + high_res_features = None + if mask_inputs is not None and self.use_mask_input_as_output_without_sam: + # When use_mask_input_as_output_without_sam=True, we directly output the mask input + # (see it as a GT mask) without using a SAM prompt encoder + mask decoder. + pix_feat = current_vision_feats[-1].permute(1, 2, 0) + pix_feat = pix_feat.view(-1, self.hidden_dim, *feat_sizes[-1]) + sam_outputs = self._use_mask_as_output( + pix_feat, high_res_features, mask_inputs + ) + else: + # fused the visual feature with previous memory features in the memory bank + pix_feat_with_mem = self._prepare_memory_conditioned_features( + frame_idx=frame_idx, + is_init_cond_frame=is_init_cond_frame, + current_vision_feats=current_vision_feats[-1:], + current_vision_pos_embeds=current_vision_pos_embeds[-1:], + feat_sizes=feat_sizes[-1:], + output_dict=output_dict, + num_frames=num_frames, + track_in_reverse=track_in_reverse, + ) + # apply SAM-style segmentation head + # here we might feed previously predicted low-res SAM mask logits into the SAM mask decoder, + # e.g. in demo where such logits come from earlier interaction instead of correction sampling + # (in this case, any `mask_inputs` shouldn't reach here as they are sent to _use_mask_as_output instead) + if prev_sam_mask_logits is not None: + assert point_inputs is not None and mask_inputs is None + mask_inputs = prev_sam_mask_logits + multimask_output = self._use_multimask(is_init_cond_frame, point_inputs) + sam_outputs = self._forward_sam_heads( + backbone_features=pix_feat_with_mem, + point_inputs=point_inputs, + mask_inputs=mask_inputs, + high_res_features=high_res_features, + multimask_output=multimask_output, + ) + ( + _, + _, + _, + low_res_masks, + high_res_masks, + obj_ptr, + _, + ) = sam_outputs + + current_out["pred_masks"] = low_res_masks + current_out["pred_masks_high_res"] = high_res_masks + current_out["obj_ptr"] = obj_ptr + + # Finally run the memory encoder on the predicted mask to encode + # it into a new memory feature (that can be used in future frames) + if run_mem_encoder and self.num_maskmem > 0: + high_res_masks_for_mem_enc = high_res_masks + maskmem_features, maskmem_pos_enc = self._encode_new_memory( + current_vision_feats=current_vision_feats, + feat_sizes=feat_sizes, + pred_masks_high_res=high_res_masks_for_mem_enc, + is_mask_from_pts=(point_inputs is not None), + ) + current_out["maskmem_features"] = maskmem_features + current_out["maskmem_pos_enc"] = maskmem_pos_enc + else: + current_out["maskmem_features"] = None + current_out["maskmem_pos_enc"] = None + + return current_out + + def _use_multimask(self, is_init_cond_frame, point_inputs): + """Whether to use multimask output in the SAM head.""" + num_pts = 0 if point_inputs is None else point_inputs["point_labels"].size(1) + multimask_output = ( + self.multimask_output_in_sam + and (is_init_cond_frame or self.multimask_output_for_tracking) + and (self.multimask_min_pt_num <= num_pts <= self.multimask_max_pt_num) + ) + return multimask_output + + def _apply_non_overlapping_constraints(self, pred_masks): + """ + Apply non-overlapping constraints to the object scores in pred_masks. Here we + keep only the highest scoring object at each spatial location in pred_masks. + """ + batch_size = pred_masks.size(0) + if batch_size == 1: + return pred_masks + + device = pred_masks.device + # "max_obj_inds": object index of the object with the highest score at each location + max_obj_inds = torch.argmax(pred_masks, dim=0, keepdim=True) + # "batch_obj_inds": object index of each object slice (along dim 0) in `pred_masks` + batch_obj_inds = torch.arange(batch_size, device=device)[:, None, None, None] + keep = max_obj_inds == batch_obj_inds + # suppress overlapping regions' scores below -10.0 so that the foreground regions + # don't overlap (here sigmoid(-10.0)=4.5398e-05) + pred_masks = torch.where(keep, pred_masks, torch.clamp(pred_masks, max=-10.0)) + return pred_masks diff --git a/inpaint/plugins/segment_anything2/modeling/sam2_utils.py b/inpaint/plugins/segment_anything2/modeling/sam2_utils.py new file mode 100644 index 0000000..6d97059 --- /dev/null +++ b/inpaint/plugins/segment_anything2/modeling/sam2_utils.py @@ -0,0 +1,149 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + + +import copy + +import torch +import torch.nn as nn +import torch.nn.functional as F + + +def select_closest_cond_frames(frame_idx, cond_frame_outputs, max_cond_frame_num): + """ + Select up to `max_cond_frame_num` conditioning frames from `cond_frame_outputs` + that are temporally closest to the current frame at `frame_idx`. Here, we take + - a) the closest conditioning frame before `frame_idx` (if any); + - b) the closest conditioning frame after `frame_idx` (if any); + - c) any other temporally closest conditioning frames until reaching a total + of `max_cond_frame_num` conditioning frames. + + Outputs: + - selected_outputs: selected items (keys & values) from `cond_frame_outputs`. + - unselected_outputs: items (keys & values) not selected in `cond_frame_outputs`. + """ + if max_cond_frame_num == -1 or len(cond_frame_outputs) <= max_cond_frame_num: + selected_outputs = cond_frame_outputs + unselected_outputs = {} + else: + assert max_cond_frame_num >= 2, "we should allow using 2+ conditioning frames" + selected_outputs = {} + + # the closest conditioning frame before `frame_idx` (if any) + idx_before = max((t for t in cond_frame_outputs if t < frame_idx), default=None) + if idx_before is not None: + selected_outputs[idx_before] = cond_frame_outputs[idx_before] + + # the closest conditioning frame after `frame_idx` (if any) + idx_after = min((t for t in cond_frame_outputs if t >= frame_idx), default=None) + if idx_after is not None: + selected_outputs[idx_after] = cond_frame_outputs[idx_after] + + # add other temporally closest conditioning frames until reaching a total + # of `max_cond_frame_num` conditioning frames. + num_remain = max_cond_frame_num - len(selected_outputs) + inds_remain = sorted( + (t for t in cond_frame_outputs if t not in selected_outputs), + key=lambda x: abs(x - frame_idx), + )[:num_remain] + selected_outputs.update((t, cond_frame_outputs[t]) for t in inds_remain) + unselected_outputs = { + t: v for t, v in cond_frame_outputs.items() if t not in selected_outputs + } + + return selected_outputs, unselected_outputs + + +def get_1d_sine_pe(pos_inds, dim, temperature=10000): + """ + Get 1D sine positional embedding as in the original Transformer paper. + """ + pe_dim = dim // 2 + dim_t = torch.arange(pe_dim, dtype=torch.float32, device=pos_inds.device) + dim_t = temperature ** (2 * (dim_t // 2) / pe_dim) + + pos_embed = pos_inds.unsqueeze(-1) / dim_t + pos_embed = torch.cat([pos_embed.sin(), pos_embed.cos()], dim=-1) + return pos_embed + + +def get_activation_fn(activation): + """Return an activation function given a string""" + if activation == "relu": + return F.relu + if activation == "gelu": + return F.gelu + if activation == "glu": + return F.glu + raise RuntimeError(f"activation should be relu/gelu, not {activation}.") + + +def get_clones(module, N): + return nn.ModuleList([copy.deepcopy(module) for i in range(N)]) + + +class DropPath(nn.Module): + # adapted from https://github.com/huggingface/pytorch-image-models/blob/main/timm/layers/drop.py + def __init__(self, drop_prob=0.0, scale_by_keep=True): + super(DropPath, self).__init__() + self.drop_prob = drop_prob + self.scale_by_keep = scale_by_keep + + def forward(self, x): + if self.drop_prob == 0.0 or not self.training: + return x + keep_prob = 1 - self.drop_prob + shape = (x.shape[0],) + (1,) * (x.ndim - 1) + random_tensor = x.new_empty(shape).bernoulli_(keep_prob) + if keep_prob > 0.0 and self.scale_by_keep: + random_tensor.div_(keep_prob) + return x * random_tensor + + +# Lightly adapted from +# https://github.com/facebookresearch/MaskFormer/blob/main/mask_former/modeling/transformer/transformer_predictor.py # noqa +class MLP(nn.Module): + def __init__( + self, + input_dim: int, + hidden_dim: int, + output_dim: int, + num_layers: int, + activation: nn.Module = nn.ReLU, + sigmoid_output: bool = False, + ) -> None: + super().__init__() + self.num_layers = num_layers + h = [hidden_dim] * (num_layers - 1) + self.layers = nn.ModuleList( + nn.Linear(n, k) for n, k in zip([input_dim] + h, h + [output_dim]) + ) + self.sigmoid_output = sigmoid_output + self.act = activation() + + def forward(self, x): + for i, layer in enumerate(self.layers): + x = self.act(layer(x)) if i < self.num_layers - 1 else layer(x) + if self.sigmoid_output: + x = F.sigmoid(x) + return x + + +# From https://github.com/facebookresearch/detectron2/blob/main/detectron2/layers/batch_norm.py # noqa +# Itself from https://github.com/facebookresearch/ConvNeXt/blob/d1fa8f6fef0a165b27399986cc2bdacc92777e40/models/convnext.py#L119 # noqa +class LayerNorm2d(nn.Module): + def __init__(self, num_channels: int, eps: float = 1e-6) -> None: + super().__init__() + self.weight = nn.Parameter(torch.ones(num_channels)) + self.bias = nn.Parameter(torch.zeros(num_channels)) + self.eps = eps + + def forward(self, x: torch.Tensor) -> torch.Tensor: + u = x.mean(1, keepdim=True) + s = (x - u).pow(2).mean(1, keepdim=True) + x = (x - u) / torch.sqrt(s + self.eps) + x = self.weight[:, None, None] * x + self.bias[:, None, None] + return x diff --git a/inpaint/plugins/segment_anything2/sam2_image_predictor.py b/inpaint/plugins/segment_anything2/sam2_image_predictor.py new file mode 100644 index 0000000..99ac570 --- /dev/null +++ b/inpaint/plugins/segment_anything2/sam2_image_predictor.py @@ -0,0 +1,445 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import logging + +from typing import List, Optional, Tuple, Union + +import numpy as np +import torch +from PIL.Image import Image + +from .modeling.sam2_base import SAM2Base + +from .utils.transforms import SAM2Transforms + + +class SAM2ImagePredictor: + def __init__( + self, + sam_model: SAM2Base, + mask_threshold=0.0, + max_hole_area=0.0, + max_sprinkle_area=0.0, + ) -> None: + """ + Uses SAM-2 to calculate the image embedding for an image, and then + allow repeated, efficient mask prediction given prompts. + + Arguments: + sam_model (Sam-2): The model to use for mask prediction. + mask_threshold (float): The threshold to use when converting mask logits + to binary masks. Masks are thresholded at 0 by default. + fill_hole_area (int): If fill_hole_area > 0, we fill small holes in up to + the maximum area of fill_hole_area in low_res_masks. + """ + super().__init__() + self.model = sam_model + self._transforms = SAM2Transforms( + resolution=self.model.image_size, + mask_threshold=mask_threshold, + max_hole_area=max_hole_area, + max_sprinkle_area=max_sprinkle_area, + ) + + # Predictor state + self._is_image_set = False + self._features = None + self._orig_hw = None + # Whether the predictor is set for single image or a batch of images + self._is_batch = False + + # Predictor config + self.mask_threshold = mask_threshold + + # Spatial dim for backbone feature maps + self._bb_feat_sizes = [ + (256, 256), + (128, 128), + (64, 64), + ] + + @torch.no_grad() + def set_image( + self, + image: Union[np.ndarray, Image], + ) -> None: + """ + Calculates the image embeddings for the provided image, allowing + masks to be predicted with the 'predict' method. + + Arguments: + image (np.ndarray or PIL Image): The input image to embed in RGB format. The image should be in HWC format if np.ndarray, or WHC format if PIL Image + with pixel values in [0, 255]. + image_format (str): The color format of the image, in ['RGB', 'BGR']. + """ + self.reset_predictor() + # Transform the image to the form expected by the model + if isinstance(image, np.ndarray): + logging.info("For numpy array image, we assume (HxWxC) format") + self._orig_hw = [image.shape[:2]] + elif isinstance(image, Image): + w, h = image.size + self._orig_hw = [(h, w)] + else: + raise NotImplementedError("Image format not supported") + + input_image = self._transforms(image) + input_image = input_image[None, ...].to(self.device) + + assert ( + len(input_image.shape) == 4 and input_image.shape[1] == 3 + ), f"input_image must be of size 1x3xHxW, got {input_image.shape}" + logging.info("Computing image embeddings for the provided image...") + backbone_out = self.model.forward_image(input_image) + _, vision_feats, _, _ = self.model._prepare_backbone_features(backbone_out) + # Add no_mem_embed, which is added to the lowest rest feat. map during training on videos + if self.model.directly_add_no_mem_embed: + vision_feats[-1] = vision_feats[-1] + self.model.no_mem_embed + + feats = [ + feat.permute(1, 2, 0).view(1, -1, *feat_size) + for feat, feat_size in zip(vision_feats[::-1], self._bb_feat_sizes[::-1]) + ][::-1] + self._features = {"image_embed": feats[-1], "high_res_feats": feats[:-1]} + self._is_image_set = True + logging.info("Image embeddings computed.") + + @torch.no_grad() + def set_image_batch( + self, + image_list: List[Union[np.ndarray]], + ) -> None: + """ + Calculates the image embeddings for the provided image batch, allowing + masks to be predicted with the 'predict_batch' method. + + Arguments: + image_list (List[np.ndarray]): The input images to embed in RGB format. The image should be in HWC format if np.ndarray + with pixel values in [0, 255]. + """ + self.reset_predictor() + assert isinstance(image_list, list) + self._orig_hw = [] + for image in image_list: + assert isinstance( + image, np.ndarray + ), "Images are expected to be an np.ndarray in RGB format, and of shape HWC" + self._orig_hw.append(image.shape[:2]) + # Transform the image to the form expected by the model + img_batch = self._transforms.forward_batch(image_list) + img_batch = img_batch.to(self.device) + batch_size = img_batch.shape[0] + assert ( + len(img_batch.shape) == 4 and img_batch.shape[1] == 3 + ), f"img_batch must be of size Bx3xHxW, got {img_batch.shape}" + logging.info("Computing image embeddings for the provided images...") + backbone_out = self.model.forward_image(img_batch) + _, vision_feats, _, _ = self.model._prepare_backbone_features(backbone_out) + # Add no_mem_embed, which is added to the lowest rest feat. map during training on videos + if self.model.directly_add_no_mem_embed: + vision_feats[-1] = vision_feats[-1] + self.model.no_mem_embed + + feats = [ + feat.permute(1, 2, 0).view(batch_size, -1, *feat_size) + for feat, feat_size in zip(vision_feats[::-1], self._bb_feat_sizes[::-1]) + ][::-1] + self._features = {"image_embed": feats[-1], "high_res_feats": feats[:-1]} + self._is_image_set = True + self._is_batch = True + logging.info("Image embeddings computed.") + + def predict_batch( + self, + point_coords_batch: List[np.ndarray] = None, + point_labels_batch: List[np.ndarray] = None, + box_batch: List[np.ndarray] = None, + mask_input_batch: List[np.ndarray] = None, + multimask_output: bool = True, + return_logits: bool = False, + normalize_coords=True, + ) -> Tuple[List[np.ndarray], List[np.ndarray], List[np.ndarray]]: + """This function is very similar to predict(...), however it is used for batched mode, when the model is expected to generate predictions on multiple images. + It returns a tupele of lists of masks, ious, and low_res_masks_logits. + """ + assert self._is_batch, "This function should only be used when in batched mode" + if not self._is_image_set: + raise RuntimeError( + "An image must be set with .set_image_batch(...) before mask prediction." + ) + num_images = len(self._features["image_embed"]) + all_masks = [] + all_ious = [] + all_low_res_masks = [] + for img_idx in range(num_images): + # Transform input prompts + point_coords = ( + point_coords_batch[img_idx] if point_coords_batch is not None else None + ) + point_labels = ( + point_labels_batch[img_idx] if point_labels_batch is not None else None + ) + box = box_batch[img_idx] if box_batch is not None else None + mask_input = ( + mask_input_batch[img_idx] if mask_input_batch is not None else None + ) + mask_input, unnorm_coords, labels, unnorm_box = self._prep_prompts( + point_coords, + point_labels, + box, + mask_input, + normalize_coords, + img_idx=img_idx, + ) + masks, iou_predictions, low_res_masks = self._predict( + unnorm_coords, + labels, + unnorm_box, + mask_input, + multimask_output, + return_logits=return_logits, + img_idx=img_idx, + ) + masks_np = masks.squeeze(0).float().detach().cpu().numpy() + iou_predictions_np = ( + iou_predictions.squeeze(0).float().detach().cpu().numpy() + ) + low_res_masks_np = low_res_masks.squeeze(0).float().detach().cpu().numpy() + all_masks.append(masks_np) + all_ious.append(iou_predictions_np) + all_low_res_masks.append(low_res_masks_np) + + return all_masks, all_ious, all_low_res_masks + + def predict( + self, + point_coords: Optional[np.ndarray] = None, + point_labels: Optional[np.ndarray] = None, + box: Optional[np.ndarray] = None, + mask_input: Optional[np.ndarray] = None, + multimask_output: bool = True, + return_logits: bool = False, + normalize_coords=True, + ) -> Tuple[np.ndarray, np.ndarray, np.ndarray]: + """ + Predict masks for the given input prompts, using the currently set image. + + Arguments: + point_coords (np.ndarray or None): A Nx2 array of point prompts to the + model. Each point is in (X,Y) in pixels. + point_labels (np.ndarray or None): A length N array of labels for the + point prompts. 1 indicates a foreground point and 0 indicates a + background point. + box (np.ndarray or None): A length 4 array given a box prompt to the + model, in XYXY format. + mask_input (np.ndarray): A low resolution mask input to the model, typically + coming from a previous prediction iteration. Has form 1xHxW, where + for SAM, H=W=256. + multimask_output (bool): If true, the model will return three masks. + For ambiguous input prompts (such as a single click), this will often + produce better masks than a single prediction. If only a single + mask is needed, the model's predicted quality score can be used + to select the best mask. For non-ambiguous prompts, such as multiple + input prompts, multimask_output=False can give better results. + return_logits (bool): If true, returns un-thresholded masks logits + instead of a binary mask. + normalize_coords (bool): If true, the point coordinates will be normalized to the range [0,1] and point_coords is expected to be wrt. image dimensions. + + Returns: + (np.ndarray): The output masks in CxHxW format, where C is the + number of masks, and (H, W) is the original image size. + (np.ndarray): An array of length C containing the model's + predictions for the quality of each mask. + (np.ndarray): An array of shape CxHxW, where C is the number + of masks and H=W=256. These low resolution logits can be passed to + a subsequent iteration as mask input. + """ + if not self._is_image_set: + raise RuntimeError( + "An image must be set with .set_image(...) before mask prediction." + ) + + # Transform input prompts + + mask_input, unnorm_coords, labels, unnorm_box = self._prep_prompts( + point_coords, point_labels, box, mask_input, normalize_coords + ) + + masks, iou_predictions, low_res_masks = self._predict( + unnorm_coords, + labels, + unnorm_box, + mask_input, + multimask_output, + return_logits=return_logits, + ) + + masks_np = masks.squeeze(0).float().detach().cpu().numpy() + iou_predictions_np = iou_predictions.squeeze(0).float().detach().cpu().numpy() + low_res_masks_np = low_res_masks.squeeze(0).float().detach().cpu().numpy() + return masks_np, iou_predictions_np, low_res_masks_np + + def _prep_prompts( + self, point_coords, point_labels, box, mask_logits, normalize_coords, img_idx=-1 + ): + unnorm_coords, labels, unnorm_box, mask_input = None, None, None, None + if point_coords is not None: + assert ( + point_labels is not None + ), "point_labels must be supplied if point_coords is supplied." + point_coords = torch.as_tensor( + point_coords, dtype=torch.float, device=self.device + ) + unnorm_coords = self._transforms.transform_coords( + point_coords, normalize=normalize_coords, orig_hw=self._orig_hw[img_idx] + ) + labels = torch.as_tensor(point_labels, dtype=torch.int, device=self.device) + if len(unnorm_coords.shape) == 2: + unnorm_coords, labels = unnorm_coords[None, ...], labels[None, ...] + if box is not None: + box = torch.as_tensor(box, dtype=torch.float, device=self.device) + unnorm_box = self._transforms.transform_boxes( + box, normalize=normalize_coords, orig_hw=self._orig_hw[img_idx] + ) # Bx2x2 + if mask_logits is not None: + mask_input = torch.as_tensor( + mask_logits, dtype=torch.float, device=self.device + ) + if len(mask_input.shape) == 3: + mask_input = mask_input[None, :, :, :] + return mask_input, unnorm_coords, labels, unnorm_box + + @torch.no_grad() + def _predict( + self, + point_coords: Optional[torch.Tensor], + point_labels: Optional[torch.Tensor], + boxes: Optional[torch.Tensor] = None, + mask_input: Optional[torch.Tensor] = None, + multimask_output: bool = True, + return_logits: bool = False, + img_idx: int = -1, + ) -> Tuple[torch.Tensor, torch.Tensor, torch.Tensor]: + """ + Predict masks for the given input prompts, using the currently set image. + Input prompts are batched torch tensors and are expected to already be + transformed to the input frame using SAM2Transforms. + + Arguments: + point_coords (torch.Tensor or None): A BxNx2 array of point prompts to the + model. Each point is in (X,Y) in pixels. + point_labels (torch.Tensor or None): A BxN array of labels for the + point prompts. 1 indicates a foreground point and 0 indicates a + background point. + boxes (np.ndarray or None): A Bx4 array given a box prompt to the + model, in XYXY format. + mask_input (np.ndarray): A low resolution mask input to the model, typically + coming from a previous prediction iteration. Has form Bx1xHxW, where + for SAM, H=W=256. Masks returned by a previous iteration of the + predict method do not need further transformation. + multimask_output (bool): If true, the model will return three masks. + For ambiguous input prompts (such as a single click), this will often + produce better masks than a single prediction. If only a single + mask is needed, the model's predicted quality score can be used + to select the best mask. For non-ambiguous prompts, such as multiple + input prompts, multimask_output=False can give better results. + return_logits (bool): If true, returns un-thresholded masks logits + instead of a binary mask. + + Returns: + (torch.Tensor): The output masks in BxCxHxW format, where C is the + number of masks, and (H, W) is the original image size. + (torch.Tensor): An array of shape BxC containing the model's + predictions for the quality of each mask. + (torch.Tensor): An array of shape BxCxHxW, where C is the number + of masks and H=W=256. These low res logits can be passed to + a subsequent iteration as mask input. + """ + if not self._is_image_set: + raise RuntimeError( + "An image must be set with .set_image(...) before mask prediction." + ) + + if point_coords is not None: + concat_points = (point_coords, point_labels) + else: + concat_points = None + + # Embed prompts + if boxes is not None: + box_coords = boxes.reshape(-1, 2, 2) + box_labels = torch.tensor([[2, 3]], dtype=torch.int, device=boxes.device) + box_labels = box_labels.repeat(boxes.size(0), 1) + # we merge "boxes" and "points" into a single "concat_points" input (where + # boxes are added at the beginning) to sam_prompt_encoder + if concat_points is not None: + concat_coords = torch.cat([box_coords, concat_points[0]], dim=1) + concat_labels = torch.cat([box_labels, concat_points[1]], dim=1) + concat_points = (concat_coords, concat_labels) + else: + concat_points = (box_coords, box_labels) + + sparse_embeddings, dense_embeddings = self.model.sam_prompt_encoder( + points=concat_points, + boxes=None, + masks=mask_input, + ) + + # Predict masks + batched_mode = ( + concat_points is not None and concat_points[0].shape[0] > 1 + ) # multi object prediction + high_res_features = [ + feat_level[img_idx].unsqueeze(0) + for feat_level in self._features["high_res_feats"] + ] + low_res_masks, iou_predictions, _, _ = self.model.sam_mask_decoder( + image_embeddings=self._features["image_embed"][img_idx].unsqueeze(0), + image_pe=self.model.sam_prompt_encoder.get_dense_pe(), + sparse_prompt_embeddings=sparse_embeddings, + dense_prompt_embeddings=dense_embeddings, + multimask_output=multimask_output, + repeat_image=batched_mode, + high_res_features=high_res_features, + ) + + # Upscale the masks to the original image resolution + masks = self._transforms.postprocess_masks( + low_res_masks, self._orig_hw[img_idx] + ) + low_res_masks = torch.clamp(low_res_masks, -32.0, 32.0) + if not return_logits: + masks = masks > self.mask_threshold + + return masks, iou_predictions, low_res_masks + + def get_image_embedding(self) -> torch.Tensor: + """ + Returns the image embeddings for the currently set image, with + shape 1xCxHxW, where C is the embedding dimension and (H,W) are + the embedding spatial dimension of SAM (typically C=256, H=W=64). + """ + if not self._is_image_set: + raise RuntimeError( + "An image must be set with .set_image(...) to generate an embedding." + ) + assert ( + self._features is not None + ), "Features must exist if an image has been set." + return self._features["image_embed"] + + @property + def device(self) -> torch.device: + return self.model.device + + def reset_predictor(self) -> None: + """ + Resets the image embeddings and other state variables. + """ + self._is_image_set = False + self._features = None + self._orig_hw = None + self._is_batch = False diff --git a/inpaint/plugins/segment_anything2/utils/__init__.py b/inpaint/plugins/segment_anything2/utils/__init__.py new file mode 100644 index 0000000..5277f46 --- /dev/null +++ b/inpaint/plugins/segment_anything2/utils/__init__.py @@ -0,0 +1,5 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. diff --git a/inpaint/plugins/segment_anything2/utils/misc.py b/inpaint/plugins/segment_anything2/utils/misc.py new file mode 100644 index 0000000..296ecc1 --- /dev/null +++ b/inpaint/plugins/segment_anything2/utils/misc.py @@ -0,0 +1,90 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import warnings + +import numpy as np +import torch +from PIL import Image + + +def get_sdpa_settings(): + if torch.cuda.is_available(): + old_gpu = torch.cuda.get_device_properties(0).major < 7 + # only use Flash Attention on Ampere (8.0) or newer GPUs + use_flash_attn = torch.cuda.get_device_properties(0).major >= 8 + if not use_flash_attn: + warnings.warn( + "Flash Attention is disabled as it requires a GPU with Ampere (8.0) CUDA capability.", + category=UserWarning, + stacklevel=2, + ) + # keep math kernel for PyTorch versions before 2.2 (Flash Attention v2 is only + # available on PyTorch 2.2+, while Flash Attention v1 cannot handle all cases) + pytorch_version = tuple(int(v) for v in torch.__version__.split(".")[:2]) + if pytorch_version < (2, 2): + warnings.warn( + f"You are using PyTorch {torch.__version__} without Flash Attention v2 support. " + "Consider upgrading to PyTorch 2.2+ for Flash Attention v2 (which could be faster).", + category=UserWarning, + stacklevel=2, + ) + math_kernel_on = pytorch_version < (2, 2) or not use_flash_attn + else: + old_gpu = True + use_flash_attn = False + math_kernel_on = True + + return old_gpu, use_flash_attn, math_kernel_on + + +def mask_to_box(masks: torch.Tensor): + """ + compute bounding box given an input mask + + Inputs: + - masks: [B, 1, H, W] boxes, dtype=torch.Tensor + + Returns: + - box_coords: [B, 1, 4], contains (x, y) coordinates of top left and bottom right box corners, dtype=torch.Tensor + """ + B, _, h, w = masks.shape + device = masks.device + xs = torch.arange(w, device=device, dtype=torch.int32) + ys = torch.arange(h, device=device, dtype=torch.int32) + grid_xs, grid_ys = torch.meshgrid(xs, ys, indexing="xy") + grid_xs = grid_xs[None, None, ...].expand(B, 1, h, w) + grid_ys = grid_ys[None, None, ...].expand(B, 1, h, w) + min_xs, _ = torch.min(torch.where(masks, grid_xs, w).flatten(-2), dim=-1) + max_xs, _ = torch.max(torch.where(masks, grid_xs, -1).flatten(-2), dim=-1) + min_ys, _ = torch.min(torch.where(masks, grid_ys, h).flatten(-2), dim=-1) + max_ys, _ = torch.max(torch.where(masks, grid_ys, -1).flatten(-2), dim=-1) + bbox_coords = torch.stack((min_xs, min_ys, max_xs, max_ys), dim=-1) + + return bbox_coords + + +def _load_img_as_tensor(img_path, image_size): + img_pil = Image.open(img_path) + img_np = np.array(img_pil.convert("RGB").resize((image_size, image_size))) + if img_np.dtype == np.uint8: # np.uint8 is expected for JPEG images + img_np = img_np / 255.0 + else: + raise RuntimeError(f"Unknown image dtype: {img_np.dtype} on {img_path}") + img = torch.from_numpy(img_np).permute(2, 0, 1) + video_width, video_height = img_pil.size # the original video size + return img, video_height, video_width + + +def concat_points(old_point_inputs, new_points, new_labels): + """Add new points and labels to previous point inputs (add at the end).""" + if old_point_inputs is None: + points, labels = new_points, new_labels + else: + points = torch.cat([old_point_inputs["point_coords"], new_points], dim=1) + labels = torch.cat([old_point_inputs["point_labels"], new_labels], dim=1) + + return {"point_coords": points, "point_labels": labels} diff --git a/inpaint/plugins/segment_anything2/utils/transforms.py b/inpaint/plugins/segment_anything2/utils/transforms.py new file mode 100644 index 0000000..fe552e0 --- /dev/null +++ b/inpaint/plugins/segment_anything2/utils/transforms.py @@ -0,0 +1,77 @@ +# Copyright (c) Meta Platforms, Inc. and affiliates. +# All rights reserved. + +# This source code is licensed under the license found in the +# LICENSE file in the root directory of this source tree. + +import torch +import torch.nn as nn +from torchvision.transforms import Normalize, Resize, ToTensor + + +class SAM2Transforms(nn.Module): + def __init__( + self, resolution, mask_threshold, max_hole_area=0.0, max_sprinkle_area=0.0 + ): + """ + Transforms for SAM2. + """ + super().__init__() + self.resolution = resolution + self.mask_threshold = mask_threshold + self.max_hole_area = max_hole_area + self.max_sprinkle_area = max_sprinkle_area + self.mean = [0.485, 0.456, 0.406] + self.std = [0.229, 0.224, 0.225] + self.to_tensor = ToTensor() + self.transforms = torch.jit.script( + nn.Sequential( + Resize((self.resolution, self.resolution)), + Normalize(self.mean, self.std), + ) + ) + + def __call__(self, x): + x = self.to_tensor(x) + return self.transforms(x) + + def forward_batch(self, img_list): + img_batch = [self.transforms(self.to_tensor(img)) for img in img_list] + img_batch = torch.stack(img_batch, dim=0) + return img_batch + + def transform_coords( + self, coords: torch.Tensor, normalize=False, orig_hw=None + ) -> torch.Tensor: + """ + Expects a torch tensor with length 2 in the last dimension. The coordinates can be in absolute image or normalized coordinates, + If the coords are in absolute image coordinates, normalize should be set to True and original image size is required. + + Returns + Un-normalized coordinates in the range of [0, 1] which is expected by the SAM2 model. + """ + if normalize: + assert orig_hw is not None + h, w = orig_hw + coords = coords.clone() + coords[..., 0] = coords[..., 0] / w + coords[..., 1] = coords[..., 1] / h + + coords = coords * self.resolution # unnormalize coords + return coords + + def transform_boxes( + self, boxes: torch.Tensor, normalize=False, orig_hw=None + ) -> torch.Tensor: + """ + Expects a tensor of shape Bx4. The coordinates can be in absolute image or normalized coordinates, + if the coords are in absolute image coordinates, normalize should be set to True and original image size is required. + """ + boxes = self.transform_coords(boxes.reshape(-1, 2, 2), normalize, orig_hw) + return boxes + + def postprocess_masks(self, masks: torch.Tensor, orig_hw) -> torch.Tensor: + """ + Perform PostProcessing on output masks. + """ + return masks diff --git a/inpaint/runtime.py b/inpaint/runtime.py new file mode 100644 index 0000000..e109528 --- /dev/null +++ b/inpaint/runtime.py @@ -0,0 +1,86 @@ +# https://github.com/huggingface/huggingface_hub/blob/5a12851f54bf614be39614034ed3a9031922d297/src/huggingface_hub/utils/_runtime.py +import os +import platform +import sys +from pathlib import Path + +import packaging.version +from inpaint.schema import Device +from loguru import logger +from rich import print +from typing import Dict, Any + + +_PY_VERSION: str = sys.version.split()[0].rstrip("+") + +if packaging.version.Version(_PY_VERSION) < packaging.version.Version("3.8.0"): + import importlib_metadata # type: ignore +else: + import importlib.metadata as importlib_metadata # type: ignore + +_package_versions = {} + +_CANDIDATES = [ + "torch", + "torchvision", + "Pillow", + "diffusers", + "transformers", + "opencv-python", + "accelerate", + "iopaint", + "rembg", +] +# Check once at runtime +for name in _CANDIDATES: + _package_versions[name] = "N/A" + try: + _package_versions[name] = importlib_metadata.version(name) + except importlib_metadata.PackageNotFoundError: + pass + + +def dump_environment_info() -> Dict[str, str]: + """Dump information about the machine to help debugging issues.""" + + # Generic machine info + info: Dict[str, Any] = { + "Platform": platform.platform(), + "Python version": platform.python_version(), + } + info.update(_package_versions) + print("\n".join([f"- {prop}: {val}" for prop, val in info.items()]) + "\n") + return info + + +def check_device(device: Device) -> Device: + if device == Device.cuda: + import platform + + if platform.system() == "Darwin": + logger.warning("MacOS does not support cuda, use cpu instead") + return Device.cpu + else: + import torch + + if not torch.cuda.is_available(): + logger.warning("CUDA is not available, use cpu instead") + return Device.cpu + elif device == Device.mps: + import torch + + if not torch.backends.mps.is_available(): + logger.warning("mps is not available, use cpu instead") + return Device.cpu + return device + + +def setup_model_dir(model_dir: Path): + model_dir = model_dir.expanduser().absolute() + logger.info(f"Model directory: {model_dir}") + os.environ["U2NET_HOME"] = str(model_dir) + os.environ["XDG_CACHE_HOME"] = str(model_dir) + if not model_dir.exists(): + logger.info(f"Create model directory: {model_dir}") + model_dir.mkdir(exist_ok=True, parents=True) + return model_dir diff --git a/inpaint/schema.py b/inpaint/schema.py new file mode 100644 index 0000000..659e341 --- /dev/null +++ b/inpaint/schema.py @@ -0,0 +1,491 @@ +import random +from enum import Enum +from pathlib import Path +from typing import Optional, Literal, List + +from loguru import logger + +from inpaint.const import ( + INSTRUCT_PIX2PIX_NAME, + KANDINSKY22_NAME, + POWERPAINT_NAME, + ANYTEXT_NAME, + SDXL_CONTROLNET_CHOICES, + SD2_CONTROLNET_CHOICES, + SD_CONTROLNET_CHOICES, + SD_BRUSHNET_CHOICES, +) +from pydantic import BaseModel, Field, computed_field, model_validator + + +class ModelType(str, Enum): + INPAINT = "inpaint" # LaMa, MAT... + DIFFUSERS_SD = "diffusers_sd" + DIFFUSERS_SD_INPAINT = "diffusers_sd_inpaint" + DIFFUSERS_SDXL = "diffusers_sdxl" + DIFFUSERS_SDXL_INPAINT = "diffusers_sdxl_inpaint" + DIFFUSERS_OTHER = "diffusers_other" + + +class ModelInfo(BaseModel): + name: str + path: str + model_type: ModelType + is_single_file_diffusers: bool = False + + @computed_field + @property + def need_prompt(self) -> bool: + return self.model_type in [ + ModelType.DIFFUSERS_SD, + ModelType.DIFFUSERS_SDXL, + ModelType.DIFFUSERS_SD_INPAINT, + ModelType.DIFFUSERS_SDXL_INPAINT, + ] or self.name in [ + INSTRUCT_PIX2PIX_NAME, + KANDINSKY22_NAME, + POWERPAINT_NAME, + ANYTEXT_NAME, + ] + + @computed_field + @property + def controlnets(self) -> List[str]: + if self.model_type in [ + ModelType.DIFFUSERS_SDXL, + ModelType.DIFFUSERS_SDXL_INPAINT, + ]: + return SDXL_CONTROLNET_CHOICES + if self.model_type in [ModelType.DIFFUSERS_SD, ModelType.DIFFUSERS_SD_INPAINT]: + if "sd2" in self.name.lower(): + return SD2_CONTROLNET_CHOICES + else: + return SD_CONTROLNET_CHOICES + if self.name == POWERPAINT_NAME: + return SD_CONTROLNET_CHOICES + return [] + + @computed_field + @property + def brushnets(self) -> List[str]: + if self.model_type in [ModelType.DIFFUSERS_SD]: + return SD_BRUSHNET_CHOICES + return [] + + @computed_field + @property + def support_strength(self) -> bool: + return self.model_type in [ + ModelType.DIFFUSERS_SD, + ModelType.DIFFUSERS_SDXL, + ModelType.DIFFUSERS_SD_INPAINT, + ModelType.DIFFUSERS_SDXL_INPAINT, + ] or self.name in [POWERPAINT_NAME, ANYTEXT_NAME] + + @computed_field + @property + def support_outpainting(self) -> bool: + return self.model_type in [ + ModelType.DIFFUSERS_SD, + ModelType.DIFFUSERS_SDXL, + ModelType.DIFFUSERS_SD_INPAINT, + ModelType.DIFFUSERS_SDXL_INPAINT, + ] or self.name in [KANDINSKY22_NAME, POWERPAINT_NAME] + + @computed_field + @property + def support_lcm_lora(self) -> bool: + return self.model_type in [ + ModelType.DIFFUSERS_SD, + ModelType.DIFFUSERS_SDXL, + ModelType.DIFFUSERS_SD_INPAINT, + ModelType.DIFFUSERS_SDXL_INPAINT, + ] + + @computed_field + @property + def support_controlnet(self) -> bool: + return self.model_type in [ + ModelType.DIFFUSERS_SD, + ModelType.DIFFUSERS_SDXL, + ModelType.DIFFUSERS_SD_INPAINT, + ModelType.DIFFUSERS_SDXL_INPAINT, + ] + + @computed_field + @property + def support_brushnet(self) -> bool: + return self.model_type in [ + ModelType.DIFFUSERS_SD, + ] + + @computed_field + @property + def support_powerpaint_v2(self) -> bool: + return ( + self.model_type + in [ + ModelType.DIFFUSERS_SD, + ] + and self.name != POWERPAINT_NAME + ) + + +class Choices(str, Enum): + @classmethod + def values(cls): + return [member.value for member in cls] + + +class RealESRGANModel(Choices): + realesr_general_x4v3 = "realesr-general-x4v3" + RealESRGAN_x4plus = "RealESRGAN_x4plus" + RealESRGAN_x4plus_anime_6B = "RealESRGAN_x4plus_anime_6B" + + +class RemoveBGModel(Choices): + u2net = "u2net" + u2netp = "u2netp" + u2net_human_seg = "u2net_human_seg" + u2net_cloth_seg = "u2net_cloth_seg" + silueta = "silueta" + isnet_general_use = "isnet-general-use" + briaai_rmbg_1_4 = "briaai/RMBG-1.4" + + +class Device(Choices): + cpu = "cpu" + cuda = "cuda" + mps = "mps" + + +class InteractiveSegModel(Choices): + vit_b = "vit_b" + vit_l = "vit_l" + vit_h = "vit_h" + sam_hq_vit_b = "sam_hq_vit_b" + sam_hq_vit_l = "sam_hq_vit_l" + sam_hq_vit_h = "sam_hq_vit_h" + mobile_sam = "mobile_sam" + sam2_tiny = "sam2_tiny" + sam2_small = "sam2_small" + sam2_base = "sam2_base" + sam2_large = "sam2_large" + + +class PluginInfo(BaseModel): + name: str + support_gen_image: bool = False + support_gen_mask: bool = False + + +class CV2Flag(str, Enum): + INPAINT_NS = "INPAINT_NS" + INPAINT_TELEA = "INPAINT_TELEA" + + +class HDStrategy(str, Enum): + # Use original image size + ORIGINAL = "Original" + # Resize the longer side of the image to a specific size(hd_strategy_resize_limit), + # then do inpainting on the resized image. Finally, resize the inpainting result to the original size. + # The area outside the mask will not lose quality. + RESIZE = "Resize" + # Crop masking area(with a margin controlled by hd_strategy_crop_margin) from the original image to do inpainting + CROP = "Crop" + + +class LDMSampler(str, Enum): + ddim = "ddim" + plms = "plms" + + +class SDSampler(str, Enum): + dpm_plus_plus_2m = "DPM++ 2M" + dpm_plus_plus_2m_karras = "DPM++ 2M Karras" + dpm_plus_plus_2m_sde = "DPM++ 2M SDE" + dpm_plus_plus_2m_sde_karras = "DPM++ 2M SDE Karras" + dpm_plus_plus_sde = "DPM++ SDE" + dpm_plus_plus_sde_karras = "DPM++ SDE Karras" + dpm2 = "DPM2" + dpm2_karras = "DPM2 Karras" + dpm2_a = "DPM2 a" + dpm2_a_karras = "DPM2 a Karras" + euler = "Euler" + euler_a = "Euler a" + heun = "Heun" + lms = "LMS" + lms_karras = "LMS Karras" + + ddim = "DDIM" + pndm = "PNDM" + uni_pc = "UniPC" + lcm = "LCM" + + +class PowerPaintTask(Choices): + text_guided = "text-guided" + context_aware = "context-aware" + shape_guided = "shape-guided" + object_remove = "object-remove" + outpainting = "outpainting" + + +class ApiConfig(BaseModel): + host: str + port: int + inbrowser: bool + model: str + no_half: bool + low_mem: bool + cpu_offload: bool + disable_nsfw_checker: bool + local_files_only: bool + cpu_textencoder: bool + device: Device + input: Optional[Path] + mask_dir: Optional[Path] + output_dir: Optional[Path] + quality: int + enable_interactive_seg: bool + interactive_seg_model: InteractiveSegModel + interactive_seg_device: Device + enable_remove_bg: bool + remove_bg_model: str + enable_anime_seg: bool + enable_realesrgan: bool + realesrgan_device: Device + realesrgan_model: RealESRGANModel + enable_gfpgan: bool + gfpgan_device: Device + enable_restoreformer: bool + restoreformer_device: Device + + +class InpaintRequest(BaseModel): + image: Optional[str] = Field(None, description="base64 encoded image") + mask: Optional[str] = Field(None, description="base64 encoded mask") + + ldm_steps: int = Field(20, description="Steps for ldm model.") + ldm_sampler: str = Field(LDMSampler.plms, discription="Sampler for ldm model.") + zits_wireframe: bool = Field(True, description="Enable wireframe for zits model.") + + hd_strategy: str = Field( + HDStrategy.CROP, + description="Different way to preprocess image, only used by erase models(e.g. lama/mat)", + ) + hd_strategy_crop_trigger_size: int = Field( + 800, + description="Crop trigger size for hd_strategy=CROP, if the longer side of the image is larger than this value, use crop strategy", + ) + hd_strategy_crop_margin: int = Field( + 128, description="Crop margin for hd_strategy=CROP" + ) + hd_strategy_resize_limit: int = Field( + 1280, description="Resize limit for hd_strategy=RESIZE" + ) + + prompt: str = Field("", description="Prompt for diffusion models.") + negative_prompt: str = Field( + "", description="Negative prompt for diffusion models." + ) + use_croper: bool = Field( + False, description="Crop image before doing diffusion inpainting" + ) + croper_x: int = Field(0, description="Crop x for croper") + croper_y: int = Field(0, description="Crop y for croper") + croper_height: int = Field(512, description="Crop height for croper") + croper_width: int = Field(512, description="Crop width for croper") + + use_extender: bool = Field( + False, description="Extend image before doing sd outpainting" + ) + extender_x: int = Field(0, description="Extend x for extender") + extender_y: int = Field(0, description="Extend y for extender") + extender_height: int = Field(640, description="Extend height for extender") + extender_width: int = Field(640, description="Extend width for extender") + + sd_scale: float = Field( + 1.0, + description="Resize the image before doing sd inpainting, the area outside the mask will not lose quality.", + gt=0.0, + le=1.0, + ) + sd_mask_blur: int = Field( + 11, + description="Blur the edge of mask area. The higher the number the smoother blend with the original image", + ) + sd_strength: float = Field( + 1.0, + description="Strength is a measure of how much noise is added to the base image, which influences how similar the output is to the base image. Higher value means more noise and more different from the base image", + le=1.0, + ) + sd_steps: int = Field( + 50, + description="The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference.", + ) + sd_guidance_scale: float = Field( + 7.5, + help="Higher guidance scale encourages to generate images that are closely linked to the text prompt, usually at the expense of lower image quality.", + ) + sd_sampler: str = Field( + SDSampler.uni_pc, description="Sampler for diffusion model." + ) + sd_seed: int = Field( + 42, + description="Seed for diffusion model. -1 mean random seed", + validate_default=True, + ) + sd_match_histograms: bool = Field( + False, + description="Match histograms between inpainting area and original image.", + ) + + sd_outpainting_softness: float = Field(20.0) + sd_outpainting_space: float = Field(20.0) + + sd_lcm_lora: bool = Field( + False, + description="Enable lcm-lora mode. https://huggingface.co/docs/diffusers/main/en/using-diffusers/inference_with_lcm#texttoimage", + ) + + sd_keep_unmasked_area: bool = Field( + True, description="Keep unmasked area unchanged" + ) + + cv2_flag: CV2Flag = Field( + CV2Flag.INPAINT_NS, + description="Flag for opencv inpainting: https://docs.opencv.org/4.6.0/d7/d8b/group__photo__inpaint.html#gga8002a65f5a3328fbf15df81b842d3c3ca05e763003a805e6c11c673a9f4ba7d07", + ) + cv2_radius: int = Field( + 4, + description="Radius of a circular neighborhood of each point inpainted that is considered by the algorithm", + ) + + # Paint by Example + paint_by_example_example_image: Optional[str] = Field( + None, description="Base64 encoded example image for paint by example model" + ) + + # InstructPix2Pix + p2p_image_guidance_scale: float = Field(1.5, description="Image guidance scale") + + # ControlNet + enable_controlnet: bool = Field(False, description="Enable controlnet") + controlnet_conditioning_scale: float = Field( + 0.4, description="Conditioning scale", ge=0.0, le=1.0 + ) + controlnet_method: str = Field( + "lllyasviel/control_v11p_sd15_canny", description="Controlnet method" + ) + + # BrushNet + enable_brushnet: bool = Field(False, description="Enable brushnet") + brushnet_method: str = Field(SD_BRUSHNET_CHOICES[0], description="Brushnet method") + brushnet_conditioning_scale: float = Field( + 1.0, description="brushnet conditioning scale", ge=0.0, le=1.0 + ) + + # PowerPaint + enable_powerpaint_v2: bool = Field(False, description="Enable PowerPaint v2") + powerpaint_task: PowerPaintTask = Field( + PowerPaintTask.text_guided, description="PowerPaint task" + ) + fitting_degree: float = Field( + 1.0, + description="Control the fitting degree of the generated objects to the mask shape.", + gt=0.0, + le=1.0, + ) + + @model_validator(mode="after") + def validate_field(cls, values: "InpaintRequest"): + if values.sd_seed == -1: + values.sd_seed = random.randint(1, 99999999) + logger.info(f"Generate random seed: {values.sd_seed}") + + if values.use_extender and values.enable_controlnet: + logger.info("Extender is enabled, set controlnet_conditioning_scale=0") + values.controlnet_conditioning_scale = 0 + + if values.use_extender: + logger.info("Extender is enabled, set sd_strength=1") + values.sd_strength = 1.0 + + if values.enable_brushnet: + logger.info("BrushNet is enabled, set enable_controlnet=False") + if values.enable_controlnet: + values.enable_controlnet = False + if values.sd_lcm_lora: + logger.info("BrushNet is enabled, set sd_lcm_lora=False") + values.sd_lcm_lora = False + + if values.enable_controlnet: + logger.info("ControlNet is enabled, set enable_brushnet=False") + if values.enable_brushnet: + values.enable_brushnet = False + + return values + + +class RunPluginRequest(BaseModel): + name: str + image: str = Field(..., description="base64 encoded image") + clicks: List[List[int]] = Field( + [], description="Clicks for interactive seg, [[x,y,0/1], [x2,y2,0/1]]" + ) + scale: float = Field(2.0, description="Scale for upscaling") + + +MediaTab = Literal["input", "output", "mask"] + + +class MediasResponse(BaseModel): + name: str + height: int + width: int + ctime: float + mtime: float + + +class GenInfoResponse(BaseModel): + prompt: str = "" + negative_prompt: str = "" + + +class ServerConfigResponse(BaseModel): + plugins: List[PluginInfo] + modelInfos: List[ModelInfo] + removeBGModel: RemoveBGModel + removeBGModels: List[RemoveBGModel] + realesrganModel: RealESRGANModel + realesrganModels: List[RealESRGANModel] + interactiveSegModel: InteractiveSegModel + interactiveSegModels: List[InteractiveSegModel] + enableFileManager: bool + enableAutoSaving: bool + enableControlnet: bool + controlnetMethod: Optional[str] + disableModelSwitch: bool + isDesktop: bool + samplers: List[str] + + +class SwitchModelRequest(BaseModel): + name: str + + +class SwitchPluginModelRequest(BaseModel): + plugin_name: str + model_name: str + + +AdjustMaskOperate = Literal["expand", "shrink", "reverse"] + + +class AdjustMaskRequest(BaseModel): + mask: str = Field( + ..., description="base64 encoded mask. 255 means area to do inpaint" + ) + operate: AdjustMaskOperate = Field(..., description="expand/shrink/reverse") + kernel_size: int = Field(5, description="Kernel size for expanding mask") diff --git a/inpaint/tests/.gitignore b/inpaint/tests/.gitignore new file mode 100644 index 0000000..89b7717 --- /dev/null +++ b/inpaint/tests/.gitignore @@ -0,0 +1,2 @@ +*_result.png +result/ \ No newline at end of file diff --git a/inpaint/tests/__init__.py b/inpaint/tests/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/inpaint/tests/anime_test.png b/inpaint/tests/anime_test.png new file mode 100644 index 0000000..6b86838 Binary files /dev/null and b/inpaint/tests/anime_test.png differ diff --git a/inpaint/tests/anytext_mask.jpg b/inpaint/tests/anytext_mask.jpg new file mode 100644 index 0000000..43d8b12 Binary files /dev/null and b/inpaint/tests/anytext_mask.jpg differ diff --git a/inpaint/tests/anytext_ref.jpg b/inpaint/tests/anytext_ref.jpg new file mode 100644 index 0000000..c36b3c5 Binary files /dev/null and b/inpaint/tests/anytext_ref.jpg differ diff --git a/inpaint/tests/bunny.jpeg b/inpaint/tests/bunny.jpeg new file mode 100644 index 0000000..3727a45 Binary files /dev/null and b/inpaint/tests/bunny.jpeg differ diff --git a/inpaint/tests/cat.png b/inpaint/tests/cat.png new file mode 100644 index 0000000..dee9eb6 Binary files /dev/null and b/inpaint/tests/cat.png differ diff --git a/inpaint/tests/icc_profile_test.jpg b/inpaint/tests/icc_profile_test.jpg new file mode 100644 index 0000000..b603ef9 Binary files /dev/null and b/inpaint/tests/icc_profile_test.jpg differ diff --git a/inpaint/tests/icc_profile_test.png b/inpaint/tests/icc_profile_test.png new file mode 100644 index 0000000..90d18ac Binary files /dev/null and b/inpaint/tests/icc_profile_test.png differ diff --git a/inpaint/tests/image.png b/inpaint/tests/image.png new file mode 100644 index 0000000..74c7a7b Binary files /dev/null and b/inpaint/tests/image.png differ diff --git a/inpaint/tests/mask.png b/inpaint/tests/mask.png new file mode 100644 index 0000000..29cf20b Binary files /dev/null and b/inpaint/tests/mask.png differ diff --git a/inpaint/tests/overture-creations-5sI6fQgYIuo.png b/inpaint/tests/overture-creations-5sI6fQgYIuo.png new file mode 100644 index 0000000..e84dfc8 Binary files /dev/null and b/inpaint/tests/overture-creations-5sI6fQgYIuo.png differ diff --git a/inpaint/tests/overture-creations-5sI6fQgYIuo_mask.png b/inpaint/tests/overture-creations-5sI6fQgYIuo_mask.png new file mode 100644 index 0000000..7f3c753 Binary files /dev/null and b/inpaint/tests/overture-creations-5sI6fQgYIuo_mask.png differ diff --git a/inpaint/tests/overture-creations-5sI6fQgYIuo_mask_blur.png b/inpaint/tests/overture-creations-5sI6fQgYIuo_mask_blur.png new file mode 100644 index 0000000..a630379 Binary files /dev/null and b/inpaint/tests/overture-creations-5sI6fQgYIuo_mask_blur.png differ diff --git a/inpaint/tests/png_parameter_test.png b/inpaint/tests/png_parameter_test.png new file mode 100644 index 0000000..dc18bce Binary files /dev/null and b/inpaint/tests/png_parameter_test.png differ diff --git a/inpaint/tests/test_adjust_mask.py b/inpaint/tests/test_adjust_mask.py new file mode 100644 index 0000000..1f01713 --- /dev/null +++ b/inpaint/tests/test_adjust_mask.py @@ -0,0 +1,17 @@ +import cv2 +from iopaint.helper import adjust_mask +from iopaint.tests.utils import current_dir, save_dir + +mask_p = current_dir / "overture-creations-5sI6fQgYIuo_mask.png" + + +def test_adjust_mask(): + mask = cv2.imread(str(mask_p), cv2.IMREAD_GRAYSCALE) + res_mask = adjust_mask(mask, 0, "expand") + cv2.imwrite(str(save_dir / "adjust_mask_original.png"), res_mask) + res_mask = adjust_mask(mask, 40, "expand") + cv2.imwrite(str(save_dir / "adjust_mask_expand.png"), res_mask) + res_mask = adjust_mask(mask, 20, "shrink") + cv2.imwrite(str(save_dir / "adjust_mask_shrink.png"), res_mask) + res_mask = adjust_mask(mask, 20, "reverse") + cv2.imwrite(str(save_dir / "adjust_mask_reverse.png"), res_mask) diff --git a/inpaint/tests/test_anytext.py b/inpaint/tests/test_anytext.py new file mode 100644 index 0000000..996176f --- /dev/null +++ b/inpaint/tests/test_anytext.py @@ -0,0 +1,45 @@ +import os + +from iopaint.tests.utils import check_device, get_config, assert_equal + +os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" +from pathlib import Path + +import pytest +import torch + +from iopaint.model_manager import ModelManager +from iopaint.schema import HDStrategy + +current_dir = Path(__file__).parent.absolute().resolve() +save_dir = current_dir / "result" +save_dir.mkdir(exist_ok=True, parents=True) + + +@pytest.mark.parametrize("device", ["cuda", "mps"]) +def test_anytext(device): + sd_steps = check_device(device) + model = ModelManager( + name="Sanster/AnyText", + device=torch.device(device), + disable_nsfw=True, + sd_cpu_textencoder=False, + ) + + cfg = get_config( + strategy=HDStrategy.ORIGINAL, + prompt='Characters written in chalk on the blackboard that says "DADDY", best quality, extremely detailed,4k, HD, supper legible text, clear text edges, clear strokes, neat writing, no watermarks', + negative_prompt="low-res, bad anatomy, extra digit, fewer digits, cropped, worst quality, low quality, watermark, unreadable text, messy words, distorted text, disorganized writing, advertising picture", + sd_steps=sd_steps, + sd_guidance_scale=9.0, + sd_seed=66273235, + sd_match_histograms=True + ) + + assert_equal( + model, + cfg, + f"anytext.png", + img_p=current_dir / "anytext_ref.jpg", + mask_p=current_dir / "anytext_mask.jpg", + ) diff --git a/inpaint/tests/test_brushnet.py b/inpaint/tests/test_brushnet.py new file mode 100644 index 0000000..73394c0 --- /dev/null +++ b/inpaint/tests/test_brushnet.py @@ -0,0 +1,110 @@ +import os + +from iopaint.const import SD_BRUSHNET_CHOICES +from iopaint.tests.utils import check_device, get_config, assert_equal + +os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" +from pathlib import Path + +import pytest +import torch + +from iopaint.model_manager import ModelManager +from iopaint.schema import HDStrategy, SDSampler, PowerPaintTask + +current_dir = Path(__file__).parent.absolute().resolve() +save_dir = current_dir / "result" +save_dir.mkdir(exist_ok=True, parents=True) + + +@pytest.mark.parametrize("device", ["cuda", "mps", "cpu"]) +@pytest.mark.parametrize("sampler", [SDSampler.dpm_plus_plus_2m_karras]) +def test_runway_brushnet(device, sampler): + sd_steps = check_device(device) + model = ModelManager( + name="runwayml/stable-diffusion-v1-5", + device=torch.device(device), + disable_nsfw=True, + sd_cpu_textencoder=False, + ) + cfg = get_config( + strategy=HDStrategy.ORIGINAL, + prompt="face of a fox, sitting on a bench", + sd_steps=sd_steps, + sd_guidance_scale=7.5, + enable_brushnet=True, + brushnet_method=SD_BRUSHNET_CHOICES[0], + ) + cfg.sd_sampler = sampler + + assert_equal( + model, + cfg, + f"brushnet_random_mask_{device}.png", + img_p=current_dir / "overture-creations-5sI6fQgYIuo.png", + mask_p=current_dir / "overture-creations-5sI6fQgYIuo_mask.png", + ) + + +@pytest.mark.parametrize("device", ["cuda", "mps"]) +@pytest.mark.parametrize("sampler", [SDSampler.dpm_plus_plus_2m]) +def test_runway_powerpaint_v2(device, sampler): + sd_steps = check_device(device) + model = ModelManager( + name="runwayml/stable-diffusion-v1-5", + device=torch.device(device), + disable_nsfw=True, + sd_cpu_textencoder=False, + ) + + tasks = { + PowerPaintTask.text_guided: { + "prompt": "face of a fox, sitting on a bench", + "scale": 7.5, + }, + PowerPaintTask.context_aware: { + "prompt": "face of a fox, sitting on a bench", + "scale": 7.5, + }, + PowerPaintTask.shape_guided: { + "prompt": "face of a fox, sitting on a bench", + "scale": 7.5, + }, + PowerPaintTask.object_remove: { + "prompt": "", + "scale": 12, + }, + PowerPaintTask.outpainting: { + "prompt": "", + "scale": 7.5, + }, + } + + for task, data in tasks.items(): + cfg = get_config( + strategy=HDStrategy.ORIGINAL, + prompt=data["prompt"], + negative_prompt="out of frame, lowres, error, cropped, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, out of frame, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, disfigured, gross proportions, malformed limbs, watermark, signature", + sd_steps=sd_steps, + sd_guidance_scale=data["scale"], + enable_powerpaint_v2=True, + powerpaint_task=task, + sd_sampler=sampler, + sd_mask_blur=11, + sd_seed=42, + # sd_keep_unmasked_area=False + ) + if task == PowerPaintTask.outpainting: + cfg.use_extender = True + cfg.extender_x = -128 + cfg.extender_y = -128 + cfg.extender_width = 768 + cfg.extender_height = 768 + + assert_equal( + model, + cfg, + f"powerpaint_v2_{device}_{task}.png", + img_p=current_dir / "overture-creations-5sI6fQgYIuo.png", + mask_p=current_dir / "overture-creations-5sI6fQgYIuo_mask.png", + ) diff --git a/inpaint/tests/test_controlnet.py b/inpaint/tests/test_controlnet.py new file mode 100644 index 0000000..c271345 --- /dev/null +++ b/inpaint/tests/test_controlnet.py @@ -0,0 +1,118 @@ +import os + +from iopaint.const import SD_CONTROLNET_CHOICES +from iopaint.tests.utils import current_dir, check_device, get_config, assert_equal + +os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" +from pathlib import Path + +import pytest +import torch + +from iopaint.model_manager import ModelManager +from iopaint.schema import HDStrategy, SDSampler + + +model_name = "runwayml/stable-diffusion-inpainting" + + +def convert_controlnet_method_name(name): + return name.replace("/", "--") + + +@pytest.mark.parametrize("device", ["cuda", "mps", "cpu"]) +@pytest.mark.parametrize("controlnet_method", [SD_CONTROLNET_CHOICES[0]]) +def test_runway_sd_1_5(device, controlnet_method): + sd_steps = check_device(device) + + model = ModelManager( + name=model_name, + device=torch.device(device), + disable_nsfw=True, + sd_cpu_textencoder=device == "cuda", + enable_controlnet=True, + controlnet_method=controlnet_method, + ) + + cfg = get_config( + prompt="a fox sitting on a bench", + sd_steps=sd_steps, + enable_controlnet=True, + controlnet_conditioning_scale=0.5, + controlnet_method=controlnet_method, + ) + name = f"device_{device}" + + assert_equal( + model, + cfg, + f"sd_controlnet_{convert_controlnet_method_name(controlnet_method)}_{name}.png", + img_p=current_dir / "overture-creations-5sI6fQgYIuo.png", + mask_p=current_dir / "overture-creations-5sI6fQgYIuo_mask.png", + ) + + +@pytest.mark.parametrize("device", ["cuda", "mps", "cpu"]) +def test_controlnet_switch(device): + sd_steps = check_device(device) + model = ModelManager( + name=model_name, + device=torch.device(device), + disable_nsfw=True, + sd_cpu_textencoder=False, + cpu_offload=True, + enable_controlnet=True, + controlnet_method="lllyasviel/control_v11p_sd15_canny", + ) + cfg = get_config( + prompt="a fox sitting on a bench", + sd_steps=sd_steps, + enable_controlnet=True, + controlnet_method="lllyasviel/control_v11f1p_sd15_depth", + ) + + assert_equal( + model, + cfg, + f"controlnet_switch_canny_to_depth_device_{device}.png", + img_p=current_dir / "overture-creations-5sI6fQgYIuo.png", + mask_p=current_dir / "overture-creations-5sI6fQgYIuo_mask.png", + fx=1.2 + ) + + +@pytest.mark.parametrize("device", ["cuda", "mps", "cpu"]) +@pytest.mark.parametrize( + "local_file", ["sd-v1-5-inpainting.ckpt", "v1-5-pruned-emaonly.safetensors"] +) +def test_local_file_path(device, local_file): + sd_steps = check_device(device) + + controlnet_kwargs = dict( + enable_controlnet=True, + controlnet_method=SD_CONTROLNET_CHOICES[0], + ) + + model = ModelManager( + name=local_file, + device=torch.device(device), + disable_nsfw=True, + sd_cpu_textencoder=False, + cpu_offload=True, + **controlnet_kwargs, + ) + cfg = get_config( + prompt="a fox sitting on a bench", + sd_steps=sd_steps, + **controlnet_kwargs, + ) + + name = f"device_{device}" + + assert_equal( + model, + cfg, + f"{convert_controlnet_method_name(controlnet_kwargs['controlnet_method'])}_local_model_{name}.png", + img_p=current_dir / "overture-creations-5sI6fQgYIuo.png", + mask_p=current_dir / "overture-creations-5sI6fQgYIuo_mask.png", + ) diff --git a/inpaint/tests/test_instruct_pix2pix.py b/inpaint/tests/test_instruct_pix2pix.py new file mode 100644 index 0000000..f1ab4e2 --- /dev/null +++ b/inpaint/tests/test_instruct_pix2pix.py @@ -0,0 +1,40 @@ +from pathlib import Path + +import pytest +import torch + +from iopaint.model_manager import ModelManager +from iopaint.schema import HDStrategy +from iopaint.tests.utils import get_config, check_device, assert_equal, current_dir + +model_name = "timbrooks/instruct-pix2pix" + + +@pytest.mark.parametrize("device", ["cuda", "mps", "cpu"]) +@pytest.mark.parametrize("disable_nsfw", [True, False]) +@pytest.mark.parametrize("cpu_offload", [False, True]) +def test_instruct_pix2pix(device, disable_nsfw, cpu_offload): + sd_steps = check_device(device) + model = ModelManager( + name=model_name, + device=torch.device(device), + disable_nsfw=disable_nsfw, + sd_cpu_textencoder=False, + cpu_offload=cpu_offload, + ) + cfg = get_config( + strategy=HDStrategy.ORIGINAL, + prompt="What if it were snowing?", + sd_steps=sd_steps + ) + + name = f"device_{device}_disnsfw_{disable_nsfw}_cpu_offload_{cpu_offload}" + + assert_equal( + model, + cfg, + f"instruct_pix2pix_{name}.png", + img_p=current_dir / "overture-creations-5sI6fQgYIuo.png", + mask_p=current_dir / "overture-creations-5sI6fQgYIuo_mask.png", + fx=1.3, + ) diff --git a/inpaint/tests/test_load_img.py b/inpaint/tests/test_load_img.py new file mode 100644 index 0000000..f7071bf --- /dev/null +++ b/inpaint/tests/test_load_img.py @@ -0,0 +1,19 @@ +from iopaint.helper import load_img +from iopaint.tests.utils import current_dir + +png_img_p = current_dir / "image.png" +jpg_img_p = current_dir / "bunny.jpeg" + + +def test_load_png_image(): + with open(png_img_p, "rb") as f: + np_img, alpha_channel = load_img(f.read()) + assert np_img.shape == (256, 256, 3) + assert alpha_channel.shape == (256, 256) + + +def test_load_jpg_image(): + with open(jpg_img_p, "rb") as f: + np_img, alpha_channel = load_img(f.read()) + assert np_img.shape == (394, 448, 3) + assert alpha_channel is None diff --git a/inpaint/tests/test_low_mem.py b/inpaint/tests/test_low_mem.py new file mode 100644 index 0000000..e3c1b91 --- /dev/null +++ b/inpaint/tests/test_low_mem.py @@ -0,0 +1,102 @@ +import os + +from loguru import logger + +from iopaint.tests.utils import check_device, get_config, assert_equal, current_dir + +os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" + +import pytest +import torch + +from iopaint.model_manager import ModelManager +from iopaint.schema import HDStrategy, SDSampler + + +@pytest.mark.parametrize("device", ["cuda", "mps"]) +def test_runway_sd_1_5_low_mem(device): + sd_steps = check_device(device) + model = ModelManager( + name="runwayml/stable-diffusion-inpainting", + device=torch.device(device), + disable_nsfw=True, + sd_cpu_textencoder=False, + low_mem=True, + ) + + all_samplers = [member.value for member in SDSampler.__members__.values()] + print(all_samplers) + cfg = get_config( + strategy=HDStrategy.ORIGINAL, + prompt="a fox sitting on a bench", + sd_steps=sd_steps, + sd_sampler=SDSampler.ddim, + ) + + name = f"device_{device}" + + assert_equal( + model, + cfg, + f"runway_sd_{name}_low_mem.png", + img_p=current_dir / "overture-creations-5sI6fQgYIuo.png", + mask_p=current_dir / "overture-creations-5sI6fQgYIuo_mask.png", + ) + + +@pytest.mark.parametrize("device", ["cuda", "mps", "cpu"]) +@pytest.mark.parametrize("sampler", [SDSampler.lcm]) +def test_runway_sd_lcm_lora_low_mem(device, sampler): + check_device(device) + + sd_steps = 5 + model = ModelManager( + name="runwayml/stable-diffusion-inpainting", + device=torch.device(device), + disable_nsfw=True, + sd_cpu_textencoder=False, + low_mem=True, + ) + cfg = get_config( + strategy=HDStrategy.ORIGINAL, + prompt="face of a fox, sitting on a bench", + sd_steps=sd_steps, + sd_guidance_scale=2, + sd_lcm_lora=True, + ) + cfg.sd_sampler = sampler + + assert_equal( + model, + cfg, + f"runway_sd_1_5_lcm_lora_device_{device}_low_mem.png", + img_p=current_dir / "overture-creations-5sI6fQgYIuo.png", + mask_p=current_dir / "overture-creations-5sI6fQgYIuo_mask.png", + ) + + + +@pytest.mark.parametrize("device", ["cuda", "mps", "cpu"]) +@pytest.mark.parametrize("strategy", [HDStrategy.ORIGINAL]) +@pytest.mark.parametrize("sampler", [SDSampler.ddim]) +def test_runway_norm_sd_model(device, strategy, sampler): + sd_steps = check_device(device) + model = ModelManager( + name="runwayml/stable-diffusion-v1-5", + device=torch.device(device), + disable_nsfw=True, + sd_cpu_textencoder=False, + low_mem=True, + ) + cfg = get_config( + strategy=strategy, prompt="face of a fox, sitting on a bench", sd_steps=sd_steps + ) + cfg.sd_sampler = sampler + + assert_equal( + model, + cfg, + f"runway_{device}_norm_sd_model_device_{device}_low_mem.png", + img_p=current_dir / "overture-creations-5sI6fQgYIuo.png", + mask_p=current_dir / "overture-creations-5sI6fQgYIuo_mask.png", + ) diff --git a/inpaint/tests/test_match_histograms.py b/inpaint/tests/test_match_histograms.py new file mode 100644 index 0000000..c20a283 --- /dev/null +++ b/inpaint/tests/test_match_histograms.py @@ -0,0 +1,36 @@ +import pytest +import torch + +from iopaint.model_manager import ModelManager +from iopaint.schema import SDSampler, HDStrategy +from iopaint.tests.utils import check_device, get_config, assert_equal, current_dir + + +@pytest.mark.parametrize("device", ["cuda", "mps"]) +@pytest.mark.parametrize("sampler", [SDSampler.ddim]) +def test_sd_match_histograms(device, sampler): + sd_steps = check_device(device) + + model = ModelManager( + name="runwayml/stable-diffusion-inpainting", + device=torch.device(device), + disable_nsfw=True, + sd_cpu_textencoder=False, + ) + cfg = get_config( + strategy=HDStrategy.ORIGINAL, + prompt="face of a fox, sitting on a bench", + sd_steps=sd_steps, + sd_guidance_scale=7.5, + sd_lcm_lora=False, + sd_match_histograms=True, + sd_sampler=sampler + ) + + assert_equal( + model, + cfg, + f"runway_sd_1_5_device_{device}_match_histograms.png", + img_p=current_dir / "overture-creations-5sI6fQgYIuo.png", + mask_p=current_dir / "overture-creations-5sI6fQgYIuo_mask.png", + ) diff --git a/inpaint/tests/test_model.py b/inpaint/tests/test_model.py new file mode 100644 index 0000000..dd84b12 --- /dev/null +++ b/inpaint/tests/test_model.py @@ -0,0 +1,160 @@ +import pytest +import torch + +from iopaint.model_manager import ModelManager +from iopaint.schema import HDStrategy, LDMSampler +from iopaint.tests.utils import assert_equal, get_config, current_dir, check_device + + +@pytest.mark.parametrize("device", ["cuda", "mps", "cpu"]) +@pytest.mark.parametrize( + "strategy", [HDStrategy.ORIGINAL, HDStrategy.RESIZE, HDStrategy.CROP] +) +def test_lama(device, strategy): + check_device(device) + model = ModelManager(name="lama", device=device) + assert_equal( + model, + get_config(strategy=strategy), + f"lama_{strategy[0].upper() + strategy[1:]}_result.png", + ) + + fx = 1.3 + assert_equal( + model, + get_config(strategy=strategy), + f"lama_{strategy[0].upper() + strategy[1:]}_fx_{fx}_result.png", + fx=1.3, + ) + + +@pytest.mark.parametrize("device", ["cuda", "cpu"]) +@pytest.mark.parametrize( + "strategy", [HDStrategy.ORIGINAL, HDStrategy.RESIZE, HDStrategy.CROP] +) +@pytest.mark.parametrize("ldm_sampler", [LDMSampler.ddim, LDMSampler.plms]) +def test_ldm(device, strategy, ldm_sampler): + check_device(device) + model = ModelManager(name="ldm", device=device) + cfg = get_config(strategy=strategy, ldm_sampler=ldm_sampler) + assert_equal( + model, cfg, f"ldm_{strategy[0].upper() + strategy[1:]}_{ldm_sampler}_result.png" + ) + + fx = 1.3 + assert_equal( + model, + cfg, + f"ldm_{strategy[0].upper() + strategy[1:]}_{ldm_sampler}_fx_{fx}_result.png", + fx=fx, + ) + + +@pytest.mark.parametrize("device", ["cuda", "cpu"]) +@pytest.mark.parametrize( + "strategy", [HDStrategy.ORIGINAL, HDStrategy.RESIZE, HDStrategy.CROP] +) +@pytest.mark.parametrize("zits_wireframe", [False, True]) +def test_zits(device, strategy, zits_wireframe): + check_device(device) + model = ModelManager(name="zits", device=device) + cfg = get_config(strategy=strategy, zits_wireframe=zits_wireframe) + assert_equal( + model, + cfg, + f"zits_{strategy[0].upper() + strategy[1:]}_wireframe_{zits_wireframe}_result.png", + ) + + fx = 1.3 + assert_equal( + model, + cfg, + f"zits_{strategy.capitalize()}_wireframe_{zits_wireframe}_fx_{fx}_result.png", + fx=fx, + ) + + +@pytest.mark.parametrize("device", ["cuda", "cpu"]) +@pytest.mark.parametrize("strategy", [HDStrategy.ORIGINAL]) +@pytest.mark.parametrize("no_half", [True, False]) +def test_mat(device, strategy, no_half): + check_device(device) + model = ModelManager(name="mat", device=device, no_half=no_half) + cfg = get_config(strategy=strategy) + + assert_equal( + model, + cfg, + f"mat_{strategy.capitalize()}_result.png", + ) + + +@pytest.mark.parametrize("device", ["cuda", "cpu"]) +@pytest.mark.parametrize("strategy", [HDStrategy.ORIGINAL]) +def test_fcf(device, strategy): + check_device(device) + model = ModelManager(name="fcf", device=device) + cfg = get_config(strategy=strategy) + + assert_equal(model, cfg, f"fcf_{strategy.capitalize()}_result.png", fx=2, fy=2) + assert_equal(model, cfg, f"fcf_{strategy.capitalize()}_result.png", fx=3.8, fy=2) + + +@pytest.mark.parametrize( + "strategy", [HDStrategy.ORIGINAL, HDStrategy.RESIZE, HDStrategy.CROP] +) +@pytest.mark.parametrize("cv2_flag", ["INPAINT_NS", "INPAINT_TELEA"]) +@pytest.mark.parametrize("cv2_radius", [3, 15]) +def test_cv2(strategy, cv2_flag, cv2_radius): + model = ModelManager( + name="cv2", + device=torch.device("cpu"), + ) + cfg = get_config(strategy=strategy, cv2_flag=cv2_flag, cv2_radius=cv2_radius) + assert_equal( + model, + cfg, + f"cv2_{strategy.capitalize()}_{cv2_flag}_{cv2_radius}.png", + img_p=current_dir / "overture-creations-5sI6fQgYIuo.png", + mask_p=current_dir / "overture-creations-5sI6fQgYIuo_mask.png", + ) + + +@pytest.mark.parametrize("device", ["cuda", "cpu"]) +@pytest.mark.parametrize( + "strategy", [HDStrategy.ORIGINAL, HDStrategy.RESIZE, HDStrategy.CROP] +) +def test_manga(device, strategy): + check_device(device) + model = ModelManager( + name="manga", + device=torch.device(device), + ) + cfg = get_config(strategy=strategy) + assert_equal( + model, + cfg, + f"manga_{strategy.capitalize()}.png", + img_p=current_dir / "overture-creations-5sI6fQgYIuo.png", + mask_p=current_dir / "overture-creations-5sI6fQgYIuo_mask.png", + ) + + +@pytest.mark.parametrize("device", ["cuda", "mps", "cpu"]) +@pytest.mark.parametrize("strategy", [HDStrategy.ORIGINAL]) +def test_mi_gan(device, strategy): + check_device(device) + model = ModelManager( + name="migan", + device=torch.device(device), + ) + cfg = get_config(strategy=strategy) + assert_equal( + model, + cfg, + f"migan_device_{device}.png", + img_p=current_dir / "overture-creations-5sI6fQgYIuo.png", + mask_p=current_dir / "overture-creations-5sI6fQgYIuo_mask.png", + fx=1.5, + fy=1.7 + ) diff --git a/inpaint/tests/test_model_md5.py b/inpaint/tests/test_model_md5.py new file mode 100644 index 0000000..3a81d72 --- /dev/null +++ b/inpaint/tests/test_model_md5.py @@ -0,0 +1,16 @@ +def test_load_model(): + from iopaint.plugins import InteractiveSeg + from iopaint.model_manager import ModelManager + + interactive_seg_model = InteractiveSeg("vit_l", "cpu") + + models = ["lama", "ldm", "zits", "mat", "fcf", "manga", "migan"] + for m in models: + ModelManager( + name=m, + device="cpu", + no_half=False, + disable_nsfw=False, + sd_cpu_textencoder=True, + cpu_offload=True, + ) diff --git a/inpaint/tests/test_model_switch.py b/inpaint/tests/test_model_switch.py new file mode 100644 index 0000000..735e1bd --- /dev/null +++ b/inpaint/tests/test_model_switch.py @@ -0,0 +1,70 @@ +import os + +from iopaint.schema import InpaintRequest + +os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" + +import torch + +from iopaint.model_manager import ModelManager + + +def test_model_switch(): + model = ModelManager( + name="runwayml/stable-diffusion-inpainting", + enable_controlnet=True, + controlnet_method="lllyasviel/control_v11p_sd15_canny", + device=torch.device("mps"), + disable_nsfw=True, + sd_cpu_textencoder=True, + cpu_offload=False, + ) + + model.switch("lama") + + +def test_controlnet_switch_onoff(caplog): + name = "runwayml/stable-diffusion-inpainting" + model = ModelManager( + name=name, + enable_controlnet=True, + controlnet_method="lllyasviel/control_v11p_sd15_canny", + device=torch.device("mps"), + disable_nsfw=True, + sd_cpu_textencoder=True, + cpu_offload=False, + ) + + model.switch_controlnet_method( + InpaintRequest( + name=name, + enable_controlnet=False, + ) + ) + + assert "Disable controlnet" in caplog.text + + +def test_switch_controlnet_method(caplog): + name = "runwayml/stable-diffusion-inpainting" + old_method = "lllyasviel/control_v11p_sd15_canny" + new_method = "lllyasviel/control_v11p_sd15_openpose" + model = ModelManager( + name=name, + enable_controlnet=True, + controlnet_method=old_method, + device=torch.device("mps"), + disable_nsfw=True, + sd_cpu_textencoder=True, + cpu_offload=False, + ) + + model.switch_controlnet_method( + InpaintRequest( + name=name, + enable_controlnet=True, + controlnet_method=new_method, + ) + ) + + assert f"Switch Controlnet method from {old_method} to {new_method}" in caplog.text diff --git a/inpaint/tests/test_outpainting.py b/inpaint/tests/test_outpainting.py new file mode 100644 index 0000000..ce48751 --- /dev/null +++ b/inpaint/tests/test_outpainting.py @@ -0,0 +1,137 @@ +import os + +from iopaint.tests.utils import current_dir, check_device + +os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" + +import pytest +import torch + +from iopaint.model_manager import ModelManager +from iopaint.schema import SDSampler +from iopaint.tests.test_model import get_config, assert_equal + + +@pytest.mark.parametrize("name", ["runwayml/stable-diffusion-inpainting"]) +@pytest.mark.parametrize("device", ["cuda", "mps"]) +@pytest.mark.parametrize( + "rect", + [ + [0, -100, 512, 512 - 128 + 100], + [0, 128, 512, 512 - 128 + 100], + [128, 0, 512 - 128 + 100, 512], + [-100, 0, 512 - 128 + 100, 512], + [0, 0, 512, 512 + 200], + [256, 0, 512 + 200, 512], + [-100, -100, 512 + 200, 512 + 200], + ], +) +def test_outpainting(name, device, rect): + sd_steps = check_device(device) + + model = ModelManager( + name=name, + device=torch.device(device), + disable_nsfw=True, + sd_cpu_textencoder=False, + ) + cfg = get_config( + prompt="a dog sitting on a bench in the park", + sd_steps=sd_steps, + use_extender=True, + extender_x=rect[0], + extender_y=rect[1], + extender_width=rect[2], + extender_height=rect[3], + sd_guidance_scale=8.0, + sd_sampler=SDSampler.dpm_plus_plus_2m, + ) + + assert_equal( + model, + cfg, + f"{name.replace('/', '--')}_outpainting_{'_'.join(map(str, rect))}_device_{device}.png", + img_p=current_dir / "overture-creations-5sI6fQgYIuo.png", + mask_p=current_dir / "overture-creations-5sI6fQgYIuo_mask.png", + ) + + +@pytest.mark.parametrize("name", ["kandinsky-community/kandinsky-2-2-decoder-inpaint"]) +@pytest.mark.parametrize("device", ["cuda", "mps"]) +@pytest.mark.parametrize( + "rect", + [ + [-128, -128, 768, 768], + ], +) +def test_kandinsky_outpainting(name, device, rect): + sd_steps = check_device(device) + + model = ModelManager( + name=name, + device=torch.device(device), + disable_nsfw=True, + sd_cpu_textencoder=False, + ) + cfg = get_config( + prompt="a cat", + negative_prompt="lowres, text, error, cropped, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, out of frame, extra fingers, mutated hands, poorly drawn hands, poorly drawn face, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, cloned face, disfigured, gross proportions, malformed limbs, missing arms, missing legs, extra arms, extra legs, fused fingers, too many fingers, long neck, username, watermark, signature", + sd_steps=sd_steps, + use_extender=True, + extender_x=rect[0], + extender_y=rect[1], + extender_width=rect[2], + extender_height=rect[3], + sd_guidance_scale=7, + sd_sampler=SDSampler.dpm_plus_plus_2m, + ) + + assert_equal( + model, + cfg, + f"{name.replace('/', '--')}_outpainting_{'_'.join(map(str, rect))}_device_{device}.png", + img_p=current_dir / "cat.png", + mask_p=current_dir / "overture-creations-5sI6fQgYIuo_mask.png", + fx=1, + fy=1, + ) + + +@pytest.mark.parametrize("name", ["Sanster/PowerPaint-V1-stable-diffusion-inpainting"]) +@pytest.mark.parametrize("device", ["cuda", "mps"]) +@pytest.mark.parametrize( + "rect", + [ + [-100, -100, 512 + 200, 512 + 200], + ], +) +def test_powerpaint_outpainting(name, device, rect): + sd_steps = check_device(device) + + model = ModelManager( + name=name, + device=torch.device(device), + disable_nsfw=True, + sd_cpu_textencoder=False, + low_mem=True, + ) + cfg = get_config( + prompt="a dog sitting on a bench in the park", + sd_steps=sd_steps, + use_extender=True, + extender_x=rect[0], + extender_y=rect[1], + extender_width=rect[2], + extender_height=rect[3], + sd_guidance_scale=8.0, + sd_sampler=SDSampler.dpm_plus_plus_2m, + powerpaint_task="outpainting", + ) + + assert_equal( + model, + cfg, + f"{name.replace('/', '--')}_outpainting_{'_'.join(map(str, rect))}_device_{device}.png", + img_p=current_dir / "overture-creations-5sI6fQgYIuo.png", + mask_p=current_dir / "overture-creations-5sI6fQgYIuo_mask.png", + ) diff --git a/inpaint/tests/test_paint_by_example.py b/inpaint/tests/test_paint_by_example.py new file mode 100644 index 0000000..9447138 --- /dev/null +++ b/inpaint/tests/test_paint_by_example.py @@ -0,0 +1,58 @@ +import cv2 +import pytest +from PIL import Image +from iopaint.helper import encode_pil_to_base64 + +from iopaint.model_manager import ModelManager +from iopaint.schema import HDStrategy +from iopaint.tests.utils import ( + current_dir, + get_config, + get_data, + save_dir, + check_device, +) + +model_name = "Fantasy-Studio/Paint-by-Example" + + +def assert_equal( + model, + config, + save_name: str, + fx: float = 1, + fy: float = 1, + img_p=current_dir / "overture-creations-5sI6fQgYIuo.png", + mask_p=current_dir / "overture-creations-5sI6fQgYIuo_mask.png", + example_p=current_dir / "bunny.jpeg", +): + img, mask = get_data(fx=fx, fy=fy, img_p=img_p, mask_p=mask_p) + + example_image = cv2.imread(str(example_p)) + example_image = cv2.cvtColor(example_image, cv2.COLOR_BGRA2RGB) + example_image = cv2.resize( + example_image, None, fx=fx, fy=fy, interpolation=cv2.INTER_AREA + ) + + print(f"Input image shape: {img.shape}, example_image: {example_image.shape}") + config.paint_by_example_example_image = encode_pil_to_base64( + Image.fromarray(example_image), 100, {} + ).decode("utf-8") + res = model(img, mask, config) + cv2.imwrite(str(save_dir / save_name), res) + + +@pytest.mark.parametrize("device", ["cuda", "mps", "cpu"]) +def test_paint_by_example(device): + sd_steps = check_device(device) + model = ModelManager(name=model_name, device=device, disable_nsfw=True) + cfg = get_config(strategy=HDStrategy.ORIGINAL, sd_steps=sd_steps) + assert_equal( + model, + cfg, + f"paint_by_example_device_{device}.png", + img_p=current_dir / "overture-creations-5sI6fQgYIuo.png", + mask_p=current_dir / "overture-creations-5sI6fQgYIuo_mask.png", + fy=0.9, + fx=1.3, + ) diff --git a/inpaint/tests/test_plugins.py b/inpaint/tests/test_plugins.py new file mode 100644 index 0000000..dd1eafd --- /dev/null +++ b/inpaint/tests/test_plugins.py @@ -0,0 +1,125 @@ +import os +from PIL import Image + +from iopaint.helper import encode_pil_to_base64, gen_frontend_mask +from iopaint.plugins.anime_seg import AnimeSeg +from iopaint.schema import RunPluginRequest, RemoveBGModel, InteractiveSegModel +from iopaint.tests.utils import check_device, current_dir, save_dir + +os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" + +import cv2 +import pytest + +from iopaint.plugins import ( + RemoveBG, + RealESRGANUpscaler, + GFPGANPlugin, + RestoreFormerPlugin, + InteractiveSeg, +) + +img_p = current_dir / "bunny.jpeg" +img_bytes = open(img_p, "rb").read() +bgr_img = cv2.imread(str(img_p)) +rgb_img = cv2.cvtColor(bgr_img, cv2.COLOR_BGR2RGB) +rgb_img_base64 = encode_pil_to_base64(Image.fromarray(rgb_img), 100, {}) +bgr_img_base64 = encode_pil_to_base64(Image.fromarray(bgr_img), 100, {}) + +person_p = current_dir / "image.png" +person_bgr_img = cv2.imread(str(person_p)) +person_rgb_img = cv2.cvtColor(person_bgr_img, cv2.COLOR_BGR2RGB) +person_rgb_img = cv2.resize(person_rgb_img, (512, 512)) + + +def _save(img, name): + cv2.imwrite(str(save_dir / name), img) + + +def test_remove_bg(): + model = RemoveBG(RemoveBGModel.briaai_rmbg_1_4) + rgba_np_img = model.gen_image( + rgb_img, RunPluginRequest(name=RemoveBG.name, image=rgb_img_base64) + ) + res = cv2.cvtColor(rgba_np_img, cv2.COLOR_RGBA2BGRA) + _save(res, "test_remove_bg.png") + + bgr_np_img = model.gen_mask( + rgb_img, RunPluginRequest(name=RemoveBG.name, image=rgb_img_base64) + ) + + res_mask = gen_frontend_mask(bgr_np_img) + _save(res_mask, "test_remove_bg_frontend_mask.png") + + assert len(bgr_np_img.shape) == 2 + _save(bgr_np_img, "test_remove_bg_mask.jpeg") + + +def test_anime_seg(): + model = AnimeSeg() + img = cv2.imread(str(current_dir / "anime_test.png")) + img_base64 = encode_pil_to_base64(Image.fromarray(img), 100, {}) + res = model.gen_image(img, RunPluginRequest(name=AnimeSeg.name, image=img_base64)) + assert len(res.shape) == 3 + assert res.shape[-1] == 4 + _save(res, "test_anime_seg.png") + + res = model.gen_mask(img, RunPluginRequest(name=AnimeSeg.name, image=img_base64)) + assert len(res.shape) == 2 + _save(res, "test_anime_seg_mask.png") + + +@pytest.mark.parametrize("device", ["cuda", "cpu", "mps"]) +def test_upscale(device): + check_device(device) + model = RealESRGANUpscaler("realesr-general-x4v3", device) + res = model.gen_image( + rgb_img, + RunPluginRequest(name=RealESRGANUpscaler.name, image=rgb_img_base64, scale=2), + ) + _save(res, f"test_upscale_x2_{device}.png") + + res = model.gen_image( + rgb_img, + RunPluginRequest(name=RealESRGANUpscaler.name, image=rgb_img_base64, scale=4), + ) + _save(res, f"test_upscale_x4_{device}.png") + + +@pytest.mark.parametrize("device", ["cuda", "cpu", "mps"]) +def test_gfpgan(device): + check_device(device) + model = GFPGANPlugin(device) + res = model.gen_image( + person_rgb_img, RunPluginRequest(name=GFPGANPlugin.name, image=rgb_img_base64) + ) + _save(res, f"test_gfpgan_{device}.png") + + +@pytest.mark.parametrize("device", ["cuda", "cpu", "mps"]) +def test_restoreformer(device): + check_device(device) + model = RestoreFormerPlugin(device) + res = model.gen_image( + person_rgb_img, + RunPluginRequest(name=RestoreFormerPlugin.name, image=rgb_img_base64), + ) + _save(res, f"test_restoreformer_{device}.png") + + +@pytest.mark.parametrize("name", InteractiveSegModel.values()) +@pytest.mark.parametrize("device", ["cuda", "cpu", "mps"]) +def test_segment_anything(name, device): + check_device(device) + model = InteractiveSeg(name, device) + new_mask = model.gen_mask( + rgb_img, + RunPluginRequest( + name=InteractiveSeg.name, + image=rgb_img_base64, + clicks=([[448 // 2, 394 // 2, 1]]), + ), + ) + + save_name = f"test_segment_anything_{name}_{device}.png" + _save(new_mask, save_name) diff --git a/inpaint/tests/test_save_exif.py b/inpaint/tests/test_save_exif.py new file mode 100644 index 0000000..5c19810 --- /dev/null +++ b/inpaint/tests/test_save_exif.py @@ -0,0 +1,59 @@ +import io +import tempfile +from pathlib import Path +from typing import List + +from PIL import Image + +from iopaint.helper import pil_to_bytes, load_img + +current_dir = Path(__file__).parent.absolute().resolve() + + +def print_exif(exif): + for k, v in exif.items(): + print(f"{k}: {v}") + + +def extra_info(img_p: Path): + ext = img_p.suffix.strip(".") + img_bytes = img_p.read_bytes() + np_img, _, infos = load_img(img_bytes, False, True) + res_pil_bytes = pil_to_bytes(Image.fromarray(np_img), ext=ext, infos=infos) + res_img = Image.open(io.BytesIO(res_pil_bytes)) + return infos, res_img.info, res_pil_bytes + + +def assert_keys(keys: List[str], infos, res_infos): + for k in keys: + assert k in infos + assert k in res_infos + assert infos[k] == res_infos[k] + + +def run_test(file_path, keys): + infos, res_infos, res_pil_bytes = extra_info(file_path) + assert_keys(keys, infos, res_infos) + with tempfile.NamedTemporaryFile("wb", suffix=file_path.suffix) as temp_file: + temp_file.write(res_pil_bytes) + temp_file.flush() + infos, res_infos, res_pil_bytes = extra_info(Path(temp_file.name)) + assert_keys(keys, infos, res_infos) + + +def test_png_icc_profile_png(): + run_test(current_dir / "icc_profile_test.png", ["icc_profile", "exif"]) + + +def test_png_icc_profile_jpeg(): + run_test(current_dir / "icc_profile_test.jpg", ["icc_profile", "exif"]) + + +def test_jpeg(): + jpg_img_p = current_dir / "bunny.jpeg" + run_test(jpg_img_p, ["dpi", "exif"]) + + +def test_png_parameter(): + jpg_img_p = current_dir / "png_parameter_test.png" + run_test(jpg_img_p, ["parameters"]) diff --git a/inpaint/tests/test_sd_model.py b/inpaint/tests/test_sd_model.py new file mode 100644 index 0000000..89eedb5 --- /dev/null +++ b/inpaint/tests/test_sd_model.py @@ -0,0 +1,240 @@ +import os + +from loguru import logger + +from iopaint.tests.utils import check_device, get_config, assert_equal + +os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" +from pathlib import Path + +import pytest +import torch + +from iopaint.model_manager import ModelManager +from iopaint.schema import HDStrategy, SDSampler + +current_dir = Path(__file__).parent.absolute().resolve() +save_dir = current_dir / "result" +save_dir.mkdir(exist_ok=True, parents=True) + + +@pytest.mark.parametrize("device", ["cuda", "mps"]) +def test_runway_sd_1_5_all_samplers(device): + sd_steps = check_device(device) + model = ModelManager( + name="runwayml/stable-diffusion-inpainting", + device=torch.device(device), + disable_nsfw=True, + sd_cpu_textencoder=False, + ) + + all_samplers = [member.value for member in SDSampler.__members__.values()] + print(all_samplers) + for sampler in all_samplers: + print(f"Testing sampler {sampler}") + if ( + sampler + in [SDSampler.dpm2_karras, SDSampler.dpm2_a_karras, SDSampler.lms_karras] + and device == "mps" + ): + # diffusers 0.25.0 still has bug on these sampler on mps, wait main branch released to fix it + logger.warning( + "skip dpm2_karras on mps, diffusers does not support it on mps. TypeError: Cannot convert a MPS Tensor to float64 dtype as the MPS framework doesn't support float64. Please use float32 instead." + ) + continue + cfg = get_config( + strategy=HDStrategy.ORIGINAL, + prompt="a fox sitting on a bench", + sd_steps=sd_steps, + sd_sampler=sampler, + ) + + name = f"device_{device}_{sampler}" + + assert_equal( + model, + cfg, + f"runway_sd_{name}.png", + img_p=current_dir / "overture-creations-5sI6fQgYIuo.png", + mask_p=current_dir / "overture-creations-5sI6fQgYIuo_mask.png", + ) + + +@pytest.mark.parametrize("device", ["cuda", "mps", "cpu"]) +@pytest.mark.parametrize("sampler", [SDSampler.lcm]) +def test_runway_sd_lcm_lora(device, sampler): + check_device(device) + + sd_steps = 5 + model = ModelManager( + name="runwayml/stable-diffusion-inpainting", + device=torch.device(device), + disable_nsfw=True, + sd_cpu_textencoder=False, + ) + cfg = get_config( + strategy=HDStrategy.ORIGINAL, + prompt="face of a fox, sitting on a bench", + sd_steps=sd_steps, + sd_guidance_scale=2, + sd_lcm_lora=True, + ) + cfg.sd_sampler = sampler + + assert_equal( + model, + cfg, + f"runway_sd_1_5_lcm_lora_device_{device}.png", + img_p=current_dir / "overture-creations-5sI6fQgYIuo.png", + mask_p=current_dir / "overture-creations-5sI6fQgYIuo_mask.png", + ) + + +@pytest.mark.parametrize("device", ["cuda", "mps"]) +@pytest.mark.parametrize("strategy", [HDStrategy.ORIGINAL]) +@pytest.mark.parametrize("sampler", [SDSampler.ddim]) +def test_runway_sd_sd_strength(device, strategy, sampler): + sd_steps = check_device(device) + model = ModelManager( + name="runwayml/stable-diffusion-inpainting", + device=torch.device(device), + disable_nsfw=True, + sd_cpu_textencoder=False, + ) + cfg = get_config( + strategy=strategy, + prompt="a fox sitting on a bench", + sd_steps=sd_steps, + sd_strength=0.8, + ) + cfg.sd_sampler = sampler + + assert_equal( + model, + cfg, + f"runway_sd_strength_0.8_device_{device}.png", + img_p=current_dir / "overture-creations-5sI6fQgYIuo.png", + mask_p=current_dir / "overture-creations-5sI6fQgYIuo_mask.png", + ) + + +@pytest.mark.parametrize("device", ["cuda", "cpu"]) +@pytest.mark.parametrize("strategy", [HDStrategy.ORIGINAL]) +@pytest.mark.parametrize("sampler", [SDSampler.ddim]) +def test_runway_sd_cpu_textencoder(device, strategy, sampler): + sd_steps = check_device(device) + model = ModelManager( + name="runwayml/stable-diffusion-inpainting", + device=torch.device(device), + disable_nsfw=True, + sd_cpu_textencoder=True, + ) + cfg = get_config( + strategy=strategy, + prompt="a fox sitting on a bench", + sd_steps=sd_steps, + sd_sampler=sampler, + ) + + assert_equal( + model, + cfg, + f"runway_sd_device_{device}_cpu_textencoder.png", + img_p=current_dir / "overture-creations-5sI6fQgYIuo.png", + mask_p=current_dir / "overture-creations-5sI6fQgYIuo_mask.png", + ) + + +@pytest.mark.parametrize("device", ["cuda", "mps", "cpu"]) +@pytest.mark.parametrize("strategy", [HDStrategy.ORIGINAL]) +@pytest.mark.parametrize("sampler", [SDSampler.ddim]) +def test_runway_norm_sd_model(device, strategy, sampler): + sd_steps = check_device(device) + model = ModelManager( + name="runwayml/stable-diffusion-v1-5", + device=torch.device(device), + disable_nsfw=True, + sd_cpu_textencoder=False, + ) + cfg = get_config( + strategy=strategy, prompt="face of a fox, sitting on a bench", sd_steps=sd_steps + ) + cfg.sd_sampler = sampler + + assert_equal( + model, + cfg, + f"runway_{device}_norm_sd_model_device_{device}.png", + img_p=current_dir / "overture-creations-5sI6fQgYIuo.png", + mask_p=current_dir / "overture-creations-5sI6fQgYIuo_mask.png", + ) + + +@pytest.mark.parametrize("device", ["cuda"]) +@pytest.mark.parametrize("strategy", [HDStrategy.ORIGINAL]) +@pytest.mark.parametrize("sampler", [SDSampler.dpm_plus_plus_2m]) +def test_runway_sd_1_5_cpu_offload(device, strategy, sampler): + sd_steps = check_device(device) + model = ModelManager( + name="runwayml/stable-diffusion-inpainting", + device=torch.device(device), + disable_nsfw=True, + sd_cpu_textencoder=False, + cpu_offload=True, + ) + cfg = get_config( + strategy=strategy, prompt="a fox sitting on a bench", sd_steps=sd_steps + ) + cfg.sd_sampler = sampler + + name = f"device_{device}_{sampler}" + + assert_equal( + model, + cfg, + f"runway_sd_{strategy.capitalize()}_{name}_cpu_offload.png", + img_p=current_dir / "overture-creations-5sI6fQgYIuo.png", + mask_p=current_dir / "overture-creations-5sI6fQgYIuo_mask.png", + ) + + +@pytest.mark.parametrize("device", ["cuda", "mps", "cpu"]) +@pytest.mark.parametrize("sampler", [SDSampler.ddim]) +@pytest.mark.parametrize( + "name", + [ + "sd-v1-5-inpainting.safetensors", + "v1-5-pruned-emaonly.safetensors", + "sd_xl_base_1.0.safetensors", + "sd_xl_base_1.0_inpainting_0.1.safetensors", + ], +) +def test_local_file_path(device, sampler, name): + sd_steps = check_device(device) + model = ModelManager( + name=name, + device=torch.device(device), + disable_nsfw=True, + sd_cpu_textencoder=False, + cpu_offload=False, + ) + cfg = get_config( + strategy=HDStrategy.ORIGINAL, + prompt="a fox sitting on a bench", + sd_steps=sd_steps, + ) + cfg.sd_sampler = sampler + + name = f"device_{device}_{sampler}_{name}" + + is_sdxl = "sd_xl" in name + + assert_equal( + model, + cfg, + f"sd_local_model_{name}.png", + img_p=current_dir / "overture-creations-5sI6fQgYIuo.png", + mask_p=current_dir / "overture-creations-5sI6fQgYIuo_mask.png", + fx=1.5 if is_sdxl else 1, + fy=1.5 if is_sdxl else 1, + ) diff --git a/inpaint/tests/test_sdxl.py b/inpaint/tests/test_sdxl.py new file mode 100644 index 0000000..8d5e5ab --- /dev/null +++ b/inpaint/tests/test_sdxl.py @@ -0,0 +1,118 @@ +import os + +from iopaint.tests.utils import check_device, current_dir + +os.environ["PYTORCH_ENABLE_MPS_FALLBACK"] = "1" + +import pytest +import torch + +from iopaint.model_manager import ModelManager +from iopaint.schema import HDStrategy, SDSampler +from iopaint.tests.test_model import get_config, assert_equal + + +@pytest.mark.parametrize("device", ["cuda", "mps"]) +@pytest.mark.parametrize("strategy", [HDStrategy.ORIGINAL]) +@pytest.mark.parametrize("sampler", [SDSampler.ddim]) +def test_sdxl(device, strategy, sampler): + sd_steps = check_device(device) + + model = ModelManager( + name="diffusers/stable-diffusion-xl-1.0-inpainting-0.1", + device=torch.device(device), + disable_nsfw=True, + sd_cpu_textencoder=False, + ) + cfg = get_config( + strategy=strategy, + prompt="face of a fox, sitting on a bench", + sd_steps=sd_steps, + sd_strength=1.0, + sd_guidance_scale=7.0, + ) + cfg.sd_sampler = sampler + + assert_equal( + model, + cfg, + f"sdxl_device_{device}.png", + img_p=current_dir / "overture-creations-5sI6fQgYIuo.png", + mask_p=current_dir / "overture-creations-5sI6fQgYIuo_mask.png", + fx=2, + fy=2, + ) + + +@pytest.mark.parametrize("device", ["cuda", "cpu"]) +@pytest.mark.parametrize("strategy", [HDStrategy.ORIGINAL]) +@pytest.mark.parametrize("sampler", [SDSampler.ddim]) +def test_sdxl_cpu_text_encoder(device, strategy, sampler): + sd_steps = check_device(device) + + model = ModelManager( + name="diffusers/stable-diffusion-xl-1.0-inpainting-0.1", + device=torch.device(device), + disable_nsfw=True, + sd_cpu_textencoder=True, + ) + cfg = get_config( + strategy=strategy, + prompt="face of a fox, sitting on a bench", + sd_steps=sd_steps, + sd_strength=1.0, + sd_guidance_scale=7.0, + ) + cfg.sd_sampler = sampler + + assert_equal( + model, + cfg, + f"sdxl_device_{device}.png", + img_p=current_dir / "overture-creations-5sI6fQgYIuo.png", + mask_p=current_dir / "overture-creations-5sI6fQgYIuo_mask.png", + fx=2, + fy=2, + ) + + +@pytest.mark.parametrize("device", ["cuda", "mps"]) +@pytest.mark.parametrize( + "rect", + [ + [-128, -128, 1024, 1024], + ], +) +def test_sdxl_outpainting(device, rect): + sd_steps = check_device(device) + + model = ModelManager( + name="diffusers/stable-diffusion-xl-1.0-inpainting-0.1", + device=torch.device(device), + disable_nsfw=True, + sd_cpu_textencoder=False, + ) + + cfg = get_config( + strategy=HDStrategy.ORIGINAL, + prompt="a dog sitting on a bench in the park", + sd_steps=sd_steps, + use_extender=True, + extender_x=rect[0], + extender_y=rect[1], + extender_width=rect[2], + extender_height=rect[3], + sd_strength=1.0, + sd_guidance_scale=8.0, + sd_sampler=SDSampler.ddim, + ) + + assert_equal( + model, + cfg, + f"sdxl_outpainting_dog_ddim_{'_'.join(map(str, rect))}_device_{device}.png", + img_p=current_dir / "overture-creations-5sI6fQgYIuo.png", + mask_p=current_dir / "overture-creations-5sI6fQgYIuo_mask.png", + fx=1.5, + fy=1.5, + ) diff --git a/inpaint/tests/utils.py b/inpaint/tests/utils.py new file mode 100644 index 0000000..92bc5ef --- /dev/null +++ b/inpaint/tests/utils.py @@ -0,0 +1,77 @@ +from pathlib import Path +import cv2 +import pytest +import torch + +from iopaint.schema import LDMSampler, HDStrategy, InpaintRequest, SDSampler +import numpy as np + +current_dir = Path(__file__).parent.absolute().resolve() +save_dir = current_dir / "result" +save_dir.mkdir(exist_ok=True, parents=True) + + +def check_device(device: str) -> int: + if device == "cuda" and not torch.cuda.is_available(): + pytest.skip("CUDA is not available, skip test on cuda") + if device == "mps" and not torch.backends.mps.is_available(): + pytest.skip("mps is not available, skip test on mps") + steps = 2 if device == "cpu" else 20 + return steps + + +def assert_equal( + model, + config: InpaintRequest, + gt_name, + fx: float = 1, + fy: float = 1, + img_p=current_dir / "image.png", + mask_p=current_dir / "mask.png", +): + img, mask = get_data(fx=fx, fy=fy, img_p=img_p, mask_p=mask_p) + print(f"Input image shape: {img.shape}") + + res = model(img, mask, config) + ok = cv2.imwrite( + str(save_dir / gt_name), + res, + [int(cv2.IMWRITE_JPEG_QUALITY), 100, int(cv2.IMWRITE_PNG_COMPRESSION), 0], + ) + assert ok, save_dir / gt_name + + """ + Note that JPEG is lossy compression, so even if it is the highest quality 100, + when the saved images is reloaded, a difference occurs with the original pixel value. + If you want to save the original images as it is, save it as PNG or BMP. + """ + # gt = cv2.imread(str(current_dir / gt_name), cv2.IMREAD_UNCHANGED) + # assert np.array_equal(res, gt) + + +def get_data( + fx: float = 1, + fy: float = 1.0, + img_p=current_dir / "image.png", + mask_p=current_dir / "mask.png", +): + img = cv2.imread(str(img_p)) + img = cv2.cvtColor(img, cv2.COLOR_BGRA2RGB) + mask = cv2.imread(str(mask_p), cv2.IMREAD_GRAYSCALE) + img = cv2.resize(img, None, fx=fx, fy=fy, interpolation=cv2.INTER_AREA) + mask = cv2.resize(mask, None, fx=fx, fy=fy, interpolation=cv2.INTER_NEAREST) + return img, mask + + +def get_config(**kwargs): + data = dict( + sd_sampler=kwargs.get("sd_sampler", SDSampler.uni_pc), + ldm_steps=1, + ldm_sampler=LDMSampler.plms, + hd_strategy=kwargs.get("strategy", HDStrategy.ORIGINAL), + hd_strategy_crop_margin=32, + hd_strategy_crop_trigger_size=200, + hd_strategy_resize_limit=200, + ) + data.update(**kwargs) + return InpaintRequest(image="", mask="", **data) diff --git a/inpaint/web_app/assets/Inter-Black-jiII8dog.woff2 b/inpaint/web_app/assets/Inter-Black-jiII8dog.woff2 new file mode 100644 index 0000000..18b35db Binary files /dev/null and b/inpaint/web_app/assets/Inter-Black-jiII8dog.woff2 differ diff --git a/inpaint/web_app/assets/Inter-BlackItalic-1413vuen.woff2 b/inpaint/web_app/assets/Inter-BlackItalic-1413vuen.woff2 new file mode 100644 index 0000000..02c9d8e Binary files /dev/null and b/inpaint/web_app/assets/Inter-BlackItalic-1413vuen.woff2 differ diff --git a/inpaint/web_app/assets/Inter-Bold-srYz_-1B.woff2 b/inpaint/web_app/assets/Inter-Bold-srYz_-1B.woff2 new file mode 100644 index 0000000..0f1b157 Binary files /dev/null and b/inpaint/web_app/assets/Inter-Bold-srYz_-1B.woff2 differ diff --git a/inpaint/web_app/assets/Inter-BoldItalic-dE_gZyur.woff2 b/inpaint/web_app/assets/Inter-BoldItalic-dE_gZyur.woff2 new file mode 100644 index 0000000..bc50f24 Binary files /dev/null and b/inpaint/web_app/assets/Inter-BoldItalic-dE_gZyur.woff2 differ diff --git a/inpaint/web_app/assets/Inter-ExtraBold-TduDdwUu.woff2 b/inpaint/web_app/assets/Inter-ExtraBold-TduDdwUu.woff2 new file mode 100644 index 0000000..b113368 Binary files /dev/null and b/inpaint/web_app/assets/Inter-ExtraBold-TduDdwUu.woff2 differ diff --git a/inpaint/web_app/assets/Inter-ExtraBoldItalic-BJafRE5I.woff2 b/inpaint/web_app/assets/Inter-ExtraBoldItalic-BJafRE5I.woff2 new file mode 100644 index 0000000..a5b76ca Binary files /dev/null and b/inpaint/web_app/assets/Inter-ExtraBoldItalic-BJafRE5I.woff2 differ diff --git a/inpaint/web_app/assets/Inter-ExtraLight-w5HAp5iF.woff2 b/inpaint/web_app/assets/Inter-ExtraLight-w5HAp5iF.woff2 new file mode 100644 index 0000000..1d77ae8 Binary files /dev/null and b/inpaint/web_app/assets/Inter-ExtraLight-w5HAp5iF.woff2 differ diff --git a/inpaint/web_app/assets/Inter-ExtraLightItalic-ZptecSuc.woff2 b/inpaint/web_app/assets/Inter-ExtraLightItalic-ZptecSuc.woff2 new file mode 100644 index 0000000..8c68492 Binary files /dev/null and b/inpaint/web_app/assets/Inter-ExtraLightItalic-ZptecSuc.woff2 differ diff --git a/inpaint/web_app/assets/Inter-Italic-f6M78thn.woff2 b/inpaint/web_app/assets/Inter-Italic-f6M78thn.woff2 new file mode 100644 index 0000000..4c24ce2 Binary files /dev/null and b/inpaint/web_app/assets/Inter-Italic-f6M78thn.woff2 differ diff --git a/inpaint/web_app/assets/Inter-Light-DFhX0qo-.woff2 b/inpaint/web_app/assets/Inter-Light-DFhX0qo-.woff2 new file mode 100644 index 0000000..dbe6143 Binary files /dev/null and b/inpaint/web_app/assets/Inter-Light-DFhX0qo-.woff2 differ diff --git a/inpaint/web_app/assets/Inter-LightItalic-fu56_DRc.woff2 b/inpaint/web_app/assets/Inter-LightItalic-fu56_DRc.woff2 new file mode 100644 index 0000000..a40d042 Binary files /dev/null and b/inpaint/web_app/assets/Inter-LightItalic-fu56_DRc.woff2 differ diff --git a/inpaint/web_app/assets/Inter-Medium-dDRaJ8tM.woff2 b/inpaint/web_app/assets/Inter-Medium-dDRaJ8tM.woff2 new file mode 100644 index 0000000..0fd2ee7 Binary files /dev/null and b/inpaint/web_app/assets/Inter-Medium-dDRaJ8tM.woff2 differ diff --git a/inpaint/web_app/assets/Inter-MediumItalic-zr3roggP.woff2 b/inpaint/web_app/assets/Inter-MediumItalic-zr3roggP.woff2 new file mode 100644 index 0000000..9676715 Binary files /dev/null and b/inpaint/web_app/assets/Inter-MediumItalic-zr3roggP.woff2 differ diff --git a/inpaint/web_app/assets/Inter-Regular-dEFHw1tF.woff2 b/inpaint/web_app/assets/Inter-Regular-dEFHw1tF.woff2 new file mode 100644 index 0000000..b8699af Binary files /dev/null and b/inpaint/web_app/assets/Inter-Regular-dEFHw1tF.woff2 differ diff --git a/inpaint/web_app/assets/Inter-SemiBold-PyS8DO2L.woff2 b/inpaint/web_app/assets/Inter-SemiBold-PyS8DO2L.woff2 new file mode 100644 index 0000000..95c48b1 Binary files /dev/null and b/inpaint/web_app/assets/Inter-SemiBold-PyS8DO2L.woff2 differ diff --git a/inpaint/web_app/assets/Inter-SemiBoldItalic-uIDb7hsH.woff2 b/inpaint/web_app/assets/Inter-SemiBoldItalic-uIDb7hsH.woff2 new file mode 100644 index 0000000..ddfe19e Binary files /dev/null and b/inpaint/web_app/assets/Inter-SemiBoldItalic-uIDb7hsH.woff2 differ diff --git a/inpaint/web_app/assets/Inter-Thin-eKObIkJC.woff2 b/inpaint/web_app/assets/Inter-Thin-eKObIkJC.woff2 new file mode 100644 index 0000000..0790960 Binary files /dev/null and b/inpaint/web_app/assets/Inter-Thin-eKObIkJC.woff2 differ diff --git a/inpaint/web_app/assets/Inter-ThinItalic-L6uBn3RP.woff2 b/inpaint/web_app/assets/Inter-ThinItalic-L6uBn3RP.woff2 new file mode 100644 index 0000000..a7bf213 Binary files /dev/null and b/inpaint/web_app/assets/Inter-ThinItalic-L6uBn3RP.woff2 differ diff --git a/inpaint/web_app/assets/index-7L_lPAh0.css b/inpaint/web_app/assets/index-7L_lPAh0.css new file mode 100644 index 0000000..ac93105 --- /dev/null +++ b/inpaint/web_app/assets/index-7L_lPAh0.css @@ -0,0 +1 @@ +@font-face{font-family:Inter;font-style:normal;font-weight:100;font-display:swap;src:url(/assets/Inter-Thin-eKObIkJC.woff2?v=4.0) format("woff2")}@font-face{font-family:Inter;font-style:italic;font-weight:100;font-display:swap;src:url(/assets/Inter-ThinItalic-L6uBn3RP.woff2?v=4.0) format("woff2")}@font-face{font-family:Inter;font-style:normal;font-weight:200;font-display:swap;src:url(/assets/Inter-ExtraLight-w5HAp5iF.woff2?v=4.0) format("woff2")}@font-face{font-family:Inter;font-style:italic;font-weight:200;font-display:swap;src:url(/assets/Inter-ExtraLightItalic-ZptecSuc.woff2?v=4.0) format("woff2")}@font-face{font-family:Inter;font-style:normal;font-weight:300;font-display:swap;src:url(/assets/Inter-Light-DFhX0qo-.woff2?v=4.0) format("woff2")}@font-face{font-family:Inter;font-style:italic;font-weight:300;font-display:swap;src:url(/assets/Inter-LightItalic-fu56_DRc.woff2?v=4.0) format("woff2")}@font-face{font-family:Inter;font-style:normal;font-weight:400;font-display:swap;src:url(/assets/Inter-Regular-dEFHw1tF.woff2?v=4.0) format("woff2")}@font-face{font-family:Inter;font-style:italic;font-weight:400;font-display:swap;src:url(/assets/Inter-Italic-f6M78thn.woff2?v=4.0) format("woff2")}@font-face{font-family:Inter;font-style:normal;font-weight:500;font-display:swap;src:url(/assets/Inter-Medium-dDRaJ8tM.woff2?v=4.0) format("woff2")}@font-face{font-family:Inter;font-style:italic;font-weight:500;font-display:swap;src:url(/assets/Inter-MediumItalic-zr3roggP.woff2?v=4.0) format("woff2")}@font-face{font-family:Inter;font-style:normal;font-weight:600;font-display:swap;src:url(/assets/Inter-SemiBold-PyS8DO2L.woff2?v=4.0) format("woff2")}@font-face{font-family:Inter;font-style:italic;font-weight:600;font-display:swap;src:url(/assets/Inter-SemiBoldItalic-uIDb7hsH.woff2?v=4.0) format("woff2")}@font-face{font-family:Inter;font-style:normal;font-weight:700;font-display:swap;src:url(/assets/Inter-Bold-srYz_-1B.woff2?v=4.0) format("woff2")}@font-face{font-family:Inter;font-style:italic;font-weight:700;font-display:swap;src:url(/assets/Inter-BoldItalic-dE_gZyur.woff2?v=4.0) format("woff2")}@font-face{font-family:Inter;font-style:normal;font-weight:800;font-display:swap;src:url(/assets/Inter-ExtraBold-TduDdwUu.woff2?v=4.0) format("woff2")}@font-face{font-family:Inter;font-style:italic;font-weight:800;font-display:swap;src:url(/assets/Inter-ExtraBoldItalic-BJafRE5I.woff2?v=4.0) format("woff2")}@font-face{font-family:Inter;font-style:normal;font-weight:900;font-display:swap;src:url(/assets/Inter-Black-jiII8dog.woff2?v=4.0) format("woff2")}@font-face{font-family:Inter;font-style:italic;font-weight:900;font-display:swap;src:url(/assets/Inter-BlackItalic-1413vuen.woff2?v=4.0) format("woff2")}*,:before,:after{box-sizing:border-box;border-width:0;border-style:solid;border-color:#e5e7eb}:before,:after{--tw-content: ""}html{line-height:1.5;-webkit-text-size-adjust:100%;-moz-tab-size:4;-o-tab-size:4;tab-size:4;font-family:ui-sans-serif,system-ui,-apple-system,BlinkMacSystemFont,Segoe UI,Roboto,Helvetica Neue,Arial,Noto Sans,sans-serif,"Apple Color Emoji","Segoe UI Emoji",Segoe UI Symbol,"Noto Color Emoji";font-feature-settings:normal;font-variation-settings:normal}body{margin:0;line-height:inherit}hr{height:0;color:inherit;border-top-width:1px}abbr:where([title]){-webkit-text-decoration:underline dotted;text-decoration:underline dotted}h1,h2,h3,h4,h5,h6{font-size:inherit;font-weight:inherit}a{color:inherit;text-decoration:inherit}b,strong{font-weight:bolder}code,kbd,samp,pre{font-family:ui-monospace,SFMono-Regular,Menlo,Monaco,Consolas,Liberation Mono,Courier New,monospace;font-size:1em}small{font-size:80%}sub,sup{font-size:75%;line-height:0;position:relative;vertical-align:baseline}sub{bottom:-.25em}sup{top:-.5em}table{text-indent:0;border-color:inherit;border-collapse:collapse}button,input,optgroup,select,textarea{font-family:inherit;font-feature-settings:inherit;font-variation-settings:inherit;font-size:100%;font-weight:inherit;line-height:inherit;color:inherit;margin:0;padding:0}button,select{text-transform:none}button,[type=button],[type=reset],[type=submit]{-webkit-appearance:button;background-color:transparent;background-image:none}:-moz-focusring{outline:auto}:-moz-ui-invalid{box-shadow:none}progress{vertical-align:baseline}::-webkit-inner-spin-button,::-webkit-outer-spin-button{height:auto}[type=search]{-webkit-appearance:textfield;outline-offset:-2px}::-webkit-search-decoration{-webkit-appearance:none}::-webkit-file-upload-button{-webkit-appearance:button;font:inherit}summary{display:list-item}blockquote,dl,dd,h1,h2,h3,h4,h5,h6,hr,figure,p,pre{margin:0}fieldset{margin:0;padding:0}legend{padding:0}ol,ul,menu{list-style:none;margin:0;padding:0}dialog{padding:0}textarea{resize:vertical}input::-moz-placeholder,textarea::-moz-placeholder{opacity:1;color:#9ca3af}input::placeholder,textarea::placeholder{opacity:1;color:#9ca3af}button,[role=button]{cursor:pointer}:disabled{cursor:default}img,svg,video,canvas,audio,iframe,embed,object{display:block;vertical-align:middle}img,video{max-width:100%;height:auto}[hidden]{display:none}:root{--background: 0 0% 100%;--foreground: 224 71.4% 4.1%;--card: 0 0% 100%;--card-foreground: 224 71.4% 4.1%;--popover: 0 0% 100%;--popover-foreground: 224 71.4% 4.1%;--primary: 48 100% 50%;--primary-foreground: 210 20% 98%;--secondary: 220 14.3% 95.9%;--secondary-foreground: 220.9 39.3% 11%;--muted: 220 14.3% 95.9%;--muted-foreground: 220 8.9% 46.1%;--accent: 220 14.3% 95.9%;--accent-foreground: 220.9 39.3% 11%;--destructive: 0 84.2% 60.2%;--destructive-foreground: 210 20% 98%;--border: 220 13% 91%;--input: 220 13% 91%;--ring: 224 71.4% 4.1%;--radius: .5rem}[data-theme=dark]{--background: 240 10% 90%;--foreground: 0 0% 38%;--card: 224 71.4% 4.1%;--card-foreground: 210 20% 98%;--popover: 240 10% 3.9%;--popover-foreground: 0 0% 98%;--primary: 48 100% 50%;--primary-foreground: 220.9 39.3% 11%;--secondary: 240 3.7% 15.9%;--secondary-foreground: 0 0% 98%;--muted: 240 3.7% 15.9%;--muted-foreground: 240 5% 64.9%;--accent: 240 3.7% 15.9%;--accent-foreground: 0 0% 98%;--destructive: 0 62.8% 30.6%;--destructive-foreground: 0 85.7% 97.3%;--border: 240 3.7% 65.9%;--input: 240 3.7% 15.9%;--ring: 240 4.9% 83.9%}*{border-color:hsl(var(--border))}body{background-color:hsl(var(--background));color:hsl(var(--foreground))}*,:before,:after{--tw-border-spacing-x: 0;--tw-border-spacing-y: 0;--tw-translate-x: 0;--tw-translate-y: 0;--tw-rotate: 0;--tw-skew-x: 0;--tw-skew-y: 0;--tw-scale-x: 1;--tw-scale-y: 1;--tw-pan-x: ;--tw-pan-y: ;--tw-pinch-zoom: ;--tw-scroll-snap-strictness: proximity;--tw-gradient-from-position: ;--tw-gradient-via-position: ;--tw-gradient-to-position: ;--tw-ordinal: ;--tw-slashed-zero: ;--tw-numeric-figure: ;--tw-numeric-spacing: ;--tw-numeric-fraction: ;--tw-ring-inset: ;--tw-ring-offset-width: 0px;--tw-ring-offset-color: #fff;--tw-ring-color: rgb(59 130 246 / .5);--tw-ring-offset-shadow: 0 0 #0000;--tw-ring-shadow: 0 0 #0000;--tw-shadow: 0 0 #0000;--tw-shadow-colored: 0 0 #0000;--tw-blur: ;--tw-brightness: ;--tw-contrast: ;--tw-grayscale: ;--tw-hue-rotate: ;--tw-invert: ;--tw-saturate: ;--tw-sepia: ;--tw-drop-shadow: ;--tw-backdrop-blur: ;--tw-backdrop-brightness: ;--tw-backdrop-contrast: ;--tw-backdrop-grayscale: ;--tw-backdrop-hue-rotate: ;--tw-backdrop-invert: ;--tw-backdrop-opacity: ;--tw-backdrop-saturate: ;--tw-backdrop-sepia: }::backdrop{--tw-border-spacing-x: 0;--tw-border-spacing-y: 0;--tw-translate-x: 0;--tw-translate-y: 0;--tw-rotate: 0;--tw-skew-x: 0;--tw-skew-y: 0;--tw-scale-x: 1;--tw-scale-y: 1;--tw-pan-x: ;--tw-pan-y: ;--tw-pinch-zoom: ;--tw-scroll-snap-strictness: proximity;--tw-gradient-from-position: ;--tw-gradient-via-position: ;--tw-gradient-to-position: ;--tw-ordinal: ;--tw-slashed-zero: ;--tw-numeric-figure: ;--tw-numeric-spacing: ;--tw-numeric-fraction: ;--tw-ring-inset: ;--tw-ring-offset-width: 0px;--tw-ring-offset-color: #fff;--tw-ring-color: rgb(59 130 246 / .5);--tw-ring-offset-shadow: 0 0 #0000;--tw-ring-shadow: 0 0 #0000;--tw-shadow: 0 0 #0000;--tw-shadow-colored: 0 0 #0000;--tw-blur: ;--tw-brightness: ;--tw-contrast: ;--tw-grayscale: ;--tw-hue-rotate: ;--tw-invert: ;--tw-saturate: ;--tw-sepia: ;--tw-drop-shadow: ;--tw-backdrop-blur: ;--tw-backdrop-brightness: ;--tw-backdrop-contrast: ;--tw-backdrop-grayscale: ;--tw-backdrop-hue-rotate: ;--tw-backdrop-invert: ;--tw-backdrop-opacity: ;--tw-backdrop-saturate: ;--tw-backdrop-sepia: }.sr-only{position:absolute;width:1px;height:1px;padding:0;margin:-1px;overflow:hidden;clip:rect(0,0,0,0);white-space:nowrap;border-width:0}.pointer-events-none{pointer-events:none}.pointer-events-auto{pointer-events:auto}.visible{visibility:visible}.fixed{position:fixed}.absolute{position:absolute}.relative{position:relative}.inset-0{top:0;right:0;bottom:0;left:0}.inset-x-0{left:0;right:0}.inset-y-0{top:0;bottom:0}.bottom-0{bottom:0}.bottom-5{bottom:1.25rem}.bottom-6{bottom:1.5rem}.left-0{left:0}.left-1\/2{left:50%}.left-2{left:.5rem}.left-\[24px\]{left:24px}.left-\[50\%\]{left:50%}.left-\[8px\]{left:8px}.right-0{right:0}.right-1{right:.25rem}.right-10{right:2.5rem}.right-2{right:.5rem}.right-4{right:1rem}.right-6{right:1.5rem}.top-0{top:0}.top-1{top:.25rem}.top-4{top:1rem}.top-\[0\]{top:0}.top-\[50\%\]{top:50%}.top-\[68px\]{top:68px}.z-10{z-index:10}.z-20{z-index:20}.z-50{z-index:50}.z-\[100\]{z-index:100}.z-\[2\]{z-index:2}.z-\[4\]{z-index:4}.m-3{margin:.75rem}.-mx-1{margin-left:-.25rem;margin-right:-.25rem}.my-1{margin-top:.25rem;margin-bottom:.25rem}.mb-2{margin-bottom:.5rem}.mb-\[-6px\]{margin-bottom:-6px}.ml-\[-6px\]{margin-left:-6px}.ml-auto{margin-left:auto}.mr-8{margin-right:2rem}.mr-\[-6px\]{margin-right:-6px}.mt-0{margin-top:0}.mt-2{margin-top:.5rem}.mt-4{margin-top:1rem}.mt-\[-6px\]{margin-top:-6px}.mt-\[60px\]{margin-top:60px}.block{display:block}.flex{display:flex}.inline-flex{display:inline-flex}.grid{display:grid}.hidden{display:none}.aspect-square{aspect-ratio:1 / 1}.h-1{height:.25rem}.h-1\.5{height:.375rem}.h-10{height:2.5rem}.h-2{height:.5rem}.h-2\.5{height:.625rem}.h-3{height:.75rem}.h-3\.5{height:.875rem}.h-4{height:1rem}.h-4\/5{height:80%}.h-5{height:1.25rem}.h-6{height:1.5rem}.h-7{height:1.75rem}.h-8{height:2rem}.h-9{height:2.25rem}.h-\[10px\]{height:10px}.h-\[12px\]{height:12px}.h-\[1px\]{height:1px}.h-\[20px\]{height:20px}.h-\[240px\]{height:240px}.h-\[32px\]{height:32px}.h-\[600px\]{height:600px}.h-\[60px\]{height:60px}.h-\[var\(--radix-select-trigger-height\)\]{height:var(--radix-select-trigger-height)}.h-full{height:100%}.h-px{height:1px}.h-screen{height:100vh}.max-h-96{max-height:24rem}.max-h-\[200px\]{max-height:200px}.max-h-\[8rem\]{max-height:8rem}.max-h-screen{max-height:100vh}.min-h-\[32px\]{min-height:32px}.min-h-\[60px\]{min-height:60px}.min-h-screen{min-height:100vh}.w-2{width:.5rem}.w-2\.5{width:.625rem}.w-3{width:.75rem}.w-3\.5{width:.875rem}.w-3\/4{width:75%}.w-4{width:1rem}.w-48{width:12rem}.w-5{width:1.25rem}.w-6{width:1.5rem}.w-72{width:18rem}.w-8{width:2rem}.w-9{width:2.25rem}.w-\[100px\]{width:100px}.w-\[10px\]{width:10px}.w-\[110px\]{width:110px}.w-\[12px\]{width:12px}.w-\[130px\]{width:130px}.w-\[140px\]{width:140px}.w-\[160px\]{width:160px}.w-\[175px\]{width:175px}.w-\[180px\]{width:180px}.w-\[1px\]{width:1px}.w-\[20px\]{width:20px}.w-\[220px\]{width:220px}.w-\[250px\]{width:250px}.w-\[286px\]{width:286px}.w-\[400px\]{width:400px}.w-\[45px\]{width:45px}.w-\[500px\]{width:500px}.w-\[50px\]{width:50px}.w-\[510px\]{width:510px}.w-\[65px\]{width:65px}.w-\[6px\]{width:6px}.w-auto{width:auto}.w-full{width:100%}.w-screen{width:100vw}.min-w-\[600px\]{min-width:600px}.min-w-\[65px\]{min-width:65px}.min-w-\[8rem\]{min-width:8rem}.min-w-\[var\(--radix-select-trigger-width\)\]{min-width:var(--radix-select-trigger-width)}.max-w-3xl{max-width:48rem}.max-w-6xl{max-width:72rem}.max-w-\[200px\]{max-width:200px}.max-w-lg{max-width:32rem}.max-w-sm{max-width:24rem}.max-w-xs{max-width:20rem}.flex-1{flex:1 1 0%}.shrink{flex-shrink:1}.shrink-0{flex-shrink:0}.grow{flex-grow:1}.origin-top-left{transform-origin:top left}.-translate-x-1\/2,.translate-x-\[-50\%\]{--tw-translate-x: -50%;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.translate-y-\[-50\%\]{--tw-translate-y: -50%;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.transform{transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}@keyframes pulse{50%{opacity:.5}}.animate-pulse{animation:pulse 2s cubic-bezier(.4,0,.6,1) infinite}@keyframes spin{to{transform:rotate(360deg)}}.animate-spin{animation:spin 1s linear infinite}.cursor-default{cursor:default}.cursor-ew-resize{cursor:ew-resize}.cursor-ne-resize{cursor:ne-resize}.cursor-ns-resize{cursor:ns-resize}.cursor-nw-resize{cursor:nw-resize}.cursor-pointer{cursor:pointer}.cursor-se-resize{cursor:se-resize}.cursor-sw-resize{cursor:sw-resize}.touch-none{touch-action:none}.select-none{-webkit-user-select:none;-moz-user-select:none;user-select:none}.resize-none{resize:none}.resize{resize:both}.flex-row{flex-direction:row}.flex-col{flex-direction:column}.flex-col-reverse{flex-direction:column-reverse}.items-center{align-items:center}.justify-start{justify-content:flex-start}.justify-end{justify-content:flex-end}.justify-center{justify-content:center}.justify-between{justify-content:space-between}.gap-1{gap:.25rem}.gap-2{gap:.5rem}.gap-3{gap:.75rem}.gap-4{gap:1rem}.gap-8{gap:2rem}.gap-\[12px\]{gap:12px}.gap-\[14px\]{gap:14px}.gap-\[18px\]{gap:18px}.gap-\[8px\]{gap:8px}.gap-y-4{row-gap:1rem}.space-x-2>:not([hidden])~:not([hidden]){--tw-space-x-reverse: 0;margin-right:calc(.5rem * var(--tw-space-x-reverse));margin-left:calc(.5rem * calc(1 - var(--tw-space-x-reverse)))}.space-x-8>:not([hidden])~:not([hidden]){--tw-space-x-reverse: 0;margin-right:calc(2rem * var(--tw-space-x-reverse));margin-left:calc(2rem * calc(1 - var(--tw-space-x-reverse)))}.space-y-0>:not([hidden])~:not([hidden]){--tw-space-y-reverse: 0;margin-top:calc(0px * calc(1 - var(--tw-space-y-reverse)));margin-bottom:calc(0px * var(--tw-space-y-reverse))}.space-y-0\.5>:not([hidden])~:not([hidden]){--tw-space-y-reverse: 0;margin-top:calc(.125rem * calc(1 - var(--tw-space-y-reverse)));margin-bottom:calc(.125rem * var(--tw-space-y-reverse))}.space-y-1>:not([hidden])~:not([hidden]){--tw-space-y-reverse: 0;margin-top:calc(.25rem * calc(1 - var(--tw-space-y-reverse)));margin-bottom:calc(.25rem * var(--tw-space-y-reverse))}.space-y-1\.5>:not([hidden])~:not([hidden]){--tw-space-y-reverse: 0;margin-top:calc(.375rem * calc(1 - var(--tw-space-y-reverse)));margin-bottom:calc(.375rem * var(--tw-space-y-reverse))}.space-y-2>:not([hidden])~:not([hidden]){--tw-space-y-reverse: 0;margin-top:calc(.5rem * calc(1 - var(--tw-space-y-reverse)));margin-bottom:calc(.5rem * var(--tw-space-y-reverse))}.space-y-4>:not([hidden])~:not([hidden]){--tw-space-y-reverse: 0;margin-top:calc(1rem * calc(1 - var(--tw-space-y-reverse)));margin-bottom:calc(1rem * var(--tw-space-y-reverse))}.justify-self-end{justify-self:end}.overflow-auto{overflow:auto}.overflow-hidden{overflow:hidden}.overflow-y-auto{overflow-y:auto}.overflow-x-hidden{overflow-x:hidden}.overflow-y-hidden{overflow-y:hidden}.whitespace-nowrap{white-space:nowrap}.rounded-\[14px\]{border-radius:14px}.rounded-\[3rem\]{border-radius:3rem}.rounded-\[50\%\]{border-radius:50%}.rounded-\[inherit\]{border-radius:inherit}.rounded-full{border-radius:9999px}.rounded-lg{border-radius:var(--radius)}.rounded-md{border-radius:calc(var(--radius) - 2px)}.rounded-sm{border-radius:calc(var(--radius) - 4px)}.rounded-xl{border-radius:.75rem}.border{border-width:1px}.border-2{border-width:2px}.border-\[1px\]{border-width:1px}.border-\[2px\]{border-width:2px}.border-b{border-bottom-width:1px}.border-l{border-left-width:1px}.border-r{border-right-width:1px}.border-t{border-top-width:1px}.border-solid{border-style:solid}.border-\[\#ffcc00\]{--tw-border-opacity: 1;border-color:rgb(255 204 0 / var(--tw-border-opacity))}.border-\[dashed\]{border-color:dashed}.border-\[solid\]{border-color:solid}.border-destructive{border-color:hsl(var(--destructive))}.border-input{border-color:hsl(var(--input))}.border-primary{border-color:hsl(var(--primary))}.border-primary\/60{border-color:hsl(var(--primary) / .6)}.border-transparent{border-color:transparent}.border-l-transparent{border-left-color:transparent}.border-t-transparent{border-top-color:transparent}.bg-\[\#ffcc00bb\]{background-color:#fc0b}.bg-\[rgba\(21\,_215\,_121\,_0\.936\)\]{background-color:#15d779ef}.bg-\[rgba\(237\,_49\,_55\,_0\.942\)\]{background-color:#ed3137f0}.bg-background{background-color:hsl(var(--background))}.bg-background\/70{background-color:hsl(var(--background) / .7)}.bg-background\/80{background-color:hsl(var(--background) / .8)}.bg-border{background-color:hsl(var(--border))}.bg-destructive{background-color:hsl(var(--destructive))}.bg-muted{background-color:hsl(var(--muted))}.bg-popover{background-color:hsl(var(--popover))}.bg-primary{background-color:hsl(var(--primary))}.bg-primary\/20{background-color:hsl(var(--primary) / .2)}.bg-secondary{background-color:hsl(var(--secondary))}.bg-transparent{background-color:transparent}.bg-\[radial-gradient\(circle_at_1px_1px\,_\#8e8e8e8e_1px\,_transparent_0\)\]{background-image:radial-gradient(circle at 1px 1px,#8e8e8e8e 1px,transparent 0)}.bg-repeat{background-repeat:repeat}.fill-current{fill:currentColor}.fill-primary{fill:hsl(var(--primary))}.p-1{padding:.25rem}.p-1\.5{padding:.375rem}.p-16{padding:4rem}.p-4{padding:1rem}.p-6{padding:1.5rem}.p-\[1px\]{padding:1px}.p-\[8px\]{padding:8px}.px-1{padding-left:.25rem;padding-right:.25rem}.px-2{padding-left:.5rem;padding-right:.5rem}.px-3{padding-left:.75rem;padding-right:.75rem}.px-4{padding-left:1rem;padding-right:1rem}.px-6{padding-left:1.5rem;padding-right:1.5rem}.px-8{padding-left:2rem;padding-right:2rem}.py-1{padding-top:.25rem;padding-bottom:.25rem}.py-1\.5{padding-top:.375rem;padding-bottom:.375rem}.py-2{padding-top:.5rem;padding-bottom:.5rem}.py-3{padding-top:.75rem;padding-bottom:.75rem}.py-4{padding-top:1rem;padding-bottom:1rem}.py-\[6px\]{padding-top:6px;padding-bottom:6px}.pb-4{padding-bottom:1rem}.pl-2{padding-left:.5rem}.pl-3{padding-left:.75rem}.pl-8{padding-left:2rem}.pl-\[30px\]{padding-left:30px}.pl-\[8px\]{padding-left:8px}.pr-1{padding-right:.25rem}.pr-2{padding-right:.5rem}.pr-4{padding-right:1rem}.pr-6{padding-right:1.5rem}.pr-8{padding-right:2rem}.pr-\[8px\]{padding-right:8px}.pt-0{padding-top:0}.pt-4{padding-top:1rem}.text-center{text-align:center}.text-2xl{font-size:1.5rem;line-height:2rem}.text-\[0\.8rem\]{font-size:.8rem}.text-base{font-size:1rem;line-height:1.5rem}.text-lg{font-size:1.125rem;line-height:1.75rem}.text-sm{font-size:.875rem;line-height:1.25rem}.text-xs{font-size:.75rem;line-height:1rem}.font-medium{font-weight:500}.font-semibold{font-weight:600}.leading-none{line-height:1}.tracking-tight{letter-spacing:-.025em}.tracking-widest{letter-spacing:.1em}.text-destructive{color:hsl(var(--destructive))}.text-destructive-foreground{color:hsl(var(--destructive-foreground))}.text-foreground{color:hsl(var(--foreground))}.text-foreground\/50{color:hsl(var(--foreground) / .5)}.text-gray-200{--tw-text-opacity: 1;color:rgb(229 231 235 / var(--tw-text-opacity))}.text-muted-foreground{color:hsl(var(--muted-foreground))}.text-popover-foreground{color:hsl(var(--popover-foreground))}.text-primary{color:hsl(var(--primary))}.text-primary-foreground{color:hsl(var(--primary-foreground))}.text-secondary-foreground{color:hsl(var(--secondary-foreground))}.underline-offset-4{text-underline-offset:4px}.opacity-0{opacity:0}.opacity-50{opacity:.5}.opacity-60{opacity:.6}.opacity-70{opacity:.7}.opacity-90{opacity:.9}.shadow{--tw-shadow: 0 1px 3px 0 rgb(0 0 0 / .1), 0 1px 2px -1px rgb(0 0 0 / .1);--tw-shadow-colored: 0 1px 3px 0 var(--tw-shadow-color), 0 1px 2px -1px var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow, 0 0 #0000),var(--tw-ring-shadow, 0 0 #0000),var(--tw-shadow)}.shadow-lg{--tw-shadow: 0 10px 15px -3px rgb(0 0 0 / .1), 0 4px 6px -4px rgb(0 0 0 / .1);--tw-shadow-colored: 0 10px 15px -3px var(--tw-shadow-color), 0 4px 6px -4px var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow, 0 0 #0000),var(--tw-ring-shadow, 0 0 #0000),var(--tw-shadow)}.shadow-md{--tw-shadow: 0 4px 6px -1px rgb(0 0 0 / .1), 0 2px 4px -2px rgb(0 0 0 / .1);--tw-shadow-colored: 0 4px 6px -1px var(--tw-shadow-color), 0 2px 4px -2px var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow, 0 0 #0000),var(--tw-ring-shadow, 0 0 #0000),var(--tw-shadow)}.shadow-sm{--tw-shadow: 0 1px 2px 0 rgb(0 0 0 / .05);--tw-shadow-colored: 0 1px 2px 0 var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow, 0 0 #0000),var(--tw-ring-shadow, 0 0 #0000),var(--tw-shadow)}.outline-none{outline:2px solid transparent;outline-offset:2px}.outline{outline-style:solid}.outline-dashed{outline-style:dashed}.outline-8{outline-width:8px}.outline-\[rgba\(255\,89\,95\,0\.31\)\]{outline-color:#ff595f4f}.outline-\[rgba\(98\,255\,179\,0\.31\)\]{outline-color:#62ffb34f}.outline-primary{outline-color:hsl(var(--primary))}.ring-0{--tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color);--tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(0px + var(--tw-ring-offset-width)) var(--tw-ring-color);box-shadow:var(--tw-ring-offset-shadow),var(--tw-ring-shadow),var(--tw-shadow, 0 0 #0000)}.ring-offset-background{--tw-ring-offset-color: hsl(var(--background))}.blur{--tw-blur: blur(8px);filter:var(--tw-blur) var(--tw-brightness) var(--tw-contrast) var(--tw-grayscale) var(--tw-hue-rotate) var(--tw-invert) var(--tw-saturate) var(--tw-sepia) var(--tw-drop-shadow)}.filter{filter:var(--tw-blur) var(--tw-brightness) var(--tw-contrast) var(--tw-grayscale) var(--tw-hue-rotate) var(--tw-invert) var(--tw-saturate) var(--tw-sepia) var(--tw-drop-shadow)}.backdrop-blur-md{--tw-backdrop-blur: blur(12px);-webkit-backdrop-filter:var(--tw-backdrop-blur) var(--tw-backdrop-brightness) var(--tw-backdrop-contrast) var(--tw-backdrop-grayscale) var(--tw-backdrop-hue-rotate) var(--tw-backdrop-invert) var(--tw-backdrop-opacity) var(--tw-backdrop-saturate) var(--tw-backdrop-sepia);backdrop-filter:var(--tw-backdrop-blur) var(--tw-backdrop-brightness) var(--tw-backdrop-contrast) var(--tw-backdrop-grayscale) var(--tw-backdrop-hue-rotate) var(--tw-backdrop-invert) var(--tw-backdrop-opacity) var(--tw-backdrop-saturate) var(--tw-backdrop-sepia)}.backdrop-blur-sm{--tw-backdrop-blur: blur(4px);-webkit-backdrop-filter:var(--tw-backdrop-blur) var(--tw-backdrop-brightness) var(--tw-backdrop-contrast) var(--tw-backdrop-grayscale) var(--tw-backdrop-hue-rotate) var(--tw-backdrop-invert) var(--tw-backdrop-opacity) var(--tw-backdrop-saturate) var(--tw-backdrop-sepia);backdrop-filter:var(--tw-backdrop-blur) var(--tw-backdrop-brightness) var(--tw-backdrop-contrast) var(--tw-backdrop-grayscale) var(--tw-backdrop-hue-rotate) var(--tw-backdrop-invert) var(--tw-backdrop-opacity) var(--tw-backdrop-saturate) var(--tw-backdrop-sepia)}.backdrop-filter{-webkit-backdrop-filter:var(--tw-backdrop-blur) var(--tw-backdrop-brightness) var(--tw-backdrop-contrast) var(--tw-backdrop-grayscale) var(--tw-backdrop-hue-rotate) var(--tw-backdrop-invert) var(--tw-backdrop-opacity) var(--tw-backdrop-saturate) var(--tw-backdrop-sepia);backdrop-filter:var(--tw-backdrop-blur) var(--tw-backdrop-brightness) var(--tw-backdrop-contrast) var(--tw-backdrop-grayscale) var(--tw-backdrop-hue-rotate) var(--tw-backdrop-invert) var(--tw-backdrop-opacity) var(--tw-backdrop-saturate) var(--tw-backdrop-sepia)}.transition{transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,-webkit-backdrop-filter;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter;transition-property:color,background-color,border-color,text-decoration-color,fill,stroke,opacity,box-shadow,transform,filter,backdrop-filter,-webkit-backdrop-filter;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.15s}.transition-\[height\]{transition-property:height;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.15s}.transition-all{transition-property:all;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.15s}.transition-colors{transition-property:color,background-color,border-color,text-decoration-color,fill,stroke;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.15s}.transition-opacity{transition-property:opacity;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.15s}.transition-transform{transition-property:transform;transition-timing-function:cubic-bezier(.4,0,.2,1);transition-duration:.15s}.duration-200{transition-duration:.2s}.ease-in-out{transition-timing-function:cubic-bezier(.4,0,.2,1)}.content-\[\'\'\]{--tw-content: "";content:var(--tw-content)}@keyframes enter{0%{opacity:var(--tw-enter-opacity, 1);transform:translate3d(var(--tw-enter-translate-x, 0),var(--tw-enter-translate-y, 0),0) scale3d(var(--tw-enter-scale, 1),var(--tw-enter-scale, 1),var(--tw-enter-scale, 1)) rotate(var(--tw-enter-rotate, 0))}}@keyframes exit{to{opacity:var(--tw-exit-opacity, 1);transform:translate3d(var(--tw-exit-translate-x, 0),var(--tw-exit-translate-y, 0),0) scale3d(var(--tw-exit-scale, 1),var(--tw-exit-scale, 1),var(--tw-exit-scale, 1)) rotate(var(--tw-exit-rotate, 0))}}.animate-in{animation-name:enter;animation-duration:.15s;--tw-enter-opacity: initial;--tw-enter-scale: initial;--tw-enter-rotate: initial;--tw-enter-translate-x: initial;--tw-enter-translate-y: initial}.fade-in-0{--tw-enter-opacity: 0}.zoom-in-95{--tw-enter-scale: .95}.duration-200{animation-duration:.2s}.ease-in-out{animation-timing-function:cubic-bezier(.4,0,.2,1)}.\[background-size\:20px_20px\]{background-size:20px 20px}.\[box-shadow\:0_0_0_0_rgba\(21\,_215\,_121\,_0\.936\)\]{box-shadow:0 0 #15d779ef}.\[box-shadow\:0_0_0_9999px_rgba\(0\,_0\,_0\,_0\.5\)\]{box-shadow:0 0 0 9999px #00000080}.\[grid-area\:editor-content\]{grid-area:editor-content}.\[grid-area\:original-image-content\]{grid-area:original-image-content}.\[grid-template-areas\:\'editor-content\'\]{grid-template-areas:"editor-content"}.\[grid-template-areas\:\'original-image-content\'\]{grid-template-areas:"original-image-content"}html{font-family:Inter,"system-ui";overflow:hidden}@supports (font-variation-settings: normal){html{font-family:Inter var,"system-ui"}}.react-transform-wrapper{display:grid!important;width:100%!important;height:100%!important}.react-photo-album{padding:8px}.react-photo-album--photo{-moz-user-select:none;-webkit-user-select:none;user-select:none;border-radius:8px;transition:transform .25s,visibility .25s ease-in}.react-photo-album--photo:hover{border:1px solid var(--border);transform:scale(1.03)}.icon-button-icon-wrapper svg{stroke-width:1px}.file\:border-0::file-selector-button{border-width:0px}.file\:bg-transparent::file-selector-button{background-color:transparent}.file\:text-sm::file-selector-button{font-size:.875rem;line-height:1.25rem}.file\:font-medium::file-selector-button{font-weight:500}.placeholder\:text-muted-foreground::-moz-placeholder{color:hsl(var(--muted-foreground))}.placeholder\:text-muted-foreground::placeholder{color:hsl(var(--muted-foreground))}.hover\:cursor-move:hover{cursor:move}.hover\:bg-accent:hover{background-color:hsl(var(--accent))}.hover\:bg-destructive\/90:hover{background-color:hsl(var(--destructive) / .9)}.hover\:bg-muted:hover{background-color:hsl(var(--muted))}.hover\:bg-primary:hover{background-color:hsl(var(--primary))}.hover\:bg-primary\/90:hover{background-color:hsl(var(--primary) / .9)}.hover\:bg-secondary:hover{background-color:hsl(var(--secondary))}.hover\:bg-secondary\/80:hover{background-color:hsl(var(--secondary) / .8)}.hover\:text-accent-foreground:hover{color:hsl(var(--accent-foreground))}.hover\:text-foreground:hover{color:hsl(var(--foreground))}.hover\:text-muted-foreground:hover{color:hsl(var(--muted-foreground))}.hover\:underline:hover{text-decoration-line:underline}.hover\:opacity-100:hover{opacity:1}.focus\:h-\[120px\]:focus{height:120px}.focus\:overflow-y-auto:focus{overflow-y:auto}.focus\:bg-accent:focus{background-color:hsl(var(--accent))}.focus\:text-accent-foreground:focus{color:hsl(var(--accent-foreground))}.focus\:opacity-100:focus{opacity:1}.focus\:outline-none:focus{outline:2px solid transparent;outline-offset:2px}.focus\:ring-1:focus{--tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color);--tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(1px + var(--tw-ring-offset-width)) var(--tw-ring-color);box-shadow:var(--tw-ring-offset-shadow),var(--tw-ring-shadow),var(--tw-shadow, 0 0 #0000)}.focus\:ring-2:focus{--tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color);--tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(2px + var(--tw-ring-offset-width)) var(--tw-ring-color);box-shadow:var(--tw-ring-offset-shadow),var(--tw-ring-shadow),var(--tw-shadow, 0 0 #0000)}.focus\:ring-ring:focus{--tw-ring-color: hsl(var(--ring))}.focus\:ring-offset-2:focus{--tw-ring-offset-width: 2px}.focus-visible\:outline-none:focus-visible{outline:2px solid transparent;outline-offset:2px}.focus-visible\:ring-1:focus-visible{--tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color);--tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(1px + var(--tw-ring-offset-width)) var(--tw-ring-color);box-shadow:var(--tw-ring-offset-shadow),var(--tw-ring-shadow),var(--tw-shadow, 0 0 #0000)}.focus-visible\:ring-2:focus-visible{--tw-ring-offset-shadow: var(--tw-ring-inset) 0 0 0 var(--tw-ring-offset-width) var(--tw-ring-offset-color);--tw-ring-shadow: var(--tw-ring-inset) 0 0 0 calc(2px + var(--tw-ring-offset-width)) var(--tw-ring-color);box-shadow:var(--tw-ring-offset-shadow),var(--tw-ring-shadow),var(--tw-shadow, 0 0 #0000)}.focus-visible\:ring-ring:focus-visible{--tw-ring-color: hsl(var(--ring))}.focus-visible\:ring-offset-2:focus-visible{--tw-ring-offset-width: 2px}.focus-visible\:ring-offset-background:focus-visible{--tw-ring-offset-color: hsl(var(--background))}.disabled\:pointer-events-none:disabled{pointer-events:none}.disabled\:cursor-not-allowed:disabled{cursor:not-allowed}.disabled\:opacity-50:disabled{opacity:.5}.group:hover .group-hover\:opacity-100{opacity:1}.group.destructive .group-\[\.destructive\]\:border-muted\/40{border-color:hsl(var(--muted) / .4)}.group.destructive .group-\[\.destructive\]\:text-red-300{--tw-text-opacity: 1;color:rgb(252 165 165 / var(--tw-text-opacity))}.group.destructive .group-\[\.destructive\]\:hover\:border-destructive\/30:hover{border-color:hsl(var(--destructive) / .3)}.group.destructive .group-\[\.destructive\]\:hover\:bg-destructive:hover{background-color:hsl(var(--destructive))}.group.destructive .group-\[\.destructive\]\:hover\:text-destructive-foreground:hover{color:hsl(var(--destructive-foreground))}.group.destructive .group-\[\.destructive\]\:hover\:text-red-50:hover{--tw-text-opacity: 1;color:rgb(254 242 242 / var(--tw-text-opacity))}.group.destructive .group-\[\.destructive\]\:focus\:ring-destructive:focus{--tw-ring-color: hsl(var(--destructive))}.group.destructive .group-\[\.destructive\]\:focus\:ring-red-400:focus{--tw-ring-opacity: 1;--tw-ring-color: rgb(248 113 113 / var(--tw-ring-opacity))}.group.destructive .group-\[\.destructive\]\:focus\:ring-offset-red-600:focus{--tw-ring-offset-color: #dc2626}.peer:disabled~.peer-disabled\:cursor-not-allowed{cursor:not-allowed}.peer:disabled~.peer-disabled\:opacity-70{opacity:.7}.data-\[disabled\]\:pointer-events-none[data-disabled]{pointer-events:none}.data-\[side\=bottom\]\:translate-y-1[data-side=bottom]{--tw-translate-y: .25rem;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.data-\[side\=left\]\:-translate-x-1[data-side=left]{--tw-translate-x: -.25rem;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.data-\[side\=right\]\:translate-x-1[data-side=right]{--tw-translate-x: .25rem;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.data-\[side\=top\]\:-translate-y-1[data-side=top]{--tw-translate-y: -.25rem;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.data-\[state\=checked\]\:translate-x-4[data-state=checked]{--tw-translate-x: 1rem;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.data-\[state\=unchecked\]\:translate-x-0[data-state=unchecked],.data-\[swipe\=cancel\]\:translate-x-0[data-swipe=cancel]{--tw-translate-x: 0px;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.data-\[swipe\=end\]\:translate-x-\[var\(--radix-toast-swipe-end-x\)\][data-swipe=end]{--tw-translate-x: var(--radix-toast-swipe-end-x);transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}.data-\[swipe\=move\]\:translate-x-\[var\(--radix-toast-swipe-move-x\)\][data-swipe=move]{--tw-translate-x: var(--radix-toast-swipe-move-x);transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))}@keyframes accordion-up{0%{height:var(--radix-accordion-content-height)}to{height:0}}.data-\[state\=closed\]\:animate-accordion-up[data-state=closed]{animation:accordion-up .2s ease-out}@keyframes accordion-down{0%{height:0}to{height:var(--radix-accordion-content-height)}}.data-\[state\=open\]\:animate-accordion-down[data-state=open]{animation:accordion-down .2s ease-out}.data-\[disabled\]\:cursor-not-allowed[data-disabled]{cursor:not-allowed}.data-\[state\=active\]\:bg-background[data-state=active]{background-color:hsl(var(--background))}.data-\[state\=checked\]\:bg-primary[data-state=checked]{background-color:hsl(var(--primary))}.data-\[state\=on\]\:bg-accent[data-state=on],.data-\[state\=open\]\:bg-accent[data-state=open]{background-color:hsl(var(--accent))}.data-\[state\=unchecked\]\:bg-input[data-state=unchecked]{background-color:hsl(var(--input))}.data-\[state\=active\]\:text-foreground[data-state=active]{color:hsl(var(--foreground))}.data-\[state\=on\]\:text-accent-foreground[data-state=on],.data-\[state\=open\]\:text-accent-foreground[data-state=open]{color:hsl(var(--accent-foreground))}.data-\[state\=open\]\:text-muted-foreground[data-state=open]{color:hsl(var(--muted-foreground))}.data-\[disabled\]\:opacity-50[data-disabled]{opacity:.5}.data-\[state\=active\]\:shadow[data-state=active]{--tw-shadow: 0 1px 3px 0 rgb(0 0 0 / .1), 0 1px 2px -1px rgb(0 0 0 / .1);--tw-shadow-colored: 0 1px 3px 0 var(--tw-shadow-color), 0 1px 2px -1px var(--tw-shadow-color);box-shadow:var(--tw-ring-offset-shadow, 0 0 #0000),var(--tw-ring-shadow, 0 0 #0000),var(--tw-shadow)}.data-\[swipe\=move\]\:transition-none[data-swipe=move]{transition-property:none}.data-\[state\=closed\]\:duration-200[data-state=closed]{transition-duration:.2s}.data-\[state\=open\]\:duration-300[data-state=open]{transition-duration:.3s}.data-\[state\=open\]\:animate-in[data-state=open]{animation-name:enter;animation-duration:.15s;--tw-enter-opacity: initial;--tw-enter-scale: initial;--tw-enter-rotate: initial;--tw-enter-translate-x: initial;--tw-enter-translate-y: initial}.data-\[state\=closed\]\:animate-out[data-state=closed],.data-\[swipe\=end\]\:animate-out[data-swipe=end]{animation-name:exit;animation-duration:.15s;--tw-exit-opacity: initial;--tw-exit-scale: initial;--tw-exit-rotate: initial;--tw-exit-translate-x: initial;--tw-exit-translate-y: initial}.data-\[state\=closed\]\:fade-out-0[data-state=closed]{--tw-exit-opacity: 0}.data-\[state\=closed\]\:fade-out-80[data-state=closed]{--tw-exit-opacity: .8}.data-\[state\=open\]\:fade-in-0[data-state=open]{--tw-enter-opacity: 0}.data-\[state\=closed\]\:zoom-out-95[data-state=closed]{--tw-exit-scale: .95}.data-\[state\=open\]\:zoom-in-95[data-state=open]{--tw-enter-scale: .95}.data-\[side\=bottom\]\:slide-in-from-top-2[data-side=bottom]{--tw-enter-translate-y: -.5rem}.data-\[side\=left\]\:slide-in-from-right-2[data-side=left]{--tw-enter-translate-x: .5rem}.data-\[side\=right\]\:slide-in-from-left-2[data-side=right]{--tw-enter-translate-x: -.5rem}.data-\[side\=top\]\:slide-in-from-bottom-2[data-side=top]{--tw-enter-translate-y: .5rem}.data-\[state\=closed\]\:slide-out-to-bottom[data-state=closed]{--tw-exit-translate-y: 100%}.data-\[state\=closed\]\:slide-out-to-left[data-state=closed]{--tw-exit-translate-x: -100%}.data-\[state\=closed\]\:slide-out-to-left-1\/2[data-state=closed]{--tw-exit-translate-x: -50%}.data-\[state\=closed\]\:slide-out-to-right[data-state=closed],.data-\[state\=closed\]\:slide-out-to-right-full[data-state=closed]{--tw-exit-translate-x: 100%}.data-\[state\=closed\]\:slide-out-to-top[data-state=closed]{--tw-exit-translate-y: -100%}.data-\[state\=closed\]\:slide-out-to-top-\[48\%\][data-state=closed]{--tw-exit-translate-y: -48%}.data-\[state\=open\]\:slide-in-from-bottom[data-state=open]{--tw-enter-translate-y: 100%}.data-\[state\=open\]\:slide-in-from-left[data-state=open]{--tw-enter-translate-x: -100%}.data-\[state\=open\]\:slide-in-from-left-1\/2[data-state=open]{--tw-enter-translate-x: -50%}.data-\[state\=open\]\:slide-in-from-right[data-state=open]{--tw-enter-translate-x: 100%}.data-\[state\=open\]\:slide-in-from-top[data-state=open]{--tw-enter-translate-y: -100%}.data-\[state\=open\]\:slide-in-from-top-\[48\%\][data-state=open]{--tw-enter-translate-y: -48%}.data-\[state\=open\]\:slide-in-from-top-full[data-state=open]{--tw-enter-translate-y: -100%}.data-\[state\=closed\]\:duration-200[data-state=closed]{animation-duration:.2s}.data-\[state\=open\]\:duration-300[data-state=open]{animation-duration:.3s}:is(.dark .dark\:text-gray-600){--tw-text-opacity: 1;color:rgb(75 85 99 / var(--tw-text-opacity))}@media (min-width: 640px){.sm\:bottom-0{bottom:0}.sm\:right-0{right:0}.sm\:top-auto{top:auto}.sm\:mt-0{margin-top:0}.sm\:max-w-sm{max-width:24rem}.sm\:flex-row{flex-direction:row}.sm\:flex-col{flex-direction:column}.sm\:justify-end{justify-content:flex-end}.sm\:space-x-2>:not([hidden])~:not([hidden]){--tw-space-x-reverse: 0;margin-right:calc(.5rem * var(--tw-space-x-reverse));margin-left:calc(.5rem * calc(1 - var(--tw-space-x-reverse)))}.sm\:rounded-lg{border-radius:var(--radius)}.sm\:text-left{text-align:left}.data-\[state\=open\]\:sm\:slide-in-from-bottom-full[data-state=open]{--tw-enter-translate-y: 100%}}@media (min-width: 768px){.md\:max-w-\[420px\]{max-width:420px}}.\[\&\+div\]\:text-xs+div{font-size:.75rem;line-height:1rem}.\[\&\>span\]\:line-clamp-1>span{overflow:hidden;display:-webkit-box;-webkit-box-orient:vertical;-webkit-line-clamp:1}.\[\&\[data-state\=open\]\>svg\]\:rotate-180[data-state=open]>svg{--tw-rotate: 180deg;transform:translate(var(--tw-translate-x),var(--tw-translate-y)) rotate(var(--tw-rotate)) skew(var(--tw-skew-x)) skewY(var(--tw-skew-y)) scaleX(var(--tw-scale-x)) scaleY(var(--tw-scale-y))} diff --git a/inpaint/web_app/assets/index-VrFIcmY_.js b/inpaint/web_app/assets/index-VrFIcmY_.js new file mode 100644 index 0000000..48e8952 --- /dev/null +++ b/inpaint/web_app/assets/index-VrFIcmY_.js @@ -0,0 +1,165 @@ +var Uv=(e,t,n)=>{if(!t.has(e))throw TypeError("Cannot "+n)};var j=(e,t,n)=>(Uv(e,t,"read from private field"),n?n.call(e):t.get(e)),ke=(e,t,n)=>{if(t.has(e))throw TypeError("Cannot add the same private member more than once");t instanceof WeakSet?t.add(e):t.set(e,n)},we=(e,t,n,r)=>(Uv(e,t,"write to private field"),r?r.call(e,n):t.set(e,n),n);var Yf=(e,t,n,r)=>({set _(o){we(e,t,o,n)},get _(){return j(e,t,r)}}),tt=(e,t,n)=>(Uv(e,t,"access private method"),n);function R9(e,t){for(var n=0;nr[o]})}}}return Object.freeze(Object.defineProperty(e,Symbol.toStringTag,{value:"Module"}))}(function(){const t=document.createElement("link").relList;if(t&&t.supports&&t.supports("modulepreload"))return;for(const o of document.querySelectorAll('link[rel="modulepreload"]'))r(o);new MutationObserver(o=>{for(const i of o)if(i.type==="childList")for(const s of i.addedNodes)s.tagName==="LINK"&&s.rel==="modulepreload"&&r(s)}).observe(document,{childList:!0,subtree:!0});function n(o){const i={};return o.integrity&&(i.integrity=o.integrity),o.referrerPolicy&&(i.referrerPolicy=o.referrerPolicy),o.crossOrigin==="use-credentials"?i.credentials="include":o.crossOrigin==="anonymous"?i.credentials="omit":i.credentials="same-origin",i}function r(o){if(o.ep)return;o.ep=!0;const i=n(o);fetch(o.href,i)}})();var Qc=typeof globalThis<"u"?globalThis:typeof window<"u"?window:typeof global<"u"?global:typeof self<"u"?self:{};function pm(e){return e&&e.__esModule&&Object.prototype.hasOwnProperty.call(e,"default")?e.default:e}var _$={exports:{}},mm={},E$={exports:{}},it={};/** + * @license React + * react.production.min.js + * + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */var Pd=Symbol.for("react.element"),P9=Symbol.for("react.portal"),T9=Symbol.for("react.fragment"),k9=Symbol.for("react.strict_mode"),A9=Symbol.for("react.profiler"),M9=Symbol.for("react.provider"),O9=Symbol.for("react.context"),N9=Symbol.for("react.forward_ref"),D9=Symbol.for("react.suspense"),I9=Symbol.for("react.memo"),L9=Symbol.for("react.lazy"),H_=Symbol.iterator;function F9(e){return e===null||typeof e!="object"?null:(e=H_&&e[H_]||e["@@iterator"],typeof e=="function"?e:null)}var C$={isMounted:function(){return!1},enqueueForceUpdate:function(){},enqueueReplaceState:function(){},enqueueSetState:function(){}},$$=Object.assign,R$={};function xc(e,t,n){this.props=e,this.context=t,this.refs=R$,this.updater=n||C$}xc.prototype.isReactComponent={};xc.prototype.setState=function(e,t){if(typeof e!="object"&&typeof e!="function"&&e!=null)throw Error("setState(...): takes an object of state variables to update or a function which returns an object of state variables.");this.updater.enqueueSetState(this,e,t,"setState")};xc.prototype.forceUpdate=function(e){this.updater.enqueueForceUpdate(this,e,"forceUpdate")};function P$(){}P$.prototype=xc.prototype;function sw(e,t,n){this.props=e,this.context=t,this.refs=R$,this.updater=n||C$}var aw=sw.prototype=new P$;aw.constructor=sw;$$(aw,xc.prototype);aw.isPureReactComponent=!0;var K_=Array.isArray,T$=Object.prototype.hasOwnProperty,lw={current:null},k$={key:!0,ref:!0,__self:!0,__source:!0};function A$(e,t,n){var r,o={},i=null,s=null;if(t!=null)for(r in t.ref!==void 0&&(s=t.ref),t.key!==void 0&&(i=""+t.key),t)T$.call(t,r)&&!k$.hasOwnProperty(r)&&(o[r]=t[r]);var l=arguments.length-2;if(l===1)o.children=n;else if(1>>1,L=G[le];if(0>>1;leo(Ke,Q))Meo(me,Ke)?(G[le]=me,G[Me]=Q,le=Me):(G[le]=Ke,G[Ne]=Q,le=Ne);else if(Meo(me,Q))G[le]=me,G[Me]=Q,le=Me;else break e}}return Z}function o(G,Z){var Q=G.sortIndex-Z.sortIndex;return Q!==0?Q:G.id-Z.id}if(typeof performance=="object"&&typeof performance.now=="function"){var i=performance;e.unstable_now=function(){return i.now()}}else{var s=Date,l=s.now();e.unstable_now=function(){return s.now()-l}}var u=[],f=[],m=1,p=null,g=3,y=!1,x=!1,S=!1,E=typeof setTimeout=="function"?setTimeout:null,_=typeof clearTimeout=="function"?clearTimeout:null,b=typeof setImmediate<"u"?setImmediate:null;typeof navigator<"u"&&navigator.scheduling!==void 0&&navigator.scheduling.isInputPending!==void 0&&navigator.scheduling.isInputPending.bind(navigator.scheduling);function C(G){for(var Z=n(f);Z!==null;){if(Z.callback===null)r(f);else if(Z.startTime<=G)r(f),Z.sortIndex=Z.expirationTime,t(u,Z);else break;Z=n(f)}}function R(G){if(S=!1,C(G),!x)if(n(u)!==null)x=!0,V(k);else{var Z=n(f);Z!==null&&J(R,Z.startTime-G)}}function k(G,Z){x=!1,S&&(S=!1,_(I),I=-1),y=!0;var Q=g;try{for(C(Z),p=n(u);p!==null&&(!(p.expirationTime>Z)||G&&!ie());){var le=p.callback;if(typeof le=="function"){p.callback=null,g=p.priorityLevel;var L=le(p.expirationTime<=Z);Z=e.unstable_now(),typeof L=="function"?p.callback=L:p===n(u)&&r(u),C(Z)}else r(u);p=n(u)}if(p!==null)var ue=!0;else{var Ne=n(f);Ne!==null&&J(R,Ne.startTime-Z),ue=!1}return ue}finally{p=null,g=Q,y=!1}}var O=!1,A=null,I=-1,z=5,H=-1;function ie(){return!(e.unstable_now()-HG||125le?(G.sortIndex=Q,t(f,G),n(u)===null&&G===n(f)&&(S?(_(I),I=-1):S=!0,J(R,Q-le))):(G.sortIndex=L,t(u,G),x||y||(x=!0,V(k))),G},e.unstable_shouldYield=ie,e.unstable_wrapCallback=function(G){var Z=g;return function(){var Q=g;g=Z;try{return G.apply(this,arguments)}finally{g=Q}}}})(D$);N$.exports=D$;var Z9=N$.exports;/** + * @license React + * react-dom.production.min.js + * + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */var I$=d,wr=Z9;function de(e){for(var t="https://reactjs.org/docs/error-decoder.html?invariant="+e,n=1;n"u"||typeof window.document>"u"||typeof window.document.createElement>"u"),Z0=Object.prototype.hasOwnProperty,q9=/^[:A-Z_a-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD][:A-Z_a-z\u00C0-\u00D6\u00D8-\u00F6\u00F8-\u02FF\u0370-\u037D\u037F-\u1FFF\u200C-\u200D\u2070-\u218F\u2C00-\u2FEF\u3001-\uD7FF\uF900-\uFDCF\uFDF0-\uFFFD\-.0-9\u00B7\u0300-\u036F\u203F-\u2040]*$/,Y_={},X_={};function Q9(e){return Z0.call(X_,e)?!0:Z0.call(Y_,e)?!1:q9.test(e)?X_[e]=!0:(Y_[e]=!0,!1)}function J9(e,t,n,r){if(n!==null&&n.type===0)return!1;switch(typeof t){case"function":case"symbol":return!0;case"boolean":return r?!1:n!==null?!n.acceptsBooleans:(e=e.toLowerCase().slice(0,5),e!=="data-"&&e!=="aria-");default:return!1}}function eD(e,t,n,r){if(t===null||typeof t>"u"||J9(e,t,n,r))return!0;if(r)return!1;if(n!==null)switch(n.type){case 3:return!t;case 4:return t===!1;case 5:return isNaN(t);case 6:return isNaN(t)||1>t}return!1}function Un(e,t,n,r,o,i,s){this.acceptsBooleans=t===2||t===3||t===4,this.attributeName=r,this.attributeNamespace=o,this.mustUseProperty=n,this.propertyName=e,this.type=t,this.sanitizeURL=i,this.removeEmptyString=s}var wn={};"children dangerouslySetInnerHTML defaultValue defaultChecked innerHTML suppressContentEditableWarning suppressHydrationWarning style".split(" ").forEach(function(e){wn[e]=new Un(e,0,!1,e,null,!1,!1)});[["acceptCharset","accept-charset"],["className","class"],["htmlFor","for"],["httpEquiv","http-equiv"]].forEach(function(e){var t=e[0];wn[t]=new Un(t,1,!1,e[1],null,!1,!1)});["contentEditable","draggable","spellCheck","value"].forEach(function(e){wn[e]=new Un(e,2,!1,e.toLowerCase(),null,!1,!1)});["autoReverse","externalResourcesRequired","focusable","preserveAlpha"].forEach(function(e){wn[e]=new Un(e,2,!1,e,null,!1,!1)});"allowFullScreen async autoFocus autoPlay controls default defer disabled disablePictureInPicture disableRemotePlayback formNoValidate hidden loop noModule noValidate open playsInline readOnly required reversed scoped seamless itemScope".split(" ").forEach(function(e){wn[e]=new Un(e,3,!1,e.toLowerCase(),null,!1,!1)});["checked","multiple","muted","selected"].forEach(function(e){wn[e]=new Un(e,3,!0,e,null,!1,!1)});["capture","download"].forEach(function(e){wn[e]=new Un(e,4,!1,e,null,!1,!1)});["cols","rows","size","span"].forEach(function(e){wn[e]=new Un(e,6,!1,e,null,!1,!1)});["rowSpan","start"].forEach(function(e){wn[e]=new Un(e,5,!1,e.toLowerCase(),null,!1,!1)});var uw=/[\-:]([a-z])/g;function dw(e){return e[1].toUpperCase()}"accent-height alignment-baseline arabic-form baseline-shift cap-height clip-path clip-rule color-interpolation color-interpolation-filters color-profile color-rendering dominant-baseline enable-background fill-opacity fill-rule flood-color flood-opacity font-family font-size font-size-adjust font-stretch font-style font-variant font-weight glyph-name glyph-orientation-horizontal glyph-orientation-vertical horiz-adv-x horiz-origin-x image-rendering letter-spacing lighting-color marker-end marker-mid marker-start overline-position overline-thickness paint-order panose-1 pointer-events rendering-intent shape-rendering stop-color stop-opacity strikethrough-position strikethrough-thickness stroke-dasharray stroke-dashoffset stroke-linecap stroke-linejoin stroke-miterlimit stroke-opacity stroke-width text-anchor text-decoration text-rendering underline-position underline-thickness unicode-bidi unicode-range units-per-em v-alphabetic v-hanging v-ideographic v-mathematical vector-effect vert-adv-y vert-origin-x vert-origin-y word-spacing writing-mode xmlns:xlink x-height".split(" ").forEach(function(e){var t=e.replace(uw,dw);wn[t]=new Un(t,1,!1,e,null,!1,!1)});"xlink:actuate xlink:arcrole xlink:role xlink:show xlink:title xlink:type".split(" ").forEach(function(e){var t=e.replace(uw,dw);wn[t]=new Un(t,1,!1,e,"http://www.w3.org/1999/xlink",!1,!1)});["xml:base","xml:lang","xml:space"].forEach(function(e){var t=e.replace(uw,dw);wn[t]=new Un(t,1,!1,e,"http://www.w3.org/XML/1998/namespace",!1,!1)});["tabIndex","crossOrigin"].forEach(function(e){wn[e]=new Un(e,1,!1,e.toLowerCase(),null,!1,!1)});wn.xlinkHref=new Un("xlinkHref",1,!1,"xlink:href","http://www.w3.org/1999/xlink",!0,!1);["src","href","action","formAction"].forEach(function(e){wn[e]=new Un(e,1,!1,e.toLowerCase(),null,!0,!0)});function fw(e,t,n,r){var o=wn.hasOwnProperty(t)?wn[t]:null;(o!==null?o.type!==0:r||!(2l||o[s]!==i[l]){var u=` +`+o[s].replace(" at new "," at ");return e.displayName&&u.includes("")&&(u=u.replace("",e.displayName)),u}while(1<=s&&0<=l);break}}}finally{Hv=!1,Error.prepareStackTrace=n}return(e=e?e.displayName||e.name:"")?gu(e):""}function tD(e){switch(e.tag){case 5:return gu(e.type);case 16:return gu("Lazy");case 13:return gu("Suspense");case 19:return gu("SuspenseList");case 0:case 2:case 15:return e=Kv(e.type,!1),e;case 11:return e=Kv(e.type.render,!1),e;case 1:return e=Kv(e.type,!0),e;default:return""}}function ey(e){if(e==null)return null;if(typeof e=="function")return e.displayName||e.name||null;if(typeof e=="string")return e;switch(e){case yl:return"Fragment";case vl:return"Portal";case q0:return"Profiler";case hw:return"StrictMode";case Q0:return"Suspense";case J0:return"SuspenseList"}if(typeof e=="object")switch(e.$$typeof){case j$:return(e.displayName||"Context")+".Consumer";case F$:return(e._context.displayName||"Context")+".Provider";case pw:var t=e.render;return e=e.displayName,e||(e=t.displayName||t.name||"",e=e!==""?"ForwardRef("+e+")":"ForwardRef"),e;case mw:return t=e.displayName||null,t!==null?t:ey(e.type)||"Memo";case ns:t=e._payload,e=e._init;try{return ey(e(t))}catch{}}return null}function nD(e){var t=e.type;switch(e.tag){case 24:return"Cache";case 9:return(t.displayName||"Context")+".Consumer";case 10:return(t._context.displayName||"Context")+".Provider";case 18:return"DehydratedFragment";case 11:return e=t.render,e=e.displayName||e.name||"",t.displayName||(e!==""?"ForwardRef("+e+")":"ForwardRef");case 7:return"Fragment";case 5:return t;case 4:return"Portal";case 3:return"Root";case 6:return"Text";case 16:return ey(t);case 8:return t===hw?"StrictMode":"Mode";case 22:return"Offscreen";case 12:return"Profiler";case 21:return"Scope";case 13:return"Suspense";case 19:return"SuspenseList";case 25:return"TracingMarker";case 1:case 0:case 17:case 2:case 14:case 15:if(typeof t=="function")return t.displayName||t.name||null;if(typeof t=="string")return t}return null}function As(e){switch(typeof e){case"boolean":case"number":case"string":case"undefined":return e;case"object":return e;default:return""}}function z$(e){var t=e.type;return(e=e.nodeName)&&e.toLowerCase()==="input"&&(t==="checkbox"||t==="radio")}function rD(e){var t=z$(e)?"checked":"value",n=Object.getOwnPropertyDescriptor(e.constructor.prototype,t),r=""+e[t];if(!e.hasOwnProperty(t)&&typeof n<"u"&&typeof n.get=="function"&&typeof n.set=="function"){var o=n.get,i=n.set;return Object.defineProperty(e,t,{configurable:!0,get:function(){return o.call(this)},set:function(s){r=""+s,i.call(this,s)}}),Object.defineProperty(e,t,{enumerable:n.enumerable}),{getValue:function(){return r},setValue:function(s){r=""+s},stopTracking:function(){e._valueTracker=null,delete e[t]}}}}function qf(e){e._valueTracker||(e._valueTracker=rD(e))}function U$(e){if(!e)return!1;var t=e._valueTracker;if(!t)return!0;var n=t.getValue(),r="";return e&&(r=z$(e)?e.checked?"true":"false":e.value),e=r,e!==n?(t.setValue(e),!0):!1}function up(e){if(e=e||(typeof document<"u"?document:void 0),typeof e>"u")return null;try{return e.activeElement||e.body}catch{return e.body}}function ty(e,t){var n=t.checked;return jt({},t,{defaultChecked:void 0,defaultValue:void 0,value:void 0,checked:n??e._wrapperState.initialChecked})}function q_(e,t){var n=t.defaultValue==null?"":t.defaultValue,r=t.checked!=null?t.checked:t.defaultChecked;n=As(t.value!=null?t.value:n),e._wrapperState={initialChecked:r,initialValue:n,controlled:t.type==="checkbox"||t.type==="radio"?t.checked!=null:t.value!=null}}function V$(e,t){t=t.checked,t!=null&&fw(e,"checked",t,!1)}function ny(e,t){V$(e,t);var n=As(t.value),r=t.type;if(n!=null)r==="number"?(n===0&&e.value===""||e.value!=n)&&(e.value=""+n):e.value!==""+n&&(e.value=""+n);else if(r==="submit"||r==="reset"){e.removeAttribute("value");return}t.hasOwnProperty("value")?ry(e,t.type,n):t.hasOwnProperty("defaultValue")&&ry(e,t.type,As(t.defaultValue)),t.checked==null&&t.defaultChecked!=null&&(e.defaultChecked=!!t.defaultChecked)}function Q_(e,t,n){if(t.hasOwnProperty("value")||t.hasOwnProperty("defaultValue")){var r=t.type;if(!(r!=="submit"&&r!=="reset"||t.value!==void 0&&t.value!==null))return;t=""+e._wrapperState.initialValue,n||t===e.value||(e.value=t),e.defaultValue=t}n=e.name,n!==""&&(e.name=""),e.defaultChecked=!!e._wrapperState.initialChecked,n!==""&&(e.name=n)}function ry(e,t,n){(t!=="number"||up(e.ownerDocument)!==e)&&(n==null?e.defaultValue=""+e._wrapperState.initialValue:e.defaultValue!==""+n&&(e.defaultValue=""+n))}var vu=Array.isArray;function kl(e,t,n,r){if(e=e.options,t){t={};for(var o=0;o"+t.valueOf().toString()+"",t=Qf.firstChild;e.firstChild;)e.removeChild(e.firstChild);for(;t.firstChild;)e.appendChild(t.firstChild)}});function ju(e,t){if(t){var n=e.firstChild;if(n&&n===e.lastChild&&n.nodeType===3){n.nodeValue=t;return}}e.textContent=t}var Eu={animationIterationCount:!0,aspectRatio:!0,borderImageOutset:!0,borderImageSlice:!0,borderImageWidth:!0,boxFlex:!0,boxFlexGroup:!0,boxOrdinalGroup:!0,columnCount:!0,columns:!0,flex:!0,flexGrow:!0,flexPositive:!0,flexShrink:!0,flexNegative:!0,flexOrder:!0,gridArea:!0,gridRow:!0,gridRowEnd:!0,gridRowSpan:!0,gridRowStart:!0,gridColumn:!0,gridColumnEnd:!0,gridColumnSpan:!0,gridColumnStart:!0,fontWeight:!0,lineClamp:!0,lineHeight:!0,opacity:!0,order:!0,orphans:!0,tabSize:!0,widows:!0,zIndex:!0,zoom:!0,fillOpacity:!0,floodOpacity:!0,stopOpacity:!0,strokeDasharray:!0,strokeDashoffset:!0,strokeMiterlimit:!0,strokeOpacity:!0,strokeWidth:!0},oD=["Webkit","ms","Moz","O"];Object.keys(Eu).forEach(function(e){oD.forEach(function(t){t=t+e.charAt(0).toUpperCase()+e.substring(1),Eu[t]=Eu[e]})});function G$(e,t,n){return t==null||typeof t=="boolean"||t===""?"":n||typeof t!="number"||t===0||Eu.hasOwnProperty(e)&&Eu[e]?(""+t).trim():t+"px"}function Y$(e,t){e=e.style;for(var n in t)if(t.hasOwnProperty(n)){var r=n.indexOf("--")===0,o=G$(n,t[n],r);n==="float"&&(n="cssFloat"),r?e.setProperty(n,o):e[n]=o}}var iD=jt({menuitem:!0},{area:!0,base:!0,br:!0,col:!0,embed:!0,hr:!0,img:!0,input:!0,keygen:!0,link:!0,meta:!0,param:!0,source:!0,track:!0,wbr:!0});function sy(e,t){if(t){if(iD[e]&&(t.children!=null||t.dangerouslySetInnerHTML!=null))throw Error(de(137,e));if(t.dangerouslySetInnerHTML!=null){if(t.children!=null)throw Error(de(60));if(typeof t.dangerouslySetInnerHTML!="object"||!("__html"in t.dangerouslySetInnerHTML))throw Error(de(61))}if(t.style!=null&&typeof t.style!="object")throw Error(de(62))}}function ay(e,t){if(e.indexOf("-")===-1)return typeof t.is=="string";switch(e){case"annotation-xml":case"color-profile":case"font-face":case"font-face-src":case"font-face-uri":case"font-face-format":case"font-face-name":case"missing-glyph":return!1;default:return!0}}var ly=null;function gw(e){return e=e.target||e.srcElement||window,e.correspondingUseElement&&(e=e.correspondingUseElement),e.nodeType===3?e.parentNode:e}var cy=null,Al=null,Ml=null;function tE(e){if(e=Ad(e)){if(typeof cy!="function")throw Error(de(280));var t=e.stateNode;t&&(t=xm(t),cy(e.stateNode,e.type,t))}}function X$(e){Al?Ml?Ml.push(e):Ml=[e]:Al=e}function Z$(){if(Al){var e=Al,t=Ml;if(Ml=Al=null,tE(e),t)for(e=0;e>>=0,e===0?32:31-(gD(e)/vD|0)|0}var Jf=64,eh=4194304;function yu(e){switch(e&-e){case 1:return 1;case 2:return 2;case 4:return 4;case 8:return 8;case 16:return 16;case 32:return 32;case 64:case 128:case 256:case 512:case 1024:case 2048:case 4096:case 8192:case 16384:case 32768:case 65536:case 131072:case 262144:case 524288:case 1048576:case 2097152:return e&4194240;case 4194304:case 8388608:case 16777216:case 33554432:case 67108864:return e&130023424;case 134217728:return 134217728;case 268435456:return 268435456;case 536870912:return 536870912;case 1073741824:return 1073741824;default:return e}}function pp(e,t){var n=e.pendingLanes;if(n===0)return 0;var r=0,o=e.suspendedLanes,i=e.pingedLanes,s=n&268435455;if(s!==0){var l=s&~o;l!==0?r=yu(l):(i&=s,i!==0&&(r=yu(i)))}else s=n&~o,s!==0?r=yu(s):i!==0&&(r=yu(i));if(r===0)return 0;if(t!==0&&t!==r&&!(t&o)&&(o=r&-r,i=t&-t,o>=i||o===16&&(i&4194240)!==0))return t;if(r&4&&(r|=n&16),t=e.entangledLanes,t!==0)for(e=e.entanglements,t&=r;0n;n++)t.push(e);return t}function Td(e,t,n){e.pendingLanes|=t,t!==536870912&&(e.suspendedLanes=0,e.pingedLanes=0),e=e.eventTimes,t=31-bo(t),e[t]=n}function bD(e,t){var n=e.pendingLanes&~t;e.pendingLanes=t,e.suspendedLanes=0,e.pingedLanes=0,e.expiredLanes&=t,e.mutableReadLanes&=t,e.entangledLanes&=t,t=e.entanglements;var r=e.eventTimes;for(e=e.expirationTimes;0=$u),uE=" ",dE=!1;function g5(e,t){switch(e){case"keyup":return XD.indexOf(t.keyCode)!==-1;case"keydown":return t.keyCode!==229;case"keypress":case"mousedown":case"focusout":return!0;default:return!1}}function v5(e){return e=e.detail,typeof e=="object"&&"data"in e?e.data:null}var wl=!1;function qD(e,t){switch(e){case"compositionend":return v5(t);case"keypress":return t.which!==32?null:(dE=!0,uE);case"textInput":return e=t.data,e===uE&&dE?null:e;default:return null}}function QD(e,t){if(wl)return e==="compositionend"||!Ew&&g5(e,t)?(e=p5(),Ih=bw=hs=null,wl=!1,e):null;switch(e){case"paste":return null;case"keypress":if(!(t.ctrlKey||t.altKey||t.metaKey)||t.ctrlKey&&t.altKey){if(t.char&&1=t)return{node:n,offset:t-e};e=r}e:{for(;n;){if(n.nextSibling){n=n.nextSibling;break e}n=n.parentNode}n=void 0}n=mE(n)}}function b5(e,t){return e&&t?e===t?!0:e&&e.nodeType===3?!1:t&&t.nodeType===3?b5(e,t.parentNode):"contains"in e?e.contains(t):e.compareDocumentPosition?!!(e.compareDocumentPosition(t)&16):!1:!1}function S5(){for(var e=window,t=up();t instanceof e.HTMLIFrameElement;){try{var n=typeof t.contentWindow.location.href=="string"}catch{n=!1}if(n)e=t.contentWindow;else break;t=up(e.document)}return t}function Cw(e){var t=e&&e.nodeName&&e.nodeName.toLowerCase();return t&&(t==="input"&&(e.type==="text"||e.type==="search"||e.type==="tel"||e.type==="url"||e.type==="password")||t==="textarea"||e.contentEditable==="true")}function aI(e){var t=S5(),n=e.focusedElem,r=e.selectionRange;if(t!==n&&n&&n.ownerDocument&&b5(n.ownerDocument.documentElement,n)){if(r!==null&&Cw(n)){if(t=r.start,e=r.end,e===void 0&&(e=t),"selectionStart"in n)n.selectionStart=t,n.selectionEnd=Math.min(e,n.value.length);else if(e=(t=n.ownerDocument||document)&&t.defaultView||window,e.getSelection){e=e.getSelection();var o=n.textContent.length,i=Math.min(r.start,o);r=r.end===void 0?i:Math.min(r.end,o),!e.extend&&i>r&&(o=r,r=i,i=o),o=gE(n,i);var s=gE(n,r);o&&s&&(e.rangeCount!==1||e.anchorNode!==o.node||e.anchorOffset!==o.offset||e.focusNode!==s.node||e.focusOffset!==s.offset)&&(t=t.createRange(),t.setStart(o.node,o.offset),e.removeAllRanges(),i>r?(e.addRange(t),e.extend(s.node,s.offset)):(t.setEnd(s.node,s.offset),e.addRange(t)))}}for(t=[],e=n;e=e.parentNode;)e.nodeType===1&&t.push({element:e,left:e.scrollLeft,top:e.scrollTop});for(typeof n.focus=="function"&&n.focus(),n=0;n=document.documentMode,xl=null,my=null,Pu=null,gy=!1;function vE(e,t,n){var r=n.window===n?n.document:n.nodeType===9?n:n.ownerDocument;gy||xl==null||xl!==up(r)||(r=xl,"selectionStart"in r&&Cw(r)?r={start:r.selectionStart,end:r.selectionEnd}:(r=(r.ownerDocument&&r.ownerDocument.defaultView||window).getSelection(),r={anchorNode:r.anchorNode,anchorOffset:r.anchorOffset,focusNode:r.focusNode,focusOffset:r.focusOffset}),Pu&&Hu(Pu,r)||(Pu=r,r=vp(my,"onSelect"),0_l||(e.current=Sy[_l],Sy[_l]=null,_l--)}function Rt(e,t){_l++,Sy[_l]=e.current,e.current=t}var Ms={},Rn=Fs(Ms),Qn=Fs(!1),Ta=Ms;function ec(e,t){var n=e.type.contextTypes;if(!n)return Ms;var r=e.stateNode;if(r&&r.__reactInternalMemoizedUnmaskedChildContext===t)return r.__reactInternalMemoizedMaskedChildContext;var o={},i;for(i in n)o[i]=t[i];return r&&(e=e.stateNode,e.__reactInternalMemoizedUnmaskedChildContext=t,e.__reactInternalMemoizedMaskedChildContext=o),o}function Jn(e){return e=e.childContextTypes,e!=null}function wp(){At(Qn),At(Rn)}function EE(e,t,n){if(Rn.current!==Ms)throw Error(de(168));Rt(Rn,t),Rt(Qn,n)}function A5(e,t,n){var r=e.stateNode;if(t=t.childContextTypes,typeof r.getChildContext!="function")return n;r=r.getChildContext();for(var o in r)if(!(o in t))throw Error(de(108,nD(e)||"Unknown",o));return jt({},n,r)}function xp(e){return e=(e=e.stateNode)&&e.__reactInternalMemoizedMergedChildContext||Ms,Ta=Rn.current,Rt(Rn,e),Rt(Qn,Qn.current),!0}function CE(e,t,n){var r=e.stateNode;if(!r)throw Error(de(169));n?(e=A5(e,t,Ta),r.__reactInternalMemoizedMergedChildContext=e,At(Qn),At(Rn),Rt(Rn,e)):At(Qn),Rt(Qn,n)}var vi=null,bm=!1,s0=!1;function M5(e){vi===null?vi=[e]:vi.push(e)}function wI(e){bm=!0,M5(e)}function js(){if(!s0&&vi!==null){s0=!0;var e=0,t=bt;try{var n=vi;for(bt=1;e>=s,o-=s,yi=1<<32-bo(t)+o|n<I?(z=A,A=null):z=A.sibling;var H=g(_,A,C[I],R);if(H===null){A===null&&(A=z);break}e&&A&&H.alternate===null&&t(_,A),b=i(H,b,I),O===null?k=H:O.sibling=H,O=H,A=z}if(I===C.length)return n(_,A),Nt&&ra(_,I),k;if(A===null){for(;II?(z=A,A=null):z=A.sibling;var ie=g(_,A,H.value,R);if(ie===null){A===null&&(A=z);break}e&&A&&ie.alternate===null&&t(_,A),b=i(ie,b,I),O===null?k=ie:O.sibling=ie,O=ie,A=z}if(H.done)return n(_,A),Nt&&ra(_,I),k;if(A===null){for(;!H.done;I++,H=C.next())H=p(_,H.value,R),H!==null&&(b=i(H,b,I),O===null?k=H:O.sibling=H,O=H);return Nt&&ra(_,I),k}for(A=r(_,A);!H.done;I++,H=C.next())H=y(A,_,I,H.value,R),H!==null&&(e&&H.alternate!==null&&A.delete(H.key===null?I:H.key),b=i(H,b,I),O===null?k=H:O.sibling=H,O=H);return e&&A.forEach(function(K){return t(_,K)}),Nt&&ra(_,I),k}function E(_,b,C,R){if(typeof C=="object"&&C!==null&&C.type===yl&&C.key===null&&(C=C.props.children),typeof C=="object"&&C!==null){switch(C.$$typeof){case Zf:e:{for(var k=C.key,O=b;O!==null;){if(O.key===k){if(k=C.type,k===yl){if(O.tag===7){n(_,O.sibling),b=o(O,C.props.children),b.return=_,_=b;break e}}else if(O.elementType===k||typeof k=="object"&&k!==null&&k.$$typeof===ns&&ME(k)===O.type){n(_,O.sibling),b=o(O,C.props),b.ref=ou(_,O,C),b.return=_,_=b;break e}n(_,O);break}else t(_,O);O=O.sibling}C.type===yl?(b=Pa(C.props.children,_.mode,R,C.key),b.return=_,_=b):(R=Wh(C.type,C.key,C.props,null,_.mode,R),R.ref=ou(_,b,C),R.return=_,_=R)}return s(_);case vl:e:{for(O=C.key;b!==null;){if(b.key===O)if(b.tag===4&&b.stateNode.containerInfo===C.containerInfo&&b.stateNode.implementation===C.implementation){n(_,b.sibling),b=o(b,C.children||[]),b.return=_,_=b;break e}else{n(_,b);break}else t(_,b);b=b.sibling}b=p0(C,_.mode,R),b.return=_,_=b}return s(_);case ns:return O=C._init,E(_,b,O(C._payload),R)}if(vu(C))return x(_,b,C,R);if(Jc(C))return S(_,b,C,R);ah(_,C)}return typeof C=="string"&&C!==""||typeof C=="number"?(C=""+C,b!==null&&b.tag===6?(n(_,b.sibling),b=o(b,C),b.return=_,_=b):(n(_,b),b=h0(C,_.mode,R),b.return=_,_=b),s(_)):n(_,b)}return E}var nc=B5(!0),z5=B5(!1),Md={},Yo=Fs(Md),Xu=Fs(Md),Zu=Fs(Md);function pa(e){if(e===Md)throw Error(de(174));return e}function Nw(e,t){switch(Rt(Zu,t),Rt(Xu,e),Rt(Yo,Md),e=t.nodeType,e){case 9:case 11:t=(t=t.documentElement)?t.namespaceURI:iy(null,"");break;default:e=e===8?t.parentNode:t,t=e.namespaceURI||null,e=e.tagName,t=iy(t,e)}At(Yo),Rt(Yo,t)}function rc(){At(Yo),At(Xu),At(Zu)}function U5(e){pa(Zu.current);var t=pa(Yo.current),n=iy(t,e.type);t!==n&&(Rt(Xu,e),Rt(Yo,n))}function Dw(e){Xu.current===e&&(At(Yo),At(Xu))}var It=Fs(0);function $p(e){for(var t=e;t!==null;){if(t.tag===13){var n=t.memoizedState;if(n!==null&&(n=n.dehydrated,n===null||n.data==="$?"||n.data==="$!"))return t}else if(t.tag===19&&t.memoizedProps.revealOrder!==void 0){if(t.flags&128)return t}else if(t.child!==null){t.child.return=t,t=t.child;continue}if(t===e)break;for(;t.sibling===null;){if(t.return===null||t.return===e)return null;t=t.return}t.sibling.return=t.return,t=t.sibling}return null}var a0=[];function Iw(){for(var e=0;en?n:4,e(!0);var r=l0.transition;l0.transition={};try{e(!1),t()}finally{bt=n,l0.transition=r}}function oR(){return Qr().memoizedState}function _I(e,t,n){var r=$s(e);if(n={lane:r,action:n,hasEagerState:!1,eagerState:null,next:null},iR(e))sR(t,n);else if(n=I5(e,t,n,r),n!==null){var o=jn();So(n,e,r,o),aR(n,t,r)}}function EI(e,t,n){var r=$s(e),o={lane:r,action:n,hasEagerState:!1,eagerState:null,next:null};if(iR(e))sR(t,o);else{var i=e.alternate;if(e.lanes===0&&(i===null||i.lanes===0)&&(i=t.lastRenderedReducer,i!==null))try{var s=t.lastRenderedState,l=i(s,n);if(o.hasEagerState=!0,o.eagerState=l,Eo(l,s)){var u=t.interleaved;u===null?(o.next=o,Mw(t)):(o.next=u.next,u.next=o),t.interleaved=o;return}}catch{}finally{}n=I5(e,t,o,r),n!==null&&(o=jn(),So(n,e,r,o),aR(n,t,r))}}function iR(e){var t=e.alternate;return e===Ft||t!==null&&t===Ft}function sR(e,t){Tu=Rp=!0;var n=e.pending;n===null?t.next=t:(t.next=n.next,n.next=t),e.pending=t}function aR(e,t,n){if(n&4194240){var r=t.lanes;r&=e.pendingLanes,n|=r,t.lanes=n,yw(e,n)}}var Pp={readContext:qr,useCallback:_n,useContext:_n,useEffect:_n,useImperativeHandle:_n,useInsertionEffect:_n,useLayoutEffect:_n,useMemo:_n,useReducer:_n,useRef:_n,useState:_n,useDebugValue:_n,useDeferredValue:_n,useTransition:_n,useMutableSource:_n,useSyncExternalStore:_n,useId:_n,unstable_isNewReconciler:!1},CI={readContext:qr,useCallback:function(e,t){return Mo().memoizedState=[e,t===void 0?null:t],e},useContext:qr,useEffect:NE,useImperativeHandle:function(e,t,n){return n=n!=null?n.concat([e]):null,Bh(4194308,4,J5.bind(null,t,e),n)},useLayoutEffect:function(e,t){return Bh(4194308,4,e,t)},useInsertionEffect:function(e,t){return Bh(4,2,e,t)},useMemo:function(e,t){var n=Mo();return t=t===void 0?null:t,e=e(),n.memoizedState=[e,t],e},useReducer:function(e,t,n){var r=Mo();return t=n!==void 0?n(t):t,r.memoizedState=r.baseState=t,e={pending:null,interleaved:null,lanes:0,dispatch:null,lastRenderedReducer:e,lastRenderedState:t},r.queue=e,e=e.dispatch=_I.bind(null,Ft,e),[r.memoizedState,e]},useRef:function(e){var t=Mo();return e={current:e},t.memoizedState=e},useState:OE,useDebugValue:zw,useDeferredValue:function(e){return Mo().memoizedState=e},useTransition:function(){var e=OE(!1),t=e[0];return e=SI.bind(null,e[1]),Mo().memoizedState=e,[t,e]},useMutableSource:function(){},useSyncExternalStore:function(e,t,n){var r=Ft,o=Mo();if(Nt){if(n===void 0)throw Error(de(407));n=n()}else{if(n=t(),dn===null)throw Error(de(349));Aa&30||H5(r,t,n)}o.memoizedState=n;var i={value:n,getSnapshot:t};return o.queue=i,NE(G5.bind(null,r,i,e),[e]),r.flags|=2048,Ju(9,K5.bind(null,r,i,n,t),void 0,null),n},useId:function(){var e=Mo(),t=dn.identifierPrefix;if(Nt){var n=wi,r=yi;n=(r&~(1<<32-bo(r)-1)).toString(32)+n,t=":"+t+"R"+n,n=qu++,0<\/script>",e=e.removeChild(e.firstChild)):typeof r.is=="string"?e=s.createElement(n,{is:r.is}):(e=s.createElement(n),n==="select"&&(s=e,r.multiple?s.multiple=!0:r.size&&(s.size=r.size))):e=s.createElementNS(e,n),e[jo]=t,e[Yu]=r,gR(e,t,!1,!1),t.stateNode=e;e:{switch(s=ay(n,r),n){case"dialog":kt("cancel",e),kt("close",e),o=r;break;case"iframe":case"object":case"embed":kt("load",e),o=r;break;case"video":case"audio":for(o=0;oic&&(t.flags|=128,r=!0,iu(i,!1),t.lanes=4194304)}else{if(!r)if(e=$p(s),e!==null){if(t.flags|=128,r=!0,n=e.updateQueue,n!==null&&(t.updateQueue=n,t.flags|=4),iu(i,!0),i.tail===null&&i.tailMode==="hidden"&&!s.alternate&&!Nt)return En(t),null}else 2*Gt()-i.renderingStartTime>ic&&n!==1073741824&&(t.flags|=128,r=!0,iu(i,!1),t.lanes=4194304);i.isBackwards?(s.sibling=t.child,t.child=s):(n=i.last,n!==null?n.sibling=s:t.child=s,i.last=s)}return i.tail!==null?(t=i.tail,i.rendering=t,i.tail=t.sibling,i.renderingStartTime=Gt(),t.sibling=null,n=It.current,Rt(It,r?n&1|2:n&1),t):(En(t),null);case 22:case 23:return Gw(),r=t.memoizedState!==null,e!==null&&e.memoizedState!==null!==r&&(t.flags|=8192),r&&t.mode&1?hr&1073741824&&(En(t),t.subtreeFlags&6&&(t.flags|=8192)):En(t),null;case 24:return null;case 25:return null}throw Error(de(156,t.tag))}function OI(e,t){switch(Rw(t),t.tag){case 1:return Jn(t.type)&&wp(),e=t.flags,e&65536?(t.flags=e&-65537|128,t):null;case 3:return rc(),At(Qn),At(Rn),Iw(),e=t.flags,e&65536&&!(e&128)?(t.flags=e&-65537|128,t):null;case 5:return Dw(t),null;case 13:if(At(It),e=t.memoizedState,e!==null&&e.dehydrated!==null){if(t.alternate===null)throw Error(de(340));tc()}return e=t.flags,e&65536?(t.flags=e&-65537|128,t):null;case 19:return At(It),null;case 4:return rc(),null;case 10:return Aw(t.type._context),null;case 22:case 23:return Gw(),null;case 24:return null;default:return null}}var ch=!1,$n=!1,NI=typeof WeakSet=="function"?WeakSet:Set,$e=null;function Rl(e,t){var n=e.ref;if(n!==null)if(typeof n=="function")try{n(null)}catch(r){Vt(e,t,r)}else n.current=null}function Ny(e,t,n){try{n()}catch(r){Vt(e,t,r)}}var VE=!1;function DI(e,t){if(vy=mp,e=S5(),Cw(e)){if("selectionStart"in e)var n={start:e.selectionStart,end:e.selectionEnd};else e:{n=(n=e.ownerDocument)&&n.defaultView||window;var r=n.getSelection&&n.getSelection();if(r&&r.rangeCount!==0){n=r.anchorNode;var o=r.anchorOffset,i=r.focusNode;r=r.focusOffset;try{n.nodeType,i.nodeType}catch{n=null;break e}var s=0,l=-1,u=-1,f=0,m=0,p=e,g=null;t:for(;;){for(var y;p!==n||o!==0&&p.nodeType!==3||(l=s+o),p!==i||r!==0&&p.nodeType!==3||(u=s+r),p.nodeType===3&&(s+=p.nodeValue.length),(y=p.firstChild)!==null;)g=p,p=y;for(;;){if(p===e)break t;if(g===n&&++f===o&&(l=s),g===i&&++m===r&&(u=s),(y=p.nextSibling)!==null)break;p=g,g=p.parentNode}p=y}n=l===-1||u===-1?null:{start:l,end:u}}else n=null}n=n||{start:0,end:0}}else n=null;for(yy={focusedElem:e,selectionRange:n},mp=!1,$e=t;$e!==null;)if(t=$e,e=t.child,(t.subtreeFlags&1028)!==0&&e!==null)e.return=t,$e=e;else for(;$e!==null;){t=$e;try{var x=t.alternate;if(t.flags&1024)switch(t.tag){case 0:case 11:case 15:break;case 1:if(x!==null){var S=x.memoizedProps,E=x.memoizedState,_=t.stateNode,b=_.getSnapshotBeforeUpdate(t.elementType===t.type?S:co(t.type,S),E);_.__reactInternalSnapshotBeforeUpdate=b}break;case 3:var C=t.stateNode.containerInfo;C.nodeType===1?C.textContent="":C.nodeType===9&&C.documentElement&&C.removeChild(C.documentElement);break;case 5:case 6:case 4:case 17:break;default:throw Error(de(163))}}catch(R){Vt(t,t.return,R)}if(e=t.sibling,e!==null){e.return=t.return,$e=e;break}$e=t.return}return x=VE,VE=!1,x}function ku(e,t,n){var r=t.updateQueue;if(r=r!==null?r.lastEffect:null,r!==null){var o=r=r.next;do{if((o.tag&e)===e){var i=o.destroy;o.destroy=void 0,i!==void 0&&Ny(t,n,i)}o=o.next}while(o!==r)}}function Em(e,t){if(t=t.updateQueue,t=t!==null?t.lastEffect:null,t!==null){var n=t=t.next;do{if((n.tag&e)===e){var r=n.create;n.destroy=r()}n=n.next}while(n!==t)}}function Dy(e){var t=e.ref;if(t!==null){var n=e.stateNode;switch(e.tag){case 5:e=n;break;default:e=n}typeof t=="function"?t(e):t.current=e}}function wR(e){var t=e.alternate;t!==null&&(e.alternate=null,wR(t)),e.child=null,e.deletions=null,e.sibling=null,e.tag===5&&(t=e.stateNode,t!==null&&(delete t[jo],delete t[Yu],delete t[by],delete t[vI],delete t[yI])),e.stateNode=null,e.return=null,e.dependencies=null,e.memoizedProps=null,e.memoizedState=null,e.pendingProps=null,e.stateNode=null,e.updateQueue=null}function xR(e){return e.tag===5||e.tag===3||e.tag===4}function WE(e){e:for(;;){for(;e.sibling===null;){if(e.return===null||xR(e.return))return null;e=e.return}for(e.sibling.return=e.return,e=e.sibling;e.tag!==5&&e.tag!==6&&e.tag!==18;){if(e.flags&2||e.child===null||e.tag===4)continue e;e.child.return=e,e=e.child}if(!(e.flags&2))return e.stateNode}}function Iy(e,t,n){var r=e.tag;if(r===5||r===6)e=e.stateNode,t?n.nodeType===8?n.parentNode.insertBefore(e,t):n.insertBefore(e,t):(n.nodeType===8?(t=n.parentNode,t.insertBefore(e,n)):(t=n,t.appendChild(e)),n=n._reactRootContainer,n!=null||t.onclick!==null||(t.onclick=yp));else if(r!==4&&(e=e.child,e!==null))for(Iy(e,t,n),e=e.sibling;e!==null;)Iy(e,t,n),e=e.sibling}function Ly(e,t,n){var r=e.tag;if(r===5||r===6)e=e.stateNode,t?n.insertBefore(e,t):n.appendChild(e);else if(r!==4&&(e=e.child,e!==null))for(Ly(e,t,n),e=e.sibling;e!==null;)Ly(e,t,n),e=e.sibling}var mn=null,ho=!1;function Xi(e,t,n){for(n=n.child;n!==null;)bR(e,t,n),n=n.sibling}function bR(e,t,n){if(Go&&typeof Go.onCommitFiberUnmount=="function")try{Go.onCommitFiberUnmount(gm,n)}catch{}switch(n.tag){case 5:$n||Rl(n,t);case 6:var r=mn,o=ho;mn=null,Xi(e,t,n),mn=r,ho=o,mn!==null&&(ho?(e=mn,n=n.stateNode,e.nodeType===8?e.parentNode.removeChild(n):e.removeChild(n)):mn.removeChild(n.stateNode));break;case 18:mn!==null&&(ho?(e=mn,n=n.stateNode,e.nodeType===8?i0(e.parentNode,n):e.nodeType===1&&i0(e,n),Vu(e)):i0(mn,n.stateNode));break;case 4:r=mn,o=ho,mn=n.stateNode.containerInfo,ho=!0,Xi(e,t,n),mn=r,ho=o;break;case 0:case 11:case 14:case 15:if(!$n&&(r=n.updateQueue,r!==null&&(r=r.lastEffect,r!==null))){o=r=r.next;do{var i=o,s=i.destroy;i=i.tag,s!==void 0&&(i&2||i&4)&&Ny(n,t,s),o=o.next}while(o!==r)}Xi(e,t,n);break;case 1:if(!$n&&(Rl(n,t),r=n.stateNode,typeof r.componentWillUnmount=="function"))try{r.props=n.memoizedProps,r.state=n.memoizedState,r.componentWillUnmount()}catch(l){Vt(n,t,l)}Xi(e,t,n);break;case 21:Xi(e,t,n);break;case 22:n.mode&1?($n=(r=$n)||n.memoizedState!==null,Xi(e,t,n),$n=r):Xi(e,t,n);break;default:Xi(e,t,n)}}function HE(e){var t=e.updateQueue;if(t!==null){e.updateQueue=null;var n=e.stateNode;n===null&&(n=e.stateNode=new NI),t.forEach(function(r){var o=WI.bind(null,e,r);n.has(r)||(n.add(r),r.then(o,o))})}}function ao(e,t){var n=t.deletions;if(n!==null)for(var r=0;ro&&(o=s),r&=~i}if(r=o,r=Gt()-r,r=(120>r?120:480>r?480:1080>r?1080:1920>r?1920:3e3>r?3e3:4320>r?4320:1960*LI(r/1960))-r,10e?16:e,ps===null)var r=!1;else{if(e=ps,ps=null,Ap=0,ft&6)throw Error(de(331));var o=ft;for(ft|=4,$e=e.current;$e!==null;){var i=$e,s=i.child;if($e.flags&16){var l=i.deletions;if(l!==null){for(var u=0;uGt()-Hw?Ra(e,0):Ww|=n),er(e,t)}function TR(e,t){t===0&&(e.mode&1?(t=eh,eh<<=1,!(eh&130023424)&&(eh=4194304)):t=1);var n=jn();e=Ci(e,t),e!==null&&(Td(e,t,n),er(e,n))}function VI(e){var t=e.memoizedState,n=0;t!==null&&(n=t.retryLane),TR(e,n)}function WI(e,t){var n=0;switch(e.tag){case 13:var r=e.stateNode,o=e.memoizedState;o!==null&&(n=o.retryLane);break;case 19:r=e.stateNode;break;default:throw Error(de(314))}r!==null&&r.delete(t),TR(e,n)}var kR;kR=function(e,t,n){if(e!==null)if(e.memoizedProps!==t.pendingProps||Qn.current)qn=!0;else{if(!(e.lanes&n)&&!(t.flags&128))return qn=!1,AI(e,t,n);qn=!!(e.flags&131072)}else qn=!1,Nt&&t.flags&1048576&&O5(t,Sp,t.index);switch(t.lanes=0,t.tag){case 2:var r=t.type;zh(e,t),e=t.pendingProps;var o=ec(t,Rn.current);Nl(t,n),o=Fw(null,t,r,e,o,n);var i=jw();return t.flags|=1,typeof o=="object"&&o!==null&&typeof o.render=="function"&&o.$$typeof===void 0?(t.tag=1,t.memoizedState=null,t.updateQueue=null,Jn(r)?(i=!0,xp(t)):i=!1,t.memoizedState=o.state!==null&&o.state!==void 0?o.state:null,Ow(t),o.updater=Sm,t.stateNode=o,o._reactInternals=t,Ry(t,r,e,n),t=ky(null,t,r,!0,i,n)):(t.tag=0,Nt&&i&&$w(t),In(null,t,o,n),t=t.child),t;case 16:r=t.elementType;e:{switch(zh(e,t),e=t.pendingProps,o=r._init,r=o(r._payload),t.type=r,o=t.tag=KI(r),e=co(r,e),o){case 0:t=Ty(null,t,r,e,n);break e;case 1:t=BE(null,t,r,e,n);break e;case 11:t=FE(null,t,r,e,n);break e;case 14:t=jE(null,t,r,co(r.type,e),n);break e}throw Error(de(306,r,""))}return t;case 0:return r=t.type,o=t.pendingProps,o=t.elementType===r?o:co(r,o),Ty(e,t,r,o,n);case 1:return r=t.type,o=t.pendingProps,o=t.elementType===r?o:co(r,o),BE(e,t,r,o,n);case 3:e:{if(hR(t),e===null)throw Error(de(387));r=t.pendingProps,i=t.memoizedState,o=i.element,L5(e,t),Cp(t,r,null,n);var s=t.memoizedState;if(r=s.element,i.isDehydrated)if(i={element:r,isDehydrated:!1,cache:s.cache,pendingSuspenseBoundaries:s.pendingSuspenseBoundaries,transitions:s.transitions},t.updateQueue.baseState=i,t.memoizedState=i,t.flags&256){o=oc(Error(de(423)),t),t=zE(e,t,r,n,o);break e}else if(r!==o){o=oc(Error(de(424)),t),t=zE(e,t,r,n,o);break e}else for(gr=_s(t.stateNode.containerInfo.firstChild),vr=t,Nt=!0,go=null,n=z5(t,null,r,n),t.child=n;n;)n.flags=n.flags&-3|4096,n=n.sibling;else{if(tc(),r===o){t=$i(e,t,n);break e}In(e,t,r,n)}t=t.child}return t;case 5:return U5(t),e===null&&Ey(t),r=t.type,o=t.pendingProps,i=e!==null?e.memoizedProps:null,s=o.children,wy(r,o)?s=null:i!==null&&wy(r,i)&&(t.flags|=32),fR(e,t),In(e,t,s,n),t.child;case 6:return e===null&&Ey(t),null;case 13:return pR(e,t,n);case 4:return Nw(t,t.stateNode.containerInfo),r=t.pendingProps,e===null?t.child=nc(t,null,r,n):In(e,t,r,n),t.child;case 11:return r=t.type,o=t.pendingProps,o=t.elementType===r?o:co(r,o),FE(e,t,r,o,n);case 7:return In(e,t,t.pendingProps,n),t.child;case 8:return In(e,t,t.pendingProps.children,n),t.child;case 12:return In(e,t,t.pendingProps.children,n),t.child;case 10:e:{if(r=t.type._context,o=t.pendingProps,i=t.memoizedProps,s=o.value,Rt(_p,r._currentValue),r._currentValue=s,i!==null)if(Eo(i.value,s)){if(i.children===o.children&&!Qn.current){t=$i(e,t,n);break e}}else for(i=t.child,i!==null&&(i.return=t);i!==null;){var l=i.dependencies;if(l!==null){s=i.child;for(var u=l.firstContext;u!==null;){if(u.context===r){if(i.tag===1){u=bi(-1,n&-n),u.tag=2;var f=i.updateQueue;if(f!==null){f=f.shared;var m=f.pending;m===null?u.next=u:(u.next=m.next,m.next=u),f.pending=u}}i.lanes|=n,u=i.alternate,u!==null&&(u.lanes|=n),Cy(i.return,n,t),l.lanes|=n;break}u=u.next}}else if(i.tag===10)s=i.type===t.type?null:i.child;else if(i.tag===18){if(s=i.return,s===null)throw Error(de(341));s.lanes|=n,l=s.alternate,l!==null&&(l.lanes|=n),Cy(s,n,t),s=i.sibling}else s=i.child;if(s!==null)s.return=i;else for(s=i;s!==null;){if(s===t){s=null;break}if(i=s.sibling,i!==null){i.return=s.return,s=i;break}s=s.return}i=s}In(e,t,o.children,n),t=t.child}return t;case 9:return o=t.type,r=t.pendingProps.children,Nl(t,n),o=qr(o),r=r(o),t.flags|=1,In(e,t,r,n),t.child;case 14:return r=t.type,o=co(r,t.pendingProps),o=co(r.type,o),jE(e,t,r,o,n);case 15:return uR(e,t,t.type,t.pendingProps,n);case 17:return r=t.type,o=t.pendingProps,o=t.elementType===r?o:co(r,o),zh(e,t),t.tag=1,Jn(r)?(e=!0,xp(t)):e=!1,Nl(t,n),j5(t,r,o),Ry(t,r,o,n),ky(null,t,r,!0,e,n);case 19:return mR(e,t,n);case 22:return dR(e,t,n)}throw Error(de(156,t.tag))};function AR(e,t){return r5(e,t)}function HI(e,t,n,r){this.tag=e,this.key=n,this.sibling=this.child=this.return=this.stateNode=this.type=this.elementType=null,this.index=0,this.ref=null,this.pendingProps=t,this.dependencies=this.memoizedState=this.updateQueue=this.memoizedProps=null,this.mode=r,this.subtreeFlags=this.flags=0,this.deletions=null,this.childLanes=this.lanes=0,this.alternate=null}function Kr(e,t,n,r){return new HI(e,t,n,r)}function Xw(e){return e=e.prototype,!(!e||!e.isReactComponent)}function KI(e){if(typeof e=="function")return Xw(e)?1:0;if(e!=null){if(e=e.$$typeof,e===pw)return 11;if(e===mw)return 14}return 2}function Rs(e,t){var n=e.alternate;return n===null?(n=Kr(e.tag,t,e.key,e.mode),n.elementType=e.elementType,n.type=e.type,n.stateNode=e.stateNode,n.alternate=e,e.alternate=n):(n.pendingProps=t,n.type=e.type,n.flags=0,n.subtreeFlags=0,n.deletions=null),n.flags=e.flags&14680064,n.childLanes=e.childLanes,n.lanes=e.lanes,n.child=e.child,n.memoizedProps=e.memoizedProps,n.memoizedState=e.memoizedState,n.updateQueue=e.updateQueue,t=e.dependencies,n.dependencies=t===null?null:{lanes:t.lanes,firstContext:t.firstContext},n.sibling=e.sibling,n.index=e.index,n.ref=e.ref,n}function Wh(e,t,n,r,o,i){var s=2;if(r=e,typeof e=="function")Xw(e)&&(s=1);else if(typeof e=="string")s=5;else e:switch(e){case yl:return Pa(n.children,o,i,t);case hw:s=8,o|=8;break;case q0:return e=Kr(12,n,t,o|2),e.elementType=q0,e.lanes=i,e;case Q0:return e=Kr(13,n,t,o),e.elementType=Q0,e.lanes=i,e;case J0:return e=Kr(19,n,t,o),e.elementType=J0,e.lanes=i,e;case B$:return $m(n,o,i,t);default:if(typeof e=="object"&&e!==null)switch(e.$$typeof){case F$:s=10;break e;case j$:s=9;break e;case pw:s=11;break e;case mw:s=14;break e;case ns:s=16,r=null;break e}throw Error(de(130,e==null?e:typeof e,""))}return t=Kr(s,n,t,o),t.elementType=e,t.type=r,t.lanes=i,t}function Pa(e,t,n,r){return e=Kr(7,e,r,t),e.lanes=n,e}function $m(e,t,n,r){return e=Kr(22,e,r,t),e.elementType=B$,e.lanes=n,e.stateNode={isHidden:!1},e}function h0(e,t,n){return e=Kr(6,e,null,t),e.lanes=n,e}function p0(e,t,n){return t=Kr(4,e.children!==null?e.children:[],e.key,t),t.lanes=n,t.stateNode={containerInfo:e.containerInfo,pendingChildren:null,implementation:e.implementation},t}function GI(e,t,n,r,o){this.tag=t,this.containerInfo=e,this.finishedWork=this.pingCache=this.current=this.pendingChildren=null,this.timeoutHandle=-1,this.callbackNode=this.pendingContext=this.context=null,this.callbackPriority=0,this.eventTimes=Yv(0),this.expirationTimes=Yv(-1),this.entangledLanes=this.finishedLanes=this.mutableReadLanes=this.expiredLanes=this.pingedLanes=this.suspendedLanes=this.pendingLanes=0,this.entanglements=Yv(0),this.identifierPrefix=r,this.onRecoverableError=o,this.mutableSourceEagerHydrationData=null}function Zw(e,t,n,r,o,i,s,l,u){return e=new GI(e,t,n,l,u),t===1?(t=1,i===!0&&(t|=8)):t=0,i=Kr(3,null,null,t),e.current=i,i.stateNode=e,i.memoizedState={element:r,isDehydrated:n,cache:null,transitions:null,pendingSuspenseBoundaries:null},Ow(i),e}function YI(e,t,n){var r=3"u"||typeof __REACT_DEVTOOLS_GLOBAL_HOOK__.checkDCE!="function"))try{__REACT_DEVTOOLS_GLOBAL_HOOK__.checkDCE(DR)}catch(e){console.error(e)}}DR(),O$.exports=Sr;var Bs=O$.exports;const JI=pm(Bs);var JE=Bs;X0.createRoot=JE.createRoot,X0.hydrateRoot=JE.hydrateRoot;var Od=class{constructor(){this.listeners=new Set,this.subscribe=this.subscribe.bind(this)}subscribe(e){return this.listeners.add(e),this.onSubscribe(),()=>{this.listeners.delete(e),this.onUnsubscribe()}}hasListeners(){return this.listeners.size>0}onSubscribe(){}onUnsubscribe(){}},sc=typeof window>"u"||"Deno"in window;function Lr(){}function eL(e,t){return typeof e=="function"?e(t):e}function Uy(e){return typeof e=="number"&&e>=0&&e!==1/0}function IR(e,t){return Math.max(e+(t||0)-Date.now(),0)}function eC(e,t){const{type:n="all",exact:r,fetchStatus:o,predicate:i,queryKey:s,stale:l}=e;if(s){if(r){if(t.queryHash!==ex(s,t.options))return!1}else if(!nd(t.queryKey,s))return!1}if(n!=="all"){const u=t.isActive();if(n==="active"&&!u||n==="inactive"&&u)return!1}return!(typeof l=="boolean"&&t.isStale()!==l||typeof o<"u"&&o!==t.state.fetchStatus||i&&!i(t))}function tC(e,t){const{exact:n,status:r,predicate:o,mutationKey:i}=e;if(i){if(!t.options.mutationKey)return!1;if(n){if(td(t.options.mutationKey)!==td(i))return!1}else if(!nd(t.options.mutationKey,i))return!1}return!(r&&t.state.status!==r||o&&!o(t))}function ex(e,t){return((t==null?void 0:t.queryKeyHashFn)||td)(e)}function td(e){return JSON.stringify(e,(t,n)=>Wy(n)?Object.keys(n).sort().reduce((r,o)=>(r[o]=n[o],r),{}):n)}function nd(e,t){return e===t?!0:typeof e!=typeof t?!1:e&&t&&typeof e=="object"&&typeof t=="object"?!Object.keys(t).some(n=>!nd(e[n],t[n])):!1}function LR(e,t){if(e===t)return e;const n=nC(e)&&nC(t);if(n||Wy(e)&&Wy(t)){const r=n?e.length:Object.keys(e).length,o=n?t:Object.keys(t),i=o.length,s=n?[]:{};let l=0;for(let u=0;u"u")return!0;const n=t.prototype;return!(!rC(n)||!n.hasOwnProperty("isPrototypeOf"))}function rC(e){return Object.prototype.toString.call(e)==="[object Object]"}function FR(e){return new Promise(t=>{setTimeout(t,e)})}function oC(e){FR(0).then(e)}function Hy(e,t,n){return typeof n.structuralSharing=="function"?n.structuralSharing(e,t):n.structuralSharing!==!1?LR(e,t):t}function tL(e,t,n=0){const r=[...e,t];return n&&r.length>n?r.slice(1):r}function nL(e,t,n=0){const r=[t,...e];return n&&r.length>n?r.slice(0,-1):r}var ya,ss,Fl,p$,rL=(p$=class extends Od{constructor(){super();ke(this,ya,void 0);ke(this,ss,void 0);ke(this,Fl,void 0);we(this,Fl,t=>{if(!sc&&window.addEventListener){const n=()=>t();return window.addEventListener("visibilitychange",n,!1),()=>{window.removeEventListener("visibilitychange",n)}}})}onSubscribe(){j(this,ss)||this.setEventListener(j(this,Fl))}onUnsubscribe(){var t;this.hasListeners()||((t=j(this,ss))==null||t.call(this),we(this,ss,void 0))}setEventListener(t){var n;we(this,Fl,t),(n=j(this,ss))==null||n.call(this),we(this,ss,t(r=>{typeof r=="boolean"?this.setFocused(r):this.onFocus()}))}setFocused(t){j(this,ya)!==t&&(we(this,ya,t),this.onFocus())}onFocus(){this.listeners.forEach(t=>{t()})}isFocused(){var t;return typeof j(this,ya)=="boolean"?j(this,ya):((t=globalThis.document)==null?void 0:t.visibilityState)!=="hidden"}},ya=new WeakMap,ss=new WeakMap,Fl=new WeakMap,p$),Np=new rL,jl,as,Bl,m$,oL=(m$=class extends Od{constructor(){super();ke(this,jl,!0);ke(this,as,void 0);ke(this,Bl,void 0);we(this,Bl,t=>{if(!sc&&window.addEventListener){const n=()=>t(!0),r=()=>t(!1);return window.addEventListener("online",n,!1),window.addEventListener("offline",r,!1),()=>{window.removeEventListener("online",n),window.removeEventListener("offline",r)}}})}onSubscribe(){j(this,as)||this.setEventListener(j(this,Bl))}onUnsubscribe(){var t;this.hasListeners()||((t=j(this,as))==null||t.call(this),we(this,as,void 0))}setEventListener(t){var n;we(this,Bl,t),(n=j(this,as))==null||n.call(this),we(this,as,t(this.setOnline.bind(this)))}setOnline(t){j(this,jl)!==t&&(we(this,jl,t),this.listeners.forEach(r=>{r(t)}))}isOnline(){return j(this,jl)}},jl=new WeakMap,as=new WeakMap,Bl=new WeakMap,m$),Dp=new oL;function iL(e){return Math.min(1e3*2**e,3e4)}function Am(e){return(e??"online")==="online"?Dp.isOnline():!0}var jR=class{constructor(e){this.revert=e==null?void 0:e.revert,this.silent=e==null?void 0:e.silent}};function m0(e){return e instanceof jR}function BR(e){let t=!1,n=0,r=!1,o,i,s;const l=new Promise((E,_)=>{i=E,s=_}),u=E=>{var _;r||(y(new jR(E)),(_=e.abort)==null||_.call(e))},f=()=>{t=!0},m=()=>{t=!1},p=()=>!Np.isFocused()||e.networkMode!=="always"&&!Dp.isOnline(),g=E=>{var _;r||(r=!0,(_=e.onSuccess)==null||_.call(e,E),o==null||o(),i(E))},y=E=>{var _;r||(r=!0,(_=e.onError)==null||_.call(e,E),o==null||o(),s(E))},x=()=>new Promise(E=>{var _;o=b=>{const C=r||!p();return C&&E(b),C},(_=e.onPause)==null||_.call(e)}).then(()=>{var E;o=void 0,r||(E=e.onContinue)==null||E.call(e)}),S=()=>{if(r)return;let E;try{E=e.fn()}catch(_){E=Promise.reject(_)}Promise.resolve(E).then(g).catch(_=>{var O;if(r)return;const b=e.retry??(sc?0:3),C=e.retryDelay??iL,R=typeof C=="function"?C(n,_):C,k=b===!0||typeof b=="number"&&n{if(p())return x()}).then(()=>{t?y(_):S()})})};return Am(e.networkMode)?S():x().then(S),{promise:l,cancel:u,continue:()=>(o==null?void 0:o())?l:Promise.resolve(),cancelRetry:f,continueRetry:m}}function sL(){let e=[],t=0,n=m=>{m()},r=m=>{m()};const o=m=>{let p;t++;try{p=m()}finally{t--,t||l()}return p},i=m=>{t?e.push(m):oC(()=>{n(m)})},s=m=>(...p)=>{i(()=>{m(...p)})},l=()=>{const m=e;e=[],m.length&&oC(()=>{r(()=>{m.forEach(p=>{n(p)})})})};return{batch:o,batchCalls:s,schedule:i,setNotifyFunction:m=>{n=m},setBatchNotifyFunction:m=>{r=m}}}var gn=sL(),wa,g$,zR=(g$=class{constructor(){ke(this,wa,void 0)}destroy(){this.clearGcTimeout()}scheduleGc(){this.clearGcTimeout(),Uy(this.gcTime)&&we(this,wa,setTimeout(()=>{this.optionalRemove()},this.gcTime))}updateGcTime(e){this.gcTime=Math.max(this.gcTime||0,e??(sc?1/0:5*60*1e3))}clearGcTimeout(){j(this,wa)&&(clearTimeout(j(this,wa)),we(this,wa,void 0))}},wa=new WeakMap,g$),zl,Ul,Nr,ls,Dr,cn,vd,xa,Vl,Hh,fo,pi,v$,aL=(v$=class extends zR{constructor(t){super();ke(this,Vl);ke(this,fo);ke(this,zl,void 0);ke(this,Ul,void 0);ke(this,Nr,void 0);ke(this,ls,void 0);ke(this,Dr,void 0);ke(this,cn,void 0);ke(this,vd,void 0);ke(this,xa,void 0);we(this,xa,!1),we(this,vd,t.defaultOptions),tt(this,Vl,Hh).call(this,t.options),we(this,cn,[]),we(this,Nr,t.cache),this.queryKey=t.queryKey,this.queryHash=t.queryHash,we(this,zl,t.state||lL(this.options)),this.state=j(this,zl),this.scheduleGc()}get meta(){return this.options.meta}optionalRemove(){!j(this,cn).length&&this.state.fetchStatus==="idle"&&j(this,Nr).remove(this)}setData(t,n){const r=Hy(this.state.data,t,this.options);return tt(this,fo,pi).call(this,{data:r,type:"success",dataUpdatedAt:n==null?void 0:n.updatedAt,manual:n==null?void 0:n.manual}),r}setState(t,n){tt(this,fo,pi).call(this,{type:"setState",state:t,setStateOptions:n})}cancel(t){var r;const n=j(this,ls);return(r=j(this,Dr))==null||r.cancel(t),n?n.then(Lr).catch(Lr):Promise.resolve()}destroy(){super.destroy(),this.cancel({silent:!0})}reset(){this.destroy(),this.setState(j(this,zl))}isActive(){return j(this,cn).some(t=>t.options.enabled!==!1)}isDisabled(){return this.getObserversCount()>0&&!this.isActive()}isStale(){return this.state.isInvalidated||!this.state.dataUpdatedAt||j(this,cn).some(t=>t.getCurrentResult().isStale)}isStaleByTime(t=0){return this.state.isInvalidated||!this.state.dataUpdatedAt||!IR(this.state.dataUpdatedAt,t)}onFocus(){var n;const t=j(this,cn).find(r=>r.shouldFetchOnWindowFocus());t==null||t.refetch({cancelRefetch:!1}),(n=j(this,Dr))==null||n.continue()}onOnline(){var n;const t=j(this,cn).find(r=>r.shouldFetchOnReconnect());t==null||t.refetch({cancelRefetch:!1}),(n=j(this,Dr))==null||n.continue()}addObserver(t){j(this,cn).includes(t)||(j(this,cn).push(t),this.clearGcTimeout(),j(this,Nr).notify({type:"observerAdded",query:this,observer:t}))}removeObserver(t){j(this,cn).includes(t)&&(we(this,cn,j(this,cn).filter(n=>n!==t)),j(this,cn).length||(j(this,Dr)&&(j(this,xa)?j(this,Dr).cancel({revert:!0}):j(this,Dr).cancelRetry()),this.scheduleGc()),j(this,Nr).notify({type:"observerRemoved",query:this,observer:t}))}getObserversCount(){return j(this,cn).length}invalidate(){this.state.isInvalidated||tt(this,fo,pi).call(this,{type:"invalidate"})}fetch(t,n){var f,m,p,g;if(this.state.fetchStatus!=="idle"){if(this.state.dataUpdatedAt&&(n!=null&&n.cancelRefetch))this.cancel({silent:!0});else if(j(this,ls))return(f=j(this,Dr))==null||f.continueRetry(),j(this,ls)}if(t&&tt(this,Vl,Hh).call(this,t),!this.options.queryFn){const y=j(this,cn).find(x=>x.options.queryFn);y&&tt(this,Vl,Hh).call(this,y.options)}const r=new AbortController,o={queryKey:this.queryKey,meta:this.meta},i=y=>{Object.defineProperty(y,"signal",{enumerable:!0,get:()=>(we(this,xa,!0),r.signal)})};i(o);const s=()=>this.options.queryFn?(we(this,xa,!1),this.options.persister?this.options.persister(this.options.queryFn,o,this):this.options.queryFn(o)):Promise.reject(new Error(`Missing queryFn: '${this.options.queryHash}'`)),l={fetchOptions:n,options:this.options,queryKey:this.queryKey,state:this.state,fetchFn:s};i(l),(m=this.options.behavior)==null||m.onFetch(l,this),we(this,Ul,this.state),(this.state.fetchStatus==="idle"||this.state.fetchMeta!==((p=l.fetchOptions)==null?void 0:p.meta))&&tt(this,fo,pi).call(this,{type:"fetch",meta:(g=l.fetchOptions)==null?void 0:g.meta});const u=y=>{var x,S,E,_;m0(y)&&y.silent||tt(this,fo,pi).call(this,{type:"error",error:y}),m0(y)||((S=(x=j(this,Nr).config).onError)==null||S.call(x,y,this),(_=(E=j(this,Nr).config).onSettled)==null||_.call(E,this.state.data,y,this)),this.isFetchingOptimistic||this.scheduleGc(),this.isFetchingOptimistic=!1};return we(this,Dr,BR({fn:l.fetchFn,abort:r.abort.bind(r),onSuccess:y=>{var x,S,E,_;if(typeof y>"u"){u(new Error(`${this.queryHash} data is undefined`));return}this.setData(y),(S=(x=j(this,Nr).config).onSuccess)==null||S.call(x,y,this),(_=(E=j(this,Nr).config).onSettled)==null||_.call(E,y,this.state.error,this),this.isFetchingOptimistic||this.scheduleGc(),this.isFetchingOptimistic=!1},onError:u,onFail:(y,x)=>{tt(this,fo,pi).call(this,{type:"failed",failureCount:y,error:x})},onPause:()=>{tt(this,fo,pi).call(this,{type:"pause"})},onContinue:()=>{tt(this,fo,pi).call(this,{type:"continue"})},retry:l.options.retry,retryDelay:l.options.retryDelay,networkMode:l.options.networkMode})),we(this,ls,j(this,Dr).promise),j(this,ls)}},zl=new WeakMap,Ul=new WeakMap,Nr=new WeakMap,ls=new WeakMap,Dr=new WeakMap,cn=new WeakMap,vd=new WeakMap,xa=new WeakMap,Vl=new WeakSet,Hh=function(t){this.options={...j(this,vd),...t},this.updateGcTime(this.options.gcTime)},fo=new WeakSet,pi=function(t){const n=r=>{switch(t.type){case"failed":return{...r,fetchFailureCount:t.failureCount,fetchFailureReason:t.error};case"pause":return{...r,fetchStatus:"paused"};case"continue":return{...r,fetchStatus:"fetching"};case"fetch":return{...r,fetchFailureCount:0,fetchFailureReason:null,fetchMeta:t.meta??null,fetchStatus:Am(this.options.networkMode)?"fetching":"paused",...!r.dataUpdatedAt&&{error:null,status:"pending"}};case"success":return{...r,data:t.data,dataUpdateCount:r.dataUpdateCount+1,dataUpdatedAt:t.dataUpdatedAt??Date.now(),error:null,isInvalidated:!1,status:"success",...!t.manual&&{fetchStatus:"idle",fetchFailureCount:0,fetchFailureReason:null}};case"error":const o=t.error;return m0(o)&&o.revert&&j(this,Ul)?{...j(this,Ul),fetchStatus:"idle"}:{...r,error:o,errorUpdateCount:r.errorUpdateCount+1,errorUpdatedAt:Date.now(),fetchFailureCount:r.fetchFailureCount+1,fetchFailureReason:o,fetchStatus:"idle",status:"error"};case"invalidate":return{...r,isInvalidated:!0};case"setState":return{...r,...t.state}}};this.state=n(this.state),gn.batch(()=>{j(this,cn).forEach(r=>{r.onQueryUpdate()}),j(this,Nr).notify({query:this,type:"updated",action:t})})},v$);function lL(e){const t=typeof e.initialData=="function"?e.initialData():e.initialData,n=typeof t<"u",r=n?typeof e.initialDataUpdatedAt=="function"?e.initialDataUpdatedAt():e.initialDataUpdatedAt:0;return{data:t,dataUpdateCount:0,dataUpdatedAt:n?r??Date.now():0,error:null,errorUpdateCount:0,errorUpdatedAt:0,fetchFailureCount:0,fetchFailureReason:null,fetchMeta:null,isInvalidated:!1,status:n?"success":"pending",fetchStatus:"idle"}}var Do,y$,cL=(y$=class extends Od{constructor(t={}){super();ke(this,Do,void 0);this.config=t,we(this,Do,new Map)}build(t,n,r){const o=n.queryKey,i=n.queryHash??ex(o,n);let s=this.get(i);return s||(s=new aL({cache:this,queryKey:o,queryHash:i,options:t.defaultQueryOptions(n),state:r,defaultOptions:t.getQueryDefaults(o)}),this.add(s)),s}add(t){j(this,Do).has(t.queryHash)||(j(this,Do).set(t.queryHash,t),this.notify({type:"added",query:t}))}remove(t){const n=j(this,Do).get(t.queryHash);n&&(t.destroy(),n===t&&j(this,Do).delete(t.queryHash),this.notify({type:"removed",query:t}))}clear(){gn.batch(()=>{this.getAll().forEach(t=>{this.remove(t)})})}get(t){return j(this,Do).get(t)}getAll(){return[...j(this,Do).values()]}find(t){const n={exact:!0,...t};return this.getAll().find(r=>eC(n,r))}findAll(t={}){const n=this.getAll();return Object.keys(t).length>0?n.filter(r=>eC(t,r)):n}notify(t){gn.batch(()=>{this.listeners.forEach(n=>{n(t)})})}onFocus(){gn.batch(()=>{this.getAll().forEach(t=>{t.onFocus()})})}onOnline(){gn.batch(()=>{this.getAll().forEach(t=>{t.onOnline()})})}},Do=new WeakMap,y$),Io,yd,fr,Wl,Lo,es,w$,uL=(w$=class extends zR{constructor(t){super();ke(this,Lo);ke(this,Io,void 0);ke(this,yd,void 0);ke(this,fr,void 0);ke(this,Wl,void 0);this.mutationId=t.mutationId,we(this,yd,t.defaultOptions),we(this,fr,t.mutationCache),we(this,Io,[]),this.state=t.state||dL(),this.setOptions(t.options),this.scheduleGc()}setOptions(t){this.options={...j(this,yd),...t},this.updateGcTime(this.options.gcTime)}get meta(){return this.options.meta}addObserver(t){j(this,Io).includes(t)||(j(this,Io).push(t),this.clearGcTimeout(),j(this,fr).notify({type:"observerAdded",mutation:this,observer:t}))}removeObserver(t){we(this,Io,j(this,Io).filter(n=>n!==t)),this.scheduleGc(),j(this,fr).notify({type:"observerRemoved",mutation:this,observer:t})}optionalRemove(){j(this,Io).length||(this.state.status==="pending"?this.scheduleGc():j(this,fr).remove(this))}continue(){var t;return((t=j(this,Wl))==null?void 0:t.continue())??this.execute(this.state.variables)}async execute(t){var o,i,s,l,u,f,m,p,g,y,x,S,E,_,b,C,R,k,O,A;const n=()=>(we(this,Wl,BR({fn:()=>this.options.mutationFn?this.options.mutationFn(t):Promise.reject(new Error("No mutationFn found")),onFail:(I,z)=>{tt(this,Lo,es).call(this,{type:"failed",failureCount:I,error:z})},onPause:()=>{tt(this,Lo,es).call(this,{type:"pause"})},onContinue:()=>{tt(this,Lo,es).call(this,{type:"continue"})},retry:this.options.retry??0,retryDelay:this.options.retryDelay,networkMode:this.options.networkMode})),j(this,Wl).promise),r=this.state.status==="pending";try{if(!r){tt(this,Lo,es).call(this,{type:"pending",variables:t}),await((i=(o=j(this,fr).config).onMutate)==null?void 0:i.call(o,t,this));const z=await((l=(s=this.options).onMutate)==null?void 0:l.call(s,t));z!==this.state.context&&tt(this,Lo,es).call(this,{type:"pending",context:z,variables:t})}const I=await n();return await((f=(u=j(this,fr).config).onSuccess)==null?void 0:f.call(u,I,t,this.state.context,this)),await((p=(m=this.options).onSuccess)==null?void 0:p.call(m,I,t,this.state.context)),await((y=(g=j(this,fr).config).onSettled)==null?void 0:y.call(g,I,null,this.state.variables,this.state.context,this)),await((S=(x=this.options).onSettled)==null?void 0:S.call(x,I,null,t,this.state.context)),tt(this,Lo,es).call(this,{type:"success",data:I}),I}catch(I){try{throw await((_=(E=j(this,fr).config).onError)==null?void 0:_.call(E,I,t,this.state.context,this)),await((C=(b=this.options).onError)==null?void 0:C.call(b,I,t,this.state.context)),await((k=(R=j(this,fr).config).onSettled)==null?void 0:k.call(R,void 0,I,this.state.variables,this.state.context,this)),await((A=(O=this.options).onSettled)==null?void 0:A.call(O,void 0,I,t,this.state.context)),I}finally{tt(this,Lo,es).call(this,{type:"error",error:I})}}}},Io=new WeakMap,yd=new WeakMap,fr=new WeakMap,Wl=new WeakMap,Lo=new WeakSet,es=function(t){const n=r=>{switch(t.type){case"failed":return{...r,failureCount:t.failureCount,failureReason:t.error};case"pause":return{...r,isPaused:!0};case"continue":return{...r,isPaused:!1};case"pending":return{...r,context:t.context,data:void 0,failureCount:0,failureReason:null,error:null,isPaused:!Am(this.options.networkMode),status:"pending",variables:t.variables,submittedAt:Date.now()};case"success":return{...r,data:t.data,failureCount:0,failureReason:null,error:null,status:"success",isPaused:!1};case"error":return{...r,data:void 0,error:t.error,failureCount:r.failureCount+1,failureReason:t.error,isPaused:!1,status:"error"}}};this.state=n(this.state),gn.batch(()=>{j(this,Io).forEach(r=>{r.onMutationUpdate(t)}),j(this,fr).notify({mutation:this,type:"updated",action:t})})},w$);function dL(){return{context:void 0,data:void 0,error:null,failureCount:0,failureReason:null,isPaused:!1,status:"idle",variables:void 0,submittedAt:0}}var Ir,wd,ba,x$,fL=(x$=class extends Od{constructor(t={}){super();ke(this,Ir,void 0);ke(this,wd,void 0);ke(this,ba,void 0);this.config=t,we(this,Ir,[]),we(this,wd,0)}build(t,n,r){const o=new uL({mutationCache:this,mutationId:++Yf(this,wd)._,options:t.defaultMutationOptions(n),state:r});return this.add(o),o}add(t){j(this,Ir).push(t),this.notify({type:"added",mutation:t})}remove(t){we(this,Ir,j(this,Ir).filter(n=>n!==t)),this.notify({type:"removed",mutation:t})}clear(){gn.batch(()=>{j(this,Ir).forEach(t=>{this.remove(t)})})}getAll(){return j(this,Ir)}find(t){const n={exact:!0,...t};return j(this,Ir).find(r=>tC(n,r))}findAll(t={}){return j(this,Ir).filter(n=>tC(t,n))}notify(t){gn.batch(()=>{this.listeners.forEach(n=>{n(t)})})}resumePausedMutations(){return we(this,ba,(j(this,ba)??Promise.resolve()).then(()=>{const t=j(this,Ir).filter(n=>n.state.isPaused);return gn.batch(()=>t.reduce((n,r)=>n.then(()=>r.continue().catch(Lr)),Promise.resolve()))}).then(()=>{we(this,ba,void 0)})),j(this,ba)}},Ir=new WeakMap,wd=new WeakMap,ba=new WeakMap,x$);function hL(e){return{onFetch:(t,n)=>{const r=async()=>{var x,S,E,_,b;const o=t.options,i=(E=(S=(x=t.fetchOptions)==null?void 0:x.meta)==null?void 0:S.fetchMore)==null?void 0:E.direction,s=((_=t.state.data)==null?void 0:_.pages)||[],l=((b=t.state.data)==null?void 0:b.pageParams)||[],u={pages:[],pageParams:[]};let f=!1;const m=C=>{Object.defineProperty(C,"signal",{enumerable:!0,get:()=>(t.signal.aborted?f=!0:t.signal.addEventListener("abort",()=>{f=!0}),t.signal)})},p=t.options.queryFn||(()=>Promise.reject(new Error(`Missing queryFn: '${t.options.queryHash}'`))),g=async(C,R,k)=>{if(f)return Promise.reject();if(R==null&&C.pages.length)return Promise.resolve(C);const O={queryKey:t.queryKey,pageParam:R,direction:k?"backward":"forward",meta:t.options.meta};m(O);const A=await p(O),{maxPages:I}=t.options,z=k?nL:tL;return{pages:z(C.pages,A,I),pageParams:z(C.pageParams,R,I)}};let y;if(i&&s.length){const C=i==="backward",R=C?pL:iC,k={pages:s,pageParams:l},O=R(o,k);y=await g(k,O,C)}else{y=await g(u,l[0]??o.initialPageParam);const C=e??s.length;for(let R=1;R{var o,i;return(i=(o=t.options).persister)==null?void 0:i.call(o,r,{queryKey:t.queryKey,meta:t.options.meta,signal:t.signal},n)}:t.fetchFn=r}}}function iC(e,{pages:t,pageParams:n}){const r=t.length-1;return e.getNextPageParam(t[r],t,n[r],n)}function pL(e,{pages:t,pageParams:n}){var r;return(r=e.getPreviousPageParam)==null?void 0:r.call(e,t[0],t,n[0],n)}var nn,cs,us,Hl,Kl,ds,Gl,Yl,b$,mL=(b$=class{constructor(e={}){ke(this,nn,void 0);ke(this,cs,void 0);ke(this,us,void 0);ke(this,Hl,void 0);ke(this,Kl,void 0);ke(this,ds,void 0);ke(this,Gl,void 0);ke(this,Yl,void 0);we(this,nn,e.queryCache||new cL),we(this,cs,e.mutationCache||new fL),we(this,us,e.defaultOptions||{}),we(this,Hl,new Map),we(this,Kl,new Map),we(this,ds,0)}mount(){Yf(this,ds)._++,j(this,ds)===1&&(we(this,Gl,Np.subscribe(()=>{Np.isFocused()&&(this.resumePausedMutations(),j(this,nn).onFocus())})),we(this,Yl,Dp.subscribe(()=>{Dp.isOnline()&&(this.resumePausedMutations(),j(this,nn).onOnline())})))}unmount(){var e,t;Yf(this,ds)._--,j(this,ds)===0&&((e=j(this,Gl))==null||e.call(this),we(this,Gl,void 0),(t=j(this,Yl))==null||t.call(this),we(this,Yl,void 0))}isFetching(e){return j(this,nn).findAll({...e,fetchStatus:"fetching"}).length}isMutating(e){return j(this,cs).findAll({...e,status:"pending"}).length}getQueryData(e){var t;return(t=j(this,nn).find({queryKey:e}))==null?void 0:t.state.data}ensureQueryData(e){const t=this.getQueryData(e.queryKey);return t!==void 0?Promise.resolve(t):this.fetchQuery(e)}getQueriesData(e){return this.getQueryCache().findAll(e).map(({queryKey:t,state:n})=>{const r=n.data;return[t,r]})}setQueryData(e,t,n){const r=j(this,nn).find({queryKey:e}),o=r==null?void 0:r.state.data,i=eL(t,o);if(typeof i>"u")return;const s=this.defaultQueryOptions({queryKey:e});return j(this,nn).build(this,s).setData(i,{...n,manual:!0})}setQueriesData(e,t,n){return gn.batch(()=>this.getQueryCache().findAll(e).map(({queryKey:r})=>[r,this.setQueryData(r,t,n)]))}getQueryState(e){var t;return(t=j(this,nn).find({queryKey:e}))==null?void 0:t.state}removeQueries(e){const t=j(this,nn);gn.batch(()=>{t.findAll(e).forEach(n=>{t.remove(n)})})}resetQueries(e,t){const n=j(this,nn),r={type:"active",...e};return gn.batch(()=>(n.findAll(e).forEach(o=>{o.reset()}),this.refetchQueries(r,t)))}cancelQueries(e={},t={}){const n={revert:!0,...t},r=gn.batch(()=>j(this,nn).findAll(e).map(o=>o.cancel(n)));return Promise.all(r).then(Lr).catch(Lr)}invalidateQueries(e={},t={}){return gn.batch(()=>{if(j(this,nn).findAll(e).forEach(r=>{r.invalidate()}),e.refetchType==="none")return Promise.resolve();const n={...e,type:e.refetchType??e.type??"active"};return this.refetchQueries(n,t)})}refetchQueries(e={},t){const n={...t,cancelRefetch:(t==null?void 0:t.cancelRefetch)??!0},r=gn.batch(()=>j(this,nn).findAll(e).filter(o=>!o.isDisabled()).map(o=>{let i=o.fetch(void 0,n);return n.throwOnError||(i=i.catch(Lr)),o.state.fetchStatus==="paused"?Promise.resolve():i}));return Promise.all(r).then(Lr)}fetchQuery(e){const t=this.defaultQueryOptions(e);typeof t.retry>"u"&&(t.retry=!1);const n=j(this,nn).build(this,t);return n.isStaleByTime(t.staleTime)?n.fetch(t):Promise.resolve(n.state.data)}prefetchQuery(e){return this.fetchQuery(e).then(Lr).catch(Lr)}fetchInfiniteQuery(e){return e.behavior=hL(e.pages),this.fetchQuery(e)}prefetchInfiniteQuery(e){return this.fetchInfiniteQuery(e).then(Lr).catch(Lr)}resumePausedMutations(){return j(this,cs).resumePausedMutations()}getQueryCache(){return j(this,nn)}getMutationCache(){return j(this,cs)}getDefaultOptions(){return j(this,us)}setDefaultOptions(e){we(this,us,e)}setQueryDefaults(e,t){j(this,Hl).set(td(e),{queryKey:e,defaultOptions:t})}getQueryDefaults(e){const t=[...j(this,Hl).values()];let n={};return t.forEach(r=>{nd(e,r.queryKey)&&(n={...n,...r.defaultOptions})}),n}setMutationDefaults(e,t){j(this,Kl).set(td(e),{mutationKey:e,defaultOptions:t})}getMutationDefaults(e){const t=[...j(this,Kl).values()];let n={};return t.forEach(r=>{nd(e,r.mutationKey)&&(n={...n,...r.defaultOptions})}),n}defaultQueryOptions(e){if(e!=null&&e._defaulted)return e;const t={...j(this,us).queries,...(e==null?void 0:e.queryKey)&&this.getQueryDefaults(e.queryKey),...e,_defaulted:!0};return t.queryHash||(t.queryHash=ex(t.queryKey,t)),typeof t.refetchOnReconnect>"u"&&(t.refetchOnReconnect=t.networkMode!=="always"),typeof t.throwOnError>"u"&&(t.throwOnError=!!t.suspense),typeof t.networkMode>"u"&&t.persister&&(t.networkMode="offlineFirst"),t}defaultMutationOptions(e){return e!=null&&e._defaulted?e:{...j(this,us).mutations,...(e==null?void 0:e.mutationKey)&&this.getMutationDefaults(e.mutationKey),...e,_defaulted:!0}}clear(){j(this,nn).clear(),j(this,cs).clear()}},nn=new WeakMap,cs=new WeakMap,us=new WeakMap,Hl=new WeakMap,Kl=new WeakMap,ds=new WeakMap,Gl=new WeakMap,Yl=new WeakMap,b$),Gn,$t,Xl,Cn,Sa,Zl,Fo,xd,ql,Ql,_a,Ea,fs,Ca,$a,xu,bd,Ky,Sd,Gy,_d,Yy,Ed,Xy,Cd,Zy,$d,qy,Rd,Qy,hm,UR,S$,gL=(S$=class extends Od{constructor(t,n){super();ke(this,$a);ke(this,bd);ke(this,Sd);ke(this,_d);ke(this,Ed);ke(this,Cd);ke(this,$d);ke(this,Rd);ke(this,hm);ke(this,Gn,void 0);ke(this,$t,void 0);ke(this,Xl,void 0);ke(this,Cn,void 0);ke(this,Sa,void 0);ke(this,Zl,void 0);ke(this,Fo,void 0);ke(this,xd,void 0);ke(this,ql,void 0);ke(this,Ql,void 0);ke(this,_a,void 0);ke(this,Ea,void 0);ke(this,fs,void 0);ke(this,Ca,void 0);we(this,$t,void 0),we(this,Xl,void 0),we(this,Cn,void 0),we(this,Ca,new Set),we(this,Gn,t),this.options=n,we(this,Fo,null),this.bindMethods(),this.setOptions(n)}bindMethods(){this.refetch=this.refetch.bind(this)}onSubscribe(){this.listeners.size===1&&(j(this,$t).addObserver(this),sC(j(this,$t),this.options)?tt(this,$a,xu).call(this):this.updateResult(),tt(this,Ed,Xy).call(this))}onUnsubscribe(){this.hasListeners()||this.destroy()}shouldFetchOnReconnect(){return Jy(j(this,$t),this.options,this.options.refetchOnReconnect)}shouldFetchOnWindowFocus(){return Jy(j(this,$t),this.options,this.options.refetchOnWindowFocus)}destroy(){this.listeners=new Set,tt(this,Cd,Zy).call(this),tt(this,$d,qy).call(this),j(this,$t).removeObserver(this)}setOptions(t,n){const r=this.options,o=j(this,$t);if(this.options=j(this,Gn).defaultQueryOptions(t),Vy(r,this.options)||j(this,Gn).getQueryCache().notify({type:"observerOptionsUpdated",query:j(this,$t),observer:this}),typeof this.options.enabled<"u"&&typeof this.options.enabled!="boolean")throw new Error("Expected enabled to be a boolean");this.options.queryKey||(this.options.queryKey=r.queryKey),tt(this,Rd,Qy).call(this);const i=this.hasListeners();i&&aC(j(this,$t),o,this.options,r)&&tt(this,$a,xu).call(this),this.updateResult(n),i&&(j(this,$t)!==o||this.options.enabled!==r.enabled||this.options.staleTime!==r.staleTime)&&tt(this,bd,Ky).call(this);const s=tt(this,Sd,Gy).call(this);i&&(j(this,$t)!==o||this.options.enabled!==r.enabled||s!==j(this,fs))&&tt(this,_d,Yy).call(this,s)}getOptimisticResult(t){const n=j(this,Gn).getQueryCache().build(j(this,Gn),t),r=this.createResult(n,t);return yL(this,r)&&(we(this,Cn,r),we(this,Zl,this.options),we(this,Sa,j(this,$t).state)),r}getCurrentResult(){return j(this,Cn)}trackResult(t){const n={};return Object.keys(t).forEach(r=>{Object.defineProperty(n,r,{configurable:!1,enumerable:!0,get:()=>(j(this,Ca).add(r),t[r])})}),n}getCurrentQuery(){return j(this,$t)}refetch({...t}={}){return this.fetch({...t})}fetchOptimistic(t){const n=j(this,Gn).defaultQueryOptions(t),r=j(this,Gn).getQueryCache().build(j(this,Gn),n);return r.isFetchingOptimistic=!0,r.fetch().then(()=>this.createResult(r,n))}fetch(t){return tt(this,$a,xu).call(this,{...t,cancelRefetch:t.cancelRefetch??!0}).then(()=>(this.updateResult(),j(this,Cn)))}createResult(t,n){var O;const r=j(this,$t),o=this.options,i=j(this,Cn),s=j(this,Sa),l=j(this,Zl),f=t!==r?t.state:j(this,Xl),{state:m}=t;let{error:p,errorUpdatedAt:g,fetchStatus:y,status:x}=m,S=!1,E;if(n._optimisticResults){const A=this.hasListeners(),I=!A&&sC(t,n),z=A&&aC(t,r,n,o);(I||z)&&(y=Am(t.options.networkMode)?"fetching":"paused",m.dataUpdatedAt||(x="pending")),n._optimisticResults==="isRestoring"&&(y="idle")}if(n.select&&typeof m.data<"u")if(i&&m.data===(s==null?void 0:s.data)&&n.select===j(this,xd))E=j(this,ql);else try{we(this,xd,n.select),E=n.select(m.data),E=Hy(i==null?void 0:i.data,E,n),we(this,ql,E),we(this,Fo,null)}catch(A){we(this,Fo,A)}else E=m.data;if(typeof n.placeholderData<"u"&&typeof E>"u"&&x==="pending"){let A;if(i!=null&&i.isPlaceholderData&&n.placeholderData===(l==null?void 0:l.placeholderData))A=i.data;else if(A=typeof n.placeholderData=="function"?n.placeholderData((O=j(this,Ql))==null?void 0:O.state.data,j(this,Ql)):n.placeholderData,n.select&&typeof A<"u")try{A=n.select(A),we(this,Fo,null)}catch(I){we(this,Fo,I)}typeof A<"u"&&(x="success",E=Hy(i==null?void 0:i.data,A,n),S=!0)}j(this,Fo)&&(p=j(this,Fo),E=j(this,ql),g=Date.now(),x="error");const _=y==="fetching",b=x==="pending",C=x==="error",R=b&&_;return{status:x,fetchStatus:y,isPending:b,isSuccess:x==="success",isError:C,isInitialLoading:R,isLoading:R,data:E,dataUpdatedAt:m.dataUpdatedAt,error:p,errorUpdatedAt:g,failureCount:m.fetchFailureCount,failureReason:m.fetchFailureReason,errorUpdateCount:m.errorUpdateCount,isFetched:m.dataUpdateCount>0||m.errorUpdateCount>0,isFetchedAfterMount:m.dataUpdateCount>f.dataUpdateCount||m.errorUpdateCount>f.errorUpdateCount,isFetching:_,isRefetching:_&&!b,isLoadingError:C&&m.dataUpdatedAt===0,isPaused:y==="paused",isPlaceholderData:S,isRefetchError:C&&m.dataUpdatedAt!==0,isStale:tx(t,n),refetch:this.refetch}}updateResult(t){const n=j(this,Cn),r=this.createResult(j(this,$t),this.options);if(we(this,Sa,j(this,$t).state),we(this,Zl,this.options),j(this,Sa).data!==void 0&&we(this,Ql,j(this,$t)),Vy(r,n))return;we(this,Cn,r);const o={},i=()=>{if(!n)return!0;const{notifyOnChangeProps:s}=this.options,l=typeof s=="function"?s():s;if(l==="all"||!l&&!j(this,Ca).size)return!0;const u=new Set(l??j(this,Ca));return this.options.throwOnError&&u.add("error"),Object.keys(j(this,Cn)).some(f=>{const m=f;return j(this,Cn)[m]!==n[m]&&u.has(m)})};(t==null?void 0:t.listeners)!==!1&&i()&&(o.listeners=!0),tt(this,hm,UR).call(this,{...o,...t})}onQueryUpdate(){this.updateResult(),this.hasListeners()&&tt(this,Ed,Xy).call(this)}},Gn=new WeakMap,$t=new WeakMap,Xl=new WeakMap,Cn=new WeakMap,Sa=new WeakMap,Zl=new WeakMap,Fo=new WeakMap,xd=new WeakMap,ql=new WeakMap,Ql=new WeakMap,_a=new WeakMap,Ea=new WeakMap,fs=new WeakMap,Ca=new WeakMap,$a=new WeakSet,xu=function(t){tt(this,Rd,Qy).call(this);let n=j(this,$t).fetch(this.options,t);return t!=null&&t.throwOnError||(n=n.catch(Lr)),n},bd=new WeakSet,Ky=function(){if(tt(this,Cd,Zy).call(this),sc||j(this,Cn).isStale||!Uy(this.options.staleTime))return;const n=IR(j(this,Cn).dataUpdatedAt,this.options.staleTime)+1;we(this,_a,setTimeout(()=>{j(this,Cn).isStale||this.updateResult()},n))},Sd=new WeakSet,Gy=function(){return(typeof this.options.refetchInterval=="function"?this.options.refetchInterval(j(this,$t)):this.options.refetchInterval)??!1},_d=new WeakSet,Yy=function(t){tt(this,$d,qy).call(this),we(this,fs,t),!(sc||this.options.enabled===!1||!Uy(j(this,fs))||j(this,fs)===0)&&we(this,Ea,setInterval(()=>{(this.options.refetchIntervalInBackground||Np.isFocused())&&tt(this,$a,xu).call(this)},j(this,fs)))},Ed=new WeakSet,Xy=function(){tt(this,bd,Ky).call(this),tt(this,_d,Yy).call(this,tt(this,Sd,Gy).call(this))},Cd=new WeakSet,Zy=function(){j(this,_a)&&(clearTimeout(j(this,_a)),we(this,_a,void 0))},$d=new WeakSet,qy=function(){j(this,Ea)&&(clearInterval(j(this,Ea)),we(this,Ea,void 0))},Rd=new WeakSet,Qy=function(){const t=j(this,Gn).getQueryCache().build(j(this,Gn),this.options);if(t===j(this,$t))return;const n=j(this,$t);we(this,$t,t),we(this,Xl,t.state),this.hasListeners()&&(n==null||n.removeObserver(this),t.addObserver(this))},hm=new WeakSet,UR=function(t){gn.batch(()=>{t.listeners&&this.listeners.forEach(n=>{n(j(this,Cn))}),j(this,Gn).getQueryCache().notify({query:j(this,$t),type:"observerResultsUpdated"})})},S$);function vL(e,t){return t.enabled!==!1&&!e.state.dataUpdatedAt&&!(e.state.status==="error"&&t.retryOnMount===!1)}function sC(e,t){return vL(e,t)||e.state.dataUpdatedAt>0&&Jy(e,t,t.refetchOnMount)}function Jy(e,t,n){if(t.enabled!==!1){const r=typeof n=="function"?n(e):n;return r==="always"||r!==!1&&tx(e,t)}return!1}function aC(e,t,n,r){return n.enabled!==!1&&(e!==t||r.enabled===!1)&&(!n.suspense||e.state.status!=="error")&&tx(e,n)}function tx(e,t){return e.isStaleByTime(t.staleTime)}function yL(e,t){return!Vy(e.getCurrentResult(),t)}var VR=d.createContext(void 0),wL=e=>{const t=d.useContext(VR);if(e)return e;if(!t)throw new Error("No QueryClient set, use QueryClientProvider to set one");return t},xL=({client:e,children:t})=>(d.useEffect(()=>(e.mount(),()=>{e.unmount()}),[e]),d.createElement(VR.Provider,{value:e},t)),WR=d.createContext(!1),bL=()=>d.useContext(WR);WR.Provider;function SL(){let e=!1;return{clearReset:()=>{e=!1},reset:()=>{e=!0},isReset:()=>e}}var _L=d.createContext(SL()),EL=()=>d.useContext(_L);function CL(e,t){return typeof e=="function"?e(...t):!!e}var $L=(e,t)=>{(e.suspense||e.throwOnError)&&(t.isReset()||(e.retryOnMount=!1))},RL=e=>{d.useEffect(()=>{e.clearReset()},[e])},PL=({result:e,errorResetBoundary:t,throwOnError:n,query:r})=>e.isError&&!t.isReset()&&!e.isFetching&&CL(n,[e.error,r]),TL=e=>{e.suspense&&typeof e.staleTime!="number"&&(e.staleTime=1e3)},kL=(e,t)=>(e==null?void 0:e.suspense)&&t.isPending,AL=(e,t,n)=>t.fetchOptimistic(e).catch(()=>{n.clearReset()});function ML(e,t,n){const r=wL(n),o=bL(),i=EL(),s=r.defaultQueryOptions(e);s._optimisticResults=o?"isRestoring":"optimistic",TL(s),$L(s,i),RL(i);const[l]=d.useState(()=>new t(r,s)),u=l.getOptimisticResult(s);if(d.useSyncExternalStore(d.useCallback(f=>{const m=o?()=>{}:l.subscribe(gn.batchCalls(f));return l.updateResult(),m},[l,o]),()=>l.getCurrentResult(),()=>l.getCurrentResult()),d.useEffect(()=>{l.setOptions(s,{listeners:!1})},[s,l]),kL(s,u))throw l.setOptions(s,{listeners:!1}),AL(s,l,i);if(PL({result:u,errorResetBoundary:i,throwOnError:s.throwOnError,query:l.getCurrentQuery()}))throw u.error;return s.notifyOnChangeProps?u:l.trackResult(u)}function OL(e,t){return ML(e,gL,t)}var Oo=(e=>(e.RemoveBG="RemoveBG",e.AnimeSeg="AnimeSeg",e.RealESRGAN="RealESRGAN",e.GFPGAN="GFPGAN",e.RestoreFormer="RestoreFormer",e.InteractiveSeg="InteractiveSeg",e))(Oo||{}),Ps=(e=>(e.NAME="name",e.CTIME="ctime",e.MTIME="mtime",e))(Ps||{}),Ou=(e=>(e.DESCENDING="desc",e.ASCENDING="asc",e))(Ou||{}),nx=(e=>(e.ddim="ddim",e.plms="plms",e))(nx||{}),rx=(e=>(e.INPAINT_NS="INPAINT_NS",e.INPAINT_TELEA="INPAINT_TELEA",e))(rx||{}),Ln=(e=>(e.x="x",e.y="y",e.xy="xy",e))(Ln||{}),ma=(e=>(e.text_guided="text-guided",e.shape_guided="shape-guided",e.context_aware="context-aware",e.object_remove="object-remove",e.outpainting="outpainting",e))(ma||{});function HR(e){var t,n,r="";if(typeof e=="string"||typeof e=="number")r+=e;else if(typeof e=="object")if(Array.isArray(e))for(t=0;tl(i)))==null?void 0:s.classGroupId}const lC=/^\[(.+)\]$/;function DL(e){if(lC.test(e)){const t=lC.exec(e)[1],n=t==null?void 0:t.substring(0,t.indexOf(":"));if(n)return"arbitrary.."+n}}function IL(e){const{theme:t,prefix:n}=e,r={nextPart:new Map,validators:[]};return FL(Object.entries(e.classGroups),n).forEach(([i,s])=>{e1(s,r,i,t)}),r}function e1(e,t,n,r){e.forEach(o=>{if(typeof o=="string"){const i=o===""?t:cC(t,o);i.classGroupId=n;return}if(typeof o=="function"){if(LL(o)){e1(o(r),t,n,r);return}t.validators.push({validator:o,classGroupId:n});return}Object.entries(o).forEach(([i,s])=>{e1(s,cC(t,i),n,r)})})}function cC(e,t){let n=e;return t.split(ox).forEach(r=>{n.nextPart.has(r)||n.nextPart.set(r,{nextPart:new Map,validators:[]}),n=n.nextPart.get(r)}),n}function LL(e){return e.isThemeGetter}function FL(e,t){return t?e.map(([n,r])=>{const o=r.map(i=>typeof i=="string"?t+i:typeof i=="object"?Object.fromEntries(Object.entries(i).map(([s,l])=>[t+s,l])):i);return[n,o]}):e}function jL(e){if(e<1)return{get:()=>{},set:()=>{}};let t=0,n=new Map,r=new Map;function o(i,s){n.set(i,s),t++,t>e&&(t=0,r=n,n=new Map)}return{get(i){let s=n.get(i);if(s!==void 0)return s;if((s=r.get(i))!==void 0)return o(i,s),s},set(i,s){n.has(i)?n.set(i,s):o(i,s)}}}const YR="!";function BL(e){const t=e.separator,n=t.length===1,r=t[0],o=t.length;return function(s){const l=[];let u=0,f=0,m;for(let S=0;Sf?m-f:void 0;return{modifiers:l,hasImportantModifier:g,baseClassName:y,maybePostfixModifierPosition:x}}}function zL(e){if(e.length<=1)return e;const t=[];let n=[];return e.forEach(r=>{r[0]==="["?(t.push(...n.sort(),r),n=[]):n.push(r)}),t.push(...n.sort()),t}function UL(e){return{cache:jL(e.cacheSize),splitModifiers:BL(e),...NL(e)}}const VL=/\s+/;function WL(e,t){const{splitModifiers:n,getClassGroupId:r,getConflictingClassGroupIds:o}=t,i=new Set;return e.trim().split(VL).map(s=>{const{modifiers:l,hasImportantModifier:u,baseClassName:f,maybePostfixModifierPosition:m}=n(s);let p=r(m?f.substring(0,m):f),g=!!m;if(!p){if(!m)return{isTailwindClass:!1,originalClassName:s};if(p=r(f),!p)return{isTailwindClass:!1,originalClassName:s};g=!1}const y=zL(l).join(":");return{isTailwindClass:!0,modifierId:u?y+YR:y,classGroupId:p,originalClassName:s,hasPostfixModifier:g}}).reverse().filter(s=>{if(!s.isTailwindClass)return!0;const{modifierId:l,classGroupId:u,hasPostfixModifier:f}=s,m=l+u;return i.has(m)?!1:(i.add(m),o(u,f).forEach(p=>i.add(l+p)),!0)}).reverse().map(s=>s.originalClassName).join(" ")}function HL(){let e=0,t,n,r="";for(;ep(m),e());return n=UL(f),r=n.cache.get,o=n.cache.set,i=l,l(u)}function l(u){const f=r(u);if(f)return f;const m=WL(u,n);return o(u,m),m}return function(){return i(HL.apply(null,arguments))}}function Tt(e){const t=n=>n[e]||[];return t.isThemeGetter=!0,t}const ZR=/^\[(?:([a-z-]+):)?(.+)\]$/i,GL=/^\d+\/\d+$/,YL=new Set(["px","full","screen"]),XL=/^(\d+(\.\d+)?)?(xs|sm|md|lg|xl)$/,ZL=/\d+(%|px|r?em|[sdl]?v([hwib]|min|max)|pt|pc|in|cm|mm|cap|ch|ex|r?lh|cq(w|h|i|b|min|max))|\b(calc|min|max|clamp)\(.+\)|^0$/,qL=/^-?((\d+)?\.?(\d+)[a-z]+|0)_-?((\d+)?\.?(\d+)[a-z]+|0)/,QL=/^(url|image|image-set|cross-fade|element|(repeating-)?(linear|radial|conic)-gradient)\(.+\)$/;function lo(e){return ga(e)||YL.has(e)||GL.test(e)}function Zi(e){return _c(e,"length",sF)}function ga(e){return!!e&&!Number.isNaN(Number(e))}function fh(e){return _c(e,"number",ga)}function au(e){return!!e&&Number.isInteger(Number(e))}function JL(e){return e.endsWith("%")&&ga(e.slice(0,-1))}function Je(e){return ZR.test(e)}function qi(e){return XL.test(e)}const eF=new Set(["length","size","percentage"]);function tF(e){return _c(e,eF,qR)}function nF(e){return _c(e,"position",qR)}const rF=new Set(["image","url"]);function oF(e){return _c(e,rF,lF)}function iF(e){return _c(e,"",aF)}function lu(){return!0}function _c(e,t,n){const r=ZR.exec(e);return r?r[1]?typeof t=="string"?r[1]===t:t.has(r[1]):n(r[2]):!1}function sF(e){return ZL.test(e)}function qR(){return!1}function aF(e){return qL.test(e)}function lF(e){return QL.test(e)}function cF(){const e=Tt("colors"),t=Tt("spacing"),n=Tt("blur"),r=Tt("brightness"),o=Tt("borderColor"),i=Tt("borderRadius"),s=Tt("borderSpacing"),l=Tt("borderWidth"),u=Tt("contrast"),f=Tt("grayscale"),m=Tt("hueRotate"),p=Tt("invert"),g=Tt("gap"),y=Tt("gradientColorStops"),x=Tt("gradientColorStopPositions"),S=Tt("inset"),E=Tt("margin"),_=Tt("opacity"),b=Tt("padding"),C=Tt("saturate"),R=Tt("scale"),k=Tt("sepia"),O=Tt("skew"),A=Tt("space"),I=Tt("translate"),z=()=>["auto","contain","none"],H=()=>["auto","hidden","clip","visible","scroll"],ie=()=>["auto",Je,t],K=()=>[Je,t],te=()=>["",lo,Zi],U=()=>["auto",ga,Je],re=()=>["bottom","center","left","left-bottom","left-top","right","right-bottom","right-top","top"],V=()=>["solid","dashed","dotted","double","none"],J=()=>["normal","multiply","screen","overlay","darken","lighten","color-dodge","color-burn","hard-light","soft-light","difference","exclusion","hue","saturation","color","luminosity","plus-lighter"],G=()=>["start","end","center","between","around","evenly","stretch"],Z=()=>["","0",Je],Q=()=>["auto","avoid","all","avoid-page","page","left","right","column"],le=()=>[ga,fh],L=()=>[ga,Je];return{cacheSize:500,separator:":",theme:{colors:[lu],spacing:[lo,Zi],blur:["none","",qi,Je],brightness:le(),borderColor:[e],borderRadius:["none","","full",qi,Je],borderSpacing:K(),borderWidth:te(),contrast:le(),grayscale:Z(),hueRotate:L(),invert:Z(),gap:K(),gradientColorStops:[e],gradientColorStopPositions:[JL,Zi],inset:ie(),margin:ie(),opacity:le(),padding:K(),saturate:le(),scale:le(),sepia:Z(),skew:L(),space:K(),translate:K()},classGroups:{aspect:[{aspect:["auto","square","video",Je]}],container:["container"],columns:[{columns:[qi]}],"break-after":[{"break-after":Q()}],"break-before":[{"break-before":Q()}],"break-inside":[{"break-inside":["auto","avoid","avoid-page","avoid-column"]}],"box-decoration":[{"box-decoration":["slice","clone"]}],box:[{box:["border","content"]}],display:["block","inline-block","inline","flex","inline-flex","table","inline-table","table-caption","table-cell","table-column","table-column-group","table-footer-group","table-header-group","table-row-group","table-row","flow-root","grid","inline-grid","contents","list-item","hidden"],float:[{float:["right","left","none"]}],clear:[{clear:["left","right","both","none"]}],isolation:["isolate","isolation-auto"],"object-fit":[{object:["contain","cover","fill","none","scale-down"]}],"object-position":[{object:[...re(),Je]}],overflow:[{overflow:H()}],"overflow-x":[{"overflow-x":H()}],"overflow-y":[{"overflow-y":H()}],overscroll:[{overscroll:z()}],"overscroll-x":[{"overscroll-x":z()}],"overscroll-y":[{"overscroll-y":z()}],position:["static","fixed","absolute","relative","sticky"],inset:[{inset:[S]}],"inset-x":[{"inset-x":[S]}],"inset-y":[{"inset-y":[S]}],start:[{start:[S]}],end:[{end:[S]}],top:[{top:[S]}],right:[{right:[S]}],bottom:[{bottom:[S]}],left:[{left:[S]}],visibility:["visible","invisible","collapse"],z:[{z:["auto",au,Je]}],basis:[{basis:ie()}],"flex-direction":[{flex:["row","row-reverse","col","col-reverse"]}],"flex-wrap":[{flex:["wrap","wrap-reverse","nowrap"]}],flex:[{flex:["1","auto","initial","none",Je]}],grow:[{grow:Z()}],shrink:[{shrink:Z()}],order:[{order:["first","last","none",au,Je]}],"grid-cols":[{"grid-cols":[lu]}],"col-start-end":[{col:["auto",{span:["full",au,Je]},Je]}],"col-start":[{"col-start":U()}],"col-end":[{"col-end":U()}],"grid-rows":[{"grid-rows":[lu]}],"row-start-end":[{row:["auto",{span:[au,Je]},Je]}],"row-start":[{"row-start":U()}],"row-end":[{"row-end":U()}],"grid-flow":[{"grid-flow":["row","col","dense","row-dense","col-dense"]}],"auto-cols":[{"auto-cols":["auto","min","max","fr",Je]}],"auto-rows":[{"auto-rows":["auto","min","max","fr",Je]}],gap:[{gap:[g]}],"gap-x":[{"gap-x":[g]}],"gap-y":[{"gap-y":[g]}],"justify-content":[{justify:["normal",...G()]}],"justify-items":[{"justify-items":["start","end","center","stretch"]}],"justify-self":[{"justify-self":["auto","start","end","center","stretch"]}],"align-content":[{content:["normal",...G(),"baseline"]}],"align-items":[{items:["start","end","center","baseline","stretch"]}],"align-self":[{self:["auto","start","end","center","stretch","baseline"]}],"place-content":[{"place-content":[...G(),"baseline"]}],"place-items":[{"place-items":["start","end","center","baseline","stretch"]}],"place-self":[{"place-self":["auto","start","end","center","stretch"]}],p:[{p:[b]}],px:[{px:[b]}],py:[{py:[b]}],ps:[{ps:[b]}],pe:[{pe:[b]}],pt:[{pt:[b]}],pr:[{pr:[b]}],pb:[{pb:[b]}],pl:[{pl:[b]}],m:[{m:[E]}],mx:[{mx:[E]}],my:[{my:[E]}],ms:[{ms:[E]}],me:[{me:[E]}],mt:[{mt:[E]}],mr:[{mr:[E]}],mb:[{mb:[E]}],ml:[{ml:[E]}],"space-x":[{"space-x":[A]}],"space-x-reverse":["space-x-reverse"],"space-y":[{"space-y":[A]}],"space-y-reverse":["space-y-reverse"],w:[{w:["auto","min","max","fit",Je,t]}],"min-w":[{"min-w":["min","max","fit",Je,lo]}],"max-w":[{"max-w":["0","none","full","min","max","fit","prose",{screen:[qi]},qi,Je]}],h:[{h:[Je,t,"auto","min","max","fit"]}],"min-h":[{"min-h":["min","max","fit",lo,Je]}],"max-h":[{"max-h":[Je,t,"min","max","fit"]}],"font-size":[{text:["base",qi,Zi]}],"font-smoothing":["antialiased","subpixel-antialiased"],"font-style":["italic","not-italic"],"font-weight":[{font:["thin","extralight","light","normal","medium","semibold","bold","extrabold","black",fh]}],"font-family":[{font:[lu]}],"fvn-normal":["normal-nums"],"fvn-ordinal":["ordinal"],"fvn-slashed-zero":["slashed-zero"],"fvn-figure":["lining-nums","oldstyle-nums"],"fvn-spacing":["proportional-nums","tabular-nums"],"fvn-fraction":["diagonal-fractions","stacked-fractons"],tracking:[{tracking:["tighter","tight","normal","wide","wider","widest",Je]}],"line-clamp":[{"line-clamp":["none",ga,fh]}],leading:[{leading:["none","tight","snug","normal","relaxed","loose",lo,Je]}],"list-image":[{"list-image":["none",Je]}],"list-style-type":[{list:["none","disc","decimal",Je]}],"list-style-position":[{list:["inside","outside"]}],"placeholder-color":[{placeholder:[e]}],"placeholder-opacity":[{"placeholder-opacity":[_]}],"text-alignment":[{text:["left","center","right","justify","start","end"]}],"text-color":[{text:[e]}],"text-opacity":[{"text-opacity":[_]}],"text-decoration":["underline","overline","line-through","no-underline"],"text-decoration-style":[{decoration:[...V(),"wavy"]}],"text-decoration-thickness":[{decoration:["auto","from-font",lo,Zi]}],"underline-offset":[{"underline-offset":["auto",lo,Je]}],"text-decoration-color":[{decoration:[e]}],"text-transform":["uppercase","lowercase","capitalize","normal-case"],"text-overflow":["truncate","text-ellipsis","text-clip"],indent:[{indent:K()}],"vertical-align":[{align:["baseline","top","middle","bottom","text-top","text-bottom","sub","super",Je]}],whitespace:[{whitespace:["normal","nowrap","pre","pre-line","pre-wrap","break-spaces"]}],break:[{break:["normal","words","all","keep"]}],hyphens:[{hyphens:["none","manual","auto"]}],content:[{content:["none",Je]}],"bg-attachment":[{bg:["fixed","local","scroll"]}],"bg-clip":[{"bg-clip":["border","padding","content","text"]}],"bg-opacity":[{"bg-opacity":[_]}],"bg-origin":[{"bg-origin":["border","padding","content"]}],"bg-position":[{bg:[...re(),nF]}],"bg-repeat":[{bg:["no-repeat",{repeat:["","x","y","round","space"]}]}],"bg-size":[{bg:["auto","cover","contain",tF]}],"bg-image":[{bg:["none",{"gradient-to":["t","tr","r","br","b","bl","l","tl"]},oF]}],"bg-color":[{bg:[e]}],"gradient-from-pos":[{from:[x]}],"gradient-via-pos":[{via:[x]}],"gradient-to-pos":[{to:[x]}],"gradient-from":[{from:[y]}],"gradient-via":[{via:[y]}],"gradient-to":[{to:[y]}],rounded:[{rounded:[i]}],"rounded-s":[{"rounded-s":[i]}],"rounded-e":[{"rounded-e":[i]}],"rounded-t":[{"rounded-t":[i]}],"rounded-r":[{"rounded-r":[i]}],"rounded-b":[{"rounded-b":[i]}],"rounded-l":[{"rounded-l":[i]}],"rounded-ss":[{"rounded-ss":[i]}],"rounded-se":[{"rounded-se":[i]}],"rounded-ee":[{"rounded-ee":[i]}],"rounded-es":[{"rounded-es":[i]}],"rounded-tl":[{"rounded-tl":[i]}],"rounded-tr":[{"rounded-tr":[i]}],"rounded-br":[{"rounded-br":[i]}],"rounded-bl":[{"rounded-bl":[i]}],"border-w":[{border:[l]}],"border-w-x":[{"border-x":[l]}],"border-w-y":[{"border-y":[l]}],"border-w-s":[{"border-s":[l]}],"border-w-e":[{"border-e":[l]}],"border-w-t":[{"border-t":[l]}],"border-w-r":[{"border-r":[l]}],"border-w-b":[{"border-b":[l]}],"border-w-l":[{"border-l":[l]}],"border-opacity":[{"border-opacity":[_]}],"border-style":[{border:[...V(),"hidden"]}],"divide-x":[{"divide-x":[l]}],"divide-x-reverse":["divide-x-reverse"],"divide-y":[{"divide-y":[l]}],"divide-y-reverse":["divide-y-reverse"],"divide-opacity":[{"divide-opacity":[_]}],"divide-style":[{divide:V()}],"border-color":[{border:[o]}],"border-color-x":[{"border-x":[o]}],"border-color-y":[{"border-y":[o]}],"border-color-t":[{"border-t":[o]}],"border-color-r":[{"border-r":[o]}],"border-color-b":[{"border-b":[o]}],"border-color-l":[{"border-l":[o]}],"divide-color":[{divide:[o]}],"outline-style":[{outline:["",...V()]}],"outline-offset":[{"outline-offset":[lo,Je]}],"outline-w":[{outline:[lo,Zi]}],"outline-color":[{outline:[e]}],"ring-w":[{ring:te()}],"ring-w-inset":["ring-inset"],"ring-color":[{ring:[e]}],"ring-opacity":[{"ring-opacity":[_]}],"ring-offset-w":[{"ring-offset":[lo,Zi]}],"ring-offset-color":[{"ring-offset":[e]}],shadow:[{shadow:["","inner","none",qi,iF]}],"shadow-color":[{shadow:[lu]}],opacity:[{opacity:[_]}],"mix-blend":[{"mix-blend":J()}],"bg-blend":[{"bg-blend":J()}],filter:[{filter:["","none"]}],blur:[{blur:[n]}],brightness:[{brightness:[r]}],contrast:[{contrast:[u]}],"drop-shadow":[{"drop-shadow":["","none",qi,Je]}],grayscale:[{grayscale:[f]}],"hue-rotate":[{"hue-rotate":[m]}],invert:[{invert:[p]}],saturate:[{saturate:[C]}],sepia:[{sepia:[k]}],"backdrop-filter":[{"backdrop-filter":["","none"]}],"backdrop-blur":[{"backdrop-blur":[n]}],"backdrop-brightness":[{"backdrop-brightness":[r]}],"backdrop-contrast":[{"backdrop-contrast":[u]}],"backdrop-grayscale":[{"backdrop-grayscale":[f]}],"backdrop-hue-rotate":[{"backdrop-hue-rotate":[m]}],"backdrop-invert":[{"backdrop-invert":[p]}],"backdrop-opacity":[{"backdrop-opacity":[_]}],"backdrop-saturate":[{"backdrop-saturate":[C]}],"backdrop-sepia":[{"backdrop-sepia":[k]}],"border-collapse":[{border:["collapse","separate"]}],"border-spacing":[{"border-spacing":[s]}],"border-spacing-x":[{"border-spacing-x":[s]}],"border-spacing-y":[{"border-spacing-y":[s]}],"table-layout":[{table:["auto","fixed"]}],caption:[{caption:["top","bottom"]}],transition:[{transition:["none","all","","colors","opacity","shadow","transform",Je]}],duration:[{duration:L()}],ease:[{ease:["linear","in","out","in-out",Je]}],delay:[{delay:L()}],animate:[{animate:["none","spin","ping","pulse","bounce",Je]}],transform:[{transform:["","gpu","none"]}],scale:[{scale:[R]}],"scale-x":[{"scale-x":[R]}],"scale-y":[{"scale-y":[R]}],rotate:[{rotate:[au,Je]}],"translate-x":[{"translate-x":[I]}],"translate-y":[{"translate-y":[I]}],"skew-x":[{"skew-x":[O]}],"skew-y":[{"skew-y":[O]}],"transform-origin":[{origin:["center","top","top-right","right","bottom-right","bottom","bottom-left","left","top-left",Je]}],accent:[{accent:["auto",e]}],appearance:["appearance-none"],cursor:[{cursor:["auto","default","pointer","wait","text","move","help","not-allowed","none","context-menu","progress","cell","crosshair","vertical-text","alias","copy","no-drop","grab","grabbing","all-scroll","col-resize","row-resize","n-resize","e-resize","s-resize","w-resize","ne-resize","nw-resize","se-resize","sw-resize","ew-resize","ns-resize","nesw-resize","nwse-resize","zoom-in","zoom-out",Je]}],"caret-color":[{caret:[e]}],"pointer-events":[{"pointer-events":["none","auto"]}],resize:[{resize:["none","y","x",""]}],"scroll-behavior":[{scroll:["auto","smooth"]}],"scroll-m":[{"scroll-m":K()}],"scroll-mx":[{"scroll-mx":K()}],"scroll-my":[{"scroll-my":K()}],"scroll-ms":[{"scroll-ms":K()}],"scroll-me":[{"scroll-me":K()}],"scroll-mt":[{"scroll-mt":K()}],"scroll-mr":[{"scroll-mr":K()}],"scroll-mb":[{"scroll-mb":K()}],"scroll-ml":[{"scroll-ml":K()}],"scroll-p":[{"scroll-p":K()}],"scroll-px":[{"scroll-px":K()}],"scroll-py":[{"scroll-py":K()}],"scroll-ps":[{"scroll-ps":K()}],"scroll-pe":[{"scroll-pe":K()}],"scroll-pt":[{"scroll-pt":K()}],"scroll-pr":[{"scroll-pr":K()}],"scroll-pb":[{"scroll-pb":K()}],"scroll-pl":[{"scroll-pl":K()}],"snap-align":[{snap:["start","end","center","align-none"]}],"snap-stop":[{snap:["normal","always"]}],"snap-type":[{snap:["none","x","y","both"]}],"snap-strictness":[{snap:["mandatory","proximity"]}],touch:[{touch:["auto","none","manipulation"]}],"touch-x":[{"touch-pan":["x","left","right"]}],"touch-y":[{"touch-pan":["y","up","down"]}],"touch-pz":["touch-pinch-zoom"],select:[{select:["none","text","all","auto"]}],"will-change":[{"will-change":["auto","scroll","contents","transform",Je]}],fill:[{fill:[e,"none"]}],"stroke-w":[{stroke:[lo,Zi,fh]}],stroke:[{stroke:[e,"none"]}],sr:["sr-only","not-sr-only"]},conflictingClassGroups:{overflow:["overflow-x","overflow-y"],overscroll:["overscroll-x","overscroll-y"],inset:["inset-x","inset-y","start","end","top","right","bottom","left"],"inset-x":["right","left"],"inset-y":["top","bottom"],flex:["basis","grow","shrink"],gap:["gap-x","gap-y"],p:["px","py","ps","pe","pt","pr","pb","pl"],px:["pr","pl"],py:["pt","pb"],m:["mx","my","ms","me","mt","mr","mb","ml"],mx:["mr","ml"],my:["mt","mb"],"font-size":["leading"],"fvn-normal":["fvn-ordinal","fvn-slashed-zero","fvn-figure","fvn-spacing","fvn-fraction"],"fvn-ordinal":["fvn-normal"],"fvn-slashed-zero":["fvn-normal"],"fvn-figure":["fvn-normal"],"fvn-spacing":["fvn-normal"],"fvn-fraction":["fvn-normal"],rounded:["rounded-s","rounded-e","rounded-t","rounded-r","rounded-b","rounded-l","rounded-ss","rounded-se","rounded-ee","rounded-es","rounded-tl","rounded-tr","rounded-br","rounded-bl"],"rounded-s":["rounded-ss","rounded-es"],"rounded-e":["rounded-se","rounded-ee"],"rounded-t":["rounded-tl","rounded-tr"],"rounded-r":["rounded-tr","rounded-br"],"rounded-b":["rounded-br","rounded-bl"],"rounded-l":["rounded-tl","rounded-bl"],"border-spacing":["border-spacing-x","border-spacing-y"],"border-w":["border-w-s","border-w-e","border-w-t","border-w-r","border-w-b","border-w-l"],"border-w-x":["border-w-r","border-w-l"],"border-w-y":["border-w-t","border-w-b"],"border-color":["border-color-t","border-color-r","border-color-b","border-color-l"],"border-color-x":["border-color-r","border-color-l"],"border-color-y":["border-color-t","border-color-b"],"scroll-m":["scroll-mx","scroll-my","scroll-ms","scroll-me","scroll-mt","scroll-mr","scroll-mb","scroll-ml"],"scroll-mx":["scroll-mr","scroll-ml"],"scroll-my":["scroll-mt","scroll-mb"],"scroll-p":["scroll-px","scroll-py","scroll-ps","scroll-pe","scroll-pt","scroll-pr","scroll-pb","scroll-pl"],"scroll-px":["scroll-pr","scroll-pl"],"scroll-py":["scroll-pt","scroll-pb"],touch:["touch-x","touch-y","touch-pz"],"touch-x":["touch"],"touch-y":["touch"],"touch-pz":["touch"]},conflictingClassGroupModifiers:{"font-size":["leading"]}}}const ix=KL(cF),uF=40,dF=3,QR=200,ms="inpaint",hh="diffusers_sd",uC="diffusers_sdxl",ph="diffusers_sd_inpaint",dC="diffusers_sdxl_inpaint",g0="diffusers_other",Kh="#ffcc00bb",fC="ldm",hC="cv2",JR="Fantasy-Studio/Paint-by-Example",fF="timbrooks/instruct-pix2pix",hF="Sanster/PowerPaint-V1-stable-diffusion-inpainting",pF="Sanster/AnyText",mF="out of frame, lowres, error, cropped, worst quality, low quality, jpeg artifacts, ugly, duplicate, morbid, mutilated, out of frame, mutation, deformed, blurry, dehydrated, bad anatomy, bad proportions, extra limbs, disfigured, gross proportions, malformed limbs, watermark, signature";function xe(...e){return ix(KR(e))}function gF(){async function e(r=""){return(await fetch(r,{method:"GET",cache:"no-cache"})).json()}const t=()=>{const r=document.location;e(r+"/flaskwebgui-keep-server-alive").then(i=>i)},n=3*1e3;t(),setInterval(t,n)}function pC(e){const t=e.split(",")[0].split(":")[1].split(";")[0],n=atob(e.split(",")[1]),r=[];for(let o=0;o{const o=e.src,i=e;i.onload=n,i.onerror=s=>{i.src=o,r(s)},i.src=t})}async function vF(e){const t=URL.createObjectURL(e),n=new Image;return await bu(n,t),n}function mC(e){return new Promise((t,n)=>{const r=new Image;r.addEventListener("load",()=>{t(r)}),r.addEventListener("error",o=>{n(o)}),r.src=e.toDataURL()})}function yF(e){return new Promise((t,n)=>{const r=new FileReader;r.onload=()=>{const o=new Image;o.onload=()=>{t(o)},o.onerror=()=>{n("无法加载图像。")},o.src=r.result},r.onerror=()=>{n("无法读取文件。")},r.readAsDataURL(e)})}function Nu(e,t,n){return fetch(e).then(function(r){return r.arrayBuffer()}).then(function(r){return new File([r],t,{type:n})})}async function wF(){try{const{state:e}=await navigator.permissions.query({name:"clipboard-write"});return e==="granted"}catch{return!1}}function xF(e,t){return new Promise((n,r)=>e.toBlob(async o=>{o?n(o):r(new Error("Expected toBlob() to be defined"))},t))}const bF=async e=>{const t=[new ClipboardItem({[e.type]:e})];await navigator.clipboard.write(t)};function gC(e){return e.nativeEvent.button===2}function vC(e){return e.nativeEvent.button===1}async function SF(e){const t=await xF(e,"image/png");try{await bF(t)}catch{console.log("Copy image failed!")}}function _F(e,t){const n=document.createElement("a");n.href=e,n.download=t,n.dispatchEvent(new MouseEvent("click",{bubbles:!0,cancelable:!0,view:window})),setTimeout(()=>{n.remove()},100)}function v0(e){const t=e.nativeEvent;if("touches"in e){const n=e.target.getBoundingClientRect(),o=e.touches[0];return{x:(o.clientX-n.x)/n.width*o.target.offsetWidth,y:(o.clientY-n.y)/n.height*o.target.offsetHeight}}return{x:t.offsetX,y:t.offsetY}}function eP(e,t,n=Kh){e.strokeStyle=n,e.lineCap="round",e.lineJoin="round",t.forEach(r=>{!(r!=null&&r.pts.length)||!r.size||(e.lineWidth=r.size,e.beginPath(),e.moveTo(r.pts[0].x,r.pts[0].y),r.pts.forEach(o=>e.lineTo(o.x,o.y)),e.stroke())})}const Gh=(e,t,n,r=[],o="white")=>{const i=document.createElement("canvas");i.width=e,i.height=t;const s=i.getContext("2d");if(!s)throw new Error("could not retrieve mask canvas");return r.forEach(l=>{s.drawImage(l,0,0,e,t)}),n.forEach(l=>{eP(s,l,o)}),i},Du=e=>new Promise((t,n)=>{const r=new FileReader;r.onload=o=>{var s;const i=(s=o.target)==null?void 0:s.result;t(i)},r.onerror=o=>{n(o)},r.readAsDataURL(e)});function tP(e,t){return function(){return e.apply(t,arguments)}}const{toString:EF}=Object.prototype,{getPrototypeOf:sx}=Object,Mm=(e=>t=>{const n=EF.call(t);return e[n]||(e[n]=n.slice(8,-1).toLowerCase())})(Object.create(null)),ni=e=>(e=e.toLowerCase(),t=>Mm(t)===e),Om=e=>t=>typeof t===e,{isArray:Ec}=Array,rd=Om("undefined");function CF(e){return e!==null&&!rd(e)&&e.constructor!==null&&!rd(e.constructor)&&Zr(e.constructor.isBuffer)&&e.constructor.isBuffer(e)}const nP=ni("ArrayBuffer");function $F(e){let t;return typeof ArrayBuffer<"u"&&ArrayBuffer.isView?t=ArrayBuffer.isView(e):t=e&&e.buffer&&nP(e.buffer),t}const RF=Om("string"),Zr=Om("function"),rP=Om("number"),Nm=e=>e!==null&&typeof e=="object",PF=e=>e===!0||e===!1,Yh=e=>{if(Mm(e)!=="object")return!1;const t=sx(e);return(t===null||t===Object.prototype||Object.getPrototypeOf(t)===null)&&!(Symbol.toStringTag in e)&&!(Symbol.iterator in e)},TF=ni("Date"),kF=ni("File"),AF=ni("Blob"),MF=ni("FileList"),OF=e=>Nm(e)&&Zr(e.pipe),NF=e=>{let t;return e&&(typeof FormData=="function"&&e instanceof FormData||Zr(e.append)&&((t=Mm(e))==="formdata"||t==="object"&&Zr(e.toString)&&e.toString()==="[object FormData]"))},DF=ni("URLSearchParams"),IF=e=>e.trim?e.trim():e.replace(/^[\s\uFEFF\xA0]+|[\s\uFEFF\xA0]+$/g,"");function Nd(e,t,{allOwnKeys:n=!1}={}){if(e===null||typeof e>"u")return;let r,o;if(typeof e!="object"&&(e=[e]),Ec(e))for(r=0,o=e.length;r0;)if(o=n[r],t===o.toLowerCase())return o;return null}const iP=typeof globalThis<"u"?globalThis:typeof self<"u"?self:typeof window<"u"?window:global,sP=e=>!rd(e)&&e!==iP;function t1(){const{caseless:e}=sP(this)&&this||{},t={},n=(r,o)=>{const i=e&&oP(t,o)||o;Yh(t[i])&&Yh(r)?t[i]=t1(t[i],r):Yh(r)?t[i]=t1({},r):Ec(r)?t[i]=r.slice():t[i]=r};for(let r=0,o=arguments.length;r(Nd(t,(o,i)=>{n&&Zr(o)?e[i]=tP(o,n):e[i]=o},{allOwnKeys:r}),e),FF=e=>(e.charCodeAt(0)===65279&&(e=e.slice(1)),e),jF=(e,t,n,r)=>{e.prototype=Object.create(t.prototype,r),e.prototype.constructor=e,Object.defineProperty(e,"super",{value:t.prototype}),n&&Object.assign(e.prototype,n)},BF=(e,t,n,r)=>{let o,i,s;const l={};if(t=t||{},e==null)return t;do{for(o=Object.getOwnPropertyNames(e),i=o.length;i-- >0;)s=o[i],(!r||r(s,e,t))&&!l[s]&&(t[s]=e[s],l[s]=!0);e=n!==!1&&sx(e)}while(e&&(!n||n(e,t))&&e!==Object.prototype);return t},zF=(e,t,n)=>{e=String(e),(n===void 0||n>e.length)&&(n=e.length),n-=t.length;const r=e.indexOf(t,n);return r!==-1&&r===n},UF=e=>{if(!e)return null;if(Ec(e))return e;let t=e.length;if(!rP(t))return null;const n=new Array(t);for(;t-- >0;)n[t]=e[t];return n},VF=(e=>t=>e&&t instanceof e)(typeof Uint8Array<"u"&&sx(Uint8Array)),WF=(e,t)=>{const r=(e&&e[Symbol.iterator]).call(e);let o;for(;(o=r.next())&&!o.done;){const i=o.value;t.call(e,i[0],i[1])}},HF=(e,t)=>{let n;const r=[];for(;(n=e.exec(t))!==null;)r.push(n);return r},KF=ni("HTMLFormElement"),GF=e=>e.toLowerCase().replace(/[-_\s]([a-z\d])(\w*)/g,function(n,r,o){return r.toUpperCase()+o}),yC=(({hasOwnProperty:e})=>(t,n)=>e.call(t,n))(Object.prototype),YF=ni("RegExp"),aP=(e,t)=>{const n=Object.getOwnPropertyDescriptors(e),r={};Nd(n,(o,i)=>{let s;(s=t(o,i,e))!==!1&&(r[i]=s||o)}),Object.defineProperties(e,r)},XF=e=>{aP(e,(t,n)=>{if(Zr(e)&&["arguments","caller","callee"].indexOf(n)!==-1)return!1;const r=e[n];if(Zr(r)){if(t.enumerable=!1,"writable"in t){t.writable=!1;return}t.set||(t.set=()=>{throw Error("Can not rewrite read-only method '"+n+"'")})}})},ZF=(e,t)=>{const n={},r=o=>{o.forEach(i=>{n[i]=!0})};return Ec(e)?r(e):r(String(e).split(t)),n},qF=()=>{},QF=(e,t)=>(e=+e,Number.isFinite(e)?e:t),y0="abcdefghijklmnopqrstuvwxyz",wC="0123456789",lP={DIGIT:wC,ALPHA:y0,ALPHA_DIGIT:y0+y0.toUpperCase()+wC},JF=(e=16,t=lP.ALPHA_DIGIT)=>{let n="";const{length:r}=t;for(;e--;)n+=t[Math.random()*r|0];return n};function ej(e){return!!(e&&Zr(e.append)&&e[Symbol.toStringTag]==="FormData"&&e[Symbol.iterator])}const tj=e=>{const t=new Array(10),n=(r,o)=>{if(Nm(r)){if(t.indexOf(r)>=0)return;if(!("toJSON"in r)){t[o]=r;const i=Ec(r)?[]:{};return Nd(r,(s,l)=>{const u=n(s,o+1);!rd(u)&&(i[l]=u)}),t[o]=void 0,i}}return r};return n(e,0)},nj=ni("AsyncFunction"),rj=e=>e&&(Nm(e)||Zr(e))&&Zr(e.then)&&Zr(e.catch),ae={isArray:Ec,isArrayBuffer:nP,isBuffer:CF,isFormData:NF,isArrayBufferView:$F,isString:RF,isNumber:rP,isBoolean:PF,isObject:Nm,isPlainObject:Yh,isUndefined:rd,isDate:TF,isFile:kF,isBlob:AF,isRegExp:YF,isFunction:Zr,isStream:OF,isURLSearchParams:DF,isTypedArray:VF,isFileList:MF,forEach:Nd,merge:t1,extend:LF,trim:IF,stripBOM:FF,inherits:jF,toFlatObject:BF,kindOf:Mm,kindOfTest:ni,endsWith:zF,toArray:UF,forEachEntry:WF,matchAll:HF,isHTMLForm:KF,hasOwnProperty:yC,hasOwnProp:yC,reduceDescriptors:aP,freezeMethods:XF,toObjectSet:ZF,toCamelCase:GF,noop:qF,toFiniteNumber:QF,findKey:oP,global:iP,isContextDefined:sP,ALPHABET:lP,generateString:JF,isSpecCompliantForm:ej,toJSONObject:tj,isAsyncFn:nj,isThenable:rj};function ct(e,t,n,r,o){Error.call(this),Error.captureStackTrace?Error.captureStackTrace(this,this.constructor):this.stack=new Error().stack,this.message=e,this.name="AxiosError",t&&(this.code=t),n&&(this.config=n),r&&(this.request=r),o&&(this.response=o)}ae.inherits(ct,Error,{toJSON:function(){return{message:this.message,name:this.name,description:this.description,number:this.number,fileName:this.fileName,lineNumber:this.lineNumber,columnNumber:this.columnNumber,stack:this.stack,config:ae.toJSONObject(this.config),code:this.code,status:this.response&&this.response.status?this.response.status:null}}});const cP=ct.prototype,uP={};["ERR_BAD_OPTION_VALUE","ERR_BAD_OPTION","ECONNABORTED","ETIMEDOUT","ERR_NETWORK","ERR_FR_TOO_MANY_REDIRECTS","ERR_DEPRECATED","ERR_BAD_RESPONSE","ERR_BAD_REQUEST","ERR_CANCELED","ERR_NOT_SUPPORT","ERR_INVALID_URL"].forEach(e=>{uP[e]={value:e}});Object.defineProperties(ct,uP);Object.defineProperty(cP,"isAxiosError",{value:!0});ct.from=(e,t,n,r,o,i)=>{const s=Object.create(cP);return ae.toFlatObject(e,s,function(u){return u!==Error.prototype},l=>l!=="isAxiosError"),ct.call(s,e.message,t,n,r,o),s.cause=e,s.name=e.name,i&&Object.assign(s,i),s};const oj=null;function n1(e){return ae.isPlainObject(e)||ae.isArray(e)}function dP(e){return ae.endsWith(e,"[]")?e.slice(0,-2):e}function xC(e,t,n){return e?e.concat(t).map(function(o,i){return o=dP(o),!n&&i?"["+o+"]":o}).join(n?".":""):t}function ij(e){return ae.isArray(e)&&!e.some(n1)}const sj=ae.toFlatObject(ae,{},null,function(t){return/^is[A-Z]/.test(t)});function Dm(e,t,n){if(!ae.isObject(e))throw new TypeError("target must be an object");t=t||new FormData,n=ae.toFlatObject(n,{metaTokens:!0,dots:!1,indexes:!1},!1,function(S,E){return!ae.isUndefined(E[S])});const r=n.metaTokens,o=n.visitor||m,i=n.dots,s=n.indexes,u=(n.Blob||typeof Blob<"u"&&Blob)&&ae.isSpecCompliantForm(t);if(!ae.isFunction(o))throw new TypeError("visitor must be a function");function f(x){if(x===null)return"";if(ae.isDate(x))return x.toISOString();if(!u&&ae.isBlob(x))throw new ct("Blob is not supported. Use a Buffer instead.");return ae.isArrayBuffer(x)||ae.isTypedArray(x)?u&&typeof Blob=="function"?new Blob([x]):Buffer.from(x):x}function m(x,S,E){let _=x;if(x&&!E&&typeof x=="object"){if(ae.endsWith(S,"{}"))S=r?S:S.slice(0,-2),x=JSON.stringify(x);else if(ae.isArray(x)&&ij(x)||(ae.isFileList(x)||ae.endsWith(S,"[]"))&&(_=ae.toArray(x)))return S=dP(S),_.forEach(function(C,R){!(ae.isUndefined(C)||C===null)&&t.append(s===!0?xC([S],R,i):s===null?S:S+"[]",f(C))}),!1}return n1(x)?!0:(t.append(xC(E,S,i),f(x)),!1)}const p=[],g=Object.assign(sj,{defaultVisitor:m,convertValue:f,isVisitable:n1});function y(x,S){if(!ae.isUndefined(x)){if(p.indexOf(x)!==-1)throw Error("Circular reference detected in "+S.join("."));p.push(x),ae.forEach(x,function(_,b){(!(ae.isUndefined(_)||_===null)&&o.call(t,_,ae.isString(b)?b.trim():b,S,g))===!0&&y(_,S?S.concat(b):[b])}),p.pop()}}if(!ae.isObject(e))throw new TypeError("data must be an object");return y(e),t}function bC(e){const t={"!":"%21","'":"%27","(":"%28",")":"%29","~":"%7E","%20":"+","%00":"\0"};return encodeURIComponent(e).replace(/[!'()~]|%20|%00/g,function(r){return t[r]})}function ax(e,t){this._pairs=[],e&&Dm(e,this,t)}const fP=ax.prototype;fP.append=function(t,n){this._pairs.push([t,n])};fP.toString=function(t){const n=t?function(r){return t.call(this,r,bC)}:bC;return this._pairs.map(function(o){return n(o[0])+"="+n(o[1])},"").join("&")};function aj(e){return encodeURIComponent(e).replace(/%3A/gi,":").replace(/%24/g,"$").replace(/%2C/gi,",").replace(/%20/g,"+").replace(/%5B/gi,"[").replace(/%5D/gi,"]")}function hP(e,t,n){if(!t)return e;const r=n&&n.encode||aj,o=n&&n.serialize;let i;if(o?i=o(t,n):i=ae.isURLSearchParams(t)?t.toString():new ax(t,n).toString(r),i){const s=e.indexOf("#");s!==-1&&(e=e.slice(0,s)),e+=(e.indexOf("?")===-1?"?":"&")+i}return e}class lj{constructor(){this.handlers=[]}use(t,n,r){return this.handlers.push({fulfilled:t,rejected:n,synchronous:r?r.synchronous:!1,runWhen:r?r.runWhen:null}),this.handlers.length-1}eject(t){this.handlers[t]&&(this.handlers[t]=null)}clear(){this.handlers&&(this.handlers=[])}forEach(t){ae.forEach(this.handlers,function(r){r!==null&&t(r)})}}const SC=lj,pP={silentJSONParsing:!0,forcedJSONParsing:!0,clarifyTimeoutError:!1},cj=typeof URLSearchParams<"u"?URLSearchParams:ax,uj=typeof FormData<"u"?FormData:null,dj=typeof Blob<"u"?Blob:null,fj={isBrowser:!0,classes:{URLSearchParams:cj,FormData:uj,Blob:dj},protocols:["http","https","file","blob","url","data"]},mP=typeof window<"u"&&typeof document<"u",hj=(e=>mP&&["ReactNative","NativeScript","NS"].indexOf(e)<0)(typeof navigator<"u"&&navigator.product),pj=typeof WorkerGlobalScope<"u"&&self instanceof WorkerGlobalScope&&typeof self.importScripts=="function",mj=Object.freeze(Object.defineProperty({__proto__:null,hasBrowserEnv:mP,hasStandardBrowserEnv:hj,hasStandardBrowserWebWorkerEnv:pj},Symbol.toStringTag,{value:"Module"})),Wo={...mj,...fj};function gj(e,t){return Dm(e,new Wo.classes.URLSearchParams,Object.assign({visitor:function(n,r,o,i){return Wo.isNode&&ae.isBuffer(n)?(this.append(r,n.toString("base64")),!1):i.defaultVisitor.apply(this,arguments)}},t))}function vj(e){return ae.matchAll(/\w+|\[(\w*)]/g,e).map(t=>t[0]==="[]"?"":t[1]||t[0])}function yj(e){const t={},n=Object.keys(e);let r;const o=n.length;let i;for(r=0;r=n.length;return s=!s&&ae.isArray(o)?o.length:s,u?(ae.hasOwnProp(o,s)?o[s]=[o[s],r]:o[s]=r,!l):((!o[s]||!ae.isObject(o[s]))&&(o[s]=[]),t(n,r,o[s],i)&&ae.isArray(o[s])&&(o[s]=yj(o[s])),!l)}if(ae.isFormData(e)&&ae.isFunction(e.entries)){const n={};return ae.forEachEntry(e,(r,o)=>{t(vj(r),o,n,0)}),n}return null}function wj(e,t,n){if(ae.isString(e))try{return(t||JSON.parse)(e),ae.trim(e)}catch(r){if(r.name!=="SyntaxError")throw r}return(n||JSON.stringify)(e)}const lx={transitional:pP,adapter:["xhr","http"],transformRequest:[function(t,n){const r=n.getContentType()||"",o=r.indexOf("application/json")>-1,i=ae.isObject(t);if(i&&ae.isHTMLForm(t)&&(t=new FormData(t)),ae.isFormData(t))return o&&o?JSON.stringify(gP(t)):t;if(ae.isArrayBuffer(t)||ae.isBuffer(t)||ae.isStream(t)||ae.isFile(t)||ae.isBlob(t))return t;if(ae.isArrayBufferView(t))return t.buffer;if(ae.isURLSearchParams(t))return n.setContentType("application/x-www-form-urlencoded;charset=utf-8",!1),t.toString();let l;if(i){if(r.indexOf("application/x-www-form-urlencoded")>-1)return gj(t,this.formSerializer).toString();if((l=ae.isFileList(t))||r.indexOf("multipart/form-data")>-1){const u=this.env&&this.env.FormData;return Dm(l?{"files[]":t}:t,u&&new u,this.formSerializer)}}return i||o?(n.setContentType("application/json",!1),wj(t)):t}],transformResponse:[function(t){const n=this.transitional||lx.transitional,r=n&&n.forcedJSONParsing,o=this.responseType==="json";if(t&&ae.isString(t)&&(r&&!this.responseType||o)){const s=!(n&&n.silentJSONParsing)&&o;try{return JSON.parse(t)}catch(l){if(s)throw l.name==="SyntaxError"?ct.from(l,ct.ERR_BAD_RESPONSE,this,null,this.response):l}}return t}],timeout:0,xsrfCookieName:"XSRF-TOKEN",xsrfHeaderName:"X-XSRF-TOKEN",maxContentLength:-1,maxBodyLength:-1,env:{FormData:Wo.classes.FormData,Blob:Wo.classes.Blob},validateStatus:function(t){return t>=200&&t<300},headers:{common:{Accept:"application/json, text/plain, */*","Content-Type":void 0}}};ae.forEach(["delete","get","head","post","put","patch"],e=>{lx.headers[e]={}});const cx=lx,xj=ae.toObjectSet(["age","authorization","content-length","content-type","etag","expires","from","host","if-modified-since","if-unmodified-since","last-modified","location","max-forwards","proxy-authorization","referer","retry-after","user-agent"]),bj=e=>{const t={};let n,r,o;return e&&e.split(` +`).forEach(function(s){o=s.indexOf(":"),n=s.substring(0,o).trim().toLowerCase(),r=s.substring(o+1).trim(),!(!n||t[n]&&xj[n])&&(n==="set-cookie"?t[n]?t[n].push(r):t[n]=[r]:t[n]=t[n]?t[n]+", "+r:r)}),t},_C=Symbol("internals");function cu(e){return e&&String(e).trim().toLowerCase()}function Xh(e){return e===!1||e==null?e:ae.isArray(e)?e.map(Xh):String(e)}function Sj(e){const t=Object.create(null),n=/([^\s,;=]+)\s*(?:=\s*([^,;]+))?/g;let r;for(;r=n.exec(e);)t[r[1]]=r[2];return t}const _j=e=>/^[-_a-zA-Z0-9^`|~,!#$%&'*+.]+$/.test(e.trim());function w0(e,t,n,r,o){if(ae.isFunction(r))return r.call(this,t,n);if(o&&(t=n),!!ae.isString(t)){if(ae.isString(r))return t.indexOf(r)!==-1;if(ae.isRegExp(r))return r.test(t)}}function Ej(e){return e.trim().toLowerCase().replace(/([a-z\d])(\w*)/g,(t,n,r)=>n.toUpperCase()+r)}function Cj(e,t){const n=ae.toCamelCase(" "+t);["get","set","has"].forEach(r=>{Object.defineProperty(e,r+n,{value:function(o,i,s){return this[r].call(this,t,o,i,s)},configurable:!0})})}class Im{constructor(t){t&&this.set(t)}set(t,n,r){const o=this;function i(l,u,f){const m=cu(u);if(!m)throw new Error("header name must be a non-empty string");const p=ae.findKey(o,m);(!p||o[p]===void 0||f===!0||f===void 0&&o[p]!==!1)&&(o[p||u]=Xh(l))}const s=(l,u)=>ae.forEach(l,(f,m)=>i(f,m,u));return ae.isPlainObject(t)||t instanceof this.constructor?s(t,n):ae.isString(t)&&(t=t.trim())&&!_j(t)?s(bj(t),n):t!=null&&i(n,t,r),this}get(t,n){if(t=cu(t),t){const r=ae.findKey(this,t);if(r){const o=this[r];if(!n)return o;if(n===!0)return Sj(o);if(ae.isFunction(n))return n.call(this,o,r);if(ae.isRegExp(n))return n.exec(o);throw new TypeError("parser must be boolean|regexp|function")}}}has(t,n){if(t=cu(t),t){const r=ae.findKey(this,t);return!!(r&&this[r]!==void 0&&(!n||w0(this,this[r],r,n)))}return!1}delete(t,n){const r=this;let o=!1;function i(s){if(s=cu(s),s){const l=ae.findKey(r,s);l&&(!n||w0(r,r[l],l,n))&&(delete r[l],o=!0)}}return ae.isArray(t)?t.forEach(i):i(t),o}clear(t){const n=Object.keys(this);let r=n.length,o=!1;for(;r--;){const i=n[r];(!t||w0(this,this[i],i,t,!0))&&(delete this[i],o=!0)}return o}normalize(t){const n=this,r={};return ae.forEach(this,(o,i)=>{const s=ae.findKey(r,i);if(s){n[s]=Xh(o),delete n[i];return}const l=t?Ej(i):String(i).trim();l!==i&&delete n[i],n[l]=Xh(o),r[l]=!0}),this}concat(...t){return this.constructor.concat(this,...t)}toJSON(t){const n=Object.create(null);return ae.forEach(this,(r,o)=>{r!=null&&r!==!1&&(n[o]=t&&ae.isArray(r)?r.join(", "):r)}),n}[Symbol.iterator](){return Object.entries(this.toJSON())[Symbol.iterator]()}toString(){return Object.entries(this.toJSON()).map(([t,n])=>t+": "+n).join(` +`)}get[Symbol.toStringTag](){return"AxiosHeaders"}static from(t){return t instanceof this?t:new this(t)}static concat(t,...n){const r=new this(t);return n.forEach(o=>r.set(o)),r}static accessor(t){const r=(this[_C]=this[_C]={accessors:{}}).accessors,o=this.prototype;function i(s){const l=cu(s);r[l]||(Cj(o,s),r[l]=!0)}return ae.isArray(t)?t.forEach(i):i(t),this}}Im.accessor(["Content-Type","Content-Length","Accept","Accept-Encoding","User-Agent","Authorization"]);ae.reduceDescriptors(Im.prototype,({value:e},t)=>{let n=t[0].toUpperCase()+t.slice(1);return{get:()=>e,set(r){this[n]=r}}});ae.freezeMethods(Im);const Si=Im;function x0(e,t){const n=this||cx,r=t||n,o=Si.from(r.headers);let i=r.data;return ae.forEach(e,function(l){i=l.call(n,i,o.normalize(),t?t.status:void 0)}),o.normalize(),i}function vP(e){return!!(e&&e.__CANCEL__)}function Dd(e,t,n){ct.call(this,e??"canceled",ct.ERR_CANCELED,t,n),this.name="CanceledError"}ae.inherits(Dd,ct,{__CANCEL__:!0});function $j(e,t,n){const r=n.config.validateStatus;!n.status||!r||r(n.status)?e(n):t(new ct("Request failed with status code "+n.status,[ct.ERR_BAD_REQUEST,ct.ERR_BAD_RESPONSE][Math.floor(n.status/100)-4],n.config,n.request,n))}const Rj=Wo.hasStandardBrowserEnv?{write(e,t,n,r,o,i){const s=[e+"="+encodeURIComponent(t)];ae.isNumber(n)&&s.push("expires="+new Date(n).toGMTString()),ae.isString(r)&&s.push("path="+r),ae.isString(o)&&s.push("domain="+o),i===!0&&s.push("secure"),document.cookie=s.join("; ")},read(e){const t=document.cookie.match(new RegExp("(^|;\\s*)("+e+")=([^;]*)"));return t?decodeURIComponent(t[3]):null},remove(e){this.write(e,"",Date.now()-864e5)}}:{write(){},read(){return null},remove(){}};function Pj(e){return/^([a-z][a-z\d+\-.]*:)?\/\//i.test(e)}function Tj(e,t){return t?e.replace(/\/+$/,"")+"/"+t.replace(/^\/+/,""):e}function yP(e,t){return e&&!Pj(t)?Tj(e,t):t}const kj=Wo.hasStandardBrowserEnv?function(){const t=/(msie|trident)/i.test(navigator.userAgent),n=document.createElement("a");let r;function o(i){let s=i;return t&&(n.setAttribute("href",s),s=n.href),n.setAttribute("href",s),{href:n.href,protocol:n.protocol?n.protocol.replace(/:$/,""):"",host:n.host,search:n.search?n.search.replace(/^\?/,""):"",hash:n.hash?n.hash.replace(/^#/,""):"",hostname:n.hostname,port:n.port,pathname:n.pathname.charAt(0)==="/"?n.pathname:"/"+n.pathname}}return r=o(window.location.href),function(s){const l=ae.isString(s)?o(s):s;return l.protocol===r.protocol&&l.host===r.host}}():function(){return function(){return!0}}();function Aj(e){const t=/^([-+\w]{1,25})(:?\/\/|:)/.exec(e);return t&&t[1]||""}function Mj(e,t){e=e||10;const n=new Array(e),r=new Array(e);let o=0,i=0,s;return t=t!==void 0?t:1e3,function(u){const f=Date.now(),m=r[i];s||(s=f),n[o]=u,r[o]=f;let p=i,g=0;for(;p!==o;)g+=n[p++],p=p%e;if(o=(o+1)%e,o===i&&(i=(i+1)%e),f-s{const i=o.loaded,s=o.lengthComputable?o.total:void 0,l=i-n,u=r(l),f=i<=s;n=i;const m={loaded:i,total:s,progress:s?i/s:void 0,bytes:l,rate:u||void 0,estimated:u&&s&&f?(s-i)/u:void 0,event:o};m[t?"download":"upload"]=!0,e(m)}}const Oj=typeof XMLHttpRequest<"u",Nj=Oj&&function(e){return new Promise(function(n,r){let o=e.data;const i=Si.from(e.headers).normalize();let{responseType:s,withXSRFToken:l}=e,u;function f(){e.cancelToken&&e.cancelToken.unsubscribe(u),e.signal&&e.signal.removeEventListener("abort",u)}let m;if(ae.isFormData(o)){if(Wo.hasStandardBrowserEnv||Wo.hasStandardBrowserWebWorkerEnv)i.setContentType(!1);else if((m=i.getContentType())!==!1){const[S,...E]=m?m.split(";").map(_=>_.trim()).filter(Boolean):[];i.setContentType([S||"multipart/form-data",...E].join("; "))}}let p=new XMLHttpRequest;if(e.auth){const S=e.auth.username||"",E=e.auth.password?unescape(encodeURIComponent(e.auth.password)):"";i.set("Authorization","Basic "+btoa(S+":"+E))}const g=yP(e.baseURL,e.url);p.open(e.method.toUpperCase(),hP(g,e.params,e.paramsSerializer),!0),p.timeout=e.timeout;function y(){if(!p)return;const S=Si.from("getAllResponseHeaders"in p&&p.getAllResponseHeaders()),_={data:!s||s==="text"||s==="json"?p.responseText:p.response,status:p.status,statusText:p.statusText,headers:S,config:e,request:p};$j(function(C){n(C),f()},function(C){r(C),f()},_),p=null}if("onloadend"in p?p.onloadend=y:p.onreadystatechange=function(){!p||p.readyState!==4||p.status===0&&!(p.responseURL&&p.responseURL.indexOf("file:")===0)||setTimeout(y)},p.onabort=function(){p&&(r(new ct("Request aborted",ct.ECONNABORTED,e,p)),p=null)},p.onerror=function(){r(new ct("Network Error",ct.ERR_NETWORK,e,p)),p=null},p.ontimeout=function(){let E=e.timeout?"timeout of "+e.timeout+"ms exceeded":"timeout exceeded";const _=e.transitional||pP;e.timeoutErrorMessage&&(E=e.timeoutErrorMessage),r(new ct(E,_.clarifyTimeoutError?ct.ETIMEDOUT:ct.ECONNABORTED,e,p)),p=null},Wo.hasStandardBrowserEnv&&(l&&ae.isFunction(l)&&(l=l(e)),l||l!==!1&&kj(g))){const S=e.xsrfHeaderName&&e.xsrfCookieName&&Rj.read(e.xsrfCookieName);S&&i.set(e.xsrfHeaderName,S)}o===void 0&&i.setContentType(null),"setRequestHeader"in p&&ae.forEach(i.toJSON(),function(E,_){p.setRequestHeader(_,E)}),ae.isUndefined(e.withCredentials)||(p.withCredentials=!!e.withCredentials),s&&s!=="json"&&(p.responseType=e.responseType),typeof e.onDownloadProgress=="function"&&p.addEventListener("progress",EC(e.onDownloadProgress,!0)),typeof e.onUploadProgress=="function"&&p.upload&&p.upload.addEventListener("progress",EC(e.onUploadProgress)),(e.cancelToken||e.signal)&&(u=S=>{p&&(r(!S||S.type?new Dd(null,e,p):S),p.abort(),p=null)},e.cancelToken&&e.cancelToken.subscribe(u),e.signal&&(e.signal.aborted?u():e.signal.addEventListener("abort",u)));const x=Aj(g);if(x&&Wo.protocols.indexOf(x)===-1){r(new ct("Unsupported protocol "+x+":",ct.ERR_BAD_REQUEST,e));return}p.send(o||null)})},r1={http:oj,xhr:Nj};ae.forEach(r1,(e,t)=>{if(e){try{Object.defineProperty(e,"name",{value:t})}catch{}Object.defineProperty(e,"adapterName",{value:t})}});const CC=e=>`- ${e}`,Dj=e=>ae.isFunction(e)||e===null||e===!1,wP={getAdapter:e=>{e=ae.isArray(e)?e:[e];const{length:t}=e;let n,r;const o={};for(let i=0;i`adapter ${l} `+(u===!1?"is not supported by the environment":"is not available in the build"));let s=t?i.length>1?`since : +`+i.map(CC).join(` +`):" "+CC(i[0]):"as no adapter specified";throw new ct("There is no suitable adapter to dispatch the request "+s,"ERR_NOT_SUPPORT")}return r},adapters:r1};function b0(e){if(e.cancelToken&&e.cancelToken.throwIfRequested(),e.signal&&e.signal.aborted)throw new Dd(null,e)}function $C(e){return b0(e),e.headers=Si.from(e.headers),e.data=x0.call(e,e.transformRequest),["post","put","patch"].indexOf(e.method)!==-1&&e.headers.setContentType("application/x-www-form-urlencoded",!1),wP.getAdapter(e.adapter||cx.adapter)(e).then(function(r){return b0(e),r.data=x0.call(e,e.transformResponse,r),r.headers=Si.from(r.headers),r},function(r){return vP(r)||(b0(e),r&&r.response&&(r.response.data=x0.call(e,e.transformResponse,r.response),r.response.headers=Si.from(r.response.headers))),Promise.reject(r)})}const RC=e=>e instanceof Si?e.toJSON():e;function ac(e,t){t=t||{};const n={};function r(f,m,p){return ae.isPlainObject(f)&&ae.isPlainObject(m)?ae.merge.call({caseless:p},f,m):ae.isPlainObject(m)?ae.merge({},m):ae.isArray(m)?m.slice():m}function o(f,m,p){if(ae.isUndefined(m)){if(!ae.isUndefined(f))return r(void 0,f,p)}else return r(f,m,p)}function i(f,m){if(!ae.isUndefined(m))return r(void 0,m)}function s(f,m){if(ae.isUndefined(m)){if(!ae.isUndefined(f))return r(void 0,f)}else return r(void 0,m)}function l(f,m,p){if(p in t)return r(f,m);if(p in e)return r(void 0,f)}const u={url:i,method:i,data:i,baseURL:s,transformRequest:s,transformResponse:s,paramsSerializer:s,timeout:s,timeoutMessage:s,withCredentials:s,withXSRFToken:s,adapter:s,responseType:s,xsrfCookieName:s,xsrfHeaderName:s,onUploadProgress:s,onDownloadProgress:s,decompress:s,maxContentLength:s,maxBodyLength:s,beforeRedirect:s,transport:s,httpAgent:s,httpsAgent:s,cancelToken:s,socketPath:s,responseEncoding:s,validateStatus:l,headers:(f,m)=>o(RC(f),RC(m),!0)};return ae.forEach(Object.keys(Object.assign({},e,t)),function(m){const p=u[m]||o,g=p(e[m],t[m],m);ae.isUndefined(g)&&p!==l||(n[m]=g)}),n}const xP="1.6.2",ux={};["object","boolean","number","function","string","symbol"].forEach((e,t)=>{ux[e]=function(r){return typeof r===e||"a"+(t<1?"n ":" ")+e}});const PC={};ux.transitional=function(t,n,r){function o(i,s){return"[Axios v"+xP+"] Transitional option '"+i+"'"+s+(r?". "+r:"")}return(i,s,l)=>{if(t===!1)throw new ct(o(s," has been removed"+(n?" in "+n:"")),ct.ERR_DEPRECATED);return n&&!PC[s]&&(PC[s]=!0,console.warn(o(s," has been deprecated since v"+n+" and will be removed in the near future"))),t?t(i,s,l):!0}};function Ij(e,t,n){if(typeof e!="object")throw new ct("options must be an object",ct.ERR_BAD_OPTION_VALUE);const r=Object.keys(e);let o=r.length;for(;o-- >0;){const i=r[o],s=t[i];if(s){const l=e[i],u=l===void 0||s(l,i,e);if(u!==!0)throw new ct("option "+i+" must be "+u,ct.ERR_BAD_OPTION_VALUE);continue}if(n!==!0)throw new ct("Unknown option "+i,ct.ERR_BAD_OPTION)}}const o1={assertOptions:Ij,validators:ux},Qi=o1.validators;class Ip{constructor(t){this.defaults=t,this.interceptors={request:new SC,response:new SC}}request(t,n){typeof t=="string"?(n=n||{},n.url=t):n=t||{},n=ac(this.defaults,n);const{transitional:r,paramsSerializer:o,headers:i}=n;r!==void 0&&o1.assertOptions(r,{silentJSONParsing:Qi.transitional(Qi.boolean),forcedJSONParsing:Qi.transitional(Qi.boolean),clarifyTimeoutError:Qi.transitional(Qi.boolean)},!1),o!=null&&(ae.isFunction(o)?n.paramsSerializer={serialize:o}:o1.assertOptions(o,{encode:Qi.function,serialize:Qi.function},!0)),n.method=(n.method||this.defaults.method||"get").toLowerCase();let s=i&&ae.merge(i.common,i[n.method]);i&&ae.forEach(["delete","get","head","post","put","patch","common"],x=>{delete i[x]}),n.headers=Si.concat(s,i);const l=[];let u=!0;this.interceptors.request.forEach(function(S){typeof S.runWhen=="function"&&S.runWhen(n)===!1||(u=u&&S.synchronous,l.unshift(S.fulfilled,S.rejected))});const f=[];this.interceptors.response.forEach(function(S){f.push(S.fulfilled,S.rejected)});let m,p=0,g;if(!u){const x=[$C.bind(this),void 0];for(x.unshift.apply(x,l),x.push.apply(x,f),g=x.length,m=Promise.resolve(n);p{if(!r._listeners)return;let i=r._listeners.length;for(;i-- >0;)r._listeners[i](o);r._listeners=null}),this.promise.then=o=>{let i;const s=new Promise(l=>{r.subscribe(l),i=l}).then(o);return s.cancel=function(){r.unsubscribe(i)},s},t(function(i,s,l){r.reason||(r.reason=new Dd(i,s,l),n(r.reason))})}throwIfRequested(){if(this.reason)throw this.reason}subscribe(t){if(this.reason){t(this.reason);return}this._listeners?this._listeners.push(t):this._listeners=[t]}unsubscribe(t){if(!this._listeners)return;const n=this._listeners.indexOf(t);n!==-1&&this._listeners.splice(n,1)}static source(){let t;return{token:new dx(function(o){t=o}),cancel:t}}}const Lj=dx;function Fj(e){return function(n){return e.apply(null,n)}}function jj(e){return ae.isObject(e)&&e.isAxiosError===!0}const i1={Continue:100,SwitchingProtocols:101,Processing:102,EarlyHints:103,Ok:200,Created:201,Accepted:202,NonAuthoritativeInformation:203,NoContent:204,ResetContent:205,PartialContent:206,MultiStatus:207,AlreadyReported:208,ImUsed:226,MultipleChoices:300,MovedPermanently:301,Found:302,SeeOther:303,NotModified:304,UseProxy:305,Unused:306,TemporaryRedirect:307,PermanentRedirect:308,BadRequest:400,Unauthorized:401,PaymentRequired:402,Forbidden:403,NotFound:404,MethodNotAllowed:405,NotAcceptable:406,ProxyAuthenticationRequired:407,RequestTimeout:408,Conflict:409,Gone:410,LengthRequired:411,PreconditionFailed:412,PayloadTooLarge:413,UriTooLong:414,UnsupportedMediaType:415,RangeNotSatisfiable:416,ExpectationFailed:417,ImATeapot:418,MisdirectedRequest:421,UnprocessableEntity:422,Locked:423,FailedDependency:424,TooEarly:425,UpgradeRequired:426,PreconditionRequired:428,TooManyRequests:429,RequestHeaderFieldsTooLarge:431,UnavailableForLegalReasons:451,InternalServerError:500,NotImplemented:501,BadGateway:502,ServiceUnavailable:503,GatewayTimeout:504,HttpVersionNotSupported:505,VariantAlsoNegotiates:506,InsufficientStorage:507,LoopDetected:508,NotExtended:510,NetworkAuthenticationRequired:511};Object.entries(i1).forEach(([e,t])=>{i1[t]=e});const Bj=i1;function bP(e){const t=new Zh(e),n=tP(Zh.prototype.request,t);return ae.extend(n,Zh.prototype,t,{allOwnKeys:!0}),ae.extend(n,t,null,{allOwnKeys:!0}),n.create=function(o){return bP(ac(e,o))},n}const Qt=bP(cx);Qt.Axios=Zh;Qt.CanceledError=Dd;Qt.CancelToken=Lj;Qt.isCancel=vP;Qt.VERSION=xP;Qt.toFormData=Dm;Qt.AxiosError=ct;Qt.Cancel=Qt.CanceledError;Qt.all=function(t){return Promise.all(t)};Qt.spread=Fj;Qt.isAxiosError=jj;Qt.mergeConfig=ac;Qt.AxiosHeaders=Si;Qt.formToJSON=e=>gP(ae.isHTMLForm(e)?new FormData(e):e);Qt.getAdapter=wP.getAdapter;Qt.HttpStatusCode=Bj;Qt.default=Qt;const zj=Qt,Ni="/api/v1",Cc=zj.create({baseURL:Ni});async function Uj(e,t,n,r,o,i=null){const s=await Du(e),l=await Du(o),u=i?await Du(i):null,f=await fetch(`${Ni}/inpaint`,{method:"POST",headers:{"Content-Type":"application/json"},body:JSON.stringify({image:s,mask:l,ldm_steps:t.ldmSteps,ldm_sampler:t.ldmSampler,zits_wireframe:t.zitsWireframe,cv2_flag:t.cv2Flag,cv2_radius:t.cv2Radius,hd_strategy:"Crop",hd_strategy_crop_triger_size:640,hd_strategy_crop_margin:128,hd_trategy_resize_imit:2048,prompt:t.prompt,negative_prompt:t.negativePrompt,use_croper:t.showCropper,croper_x:n.x,croper_y:n.y,croper_height:n.height,croper_width:n.width,use_extender:t.showExtender,extender_x:r.x,extender_y:r.y,extender_height:r.height,extender_width:r.width,sd_mask_blur:t.sdMaskBlur,sd_strength:t.sdStrength,sd_steps:t.sdSteps,sd_guidance_scale:t.sdGuidanceScale,sd_sampler:t.sdSampler,sd_seed:t.seedFixed?t.seed:-1,sd_match_histograms:t.sdMatchHistograms,sd_lcm_lora:t.enableLCMLora,paint_by_example_example_image:u,p2p_image_guidance_scale:t.p2pImageGuidanceScale,enable_controlnet:t.enableControlnet,controlnet_conditioning_scale:t.controlnetConditioningScale,controlnet_method:t.controlnetMethod?t.controlnetMethod:"",enable_brushnet:t.enableBrushNet,brushnet_method:t.brushnetMethod?t.brushnetMethod:"",brushnet_conditioning_scale:t.brushnetConditioningScale,enable_powerpaint_v2:t.enablePowerPaintV2,powerpaint_task:t.showExtender?ma.outpainting:t.powerpaintTask})});if(f.ok){const p=await f.blob();return{blob:URL.createObjectURL(p),seed:f.headers.get("X-Seed")}}const m=await f.json();throw new Error(`Something went wrong: ${m.errors}`)}async function SP(){return(await Cc.get("/server-config")).data}async function Vj(e){return(await Cc.post("/model",{name:e})).data}async function S0(e,t){return Cc.post("/switch_plugin_model",{plugin_name:e,model_name:t})}async function Wj(){return(await Cc.get("/model")).data}async function _P(e,t,n,r,o){const i=await Du(n),l=await fetch(`${Ni}/${e?"run_plugin_gen_mask":"run_plugin_gen_image"}`,{method:"POST",headers:{"Content-Type":"application/json"},body:JSON.stringify({name:t,image:i,scale:r,clicks:o})});if(l.ok){const f=await l.blob();return{blob:URL.createObjectURL(f)}}const u=await l.json();throw new Error(u)}async function Hj(e,t){const n=await fetch(`${Ni}/media_file?tab=${e}&filename=${encodeURIComponent(t)}`,{method:"GET"});if(n.ok){const o=await n.blob();return new File([o],t,{type:n.headers.get("Content-Type")??"image/png"})}const r=await n.json();throw new Error(r.errors)}async function Kj(e,t){const n=await fetch(`${Ni}/media_file?tab=${e}&filename=${encodeURIComponent(t)}`,{method:"GET"});if(n.ok)return await n.blob();const r=await n.json();throw new Error(r.errors)}async function Gj(e){return(await Cc.get("medias",{params:{tab:e}})).data}async function Yj(e,t,n){const r=await Nu(e.src,t,n),o=new FormData;o.append("file",r);try{const i=await fetch(`${Ni}/save_image`,{method:"POST",body:o});if(!i.ok){const s=await i.text();throw new Error(s)}}catch(i){throw new Error(`Something went wrong: ${i}`)}}async function Xj(e){const t=new FormData;return t.append("file",e),(await Cc.post("/gen-info",t)).data}async function Zj(e,t,n){const r=await Du(e),o=await fetch(`${Ni}/adjust_mask`,{method:"POST",headers:{"Content-Type":"application/json"},body:JSON.stringify({mask:r,operate:t,kernel_size:n})});if(o.ok)return await o.blob();const i=await o.json();throw new Error(i)}function qj(){const[e,t]=d.useState(null),n=d.useCallback(()=>{const r=new Headers;r.append("pragma","no-cache"),r.append("cache-control","no-cache"),fetch(`${Ni}/inputimage`,{headers:r}).then(async o=>{var l;if(!o.ok)return;const i=(l=o.headers.get("content-disposition"))==null?void 0:l.split("filename=")[1].split(";")[0],s=await o.blob();if(s&&s.type.startsWith("image")){const u=new File([s],i!==void 0?i:"inputImage");t(u)}}).catch(o=>{console.log(o)})},[t]);return d.useEffect(()=>{n()},[n]),e}function Co(e,t){if(e==null)return{};var n={},r=Object.keys(e),o,i;for(i=0;i=0)&&(n[o]=e[o]);return n}var Qj=["color"],Jj=d.forwardRef(function(e,t){var n=e.color,r=n===void 0?"currentColor":n,o=Co(e,Qj);return d.createElement("svg",Object.assign({width:"15",height:"15",viewBox:"0 0 15 15",fill:"none",xmlns:"http://www.w3.org/2000/svg"},o,{ref:t}),d.createElement("path",{d:"M4.93179 5.43179C4.75605 5.60753 4.75605 5.89245 4.93179 6.06819C5.10753 6.24392 5.39245 6.24392 5.56819 6.06819L7.49999 4.13638L9.43179 6.06819C9.60753 6.24392 9.89245 6.24392 10.0682 6.06819C10.2439 5.89245 10.2439 5.60753 10.0682 5.43179L7.81819 3.18179C7.73379 3.0974 7.61933 3.04999 7.49999 3.04999C7.38064 3.04999 7.26618 3.0974 7.18179 3.18179L4.93179 5.43179ZM10.0682 9.56819C10.2439 9.39245 10.2439 9.10753 10.0682 8.93179C9.89245 8.75606 9.60753 8.75606 9.43179 8.93179L7.49999 10.8636L5.56819 8.93179C5.39245 8.75606 5.10753 8.75606 4.93179 8.93179C4.75605 9.10753 4.75605 9.39245 4.93179 9.56819L7.18179 11.8182C7.35753 11.9939 7.64245 11.9939 7.81819 11.8182L10.0682 9.56819Z",fill:r,fillRule:"evenodd",clipRule:"evenodd"}))}),eB=["color"],EP=d.forwardRef(function(e,t){var n=e.color,r=n===void 0?"currentColor":n,o=Co(e,eB);return d.createElement("svg",Object.assign({width:"15",height:"15",viewBox:"0 0 15 15",fill:"none",xmlns:"http://www.w3.org/2000/svg"},o,{ref:t}),d.createElement("path",{d:"M11.4669 3.72684C11.7558 3.91574 11.8369 4.30308 11.648 4.59198L7.39799 11.092C7.29783 11.2452 7.13556 11.3467 6.95402 11.3699C6.77247 11.3931 6.58989 11.3355 6.45446 11.2124L3.70446 8.71241C3.44905 8.48022 3.43023 8.08494 3.66242 7.82953C3.89461 7.57412 4.28989 7.55529 4.5453 7.78749L6.75292 9.79441L10.6018 3.90792C10.7907 3.61902 11.178 3.53795 11.4669 3.72684Z",fill:r,fillRule:"evenodd",clipRule:"evenodd"}))}),tB=["color"],nB=d.forwardRef(function(e,t){var n=e.color,r=n===void 0?"currentColor":n,o=Co(e,tB);return d.createElement("svg",Object.assign({width:"15",height:"15",viewBox:"0 0 15 15",fill:"none",xmlns:"http://www.w3.org/2000/svg"},o,{ref:t}),d.createElement("path",{d:"M3.13523 6.15803C3.3241 5.95657 3.64052 5.94637 3.84197 6.13523L7.5 9.56464L11.158 6.13523C11.3595 5.94637 11.6759 5.95657 11.8648 6.15803C12.0536 6.35949 12.0434 6.67591 11.842 6.86477L7.84197 10.6148C7.64964 10.7951 7.35036 10.7951 7.15803 10.6148L3.15803 6.86477C2.95657 6.67591 2.94637 6.35949 3.13523 6.15803Z",fill:r,fillRule:"evenodd",clipRule:"evenodd"}))}),rB=["color"],oB=d.forwardRef(function(e,t){var n=e.color,r=n===void 0?"currentColor":n,o=Co(e,rB);return d.createElement("svg",Object.assign({width:"15",height:"15",viewBox:"0 0 15 15",fill:"none",xmlns:"http://www.w3.org/2000/svg"},o,{ref:t}),d.createElement("path",{d:"M6.1584 3.13508C6.35985 2.94621 6.67627 2.95642 6.86514 3.15788L10.6151 7.15788C10.7954 7.3502 10.7954 7.64949 10.6151 7.84182L6.86514 11.8418C6.67627 12.0433 6.35985 12.0535 6.1584 11.8646C5.95694 11.6757 5.94673 11.3593 6.1356 11.1579L9.565 7.49985L6.1356 3.84182C5.94673 3.64036 5.95694 3.32394 6.1584 3.13508Z",fill:r,fillRule:"evenodd",clipRule:"evenodd"}))}),iB=["color"],sB=d.forwardRef(function(e,t){var n=e.color,r=n===void 0?"currentColor":n,o=Co(e,iB);return d.createElement("svg",Object.assign({width:"15",height:"15",viewBox:"0 0 15 15",fill:"none",xmlns:"http://www.w3.org/2000/svg"},o,{ref:t}),d.createElement("path",{d:"M3.13523 8.84197C3.3241 9.04343 3.64052 9.05363 3.84197 8.86477L7.5 5.43536L11.158 8.86477C11.3595 9.05363 11.6759 9.04343 11.8648 8.84197C12.0536 8.64051 12.0434 8.32409 11.842 8.13523L7.84197 4.38523C7.64964 4.20492 7.35036 4.20492 7.15803 4.38523L3.15803 8.13523C2.95657 8.32409 2.94637 8.64051 3.13523 8.84197Z",fill:r,fillRule:"evenodd",clipRule:"evenodd"}))}),aB=["color"],CP=d.forwardRef(function(e,t){var n=e.color,r=n===void 0?"currentColor":n,o=Co(e,aB);return d.createElement("svg",Object.assign({width:"15",height:"15",viewBox:"0 0 15 15",fill:"none",xmlns:"http://www.w3.org/2000/svg"},o,{ref:t}),d.createElement("path",{d:"M11.7816 4.03157C12.0062 3.80702 12.0062 3.44295 11.7816 3.2184C11.5571 2.99385 11.193 2.99385 10.9685 3.2184L7.50005 6.68682L4.03164 3.2184C3.80708 2.99385 3.44301 2.99385 3.21846 3.2184C2.99391 3.44295 2.99391 3.80702 3.21846 4.03157L6.68688 7.49999L3.21846 10.9684C2.99391 11.193 2.99391 11.557 3.21846 11.7816C3.44301 12.0061 3.80708 12.0061 4.03164 11.7816L7.50005 8.31316L10.9685 11.7816C11.193 12.0061 11.5571 12.0061 11.7816 11.7816C12.0062 11.557 12.0062 11.193 11.7816 10.9684L8.31322 7.49999L11.7816 4.03157Z",fill:r,fillRule:"evenodd",clipRule:"evenodd"}))}),lB=["color"],cB=d.forwardRef(function(e,t){var n=e.color,r=n===void 0?"currentColor":n,o=Co(e,lB);return d.createElement("svg",Object.assign({width:"15",height:"15",viewBox:"0 0 15 15",fill:"none",xmlns:"http://www.w3.org/2000/svg"},o,{ref:t}),d.createElement("path",{d:"M9.875 7.5C9.875 8.81168 8.81168 9.875 7.5 9.875C6.18832 9.875 5.125 8.81168 5.125 7.5C5.125 6.18832 6.18832 5.125 7.5 5.125C8.81168 5.125 9.875 6.18832 9.875 7.5Z",fill:r}))}),uB=["color"],dB=d.forwardRef(function(e,t){var n=e.color,r=n===void 0?"currentColor":n,o=Co(e,uB);return d.createElement("svg",Object.assign({width:"15",height:"15",viewBox:"0 0 15 15",fill:"none",xmlns:"http://www.w3.org/2000/svg"},o,{ref:t}),d.createElement("path",{d:"M10 6.5C10 8.433 8.433 10 6.5 10C4.567 10 3 8.433 3 6.5C3 4.567 4.567 3 6.5 3C8.433 3 10 4.567 10 6.5ZM9.30884 10.0159C8.53901 10.6318 7.56251 11 6.5 11C4.01472 11 2 8.98528 2 6.5C2 4.01472 4.01472 2 6.5 2C8.98528 2 11 4.01472 11 6.5C11 7.56251 10.6318 8.53901 10.0159 9.30884L12.8536 12.1464C13.0488 12.3417 13.0488 12.6583 12.8536 12.8536C12.6583 13.0488 12.3417 13.0488 12.1464 12.8536L9.30884 10.0159Z",fill:r,fillRule:"evenodd",clipRule:"evenodd"}))}),fB=["color"],hB=d.forwardRef(function(e,t){var n=e.color,r=n===void 0?"currentColor":n,o=Co(e,fB);return d.createElement("svg",Object.assign({width:"15",height:"15",viewBox:"0 0 15 15",fill:"none",xmlns:"http://www.w3.org/2000/svg"},o,{ref:t}),d.createElement("path",{d:"M3.24182 2.32181C3.3919 2.23132 3.5784 2.22601 3.73338 2.30781L12.7334 7.05781C12.8974 7.14436 13 7.31457 13 7.5C13 7.68543 12.8974 7.85564 12.7334 7.94219L3.73338 12.6922C3.5784 12.774 3.3919 12.7687 3.24182 12.6782C3.09175 12.5877 3 12.4252 3 12.25V2.75C3 2.57476 3.09175 2.4123 3.24182 2.32181ZM4 3.57925V11.4207L11.4288 7.5L4 3.57925Z",fill:r,fillRule:"evenodd",clipRule:"evenodd"}))}),pB=["color"],mB=d.forwardRef(function(e,t){var n=e.color,r=n===void 0?"currentColor":n,o=Co(e,pB);return d.createElement("svg",Object.assign({width:"15",height:"15",viewBox:"0 0 15 15",fill:"none",xmlns:"http://www.w3.org/2000/svg"},o,{ref:t}),d.createElement("path",{d:"M7 2H1.5C1.22386 2 1 2.22386 1 2.5V7H7V2ZM8 2V7H14V2.5C14 2.22386 13.7761 2 13.5 2H8ZM7 8H1V12.5C1 12.7761 1.22386 13 1.5 13H7V8ZM8 13V8H14V12.5C14 12.7761 13.7761 13 13.5 13H8ZM1.5 1C0.671573 1 0 1.67157 0 2.5V12.5C0 13.3284 0.671573 14 1.5 14H13.5C14.3284 14 15 13.3284 15 12.5V2.5C15 1.67157 14.3284 1 13.5 1H1.5Z",fill:r,fillRule:"evenodd",clipRule:"evenodd"}))}),gB=["color"],vB=d.forwardRef(function(e,t){var n=e.color,r=n===void 0?"currentColor":n,o=Co(e,gB);return d.createElement("svg",Object.assign({width:"15",height:"15",viewBox:"0 0 15 15",fill:"none",xmlns:"http://www.w3.org/2000/svg"},o,{ref:t}),d.createElement("path",{d:"M1.5 2H13.5C13.7761 2 14 2.22386 14 2.5V7H1V2.5C1 2.22386 1.22386 2 1.5 2ZM1 8V12.5C1 12.7761 1.22386 13 1.5 13H13.5C13.7761 13 14 12.7761 14 12.5V8H1ZM0 2.5C0 1.67157 0.671573 1 1.5 1H13.5C14.3284 1 15 1.67157 15 2.5V12.5C15 13.3284 14.3284 14 13.5 14H1.5C0.671573 14 0 13.3284 0 12.5V2.5Z",fill:r,fillRule:"evenodd",clipRule:"evenodd"}))});function Y(){return Y=Object.assign?Object.assign.bind():function(e){for(var t=1;te.forEach(n=>yB(n,t))}function Ve(...e){return d.useCallback(Lm(...e),e)}const Qo=d.forwardRef((e,t)=>{const{children:n,...r}=e,o=d.Children.toArray(n),i=o.find(wB);if(i){const s=i.props.children,l=o.map(u=>u===i?d.Children.count(s)>1?d.Children.only(null):d.isValidElement(s)?s.props.children:null:u);return d.createElement(s1,Y({},r,{ref:t}),d.isValidElement(s)?d.cloneElement(s,void 0,l):null)}return d.createElement(s1,Y({},r,{ref:t}),n)});Qo.displayName="Slot";const s1=d.forwardRef((e,t)=>{const{children:n,...r}=e;return d.isValidElement(n)?d.cloneElement(n,{...xB(r,n.props),ref:t?Lm(t,n.ref):n.ref}):d.Children.count(n)>1?d.Children.only(null):null});s1.displayName="SlotClone";const fx=({children:e})=>d.createElement(d.Fragment,null,e);function wB(e){return d.isValidElement(e)&&e.type===fx}function xB(e,t){const n={...t};for(const r in t){const o=e[r],i=t[r];/^on[A-Z]/.test(r)?o&&i?n[r]=(...l)=>{i(...l),o(...l)}:o&&(n[r]=o):r==="style"?n[r]={...o,...i}:r==="className"&&(n[r]=[o,i].filter(Boolean).join(" "))}return{...e,...n}}const TC=e=>typeof e=="boolean"?"".concat(e):e===0?"0":e,kC=KR,Fm=(e,t)=>n=>{var r;if((t==null?void 0:t.variants)==null)return kC(e,n==null?void 0:n.class,n==null?void 0:n.className);const{variants:o,defaultVariants:i}=t,s=Object.keys(o).map(f=>{const m=n==null?void 0:n[f],p=i==null?void 0:i[f];if(m===null)return null;const g=TC(m)||TC(p);return o[f][g]}),l=n&&Object.entries(n).reduce((f,m)=>{let[p,g]=m;return g===void 0||(f[p]=g),f},{}),u=t==null||(r=t.compoundVariants)===null||r===void 0?void 0:r.reduce((f,m)=>{let{class:p,className:g,...y}=m;return Object.entries(y).every(x=>{let[S,E]=x;return Array.isArray(E)?E.includes({...i,...l}[S]):{...i,...l}[S]===E})?[...f,p,g]:f},[]);return kC(e,s,u,n==null?void 0:n.class,n==null?void 0:n.className)};var bB={VITE_BACKEND:"http://127.0.0.1:8080",BASE_URL:"/",MODE:"production",DEV:!1,PROD:!0,SSR:!1};function SB(e,t){let n;try{n=e()}catch{return}return{getItem:o=>{var i;const s=u=>u===null?null:JSON.parse(u,t==null?void 0:t.reviver),l=(i=n.getItem(o))!=null?i:null;return l instanceof Promise?l.then(s):s(l)},setItem:(o,i)=>n.setItem(o,JSON.stringify(i,t==null?void 0:t.replacer)),removeItem:o=>n.removeItem(o)}}const od=e=>t=>{try{const n=e(t);return n instanceof Promise?n:{then(r){return od(r)(n)},catch(r){return this}}}catch(n){return{then(r){return this},catch(r){return od(r)(n)}}}},_B=(e,t)=>(n,r,o)=>{let i={getStorage:()=>localStorage,serialize:JSON.stringify,deserialize:JSON.parse,partialize:E=>E,version:0,merge:(E,_)=>({..._,...E}),...t},s=!1;const l=new Set,u=new Set;let f;try{f=i.getStorage()}catch{}if(!f)return e((...E)=>{console.warn(`[zustand persist middleware] Unable to update item '${i.name}', the given storage is currently unavailable.`),n(...E)},r,o);const m=od(i.serialize),p=()=>{const E=i.partialize({...r()});let _;const b=m({state:E,version:i.version}).then(C=>f.setItem(i.name,C)).catch(C=>{_=C});if(_)throw _;return b},g=o.setState;o.setState=(E,_)=>{g(E,_),p()};const y=e((...E)=>{n(...E),p()},r,o);let x;const S=()=>{var E;if(!f)return;s=!1,l.forEach(b=>b(r()));const _=((E=i.onRehydrateStorage)==null?void 0:E.call(i,r()))||void 0;return od(f.getItem.bind(f))(i.name).then(b=>{if(b)return i.deserialize(b)}).then(b=>{if(b)if(typeof b.version=="number"&&b.version!==i.version){if(i.migrate)return i.migrate(b.state,b.version);console.error("State loaded from storage couldn't be migrated since no migrate function was provided")}else return b.state}).then(b=>{var C;return x=i.merge(b,(C=r())!=null?C:y),n(x,!0),p()}).then(()=>{_==null||_(x,void 0),s=!0,u.forEach(b=>b(x))}).catch(b=>{_==null||_(void 0,b)})};return o.persist={setOptions:E=>{i={...i,...E},E.getStorage&&(f=E.getStorage())},clearStorage:()=>{f==null||f.removeItem(i.name)},getOptions:()=>i,rehydrate:()=>S(),hasHydrated:()=>s,onHydrate:E=>(l.add(E),()=>{l.delete(E)}),onFinishHydration:E=>(u.add(E),()=>{u.delete(E)})},S(),x||y},EB=(e,t)=>(n,r,o)=>{let i={storage:SB(()=>localStorage),partialize:S=>S,version:0,merge:(S,E)=>({...E,...S}),...t},s=!1;const l=new Set,u=new Set;let f=i.storage;if(!f)return e((...S)=>{console.warn(`[zustand persist middleware] Unable to update item '${i.name}', the given storage is currently unavailable.`),n(...S)},r,o);const m=()=>{const S=i.partialize({...r()});return f.setItem(i.name,{state:S,version:i.version})},p=o.setState;o.setState=(S,E)=>{p(S,E),m()};const g=e((...S)=>{n(...S),m()},r,o);let y;const x=()=>{var S,E;if(!f)return;s=!1,l.forEach(b=>{var C;return b((C=r())!=null?C:g)});const _=((E=i.onRehydrateStorage)==null?void 0:E.call(i,(S=r())!=null?S:g))||void 0;return od(f.getItem.bind(f))(i.name).then(b=>{if(b)if(typeof b.version=="number"&&b.version!==i.version){if(i.migrate)return i.migrate(b.state,b.version);console.error("State loaded from storage couldn't be migrated since no migrate function was provided")}else return b.state}).then(b=>{var C;return y=i.merge(b,(C=r())!=null?C:g),n(y,!0),m()}).then(()=>{_==null||_(y,void 0),y=r(),s=!0,u.forEach(b=>b(y))}).catch(b=>{_==null||_(void 0,b)})};return o.persist={setOptions:S=>{i={...i,...S},S.storage&&(f=S.storage)},clearStorage:()=>{f==null||f.removeItem(i.name)},getOptions:()=>i,rehydrate:()=>x(),hasHydrated:()=>s,onHydrate:S=>(l.add(S),()=>{l.delete(S)}),onFinishHydration:S=>(u.add(S),()=>{u.delete(S)})},i.skipHydration||x(),y||g},CB=(e,t)=>"getStorage"in t||"serialize"in t||"deserialize"in t?((bB?"production":void 0)!=="production"&&console.warn("[DEPRECATED] `getStorage`, `serialize` and `deserialize` options are deprecated. Use `storage` option instead."),_B(e,t)):EB(e,t),$B=CB;function RB(e,t){if(Object.is(e,t))return!0;if(typeof e!="object"||e===null||typeof t!="object"||t===null)return!1;if(e instanceof Map&&t instanceof Map){if(e.size!==t.size)return!1;for(const[r,o]of e)if(!Object.is(o,t.get(r)))return!1;return!0}if(e instanceof Set&&t instanceof Set){if(e.size!==t.size)return!1;for(const r of e)if(!t.has(r))return!1;return!0}const n=Object.keys(e);if(n.length!==Object.keys(t).length)return!1;for(let r=0;r{t(n,r,e)}):e.forEach((n,r)=>t(r,n,e))}function jm(e){const t=e[xr];return t?t.type_:Array.isArray(e)?1:Bm(e)?2:zm(e)?3:0}function a1(e,t){return jm(e)===2?e.has(t):Object.prototype.hasOwnProperty.call(e,t)}function PP(e,t,n){const r=jm(e);r===2?e.set(t,n):r===3?e.add(n):e[t]=n}function TB(e,t){return e===t?e!==0||1/e===1/t:e!==e&&t!==t}function Bm(e){return e instanceof Map}function zm(e){return e instanceof Set}function ia(e){return e.copy_||e.base_}function l1(e,t){if(Bm(e))return new Map(e);if(zm(e))return new Set(e);if(Array.isArray(e))return Array.prototype.slice.call(e);if(!t&&RP(e))return lc(e)?{...e}:Object.assign(Object.create(null),e);const n=Object.getOwnPropertyDescriptors(e);delete n[xr];let r=Reflect.ownKeys(n);for(let o=0;o1&&(e.set=e.add=e.clear=e.delete=kB),Object.freeze(e),t&&id(e,(n,r)=>hx(r,!0))),e}function kB(){vo(2)}function Um(e){return Object.isFrozen(e)}var AB={};function Da(e){const t=AB[e];return t||vo(0,e),t}var sd;function TP(){return sd}function MB(e,t){return{drafts_:[],parent_:e,immer_:t,canAutoFreeze_:!0,unfinalizedDrafts_:0}}function MC(e,t){t&&(Da("Patches"),e.patches_=[],e.inversePatches_=[],e.patchListener_=t)}function c1(e){u1(e),e.drafts_.forEach(OB),e.drafts_=null}function u1(e){e===sd&&(sd=e.parent_)}function OC(e){return sd=MB(sd,e)}function OB(e){const t=e[xr];t.type_===0||t.type_===1?t.revoke_():t.revoked_=!0}function NC(e,t){t.unfinalizedDrafts_=t.drafts_.length;const n=t.drafts_[0];return e!==void 0&&e!==n?(n[xr].modified_&&(c1(t),vo(4)),Na(e)&&(e=Lp(t,e),t.parent_||Fp(t,e)),t.patches_&&Da("Patches").generateReplacementPatches_(n[xr].base_,e,t.patches_,t.inversePatches_)):e=Lp(t,n,[]),c1(t),t.patches_&&t.patchListener_(t.patches_,t.inversePatches_),e!==$P?e:void 0}function Lp(e,t,n){if(Um(t))return t;const r=t[xr];if(!r)return id(t,(o,i)=>DC(e,r,t,o,i,n)),t;if(r.scope_!==e)return t;if(!r.modified_)return Fp(e,r.base_,!0),r.base_;if(!r.finalized_){r.finalized_=!0,r.scope_.unfinalizedDrafts_--;const o=r.copy_;let i=o,s=!1;r.type_===3&&(i=new Set(o),o.clear(),s=!0),id(i,(l,u)=>DC(e,r,o,l,u,n,s)),Fp(e,o,!1),n&&e.patches_&&Da("Patches").generatePatches_(r,n,e.patches_,e.inversePatches_)}return r.copy_}function DC(e,t,n,r,o,i,s){if(cc(o)){const l=i&&t&&t.type_!==3&&!a1(t.assigned_,r)?i.concat(r):void 0,u=Lp(e,o,l);if(PP(n,r,u),cc(u))e.canAutoFreeze_=!1;else return}else s&&n.add(o);if(Na(o)&&!Um(o)){if(!e.immer_.autoFreeze_&&e.unfinalizedDrafts_<1)return;Lp(e,o),(!t||!t.scope_.parent_)&&Fp(e,o)}}function Fp(e,t,n=!1){!e.parent_&&e.immer_.autoFreeze_&&e.canAutoFreeze_&&hx(t,n)}function NB(e,t){const n=Array.isArray(e),r={type_:n?1:0,scope_:t?t.scope_:TP(),modified_:!1,finalized_:!1,assigned_:{},parent_:t,base_:e,draft_:null,copy_:null,revoke_:null,isManual_:!1};let o=r,i=px;n&&(o=[r],i=ad);const{revoke:s,proxy:l}=Proxy.revocable(o,i);return r.draft_=l,r.revoke_=s,l}var px={get(e,t){if(t===xr)return e;const n=ia(e);if(!a1(n,t))return DB(e,n,t);const r=n[t];return e.finalized_||!Na(r)?r:r===_0(e.base_,t)?(E0(e),e.copy_[t]=f1(r,e)):r},has(e,t){return t in ia(e)},ownKeys(e){return Reflect.ownKeys(ia(e))},set(e,t,n){const r=kP(ia(e),t);if(r!=null&&r.set)return r.set.call(e.draft_,n),!0;if(!e.modified_){const o=_0(ia(e),t),i=o==null?void 0:o[xr];if(i&&i.base_===n)return e.copy_[t]=n,e.assigned_[t]=!1,!0;if(TB(n,o)&&(n!==void 0||a1(e.base_,t)))return!0;E0(e),d1(e)}return e.copy_[t]===n&&(n!==void 0||t in e.copy_)||Number.isNaN(n)&&Number.isNaN(e.copy_[t])||(e.copy_[t]=n,e.assigned_[t]=!0),!0},deleteProperty(e,t){return _0(e.base_,t)!==void 0||t in e.base_?(e.assigned_[t]=!1,E0(e),d1(e)):delete e.assigned_[t],e.copy_&&delete e.copy_[t],!0},getOwnPropertyDescriptor(e,t){const n=ia(e),r=Reflect.getOwnPropertyDescriptor(n,t);return r&&{writable:!0,configurable:e.type_!==1||t!=="length",enumerable:r.enumerable,value:n[t]}},defineProperty(){vo(11)},getPrototypeOf(e){return lc(e.base_)},setPrototypeOf(){vo(12)}},ad={};id(px,(e,t)=>{ad[e]=function(){return arguments[0]=arguments[0][0],t.apply(this,arguments)}});ad.deleteProperty=function(e,t){return ad.set.call(this,e,t,void 0)};ad.set=function(e,t,n){return px.set.call(this,e[0],t,n,e[0])};function _0(e,t){const n=e[xr];return(n?ia(n):e)[t]}function DB(e,t,n){var o;const r=kP(t,n);return r?"value"in r?r.value:(o=r.get)==null?void 0:o.call(e.draft_):void 0}function kP(e,t){if(!(t in e))return;let n=lc(e);for(;n;){const r=Object.getOwnPropertyDescriptor(n,t);if(r)return r;n=lc(n)}}function d1(e){e.modified_||(e.modified_=!0,e.parent_&&d1(e.parent_))}function E0(e){e.copy_||(e.copy_=l1(e.base_,e.scope_.immer_.useStrictShallowCopy_))}var IB=class{constructor(e){this.autoFreeze_=!0,this.useStrictShallowCopy_=!1,this.produce=(t,n,r)=>{if(typeof t=="function"&&typeof n!="function"){const i=n;n=t;const s=this;return function(u=i,...f){return s.produce(u,m=>n.call(this,m,...f))}}typeof n!="function"&&vo(6),r!==void 0&&typeof r!="function"&&vo(7);let o;if(Na(t)){const i=OC(this),s=f1(t,void 0);let l=!0;try{o=n(s),l=!1}finally{l?c1(i):u1(i)}return MC(i,r),NC(o,i)}else if(!t||typeof t!="object"){if(o=n(t),o===void 0&&(o=t),o===$P&&(o=void 0),this.autoFreeze_&&hx(o,!0),r){const i=[],s=[];Da("Patches").generateReplacementPatches_(t,o,i,s),r(i,s)}return o}else vo(1,t)},this.produceWithPatches=(t,n)=>{if(typeof t=="function")return(s,...l)=>this.produceWithPatches(s,u=>t(u,...l));let r,o;return[this.produce(t,n,(s,l)=>{r=s,o=l}),r,o]},typeof(e==null?void 0:e.autoFreeze)=="boolean"&&this.setAutoFreeze(e.autoFreeze),typeof(e==null?void 0:e.useStrictShallowCopy)=="boolean"&&this.setUseStrictShallowCopy(e.useStrictShallowCopy)}createDraft(e){Na(e)||vo(8),cc(e)&&(e=LB(e));const t=OC(this),n=f1(e,void 0);return n[xr].isManual_=!0,u1(t),n}finishDraft(e,t){const n=e&&e[xr];(!n||!n.isManual_)&&vo(9);const{scope_:r}=n;return MC(r,t),NC(void 0,r)}setAutoFreeze(e){this.autoFreeze_=e}setUseStrictShallowCopy(e){this.useStrictShallowCopy_=e}applyPatches(e,t){let n;for(n=t.length-1;n>=0;n--){const o=t[n];if(o.path.length===0&&o.op==="replace"){e=o.value;break}}n>-1&&(t=t.slice(n+1));const r=Da("Patches").applyPatches_;return cc(e)?r(e,t):this.produce(e,o=>r(o,t))}};function f1(e,t){const n=Bm(e)?Da("MapSet").proxyMap_(e,t):zm(e)?Da("MapSet").proxySet_(e,t):NB(e,t);return(t?t.scope_:TP()).drafts_.push(n),n}function LB(e){return cc(e)||vo(10,e),AP(e)}function AP(e){if(!Na(e)||Um(e))return e;const t=e[xr];let n;if(t){if(!t.modified_)return t.base_;t.finalized_=!0,n=l1(e,t.scope_.immer_.useStrictShallowCopy_)}else n=l1(e,!0);return id(n,(r,o)=>{PP(n,r,AP(o))}),t&&(t.finalized_=!1),n}var br=new IB,FB=br.produce;br.produceWithPatches.bind(br);br.setAutoFreeze.bind(br);br.setUseStrictShallowCopy.bind(br);br.applyPatches.bind(br);br.createDraft.bind(br);br.finishDraft.bind(br);const jB=e=>(t,n,r)=>(r.setState=(o,i,...s)=>{const l=typeof o=="function"?FB(o):o;return t(l,i,...s)},e(r.setState,n,r)),BB=jB;var MP={exports:{}},OP={},NP={exports:{}},DP={};/** + * @license React + * use-sync-external-store-shim.production.min.js + * + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */var uc=d;function zB(e,t){return e===t&&(e!==0||1/e===1/t)||e!==e&&t!==t}var UB=typeof Object.is=="function"?Object.is:zB,VB=uc.useState,WB=uc.useEffect,HB=uc.useLayoutEffect,KB=uc.useDebugValue;function GB(e,t){var n=t(),r=VB({inst:{value:n,getSnapshot:t}}),o=r[0].inst,i=r[1];return HB(function(){o.value=n,o.getSnapshot=t,C0(o)&&i({inst:o})},[e,n,t]),WB(function(){return C0(o)&&i({inst:o}),e(function(){C0(o)&&i({inst:o})})},[e]),KB(n),n}function C0(e){var t=e.getSnapshot;e=e.value;try{var n=t();return!UB(e,n)}catch{return!0}}function YB(e,t){return t()}var XB=typeof window>"u"||typeof window.document>"u"||typeof window.document.createElement>"u"?YB:GB;DP.useSyncExternalStore=uc.useSyncExternalStore!==void 0?uc.useSyncExternalStore:XB;NP.exports=DP;var ZB=NP.exports;/** + * @license React + * use-sync-external-store-shim/with-selector.production.min.js + * + * Copyright (c) Facebook, Inc. and its affiliates. + * + * This source code is licensed under the MIT license found in the + * LICENSE file in the root directory of this source tree. + */var Vm=d,qB=ZB;function QB(e,t){return e===t&&(e!==0||1/e===1/t)||e!==e&&t!==t}var JB=typeof Object.is=="function"?Object.is:QB,ez=qB.useSyncExternalStore,tz=Vm.useRef,nz=Vm.useEffect,rz=Vm.useMemo,oz=Vm.useDebugValue;OP.useSyncExternalStoreWithSelector=function(e,t,n,r,o){var i=tz(null);if(i.current===null){var s={hasValue:!1,value:null};i.current=s}else s=i.current;i=rz(function(){function u(y){if(!f){if(f=!0,m=y,y=r(y),o!==void 0&&s.hasValue){var x=s.value;if(o(x,y))return p=x}return p=y}if(x=p,JB(m,y))return x;var S=r(y);return o!==void 0&&o(x,S)?x:(m=y,p=S)}var f=!1,m,p,g=n===void 0?null:n;return[function(){return u(t())},g===null?void 0:function(){return u(g())}]},[t,n,r,o]);var l=ez(e,i[0],i[1]);return nz(function(){s.hasValue=!0,s.value=l},[l]),oz(l),l};MP.exports=OP;var iz=MP.exports;const sz=pm(iz);var az={VITE_BACKEND:"http://127.0.0.1:8080",BASE_URL:"/",MODE:"production",DEV:!1,PROD:!0,SSR:!1};const IC=e=>{let t;const n=new Set,r=(u,f)=>{const m=typeof u=="function"?u(t):u;if(!Object.is(m,t)){const p=t;t=f??typeof m!="object"?m:Object.assign({},t,m),n.forEach(g=>g(t,p))}},o=()=>t,l={setState:r,getState:o,subscribe:u=>(n.add(u),()=>n.delete(u)),destroy:()=>{(az?"production":void 0)!=="production"&&console.warn("[DEPRECATED] The `destroy` method will be unsupported in a future version. Instead use unsubscribe function returned by subscribe. Everything will be garbage-collected if store is garbage-collected."),n.clear()}};return t=e(r,o,l),l},lz=e=>e?IC(e):IC,{useDebugValue:cz}=Be,{useSyncExternalStoreWithSelector:uz}=sz;function dz(e,t=e.getState,n){const r=uz(e.subscribe,e.getState,e.getServerState||e.getState,t,n);return cz(r),r}const LC=(e,t)=>{const n=lz(e),r=(o,i=t)=>dz(n,o,i);return Object.assign(r,n),r},fz=(e,t)=>e?LC(e,t):LC,hz=1,pz=1e6;let $0=0;function mz(){return $0=($0+1)%Number.MAX_VALUE,$0.toString()}const R0=new Map,FC=e=>{if(R0.has(e))return;const t=setTimeout(()=>{R0.delete(e),Iu({type:"REMOVE_TOAST",toastId:e})},pz);R0.set(e,t)},gz=(e,t)=>{switch(t.type){case"ADD_TOAST":return{...e,toasts:[t.toast,...e.toasts].slice(0,hz)};case"UPDATE_TOAST":return{...e,toasts:e.toasts.map(n=>n.id===t.toast.id?{...n,...t.toast}:n)};case"DISMISS_TOAST":{const{toastId:n}=t;return n?FC(n):e.toasts.forEach(r=>{FC(r.id)}),{...e,toasts:e.toasts.map(r=>r.id===n||n===void 0?{...r,open:!1}:r)}}case"REMOVE_TOAST":return t.toastId===void 0?{...e,toasts:[]}:{...e,toasts:e.toasts.filter(n=>n.id!==t.toastId)}}},qh=[];let Qh={toasts:[]};function Iu(e){Qh=gz(Qh,e),qh.forEach(t=>{t(Qh)})}function hl({...e}){const t=mz(),n=o=>Iu({type:"UPDATE_TOAST",toast:{...o,id:t}}),r=()=>Iu({type:"DISMISS_TOAST",toastId:t});return Iu({type:"ADD_TOAST",toast:{...e,id:t,open:!0,onOpenChange:o=>{o||r()}}}),{id:t,dismiss:r,update:n}}function Id(){const[e,t]=d.useState(Qh);return d.useEffect(()=>(qh.push(t),()=>{const n=qh.indexOf(t);n>-1&&qh.splice(n,1)}),[e]),{...e,toast:hl,dismiss:n=>Iu({type:"DISMISS_TOAST",toastId:n})}}const na={file:null,paintByExampleFile:null,customMask:null,imageHeight:0,imageWidth:0,isInpainting:!1,isPluginRunning:!1,isAdjustingMask:!1,disableShortCuts:!1,windowSize:{height:600,width:800},editorState:{baseBrushSize:uF,brushSizeScale:1,renders:[],extraMasks:[],prevExtraMasks:[],temporaryMasks:[],lineGroups:[],lastLineGroup:[],curLineGroup:[],redoRenders:[],redoCurLines:[],redoLineGroups:[]},interactiveSegState:{isInteractiveSeg:!1,tmpInteractiveSegMask:null,clicks:[]},cropperState:{x:0,y:0,width:512,height:512},extenderState:{x:0,y:0,width:512,height:512},isCropperExtenderResizing:!1,fileManagerState:{sortBy:Ps.CTIME,sortOrder:Ou.DESCENDING,layout:"masonry",searchText:"",inputDirectory:"",outputDirectory:""},serverConfig:{plugins:[],modelInfos:[],removeBGModel:"briaai/RMBG-1.4",removeBGModels:[],realesrganModel:"realesr-general-x4v3",realesrganModels:[],interactiveSegModel:"vit_b",interactiveSegModels:[],enableFileManager:!1,enableAutoSaving:!1,enableControlnet:!1,controlnetMethod:"lllyasviel/control_v11p_sd15_canny",disableModelSwitch:!1,isDesktop:!1,samplers:["DPM++ 2M SDE Karras"]},settings:{model:{name:"lama",path:"lama",model_type:"inpaint",support_controlnet:!1,support_brushnet:!1,support_strength:!1,support_outpainting:!1,support_powerpaint_v2:!1,controlnets:[],brushnets:[],support_lcm_lora:!1,is_single_file_diffusers:!1,need_prompt:!1},showCropper:!1,showExtender:!1,extenderDirection:Ln.xy,enableDownloadMask:!1,enableManualInpainting:!1,enableUploadMask:!1,enableAutoExtractPrompt:!0,ldmSteps:30,ldmSampler:nx.ddim,zitsWireframe:!0,cv2Radius:5,cv2Flag:rx.INPAINT_NS,prompt:"",negativePrompt:mF,seed:42,seedFixed:!1,sdMaskBlur:12,sdStrength:1,sdSteps:50,sdGuidanceScale:7.5,sdSampler:"DPM++ 2M",sdMatchHistograms:!1,sdScale:1,p2pImageGuidanceScale:1.5,enableControlnet:!1,controlnetMethod:"lllyasviel/control_v11p_sd15_canny",controlnetConditioningScale:.4,enableBrushNet:!1,brushnetMethod:"random_mask",brushnetConditioningScale:1,enableLCMLora:!1,enablePowerPaintV2:!1,powerpaintTask:ma.text_guided,adjustMaskKernelSize:12}},xt=fz()($B(BB((e,t)=>({...na,showPrevMask:async()=>{if(t().settings.showExtender)return;const{lastLineGroup:n,curLineGroup:r,prevExtraMasks:o,extraMasks:i}=t().editorState;if(r.length!==0||i.length!==0)return;const{imageWidth:s,imageHeight:l}=t(),u=Gh(s,l,[n],o,Kh);try{const f=await mC(u);e(m=>{m.editorState.temporaryMasks.push(f)})}catch(f){console.error(f);return}},hidePrevMask:()=>{e(n=>{n.editorState.temporaryMasks=[]})},getCurrentTargetFile:async()=>{const n=t().file,r=t().editorState.renders;let o=n;if(r.length>0){const i=r[r.length-1];o=await Nu(i.currentSrc,n.name,n.type)}return o},runInpainting:async()=>{const{isInpainting:n,file:r,paintByExampleFile:o,imageWidth:i,imageHeight:s,settings:l,cropperState:u,extenderState:f}=t();if(n||r===null||t().settings.model.support_outpainting&&l.showExtender&&f.x===0&&f.y===0&&f.height===s&&f.width===i)return;const{lastLineGroup:m,curLineGroup:p,lineGroups:g,renders:y,prevExtraMasks:x,extraMasks:S}=t().editorState,E=p.length===0&&S.length===0&&!l.showExtender;let _=[],b=[];if(E===!0?(b=m,_=x):(b=p,_=S),b.length===0&&_===null&&!l.showExtender){hl({variant:"destructive",description:"Please draw mask on picture"});return}const C=[...g,b];e(O=>{O.isInpainting=!0});let R=r;if(E===!0){if(y.length>1){const O=y[y.length-2];R=await Nu(O.currentSrc,r.name,r.type)}}else if(y.length>0){const O=y[y.length-1];R=await Nu(O.currentSrc,r.name,r.type)}const k=Gh(i,s,[b],_,Kh);if(E){const O=await mC(k);e(A=>{A.editorState.temporaryMasks=[O]})}try{const O=await Uj(R,l,u,f,pC(k.toDataURL()),o),{blob:A,seed:I}=O;I&&t().setSeed(parseInt(I,10));const z=new Image;await bu(z,A);const H=[...y,z];t().setImageSize(z.width,z.height),t().updateEditorState({renders:H,lineGroups:C,lastLineGroup:b,curLineGroup:[],extraMasks:[],prevExtraMasks:_})}catch(O){hl({variant:"destructive",description:O.message?O.message:O.toString()})}t().resetRedoState(),e(O=>{O.isInpainting=!1,O.editorState.temporaryMasks=[]})},runRenderablePlugin:async(n,r,o={upscale:1})=>{const{renders:i,lineGroups:s}=t().editorState;e(l=>{l.isPluginRunning=!0});try{const l=new Date,u=await t().getCurrentTargetFile(),f=await _P(n,r,u,o.upscale),{blob:m}=f;if(n){const y=new Image;await bu(y,m),e(x=>{x.editorState.extraMasks.push(y)})}else{const y=new Image;await bu(y,m),t().setImageSize(y.width,y.height);const x=[...i,y],S=[...s,[]];t().updateEditorState({renders:x,lineGroups:S})}const g=new Date().getTime()-l.getTime();hl({description:`Run ${r} successfully in ${g/1e3}s`})}catch(l){hl({variant:"destructive",description:l.message?l.message:l.toString()})}e(l=>{l.isPluginRunning=!1})},updateEditorState:n=>{e(r=>{r.editorState={...r.editorState,...n}})},cleanCurLineGroup:()=>{t().updateEditorState({curLineGroup:[]})},handleCanvasMouseDown:n=>{let r=[];const o=t();o.runMannually()&&(r=[...o.editorState.curLineGroup]),r.push({size:o.getBrushSize(),pts:[n]}),e(i=>{i.editorState.curLineGroup=r})},handleCanvasMouseMove:n=>{e(r=>{const o=r.editorState.curLineGroup;o.length&&o[o.length-1].pts.push(n)})},runMannually:()=>{const n=t();return n.settings.enableManualInpainting||n.settings.model.model_type!==ms},getIsProcessing:()=>t().isInpainting||t().isPluginRunning||t().isAdjustingMask,isSD:()=>t().settings.model.model_type!==ms,undoDisabled:()=>{const n=t().editorState;if(n.renders.length>0)return!1;if(t().runMannually()){if(n.curLineGroup.length===0)return!0}else if(n.renders.length===0)return!0;return!1},undo:()=>{t().runMannually()&&t().editorState.curLineGroup.length!==0?e(n=>{const r=n.editorState;if(r.curLineGroup.length===0)return;r.lastLineGroup=[];const o=r.curLineGroup.pop();r.redoCurLines.push(o)}):e(n=>{const r=n.editorState;if(r.renders.length===0||r.lineGroups.length===0)return;const o=r.lineGroups.pop();r.redoLineGroups.push(o),r.redoCurLines=[],r.curLineGroup=[];const i=r.renders.pop();r.redoRenders.push(i)})},redoDisabled:()=>{const n=t().editorState;if(n.redoRenders.length>0)return!1;if(t().runMannually()){if(n.redoCurLines.length===0)return!0}else if(n.redoRenders.length===0)return!0;return!1},redo:()=>{t().runMannually()&&t().editorState.redoCurLines.length!==0?e(n=>{const r=n.editorState;if(r.redoCurLines.length===0)return;const o=r.redoCurLines.pop();r.curLineGroup.push(o)}):e(n=>{const r=n.editorState;if(r.redoRenders.length===0||r.redoLineGroups.length===0)return;const o=r.redoLineGroups.pop();r.lineGroups.push(o),r.curLineGroup=[];const i=r.redoRenders.pop();r.renders.push(i)})},resetRedoState:()=>{e(n=>{n.editorState.redoCurLines=[],n.editorState.redoLineGroups=[],n.editorState.redoRenders=[]})},updateAppState:n=>{e(()=>n)},getBrushSize:()=>t().editorState.baseBrushSize*t().editorState.brushSizeScale,showPromptInput:()=>{const n=t().settings.model;return n.model_type!==ms&&n.name!==JR},setServerConfig:n=>{e(r=>{r.serverConfig=n,r.settings.enableControlnet=n.enableControlnet,r.settings.controlnetMethod=n.controlnetMethod})},updateSettings:n=>{e(r=>{r.settings={...r.settings,...n}})},updateEnablePowerPaintV2:n=>{t().updateSettings({enablePowerPaintV2:n}),n&&t().updateSettings({enableBrushNet:!1,enableControlnet:!1,enableLCMLora:!1})},updateEnableBrushNet:n=>{t().updateSettings({enableBrushNet:n}),n&&t().updateSettings({enablePowerPaintV2:!1,enableControlnet:!1,enableLCMLora:!1})},updateEnableControlnet(n){t().updateSettings({enableControlnet:n}),n&&t().updateSettings({enablePowerPaintV2:!1,enableBrushNet:!1})},updateLCMLora(n){t().updateSettings({enableLCMLora:n}),n&&t().updateSettings({enablePowerPaintV2:!1,enableBrushNet:!1})},setModel:n=>{e(r=>{r.settings.model=n,n.support_controlnet&&!n.controlnets.includes(r.settings.controlnetMethod)&&(r.settings.controlnetMethod=n.controlnets[0])})},updateFileManagerState:n=>{e(r=>{r.fileManagerState={...r.fileManagerState,...n}})},updateInteractiveSegState:n=>{e(r=>({...r,interactiveSegState:{...r.interactiveSegState,...n}}))},resetInteractiveSegState:()=>{t().updateInteractiveSegState(na.interactiveSegState)},handleInteractiveSegAccept:()=>{e(n=>{n.interactiveSegState.tmpInteractiveSegMask&&n.editorState.extraMasks.push(n.interactiveSegState.tmpInteractiveSegMask),n.interactiveSegState={...na.interactiveSegState}})},handleFileManagerMaskSelect:async n=>{const r=new Image;await bu(r,URL.createObjectURL(n)),e(o=>{o.editorState.extraMasks.push(r)}),t().runInpainting()},setIsInpainting:n=>e(r=>{r.isInpainting=n}),setFile:async n=>{if(t().settings.enableAutoExtractPrompt)try{const r=await Xj(n);r.prompt&&e(o=>{o.settings.prompt=r.prompt}),r.negative_prompt&&e(o=>{o.settings.negativePrompt=r.negative_prompt})}catch(r){hl({variant:"destructive",description:r.message?r.message:r.toString()})}e(r=>{r.file=n,r.interactiveSegState=na.interactiveSegState,r.editorState=na.editorState,r.cropperState=na.cropperState})},setCustomFile:n=>e(r=>{r.customMask=n}),setBaseBrushSize:n=>e(r=>{r.editorState.baseBrushSize=n}),decreaseBaseBrushSize:()=>{const n=t().editorState.baseBrushSize;let r=n;n>10&&(r=n-10),n<=10&&n>0&&(r=n-3),t().setBaseBrushSize(r)},increaseBaseBrushSize:()=>{const n=t().editorState.baseBrushSize,r=Math.min(n+10,QR);t().setBaseBrushSize(r)},setImageSize:(n,r)=>{e(o=>{o.imageWidth=n,o.imageHeight=r,o.editorState.brushSizeScale=Math.max(Math.min(n,r),512)/512}),t().resetExtender(n,r)},setCropperX:n=>e(r=>{r.cropperState.x=n}),setCropperY:n=>e(r=>{r.cropperState.y=n}),setCropperWidth:n=>e(r=>{r.cropperState.width=n}),setCropperHeight:n=>e(r=>{r.cropperState.height=n}),setExtenderX:n=>e(r=>{r.extenderState.x=n}),setExtenderY:n=>e(r=>{r.extenderState.y=n}),setExtenderWidth:n=>e(r=>{r.extenderState.width=n}),setExtenderHeight:n=>e(r=>{r.extenderState.height=n}),setIsCropperExtenderResizing:n=>e(r=>{r.isCropperExtenderResizing=n}),updateExtenderDirection:n=>{console.log(`updateExtenderDirection: ${JSON.stringify(t().extenderState)}`),e(r=>{r.settings.extenderDirection=n,r.extenderState.x=0,r.extenderState.y=0,r.extenderState.width=r.imageWidth,r.extenderState.height=r.imageHeight}),t().updateExtenderByBuiltIn(n,1.5)},updateExtenderByBuiltIn:(n,r)=>{const o={...na.extenderState};let{x:i,y:s,width:l,height:u}=o;const{imageWidth:f,imageHeight:m}=t();switch(l=f,u=m,n){case Ln.x:i=-Math.ceil(f*(r-1)/2),l=Math.ceil(f*r);break;case Ln.y:s=-Math.ceil(m*(r-1)/2),u=Math.ceil(m*r);break;case Ln.xy:i=-Math.ceil(f*(r-1)/2),s=-Math.ceil(m*(r-1)/2),l=Math.ceil(f*r),u=Math.ceil(m*r);break}e(p=>{p.extenderState.x=i,p.extenderState.y=s,p.extenderState.width=l,p.extenderState.height=u})},resetExtender:(n,r)=>{e(o=>{o.extenderState.x=0,o.extenderState.y=0,o.extenderState.width=n,o.extenderState.height=r})},setSeed:n=>e(r=>{r.settings.seed=n}),adjustMask:async n=>{const{imageWidth:r,imageHeight:o}=t(),{curLineGroup:i,extraMasks:s}=t().editorState,{adjustMaskKernelSize:l}=t().settings;if(i.length===0&&s.length===0)return;e(g=>{g.isAdjustingMask=!0});const u=Gh(r,o,[i],s,Kh),f=pC(u.toDataURL()),m=await Zj(f,n,l),p=await vF(m);e(g=>{g.editorState.extraMasks=[p],g.editorState.curLineGroup=[]}),e(g=>{g.isAdjustingMask=!1})},clearMask:()=>{e(n=>{n.editorState.extraMasks=[],n.editorState.curLineGroup=[]})}})),{name:"ZUSTAND_STATE",version:2,partialize:e=>Object.fromEntries(Object.entries(e).filter(([t])=>["fileManagerState","settings"].includes(t)))}),RB),Wm=d.forwardRef(({className:e,type:t,...n},r)=>{const o=xt(l=>l.updateAppState),i=()=>{o({disableShortCuts:!0})},s=()=>{o({disableShortCuts:!1})};return v.jsx("input",{type:t,className:xe("flex h-8 w-full rounded-md border border-input bg-transparent px-3 py-1 text-sm shadow-sm transition-colors file:border-0 file:bg-transparent file:text-sm file:font-medium placeholder:text-muted-foreground focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring disabled:cursor-not-allowed disabled:opacity-50",e),ref:r,autoComplete:"off",tabIndex:-1,onFocus:i,onBlur:s,...n})});Wm.displayName="Input";const No=d.forwardRef(({numberValue:e,allowFloat:t,onNumberValueChange:n,className:r,...o},i)=>{const[s,l]=d.useState(e.toString());d.useEffect(()=>{s!==e.toString()+"."&&l(e.toString())},[e]);const u=f=>{let p=f.target.value;if(t){if(p=p.replace(/[^0-9.]/g,"").replace(/(\..*?)\..*/g,"$1"),p.length===0){n(0);return}n(parseFloat(p))}else{if(p=p.replace(/\D/g,""),p.length===0){n(0);return}n(parseInt(p,10))}l(p)};return v.jsx(Wm,{ref:i,value:s,onInput:u,className:xe("text-center h-7 px-1",r),...o})});function fe(e,t,{checkForDefaultPrevented:n=!0}={}){return function(o){if(e==null||e(o),n===!1||!o.defaultPrevented)return t==null?void 0:t(o)}}function vz(e,t){const n=d.createContext(t);function r(i){const{children:s,...l}=i,u=d.useMemo(()=>l,Object.values(l));return d.createElement(n.Provider,{value:u},s)}function o(i){const s=d.useContext(n);if(s)return s;if(t!==void 0)return t;throw new Error(`\`${i}\` must be used within \`${e}\``)}return r.displayName=e+"Provider",[r,o]}function Tn(e,t=[]){let n=[];function r(i,s){const l=d.createContext(s),u=n.length;n=[...n,s];function f(p){const{scope:g,children:y,...x}=p,S=(g==null?void 0:g[e][u])||l,E=d.useMemo(()=>x,Object.values(x));return d.createElement(S.Provider,{value:E},y)}function m(p,g){const y=(g==null?void 0:g[e][u])||l,x=d.useContext(y);if(x)return x;if(s!==void 0)return s;throw new Error(`\`${p}\` must be used within \`${i}\``)}return f.displayName=i+"Provider",[f,m]}const o=()=>{const i=n.map(s=>d.createContext(s));return function(l){const u=(l==null?void 0:l[e])||i;return d.useMemo(()=>({[`__scope${e}`]:{...l,[e]:u}}),[l,u])}};return o.scopeName=e,[r,yz(o,...t)]}function yz(...e){const t=e[0];if(e.length===1)return t;const n=()=>{const r=e.map(o=>({useScope:o(),scopeName:o.scopeName}));return function(i){const s=r.reduce((l,{useScope:u,scopeName:f})=>{const p=u(i)[`__scope${f}`];return{...l,...p}},{});return d.useMemo(()=>({[`__scope${t.scopeName}`]:s}),[s])}};return n.scopeName=t.scopeName,n}const wz=["a","button","div","form","h2","h3","img","input","label","li","nav","ol","p","span","svg","ul"],Ae=wz.reduce((e,t)=>{const n=d.forwardRef((r,o)=>{const{asChild:i,...s}=r,l=i?Qo:t;return d.useEffect(()=>{window[Symbol.for("radix-ui")]=!0},[]),d.createElement(l,Y({},s,{ref:o}))});return n.displayName=`Primitive.${t}`,{...e,[t]:n}},{});function mx(e,t){e&&Bs.flushSync(()=>e.dispatchEvent(t))}function Lt(e){const t=d.useRef(e);return d.useEffect(()=>{t.current=e}),d.useMemo(()=>(...n)=>{var r;return(r=t.current)===null||r===void 0?void 0:r.call(t,...n)},[])}function xz(e,t=globalThis==null?void 0:globalThis.document){const n=Lt(e);d.useEffect(()=>{const r=o=>{o.key==="Escape"&&n(o)};return t.addEventListener("keydown",r),()=>t.removeEventListener("keydown",r)},[n,t])}const h1="dismissableLayer.update",bz="dismissableLayer.pointerDownOutside",Sz="dismissableLayer.focusOutside";let jC;const IP=d.createContext({layers:new Set,layersWithOutsidePointerEventsDisabled:new Set,branches:new Set}),$c=d.forwardRef((e,t)=>{var n;const{disableOutsidePointerEvents:r=!1,onEscapeKeyDown:o,onPointerDownOutside:i,onFocusOutside:s,onInteractOutside:l,onDismiss:u,...f}=e,m=d.useContext(IP),[p,g]=d.useState(null),y=(n=p==null?void 0:p.ownerDocument)!==null&&n!==void 0?n:globalThis==null?void 0:globalThis.document,[,x]=d.useState({}),S=Ve(t,I=>g(I)),E=Array.from(m.layers),[_]=[...m.layersWithOutsidePointerEventsDisabled].slice(-1),b=E.indexOf(_),C=p?E.indexOf(p):-1,R=m.layersWithOutsidePointerEventsDisabled.size>0,k=C>=b,O=Ez(I=>{const z=I.target,H=[...m.branches].some(ie=>ie.contains(z));!k||H||(i==null||i(I),l==null||l(I),I.defaultPrevented||u==null||u())},y),A=Cz(I=>{const z=I.target;[...m.branches].some(ie=>ie.contains(z))||(s==null||s(I),l==null||l(I),I.defaultPrevented||u==null||u())},y);return xz(I=>{C===m.layers.size-1&&(o==null||o(I),!I.defaultPrevented&&u&&(I.preventDefault(),u()))},y),d.useEffect(()=>{if(p)return r&&(m.layersWithOutsidePointerEventsDisabled.size===0&&(jC=y.body.style.pointerEvents,y.body.style.pointerEvents="none"),m.layersWithOutsidePointerEventsDisabled.add(p)),m.layers.add(p),BC(),()=>{r&&m.layersWithOutsidePointerEventsDisabled.size===1&&(y.body.style.pointerEvents=jC)}},[p,y,r,m]),d.useEffect(()=>()=>{p&&(m.layers.delete(p),m.layersWithOutsidePointerEventsDisabled.delete(p),BC())},[p,m]),d.useEffect(()=>{const I=()=>x({});return document.addEventListener(h1,I),()=>document.removeEventListener(h1,I)},[]),d.createElement(Ae.div,Y({},f,{ref:S,style:{pointerEvents:R?k?"auto":"none":void 0,...e.style},onFocusCapture:fe(e.onFocusCapture,A.onFocusCapture),onBlurCapture:fe(e.onBlurCapture,A.onBlurCapture),onPointerDownCapture:fe(e.onPointerDownCapture,O.onPointerDownCapture)}))}),_z=d.forwardRef((e,t)=>{const n=d.useContext(IP),r=d.useRef(null),o=Ve(t,r);return d.useEffect(()=>{const i=r.current;if(i)return n.branches.add(i),()=>{n.branches.delete(i)}},[n.branches]),d.createElement(Ae.div,Y({},e,{ref:o}))});function Ez(e,t=globalThis==null?void 0:globalThis.document){const n=Lt(e),r=d.useRef(!1),o=d.useRef(()=>{});return d.useEffect(()=>{const i=l=>{if(l.target&&!r.current){let m=function(){LP(bz,n,f,{discrete:!0})};var u=m;const f={originalEvent:l};l.pointerType==="touch"?(t.removeEventListener("click",o.current),o.current=m,t.addEventListener("click",o.current,{once:!0})):m()}else t.removeEventListener("click",o.current);r.current=!1},s=window.setTimeout(()=>{t.addEventListener("pointerdown",i)},0);return()=>{window.clearTimeout(s),t.removeEventListener("pointerdown",i),t.removeEventListener("click",o.current)}},[t,n]),{onPointerDownCapture:()=>r.current=!0}}function Cz(e,t=globalThis==null?void 0:globalThis.document){const n=Lt(e),r=d.useRef(!1);return d.useEffect(()=>{const o=i=>{i.target&&!r.current&&LP(Sz,n,{originalEvent:i},{discrete:!1})};return t.addEventListener("focusin",o),()=>t.removeEventListener("focusin",o)},[t,n]),{onFocusCapture:()=>r.current=!0,onBlurCapture:()=>r.current=!1}}function BC(){const e=new CustomEvent(h1);document.dispatchEvent(e)}function LP(e,t,n,{discrete:r}){const o=n.originalEvent.target,i=new CustomEvent(e,{bubbles:!1,cancelable:!0,detail:n});t&&o.addEventListener(e,t,{once:!0}),r?mx(o,i):o.dispatchEvent(i)}const $z=$c,Rz=_z,Pn=globalThis!=null&&globalThis.document?d.useLayoutEffect:()=>{},Pz=V9.useId||(()=>{});let Tz=0;function tr(e){const[t,n]=d.useState(Pz());return Pn(()=>{e||n(r=>r??String(Tz++))},[e]),e||(t?`radix-${t}`:"")}const kz=["top","right","bottom","left"],Os=Math.min,pr=Math.max,jp=Math.round,mh=Math.floor,Ns=e=>({x:e,y:e}),Az={left:"right",right:"left",bottom:"top",top:"bottom"},Mz={start:"end",end:"start"};function p1(e,t,n){return pr(e,Os(t,n))}function Ri(e,t){return typeof e=="function"?e(t):e}function Pi(e){return e.split("-")[0]}function Rc(e){return e.split("-")[1]}function gx(e){return e==="x"?"y":"x"}function vx(e){return e==="y"?"height":"width"}function Pc(e){return["top","bottom"].includes(Pi(e))?"y":"x"}function yx(e){return gx(Pc(e))}function Oz(e,t,n){n===void 0&&(n=!1);const r=Rc(e),o=yx(e),i=vx(o);let s=o==="x"?r===(n?"end":"start")?"right":"left":r==="start"?"bottom":"top";return t.reference[i]>t.floating[i]&&(s=Bp(s)),[s,Bp(s)]}function Nz(e){const t=Bp(e);return[m1(e),t,m1(t)]}function m1(e){return e.replace(/start|end/g,t=>Mz[t])}function Dz(e,t,n){const r=["left","right"],o=["right","left"],i=["top","bottom"],s=["bottom","top"];switch(e){case"top":case"bottom":return n?t?o:r:t?r:o;case"left":case"right":return t?i:s;default:return[]}}function Iz(e,t,n,r){const o=Rc(e);let i=Dz(Pi(e),n==="start",r);return o&&(i=i.map(s=>s+"-"+o),t&&(i=i.concat(i.map(m1)))),i}function Bp(e){return e.replace(/left|right|bottom|top/g,t=>Az[t])}function Lz(e){return{top:0,right:0,bottom:0,left:0,...e}}function FP(e){return typeof e!="number"?Lz(e):{top:e,right:e,bottom:e,left:e}}function zp(e){return{...e,top:e.y,left:e.x,right:e.x+e.width,bottom:e.y+e.height}}function zC(e,t,n){let{reference:r,floating:o}=e;const i=Pc(t),s=yx(t),l=vx(s),u=Pi(t),f=i==="y",m=r.x+r.width/2-o.width/2,p=r.y+r.height/2-o.height/2,g=r[l]/2-o[l]/2;let y;switch(u){case"top":y={x:m,y:r.y-o.height};break;case"bottom":y={x:m,y:r.y+r.height};break;case"right":y={x:r.x+r.width,y:p};break;case"left":y={x:r.x-o.width,y:p};break;default:y={x:r.x,y:r.y}}switch(Rc(t)){case"start":y[s]-=g*(n&&f?-1:1);break;case"end":y[s]+=g*(n&&f?-1:1);break}return y}const Fz=async(e,t,n)=>{const{placement:r="bottom",strategy:o="absolute",middleware:i=[],platform:s}=n,l=i.filter(Boolean),u=await(s.isRTL==null?void 0:s.isRTL(t));let f=await s.getElementRects({reference:e,floating:t,strategy:o}),{x:m,y:p}=zC(f,r,u),g=r,y={},x=0;for(let S=0;S({name:"arrow",options:e,async fn(t){const{x:n,y:r,placement:o,rects:i,platform:s,elements:l,middlewareData:u}=t,{element:f,padding:m=0}=Ri(e,t)||{};if(f==null)return{};const p=FP(m),g={x:n,y:r},y=yx(o),x=vx(y),S=await s.getDimensions(f),E=y==="y",_=E?"top":"left",b=E?"bottom":"right",C=E?"clientHeight":"clientWidth",R=i.reference[x]+i.reference[y]-g[y]-i.floating[x],k=g[y]-i.reference[y],O=await(s.getOffsetParent==null?void 0:s.getOffsetParent(f));let A=O?O[C]:0;(!A||!await(s.isElement==null?void 0:s.isElement(O)))&&(A=l.floating[C]||i.floating[x]);const I=R/2-k/2,z=A/2-S[x]/2-1,H=Os(p[_],z),ie=Os(p[b],z),K=H,te=A-S[x]-ie,U=A/2-S[x]/2+I,re=p1(K,U,te),V=!u.arrow&&Rc(o)!=null&&U!=re&&i.reference[x]/2-(UK<=0)){var z,H;const K=(((z=i.flip)==null?void 0:z.index)||0)+1,te=k[K];if(te)return{data:{index:K,overflows:I},reset:{placement:te}};let U=(H=I.filter(re=>re.overflows[0]<=0).sort((re,V)=>re.overflows[1]-V.overflows[1])[0])==null?void 0:H.placement;if(!U)switch(y){case"bestFit":{var ie;const re=(ie=I.map(V=>[V.placement,V.overflows.filter(J=>J>0).reduce((J,G)=>J+G,0)]).sort((V,J)=>V[1]-J[1])[0])==null?void 0:ie[0];re&&(U=re);break}case"initialPlacement":U=l;break}if(o!==U)return{reset:{placement:U}}}return{}}}};function VC(e,t){return{top:e.top-t.height,right:e.right-t.width,bottom:e.bottom-t.height,left:e.left-t.width}}function WC(e){return kz.some(t=>e[t]>=0)}const Bz=function(e){return e===void 0&&(e={}),{name:"hide",options:e,async fn(t){const{rects:n}=t,{strategy:r="referenceHidden",...o}=Ri(e,t);switch(r){case"referenceHidden":{const i=await ld(t,{...o,elementContext:"reference"}),s=VC(i,n.reference);return{data:{referenceHiddenOffsets:s,referenceHidden:WC(s)}}}case"escaped":{const i=await ld(t,{...o,altBoundary:!0}),s=VC(i,n.floating);return{data:{escapedOffsets:s,escaped:WC(s)}}}default:return{}}}}};async function zz(e,t){const{placement:n,platform:r,elements:o}=e,i=await(r.isRTL==null?void 0:r.isRTL(o.floating)),s=Pi(n),l=Rc(n),u=Pc(n)==="y",f=["left","top"].includes(s)?-1:1,m=i&&u?-1:1,p=Ri(t,e);let{mainAxis:g,crossAxis:y,alignmentAxis:x}=typeof p=="number"?{mainAxis:p,crossAxis:0,alignmentAxis:null}:{mainAxis:0,crossAxis:0,alignmentAxis:null,...p};return l&&typeof x=="number"&&(y=l==="end"?x*-1:x),u?{x:y*m,y:g*f}:{x:g*f,y:y*m}}const Uz=function(e){return e===void 0&&(e=0),{name:"offset",options:e,async fn(t){const{x:n,y:r}=t,o=await zz(t,e);return{x:n+o.x,y:r+o.y,data:o}}}},Vz=function(e){return e===void 0&&(e={}),{name:"shift",options:e,async fn(t){const{x:n,y:r,placement:o}=t,{mainAxis:i=!0,crossAxis:s=!1,limiter:l={fn:E=>{let{x:_,y:b}=E;return{x:_,y:b}}},...u}=Ri(e,t),f={x:n,y:r},m=await ld(t,u),p=Pc(Pi(o)),g=gx(p);let y=f[g],x=f[p];if(i){const E=g==="y"?"top":"left",_=g==="y"?"bottom":"right",b=y+m[E],C=y-m[_];y=p1(b,y,C)}if(s){const E=p==="y"?"top":"left",_=p==="y"?"bottom":"right",b=x+m[E],C=x-m[_];x=p1(b,x,C)}const S=l.fn({...t,[g]:y,[p]:x});return{...S,data:{x:S.x-n,y:S.y-r}}}}},Wz=function(e){return e===void 0&&(e={}),{options:e,fn(t){const{x:n,y:r,placement:o,rects:i,middlewareData:s}=t,{offset:l=0,mainAxis:u=!0,crossAxis:f=!0}=Ri(e,t),m={x:n,y:r},p=Pc(o),g=gx(p);let y=m[g],x=m[p];const S=Ri(l,t),E=typeof S=="number"?{mainAxis:S,crossAxis:0}:{mainAxis:0,crossAxis:0,...S};if(u){const C=g==="y"?"height":"width",R=i.reference[g]-i.floating[C]+E.mainAxis,k=i.reference[g]+i.reference[C]-E.mainAxis;yk&&(y=k)}if(f){var _,b;const C=g==="y"?"width":"height",R=["top","left"].includes(Pi(o)),k=i.reference[p]-i.floating[C]+(R&&((_=s.offset)==null?void 0:_[p])||0)+(R?0:E.crossAxis),O=i.reference[p]+i.reference[C]+(R?0:((b=s.offset)==null?void 0:b[p])||0)-(R?E.crossAxis:0);xO&&(x=O)}return{[g]:y,[p]:x}}}},Hz=function(e){return e===void 0&&(e={}),{name:"size",options:e,async fn(t){const{placement:n,rects:r,platform:o,elements:i}=t,{apply:s=()=>{},...l}=Ri(e,t),u=await ld(t,l),f=Pi(n),m=Rc(n),p=Pc(n)==="y",{width:g,height:y}=r.floating;let x,S;f==="top"||f==="bottom"?(x=f,S=m===(await(o.isRTL==null?void 0:o.isRTL(i.floating))?"start":"end")?"left":"right"):(S=f,x=m==="end"?"top":"bottom");const E=y-u[x],_=g-u[S],b=!t.middlewareData.shift;let C=E,R=_;if(p){const O=g-u.left-u.right;R=m||b?Os(_,O):O}else{const O=y-u.top-u.bottom;C=m||b?Os(E,O):O}if(b&&!m){const O=pr(u.left,0),A=pr(u.right,0),I=pr(u.top,0),z=pr(u.bottom,0);p?R=g-2*(O!==0||A!==0?O+A:pr(u.left,u.right)):C=y-2*(I!==0||z!==0?I+z:pr(u.top,u.bottom))}await s({...t,availableWidth:R,availableHeight:C});const k=await o.getDimensions(i.floating);return g!==k.width||y!==k.height?{reset:{rects:!0}}:{}}}};function Ds(e){return jP(e)?(e.nodeName||"").toLowerCase():"#document"}function yr(e){var t;return(e==null||(t=e.ownerDocument)==null?void 0:t.defaultView)||window}function Di(e){var t;return(t=(jP(e)?e.ownerDocument:e.document)||window.document)==null?void 0:t.documentElement}function jP(e){return e instanceof Node||e instanceof yr(e).Node}function Ti(e){return e instanceof Element||e instanceof yr(e).Element}function Jo(e){return e instanceof HTMLElement||e instanceof yr(e).HTMLElement}function HC(e){return typeof ShadowRoot>"u"?!1:e instanceof ShadowRoot||e instanceof yr(e).ShadowRoot}function Ld(e){const{overflow:t,overflowX:n,overflowY:r,display:o}=Jr(e);return/auto|scroll|overlay|hidden|clip/.test(t+r+n)&&!["inline","contents"].includes(o)}function Kz(e){return["table","td","th"].includes(Ds(e))}function wx(e){const t=xx(),n=Jr(e);return n.transform!=="none"||n.perspective!=="none"||(n.containerType?n.containerType!=="normal":!1)||!t&&(n.backdropFilter?n.backdropFilter!=="none":!1)||!t&&(n.filter?n.filter!=="none":!1)||["transform","perspective","filter"].some(r=>(n.willChange||"").includes(r))||["paint","layout","strict","content"].some(r=>(n.contain||"").includes(r))}function Gz(e){let t=dc(e);for(;Jo(t)&&!Hm(t);){if(wx(t))return t;t=dc(t)}return null}function xx(){return typeof CSS>"u"||!CSS.supports?!1:CSS.supports("-webkit-backdrop-filter","none")}function Hm(e){return["html","body","#document"].includes(Ds(e))}function Jr(e){return yr(e).getComputedStyle(e)}function Km(e){return Ti(e)?{scrollLeft:e.scrollLeft,scrollTop:e.scrollTop}:{scrollLeft:e.pageXOffset,scrollTop:e.pageYOffset}}function dc(e){if(Ds(e)==="html")return e;const t=e.assignedSlot||e.parentNode||HC(e)&&e.host||Di(e);return HC(t)?t.host:t}function BP(e){const t=dc(e);return Hm(t)?e.ownerDocument?e.ownerDocument.body:e.body:Jo(t)&&Ld(t)?t:BP(t)}function cd(e,t,n){var r;t===void 0&&(t=[]),n===void 0&&(n=!0);const o=BP(e),i=o===((r=e.ownerDocument)==null?void 0:r.body),s=yr(o);return i?t.concat(s,s.visualViewport||[],Ld(o)?o:[],s.frameElement&&n?cd(s.frameElement):[]):t.concat(o,cd(o,[],n))}function zP(e){const t=Jr(e);let n=parseFloat(t.width)||0,r=parseFloat(t.height)||0;const o=Jo(e),i=o?e.offsetWidth:n,s=o?e.offsetHeight:r,l=jp(n)!==i||jp(r)!==s;return l&&(n=i,r=s),{width:n,height:r,$:l}}function bx(e){return Ti(e)?e:e.contextElement}function Il(e){const t=bx(e);if(!Jo(t))return Ns(1);const n=t.getBoundingClientRect(),{width:r,height:o,$:i}=zP(t);let s=(i?jp(n.width):n.width)/r,l=(i?jp(n.height):n.height)/o;return(!s||!Number.isFinite(s))&&(s=1),(!l||!Number.isFinite(l))&&(l=1),{x:s,y:l}}const Yz=Ns(0);function UP(e){const t=yr(e);return!xx()||!t.visualViewport?Yz:{x:t.visualViewport.offsetLeft,y:t.visualViewport.offsetTop}}function Xz(e,t,n){return t===void 0&&(t=!1),!n||t&&n!==yr(e)?!1:t}function Ia(e,t,n,r){t===void 0&&(t=!1),n===void 0&&(n=!1);const o=e.getBoundingClientRect(),i=bx(e);let s=Ns(1);t&&(r?Ti(r)&&(s=Il(r)):s=Il(e));const l=Xz(i,n,r)?UP(i):Ns(0);let u=(o.left+l.x)/s.x,f=(o.top+l.y)/s.y,m=o.width/s.x,p=o.height/s.y;if(i){const g=yr(i),y=r&&Ti(r)?yr(r):r;let x=g.frameElement;for(;x&&r&&y!==g;){const S=Il(x),E=x.getBoundingClientRect(),_=Jr(x),b=E.left+(x.clientLeft+parseFloat(_.paddingLeft))*S.x,C=E.top+(x.clientTop+parseFloat(_.paddingTop))*S.y;u*=S.x,f*=S.y,m*=S.x,p*=S.y,u+=b,f+=C,x=yr(x).frameElement}}return zp({width:m,height:p,x:u,y:f})}function Zz(e){let{rect:t,offsetParent:n,strategy:r}=e;const o=Jo(n),i=Di(n);if(n===i)return t;let s={scrollLeft:0,scrollTop:0},l=Ns(1);const u=Ns(0);if((o||!o&&r!=="fixed")&&((Ds(n)!=="body"||Ld(i))&&(s=Km(n)),Jo(n))){const f=Ia(n);l=Il(n),u.x=f.x+n.clientLeft,u.y=f.y+n.clientTop}return{width:t.width*l.x,height:t.height*l.y,x:t.x*l.x-s.scrollLeft*l.x+u.x,y:t.y*l.y-s.scrollTop*l.y+u.y}}function qz(e){return Array.from(e.getClientRects())}function VP(e){return Ia(Di(e)).left+Km(e).scrollLeft}function Qz(e){const t=Di(e),n=Km(e),r=e.ownerDocument.body,o=pr(t.scrollWidth,t.clientWidth,r.scrollWidth,r.clientWidth),i=pr(t.scrollHeight,t.clientHeight,r.scrollHeight,r.clientHeight);let s=-n.scrollLeft+VP(e);const l=-n.scrollTop;return Jr(r).direction==="rtl"&&(s+=pr(t.clientWidth,r.clientWidth)-o),{width:o,height:i,x:s,y:l}}function Jz(e,t){const n=yr(e),r=Di(e),o=n.visualViewport;let i=r.clientWidth,s=r.clientHeight,l=0,u=0;if(o){i=o.width,s=o.height;const f=xx();(!f||f&&t==="fixed")&&(l=o.offsetLeft,u=o.offsetTop)}return{width:i,height:s,x:l,y:u}}function eU(e,t){const n=Ia(e,!0,t==="fixed"),r=n.top+e.clientTop,o=n.left+e.clientLeft,i=Jo(e)?Il(e):Ns(1),s=e.clientWidth*i.x,l=e.clientHeight*i.y,u=o*i.x,f=r*i.y;return{width:s,height:l,x:u,y:f}}function KC(e,t,n){let r;if(t==="viewport")r=Jz(e,n);else if(t==="document")r=Qz(Di(e));else if(Ti(t))r=eU(t,n);else{const o=UP(e);r={...t,x:t.x-o.x,y:t.y-o.y}}return zp(r)}function WP(e,t){const n=dc(e);return n===t||!Ti(n)||Hm(n)?!1:Jr(n).position==="fixed"||WP(n,t)}function tU(e,t){const n=t.get(e);if(n)return n;let r=cd(e,[],!1).filter(l=>Ti(l)&&Ds(l)!=="body"),o=null;const i=Jr(e).position==="fixed";let s=i?dc(e):e;for(;Ti(s)&&!Hm(s);){const l=Jr(s),u=wx(s);!u&&l.position==="fixed"&&(o=null),(i?!u&&!o:!u&&l.position==="static"&&!!o&&["absolute","fixed"].includes(o.position)||Ld(s)&&!u&&WP(e,s))?r=r.filter(m=>m!==s):o=l,s=dc(s)}return t.set(e,r),r}function nU(e){let{element:t,boundary:n,rootBoundary:r,strategy:o}=e;const s=[...n==="clippingAncestors"?tU(t,this._c):[].concat(n),r],l=s[0],u=s.reduce((f,m)=>{const p=KC(t,m,o);return f.top=pr(p.top,f.top),f.right=Os(p.right,f.right),f.bottom=Os(p.bottom,f.bottom),f.left=pr(p.left,f.left),f},KC(t,l,o));return{width:u.right-u.left,height:u.bottom-u.top,x:u.left,y:u.top}}function rU(e){return zP(e)}function oU(e,t,n){const r=Jo(t),o=Di(t),i=n==="fixed",s=Ia(e,!0,i,t);let l={scrollLeft:0,scrollTop:0};const u=Ns(0);if(r||!r&&!i)if((Ds(t)!=="body"||Ld(o))&&(l=Km(t)),r){const f=Ia(t,!0,i,t);u.x=f.x+t.clientLeft,u.y=f.y+t.clientTop}else o&&(u.x=VP(o));return{x:s.left+l.scrollLeft-u.x,y:s.top+l.scrollTop-u.y,width:s.width,height:s.height}}function GC(e,t){return!Jo(e)||Jr(e).position==="fixed"?null:t?t(e):e.offsetParent}function HP(e,t){const n=yr(e);if(!Jo(e))return n;let r=GC(e,t);for(;r&&Kz(r)&&Jr(r).position==="static";)r=GC(r,t);return r&&(Ds(r)==="html"||Ds(r)==="body"&&Jr(r).position==="static"&&!wx(r))?n:r||Gz(e)||n}const iU=async function(e){let{reference:t,floating:n,strategy:r}=e;const o=this.getOffsetParent||HP,i=this.getDimensions;return{reference:oU(t,await o(n),r),floating:{x:0,y:0,...await i(n)}}};function sU(e){return Jr(e).direction==="rtl"}const aU={convertOffsetParentRelativeRectToViewportRelativeRect:Zz,getDocumentElement:Di,getClippingRect:nU,getOffsetParent:HP,getElementRects:iU,getClientRects:qz,getDimensions:rU,getScale:Il,isElement:Ti,isRTL:sU};function lU(e,t){let n=null,r;const o=Di(e);function i(){clearTimeout(r),n&&n.disconnect(),n=null}function s(l,u){l===void 0&&(l=!1),u===void 0&&(u=1),i();const{left:f,top:m,width:p,height:g}=e.getBoundingClientRect();if(l||t(),!p||!g)return;const y=mh(m),x=mh(o.clientWidth-(f+p)),S=mh(o.clientHeight-(m+g)),E=mh(f),b={rootMargin:-y+"px "+-x+"px "+-S+"px "+-E+"px",threshold:pr(0,Os(1,u))||1};let C=!0;function R(k){const O=k[0].intersectionRatio;if(O!==u){if(!C)return s();O?s(!1,O):r=setTimeout(()=>{s(!1,1e-7)},100)}C=!1}try{n=new IntersectionObserver(R,{...b,root:o.ownerDocument})}catch{n=new IntersectionObserver(R,b)}n.observe(e)}return s(!0),i}function cU(e,t,n,r){r===void 0&&(r={});const{ancestorScroll:o=!0,ancestorResize:i=!0,elementResize:s=typeof ResizeObserver=="function",layoutShift:l=typeof IntersectionObserver=="function",animationFrame:u=!1}=r,f=bx(e),m=o||i?[...f?cd(f):[],...cd(t)]:[];m.forEach(_=>{o&&_.addEventListener("scroll",n,{passive:!0}),i&&_.addEventListener("resize",n)});const p=f&&l?lU(f,n):null;let g=-1,y=null;s&&(y=new ResizeObserver(_=>{let[b]=_;b&&b.target===f&&y&&(y.unobserve(t),cancelAnimationFrame(g),g=requestAnimationFrame(()=>{y&&y.observe(t)})),n()}),f&&!u&&y.observe(f),y.observe(t));let x,S=u?Ia(e):null;u&&E();function E(){const _=Ia(e);S&&(_.x!==S.x||_.y!==S.y||_.width!==S.width||_.height!==S.height)&&n(),S=_,x=requestAnimationFrame(E)}return n(),()=>{m.forEach(_=>{o&&_.removeEventListener("scroll",n),i&&_.removeEventListener("resize",n)}),p&&p(),y&&y.disconnect(),y=null,u&&cancelAnimationFrame(x)}}const uU=(e,t,n)=>{const r=new Map,o={platform:aU,...n},i={...o.platform,_c:r};return Fz(e,t,{...o,platform:i})},dU=e=>{function t(n){return{}.hasOwnProperty.call(n,"current")}return{name:"arrow",options:e,fn(n){const{element:r,padding:o}=typeof e=="function"?e(n):e;return r&&t(r)?r.current!=null?UC({element:r.current,padding:o}).fn(n):{}:r?UC({element:r,padding:o}).fn(n):{}}}};var Jh=typeof document<"u"?d.useLayoutEffect:d.useEffect;function Up(e,t){if(e===t)return!0;if(typeof e!=typeof t)return!1;if(typeof e=="function"&&e.toString()===t.toString())return!0;let n,r,o;if(e&&t&&typeof e=="object"){if(Array.isArray(e)){if(n=e.length,n!=t.length)return!1;for(r=n;r--!==0;)if(!Up(e[r],t[r]))return!1;return!0}if(o=Object.keys(e),n=o.length,n!==Object.keys(t).length)return!1;for(r=n;r--!==0;)if(!{}.hasOwnProperty.call(t,o[r]))return!1;for(r=n;r--!==0;){const i=o[r];if(!(i==="_owner"&&e.$$typeof)&&!Up(e[i],t[i]))return!1}return!0}return e!==e&&t!==t}function KP(e){return typeof window>"u"?1:(e.ownerDocument.defaultView||window).devicePixelRatio||1}function YC(e,t){const n=KP(e);return Math.round(t*n)/n}function XC(e){const t=d.useRef(e);return Jh(()=>{t.current=e}),t}function fU(e){e===void 0&&(e={});const{placement:t="bottom",strategy:n="absolute",middleware:r=[],platform:o,elements:{reference:i,floating:s}={},transform:l=!0,whileElementsMounted:u,open:f}=e,[m,p]=d.useState({x:0,y:0,strategy:n,placement:t,middlewareData:{},isPositioned:!1}),[g,y]=d.useState(r);Up(g,r)||y(r);const[x,S]=d.useState(null),[E,_]=d.useState(null),b=d.useCallback(V=>{V!=O.current&&(O.current=V,S(V))},[S]),C=d.useCallback(V=>{V!==A.current&&(A.current=V,_(V))},[_]),R=i||x,k=s||E,O=d.useRef(null),A=d.useRef(null),I=d.useRef(m),z=XC(u),H=XC(o),ie=d.useCallback(()=>{if(!O.current||!A.current)return;const V={placement:t,strategy:n,middleware:g};H.current&&(V.platform=H.current),uU(O.current,A.current,V).then(J=>{const G={...J,isPositioned:!0};K.current&&!Up(I.current,G)&&(I.current=G,Bs.flushSync(()=>{p(G)}))})},[g,t,n,H]);Jh(()=>{f===!1&&I.current.isPositioned&&(I.current.isPositioned=!1,p(V=>({...V,isPositioned:!1})))},[f]);const K=d.useRef(!1);Jh(()=>(K.current=!0,()=>{K.current=!1}),[]),Jh(()=>{if(R&&(O.current=R),k&&(A.current=k),R&&k){if(z.current)return z.current(R,k,ie);ie()}},[R,k,ie,z]);const te=d.useMemo(()=>({reference:O,floating:A,setReference:b,setFloating:C}),[b,C]),U=d.useMemo(()=>({reference:R,floating:k}),[R,k]),re=d.useMemo(()=>{const V={position:n,left:0,top:0};if(!U.floating)return V;const J=YC(U.floating,m.x),G=YC(U.floating,m.y);return l?{...V,transform:"translate("+J+"px, "+G+"px)",...KP(U.floating)>=1.5&&{willChange:"transform"}}:{position:n,left:J,top:G}},[n,l,U.floating,m.x,m.y]);return d.useMemo(()=>({...m,update:ie,refs:te,elements:U,floatingStyles:re}),[m,ie,te,U,re])}function Sx(e){const[t,n]=d.useState(void 0);return Pn(()=>{if(e){n({width:e.offsetWidth,height:e.offsetHeight});const r=new ResizeObserver(o=>{if(!Array.isArray(o)||!o.length)return;const i=o[0];let s,l;if("borderBoxSize"in i){const u=i.borderBoxSize,f=Array.isArray(u)?u[0]:u;s=f.inlineSize,l=f.blockSize}else s=e.offsetWidth,l=e.offsetHeight;n({width:s,height:l})});return r.observe(e,{box:"border-box"}),()=>r.unobserve(e)}else n(void 0)},[e]),t}const GP="Popper",[YP,zs]=Tn(GP),[hU,XP]=YP(GP),pU=e=>{const{__scopePopper:t,children:n}=e,[r,o]=d.useState(null);return d.createElement(hU,{scope:t,anchor:r,onAnchorChange:o},n)},mU="PopperAnchor",gU=d.forwardRef((e,t)=>{const{__scopePopper:n,virtualRef:r,...o}=e,i=XP(mU,n),s=d.useRef(null),l=Ve(t,s);return d.useEffect(()=>{i.onAnchorChange((r==null?void 0:r.current)||s.current)}),r?null:d.createElement(Ae.div,Y({},o,{ref:l}))}),ZP="PopperContent",[vU,ute]=YP(ZP),yU=d.forwardRef((e,t)=>{var n,r,o,i,s,l,u,f;const{__scopePopper:m,side:p="bottom",sideOffset:g=0,align:y="center",alignOffset:x=0,arrowPadding:S=0,avoidCollisions:E=!0,collisionBoundary:_=[],collisionPadding:b=0,sticky:C="partial",hideWhenDetached:R=!1,updatePositionStrategy:k="optimized",onPlaced:O,...A}=e,I=XP(ZP,m),[z,H]=d.useState(null),ie=Ve(t,Yt=>H(Yt)),[K,te]=d.useState(null),U=Sx(K),re=(n=U==null?void 0:U.width)!==null&&n!==void 0?n:0,V=(r=U==null?void 0:U.height)!==null&&r!==void 0?r:0,J=p+(y!=="center"?"-"+y:""),G=typeof b=="number"?b:{top:0,right:0,bottom:0,left:0,...b},Z=Array.isArray(_)?_:[_],Q=Z.length>0,le={padding:G,boundary:Z.filter(wU),altBoundary:Q},{refs:L,floatingStyles:ue,placement:Ne,isPositioned:Ke,middlewareData:Me}=fU({strategy:"fixed",placement:J,whileElementsMounted:(...Yt)=>cU(...Yt,{animationFrame:k==="always"}),elements:{reference:I.anchor},middleware:[Uz({mainAxis:g+V,alignmentAxis:x}),E&&Vz({mainAxis:!0,crossAxis:!1,limiter:C==="partial"?Wz():void 0,...le}),E&&jz({...le}),Hz({...le,apply:({elements:Yt,rects:rr,availableWidth:Jt,availableHeight:Li})=>{const{width:N,height:X}=rr.reference,ee=Yt.floating.style;ee.setProperty("--radix-popper-available-width",`${Jt}px`),ee.setProperty("--radix-popper-available-height",`${Li}px`),ee.setProperty("--radix-popper-anchor-width",`${N}px`),ee.setProperty("--radix-popper-anchor-height",`${X}px`)}}),K&&dU({element:K,padding:S}),xU({arrowWidth:re,arrowHeight:V}),R&&Bz({strategy:"referenceHidden",...le})]}),[me,be]=qP(Ne),Ee=Lt(O);Pn(()=>{Ke&&(Ee==null||Ee())},[Ke,Ee]);const Oe=(o=Me.arrow)===null||o===void 0?void 0:o.x,Ie=(i=Me.arrow)===null||i===void 0?void 0:i.y,ze=((s=Me.arrow)===null||s===void 0?void 0:s.centerOffset)!==0,[ht,st]=d.useState();return Pn(()=>{z&&st(window.getComputedStyle(z).zIndex)},[z]),d.createElement("div",{ref:L.setFloating,"data-radix-popper-content-wrapper":"",style:{...ue,transform:Ke?ue.transform:"translate(0, -200%)",minWidth:"max-content",zIndex:ht,"--radix-popper-transform-origin":[(l=Me.transformOrigin)===null||l===void 0?void 0:l.x,(u=Me.transformOrigin)===null||u===void 0?void 0:u.y].join(" ")},dir:e.dir},d.createElement(vU,{scope:m,placedSide:me,onArrowChange:te,arrowX:Oe,arrowY:Ie,shouldHideArrow:ze},d.createElement(Ae.div,Y({"data-side":me,"data-align":be},A,{ref:ie,style:{...A.style,animation:Ke?void 0:"none",opacity:(f=Me.hide)!==null&&f!==void 0&&f.referenceHidden?0:void 0}}))))});function wU(e){return e!==null}const xU=e=>({name:"transformOrigin",options:e,fn(t){var n,r,o,i,s;const{placement:l,rects:u,middlewareData:f}=t,p=((n=f.arrow)===null||n===void 0?void 0:n.centerOffset)!==0,g=p?0:e.arrowWidth,y=p?0:e.arrowHeight,[x,S]=qP(l),E={start:"0%",center:"50%",end:"100%"}[S],_=((r=(o=f.arrow)===null||o===void 0?void 0:o.x)!==null&&r!==void 0?r:0)+g/2,b=((i=(s=f.arrow)===null||s===void 0?void 0:s.y)!==null&&i!==void 0?i:0)+y/2;let C="",R="";return x==="bottom"?(C=p?E:`${_}px`,R=`${-y}px`):x==="top"?(C=p?E:`${_}px`,R=`${u.floating.height+y}px`):x==="right"?(C=`${-y}px`,R=p?E:`${b}px`):x==="left"&&(C=`${u.floating.width+y}px`,R=p?E:`${b}px`),{data:{x:C,y:R}}}});function qP(e){const[t,n="center"]=e.split("-");return[t,n]}const Fd=pU,Gm=gU,Ym=yU,jd=d.forwardRef((e,t)=>{var n;const{container:r=globalThis==null||(n=globalThis.document)===null||n===void 0?void 0:n.body,...o}=e;return r?JI.createPortal(d.createElement(Ae.div,Y({},o,{ref:t})),r):null});function bU(e,t){return d.useReducer((n,r)=>{const o=t[n][r];return o??n},e)}const xn=e=>{const{present:t,children:n}=e,r=SU(t),o=typeof n=="function"?n({present:r.isPresent}):d.Children.only(n),i=Ve(r.ref,o.ref);return typeof n=="function"||r.isPresent?d.cloneElement(o,{ref:i}):null};xn.displayName="Presence";function SU(e){const[t,n]=d.useState(),r=d.useRef({}),o=d.useRef(e),i=d.useRef("none"),s=e?"mounted":"unmounted",[l,u]=bU(s,{mounted:{UNMOUNT:"unmounted",ANIMATION_OUT:"unmountSuspended"},unmountSuspended:{MOUNT:"mounted",ANIMATION_END:"unmounted"},unmounted:{MOUNT:"mounted"}});return d.useEffect(()=>{const f=gh(r.current);i.current=l==="mounted"?f:"none"},[l]),Pn(()=>{const f=r.current,m=o.current;if(m!==e){const g=i.current,y=gh(f);e?u("MOUNT"):y==="none"||(f==null?void 0:f.display)==="none"?u("UNMOUNT"):u(m&&g!==y?"ANIMATION_OUT":"UNMOUNT"),o.current=e}},[e,u]),Pn(()=>{if(t){const f=p=>{const y=gh(r.current).includes(p.animationName);p.target===t&&y&&Bs.flushSync(()=>u("ANIMATION_END"))},m=p=>{p.target===t&&(i.current=gh(r.current))};return t.addEventListener("animationstart",m),t.addEventListener("animationcancel",f),t.addEventListener("animationend",f),()=>{t.removeEventListener("animationstart",m),t.removeEventListener("animationcancel",f),t.removeEventListener("animationend",f)}}else u("ANIMATION_END")},[t,u]),{isPresent:["mounted","unmountSuspended"].includes(l),ref:d.useCallback(f=>{f&&(r.current=getComputedStyle(f)),n(f)},[])}}function gh(e){return(e==null?void 0:e.animationName)||"none"}function eo({prop:e,defaultProp:t,onChange:n=()=>{}}){const[r,o]=_U({defaultProp:t,onChange:n}),i=e!==void 0,s=i?e:r,l=Lt(n),u=d.useCallback(f=>{if(i){const p=typeof f=="function"?f(e):f;p!==e&&l(p)}else o(f)},[i,e,o,l]);return[s,u]}function _U({defaultProp:e,onChange:t}){const n=d.useState(e),[r]=n,o=d.useRef(r),i=Lt(t);return d.useEffect(()=>{o.current!==r&&(i(r),o.current=r)},[r,o,i]),n}const Xm=d.forwardRef((e,t)=>d.createElement(Ae.span,Y({},e,{ref:t,style:{position:"absolute",border:0,width:1,height:1,padding:0,margin:-1,overflow:"hidden",clip:"rect(0, 0, 0, 0)",whiteSpace:"nowrap",wordWrap:"normal",...e.style}}))),EU=Xm,[Zm,dte]=Tn("Tooltip",[zs]),_x=zs(),CU="TooltipProvider",$U=700,g1="tooltip.open",[RU,Ex]=Zm(CU),PU=e=>{const{__scopeTooltip:t,delayDuration:n=$U,skipDelayDuration:r=300,disableHoverableContent:o=!1,children:i}=e,[s,l]=d.useState(!0),u=d.useRef(!1),f=d.useRef(0);return d.useEffect(()=>{const m=f.current;return()=>window.clearTimeout(m)},[]),d.createElement(RU,{scope:t,isOpenDelayed:s,delayDuration:n,onOpen:d.useCallback(()=>{window.clearTimeout(f.current),l(!1)},[]),onClose:d.useCallback(()=>{window.clearTimeout(f.current),f.current=window.setTimeout(()=>l(!0),r)},[r]),isPointerInTransitRef:u,onPointerInTransitChange:d.useCallback(m=>{u.current=m},[]),disableHoverableContent:o},i)},Cx="Tooltip",[TU,qm]=Zm(Cx),kU=e=>{const{__scopeTooltip:t,children:n,open:r,defaultOpen:o=!1,onOpenChange:i,disableHoverableContent:s,delayDuration:l}=e,u=Ex(Cx,e.__scopeTooltip),f=_x(t),[m,p]=d.useState(null),g=tr(),y=d.useRef(0),x=s??u.disableHoverableContent,S=l??u.delayDuration,E=d.useRef(!1),[_=!1,b]=eo({prop:r,defaultProp:o,onChange:A=>{A?(u.onOpen(),document.dispatchEvent(new CustomEvent(g1))):u.onClose(),i==null||i(A)}}),C=d.useMemo(()=>_?E.current?"delayed-open":"instant-open":"closed",[_]),R=d.useCallback(()=>{window.clearTimeout(y.current),E.current=!1,b(!0)},[b]),k=d.useCallback(()=>{window.clearTimeout(y.current),b(!1)},[b]),O=d.useCallback(()=>{window.clearTimeout(y.current),y.current=window.setTimeout(()=>{E.current=!0,b(!0)},S)},[S,b]);return d.useEffect(()=>()=>window.clearTimeout(y.current),[]),d.createElement(Fd,f,d.createElement(TU,{scope:t,contentId:g,open:_,stateAttribute:C,trigger:m,onTriggerChange:p,onTriggerEnter:d.useCallback(()=>{u.isOpenDelayed?O():R()},[u.isOpenDelayed,O,R]),onTriggerLeave:d.useCallback(()=>{x?k():window.clearTimeout(y.current)},[k,x]),onOpen:R,onClose:k,disableHoverableContent:x},n))},ZC="TooltipTrigger",AU=d.forwardRef((e,t)=>{const{__scopeTooltip:n,...r}=e,o=qm(ZC,n),i=Ex(ZC,n),s=_x(n),l=d.useRef(null),u=Ve(t,l,o.onTriggerChange),f=d.useRef(!1),m=d.useRef(!1),p=d.useCallback(()=>f.current=!1,[]);return d.useEffect(()=>()=>document.removeEventListener("pointerup",p),[p]),d.createElement(Gm,Y({asChild:!0},s),d.createElement(Ae.button,Y({"aria-describedby":o.open?o.contentId:void 0,"data-state":o.stateAttribute},r,{ref:u,onPointerMove:fe(e.onPointerMove,g=>{g.pointerType!=="touch"&&!m.current&&!i.isPointerInTransitRef.current&&(o.onTriggerEnter(),m.current=!0)}),onPointerLeave:fe(e.onPointerLeave,()=>{o.onTriggerLeave(),m.current=!1}),onPointerDown:fe(e.onPointerDown,()=>{f.current=!0,document.addEventListener("pointerup",p,{once:!0})}),onFocus:fe(e.onFocus,()=>{f.current||o.onOpen()}),onBlur:fe(e.onBlur,o.onClose),onClick:fe(e.onClick,o.onClose)})))}),MU="TooltipPortal",[fte,OU]=Zm(MU,{forceMount:void 0}),ud="TooltipContent",NU=d.forwardRef((e,t)=>{const n=OU(ud,e.__scopeTooltip),{forceMount:r=n.forceMount,side:o="top",...i}=e,s=qm(ud,e.__scopeTooltip);return d.createElement(xn,{present:r||s.open},s.disableHoverableContent?d.createElement(QP,Y({side:o},i,{ref:t})):d.createElement(DU,Y({side:o},i,{ref:t})))}),DU=d.forwardRef((e,t)=>{const n=qm(ud,e.__scopeTooltip),r=Ex(ud,e.__scopeTooltip),o=d.useRef(null),i=Ve(t,o),[s,l]=d.useState(null),{trigger:u,onClose:f}=n,m=o.current,{onPointerInTransitChange:p}=r,g=d.useCallback(()=>{l(null),p(!1)},[p]),y=d.useCallback((x,S)=>{const E=x.currentTarget,_={x:x.clientX,y:x.clientY},b=LU(_,E.getBoundingClientRect()),C=FU(_,b),R=jU(S.getBoundingClientRect()),k=zU([...C,...R]);l(k),p(!0)},[p]);return d.useEffect(()=>()=>g(),[g]),d.useEffect(()=>{if(u&&m){const x=E=>y(E,m),S=E=>y(E,u);return u.addEventListener("pointerleave",x),m.addEventListener("pointerleave",S),()=>{u.removeEventListener("pointerleave",x),m.removeEventListener("pointerleave",S)}}},[u,m,y,g]),d.useEffect(()=>{if(s){const x=S=>{const E=S.target,_={x:S.clientX,y:S.clientY},b=(u==null?void 0:u.contains(E))||(m==null?void 0:m.contains(E)),C=!BU(_,s);b?g():C&&(g(),f())};return document.addEventListener("pointermove",x),()=>document.removeEventListener("pointermove",x)}},[u,m,s,f,g]),d.createElement(QP,Y({},e,{ref:i}))}),[IU,hte]=Zm(Cx,{isInside:!1}),QP=d.forwardRef((e,t)=>{const{__scopeTooltip:n,children:r,"aria-label":o,onEscapeKeyDown:i,onPointerDownOutside:s,...l}=e,u=qm(ud,n),f=_x(n),{onClose:m}=u;return d.useEffect(()=>(document.addEventListener(g1,m),()=>document.removeEventListener(g1,m)),[m]),d.useEffect(()=>{if(u.trigger){const p=g=>{const y=g.target;y!=null&&y.contains(u.trigger)&&m()};return window.addEventListener("scroll",p,{capture:!0}),()=>window.removeEventListener("scroll",p,{capture:!0})}},[u.trigger,m]),d.createElement($c,{asChild:!0,disableOutsidePointerEvents:!1,onEscapeKeyDown:i,onPointerDownOutside:s,onFocusOutside:p=>p.preventDefault(),onDismiss:m},d.createElement(Ym,Y({"data-state":u.stateAttribute},f,l,{ref:t,style:{...l.style,"--radix-tooltip-content-transform-origin":"var(--radix-popper-transform-origin)","--radix-tooltip-content-available-width":"var(--radix-popper-available-width)","--radix-tooltip-content-available-height":"var(--radix-popper-available-height)","--radix-tooltip-trigger-width":"var(--radix-popper-anchor-width)","--radix-tooltip-trigger-height":"var(--radix-popper-anchor-height)"}}),d.createElement(fx,null,r),d.createElement(IU,{scope:n,isInside:!0},d.createElement(EU,{id:u.contentId,role:"tooltip"},o||r))))});function LU(e,t){const n=Math.abs(t.top-e.y),r=Math.abs(t.bottom-e.y),o=Math.abs(t.right-e.x),i=Math.abs(t.left-e.x);switch(Math.min(n,r,o,i)){case i:return"left";case o:return"right";case n:return"top";case r:return"bottom";default:throw new Error("unreachable")}}function FU(e,t,n=5){const r=[];switch(t){case"top":r.push({x:e.x-n,y:e.y+n},{x:e.x+n,y:e.y+n});break;case"bottom":r.push({x:e.x-n,y:e.y-n},{x:e.x+n,y:e.y-n});break;case"left":r.push({x:e.x+n,y:e.y-n},{x:e.x+n,y:e.y+n});break;case"right":r.push({x:e.x-n,y:e.y-n},{x:e.x-n,y:e.y+n});break}return r}function jU(e){const{top:t,right:n,bottom:r,left:o}=e;return[{x:o,y:t},{x:n,y:t},{x:n,y:r},{x:o,y:r}]}function BU(e,t){const{x:n,y:r}=e;let o=!1;for(let i=0,s=t.length-1;ir!=m>r&&n<(f-l)*(r-u)/(m-u)+l&&(o=!o)}return o}function zU(e){const t=e.slice();return t.sort((n,r)=>n.xr.x?1:n.yr.y?1:0),UU(t)}function UU(e){if(e.length<=1)return e.slice();const t=[];for(let r=0;r=2;){const i=t[t.length-1],s=t[t.length-2];if((i.x-s.x)*(o.y-s.y)>=(i.y-s.y)*(o.x-s.x))t.pop();else break}t.push(o)}t.pop();const n=[];for(let r=e.length-1;r>=0;r--){const o=e[r];for(;n.length>=2;){const i=n[n.length-1],s=n[n.length-2];if((i.x-s.x)*(o.y-s.y)>=(i.y-s.y)*(o.x-s.x))n.pop();else break}n.push(o)}return n.pop(),t.length===1&&n.length===1&&t[0].x===n[0].x&&t[0].y===n[0].y?t:t.concat(n)}const VU=PU,WU=kU,HU=AU,JP=NU,KU=VU,eT=WU,tT=HU,$x=d.forwardRef(({className:e,sideOffset:t=4,...n},r)=>v.jsx(JP,{ref:r,sideOffset:t,className:xe("z-50 border overflow-hidden rounded-md bg-background text-foreground px-3 py-1.5 text-xs animate-in fade-in-0 zoom-in-95 data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=closed]:zoom-out-95 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2",e),...n}));$x.displayName=JP.displayName;const Rx=Fm("inline-flex items-center justify-center whitespace-nowrap rounded-md text-sm font-medium transition-colors focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring disabled:pointer-events-none disabled:opacity-50",{variants:{variant:{default:"bg-primary text-primary-foreground shadow hover:bg-primary/90",destructive:"bg-destructive text-destructive-foreground shadow-sm hover:bg-destructive/90",outline:"border border-input bg-transparent shadow-sm hover:bg-accent hover:text-accent-foreground",secondary:"bg-secondary text-secondary-foreground shadow-sm hover:bg-secondary/80",ghost:"hover:bg-accent hover:text-accent-foreground",link:"text-primary underline-offset-4 hover:underline"},size:{default:"h-9 px-4 py-2",sm:"h-8 rounded-md px-3 text-xs",lg:"h-10 rounded-md px-8",icon:"h-9 w-9"}},defaultVariants:{variant:"default",size:"default"}}),vn=d.forwardRef(({className:e,variant:t,size:n,asChild:r=!1,...o},i)=>{const s=r?Qo:"button";return v.jsx(s,{className:xe(Rx({variant:t,size:n,className:e}),"outline-none cursor-default select-none"),ref:i,tabIndex:-1,...o})});vn.displayName="Button";const Zn=d.forwardRef(({tooltip:e,children:t,...n},r)=>v.jsxs(eT,{children:[v.jsx(tT,{asChild:!0,children:v.jsx(vn,{variant:"ghost",size:"icon",...n,ref:r,tabIndex:-1,className:"cursor-default bg-background",children:v.jsx("div",{className:"icon-button-icon-wrapper",children:t})})}),v.jsx($x,{children:v.jsx("p",{children:e})})]})),nT=e=>{const{onFileUpload:t,children:n,...r}=e,[o]=d.useState(`file-upload-${Math.random().toString()}`),i=s=>{var u;const l=(u=s.currentTarget.files)==null?void 0:u[0];l&&t(l)};return v.jsxs(v.Fragment,{children:[v.jsx("label",{htmlFor:o,children:v.jsx(Zn,{...r,asChild:!0,children:n})}),v.jsx(Wm,{style:{display:"none"},id:o,name:o,type:"file",onChange:i,accept:"image/png, image/jpeg"})]})};function GU(e,t){const[n,r]=d.useState(e);return d.useEffect(()=>{const o=setTimeout(()=>{r(e)},t);return()=>{clearTimeout(o)}},[e,t]),n}function rT(e){const[t,n]=d.useState(()=>typeof e=="boolean"?e:!!e),r=d.useCallback(o=>n(typeof o=="boolean"?o:i=>!i),[]);return[t,r]}const P0="focusScope.autoFocusOnMount",T0="focusScope.autoFocusOnUnmount",qC={bubbles:!1,cancelable:!0},Qm=d.forwardRef((e,t)=>{const{loop:n=!1,trapped:r=!1,onMountAutoFocus:o,onUnmountAutoFocus:i,...s}=e,[l,u]=d.useState(null),f=Lt(o),m=Lt(i),p=d.useRef(null),g=Ve(t,S=>u(S)),y=d.useRef({paused:!1,pause(){this.paused=!0},resume(){this.paused=!1}}).current;d.useEffect(()=>{if(r){let b=function(O){if(y.paused||!l)return;const A=O.target;l.contains(A)?p.current=A:ts(p.current,{select:!0})},C=function(O){if(y.paused||!l)return;const A=O.relatedTarget;A!==null&&(l.contains(A)||ts(p.current,{select:!0}))},R=function(O){if(document.activeElement===document.body)for(const I of O)I.removedNodes.length>0&&ts(l)};var S=b,E=C,_=R;document.addEventListener("focusin",b),document.addEventListener("focusout",C);const k=new MutationObserver(R);return l&&k.observe(l,{childList:!0,subtree:!0}),()=>{document.removeEventListener("focusin",b),document.removeEventListener("focusout",C),k.disconnect()}}},[r,l,y.paused]),d.useEffect(()=>{if(l){JC.add(y);const S=document.activeElement;if(!l.contains(S)){const _=new CustomEvent(P0,qC);l.addEventListener(P0,f),l.dispatchEvent(_),_.defaultPrevented||(YU(JU(oT(l)),{select:!0}),document.activeElement===S&&ts(l))}return()=>{l.removeEventListener(P0,f),setTimeout(()=>{const _=new CustomEvent(T0,qC);l.addEventListener(T0,m),l.dispatchEvent(_),_.defaultPrevented||ts(S??document.body,{select:!0}),l.removeEventListener(T0,m),JC.remove(y)},0)}}},[l,f,m,y]);const x=d.useCallback(S=>{if(!n&&!r||y.paused)return;const E=S.key==="Tab"&&!S.altKey&&!S.ctrlKey&&!S.metaKey,_=document.activeElement;if(E&&_){const b=S.currentTarget,[C,R]=XU(b);C&&R?!S.shiftKey&&_===R?(S.preventDefault(),n&&ts(C,{select:!0})):S.shiftKey&&_===C&&(S.preventDefault(),n&&ts(R,{select:!0})):_===b&&S.preventDefault()}},[n,r,y.paused]);return d.createElement(Ae.div,Y({tabIndex:-1},s,{ref:g,onKeyDown:x}))});function YU(e,{select:t=!1}={}){const n=document.activeElement;for(const r of e)if(ts(r,{select:t}),document.activeElement!==n)return}function XU(e){const t=oT(e),n=QC(t,e),r=QC(t.reverse(),e);return[n,r]}function oT(e){const t=[],n=document.createTreeWalker(e,NodeFilter.SHOW_ELEMENT,{acceptNode:r=>{const o=r.tagName==="INPUT"&&r.type==="hidden";return r.disabled||r.hidden||o?NodeFilter.FILTER_SKIP:r.tabIndex>=0?NodeFilter.FILTER_ACCEPT:NodeFilter.FILTER_SKIP}});for(;n.nextNode();)t.push(n.currentNode);return t}function QC(e,t){for(const n of e)if(!ZU(n,{upTo:t}))return n}function ZU(e,{upTo:t}){if(getComputedStyle(e).visibility==="hidden")return!0;for(;e;){if(t!==void 0&&e===t)return!1;if(getComputedStyle(e).display==="none")return!0;e=e.parentElement}return!1}function qU(e){return e instanceof HTMLInputElement&&"select"in e}function ts(e,{select:t=!1}={}){if(e&&e.focus){const n=document.activeElement;e.focus({preventScroll:!0}),e!==n&&qU(e)&&t&&e.select()}}const JC=QU();function QU(){let e=[];return{add(t){const n=e[0];t!==n&&(n==null||n.pause()),e=e2(e,t),e.unshift(t)},remove(t){var n;e=e2(e,t),(n=e[0])===null||n===void 0||n.resume()}}}function e2(e,t){const n=[...e],r=n.indexOf(t);return r!==-1&&n.splice(r,1),n}function JU(e){return e.filter(t=>t.tagName!=="A")}let k0=0;function Jm(){d.useEffect(()=>{var e,t;const n=document.querySelectorAll("[data-radix-focus-guard]");return document.body.insertAdjacentElement("afterbegin",(e=n[0])!==null&&e!==void 0?e:t2()),document.body.insertAdjacentElement("beforeend",(t=n[1])!==null&&t!==void 0?t:t2()),k0++,()=>{k0===1&&document.querySelectorAll("[data-radix-focus-guard]").forEach(r=>r.remove()),k0--}},[])}function t2(){const e=document.createElement("span");return e.setAttribute("data-radix-focus-guard",""),e.tabIndex=0,e.style.cssText="outline: none; opacity: 0; position: fixed; pointer-events: none",e}var Bo=function(){return Bo=Object.assign||function(t){for(var n,r=1,o=arguments.length;r"u")return gV;var t=vV(e),n=document.documentElement.clientWidth,r=window.innerWidth;return{left:t[0],top:t[1],right:t[2],gap:Math.max(0,r-n+t[2]-t[0])}},wV=lT(),xV=function(e,t,n,r){var o=e.left,i=e.top,s=e.right,l=e.gap;return n===void 0&&(n="margin"),` + .`.concat(tV,` { + overflow: hidden `).concat(r,`; + padding-right: `).concat(l,"px ").concat(r,`; + } + body { + overflow: hidden `).concat(r,`; + overscroll-behavior: contain; + `).concat([t&&"position: relative ".concat(r,";"),n==="margin"&&` + padding-left: `.concat(o,`px; + padding-top: `).concat(i,`px; + padding-right: `).concat(s,`px; + margin-left:0; + margin-top:0; + margin-right: `).concat(l,"px ").concat(r,`; + `),n==="padding"&&"padding-right: ".concat(l,"px ").concat(r,";")].filter(Boolean).join(""),` + } + + .`).concat(ep,` { + right: `).concat(l,"px ").concat(r,`; + } + + .`).concat(tp,` { + margin-right: `).concat(l,"px ").concat(r,`; + } + + .`).concat(ep," .").concat(ep,` { + right: 0 `).concat(r,`; + } + + .`).concat(tp," .").concat(tp,` { + margin-right: 0 `).concat(r,`; + } + + body { + `).concat(nV,": ").concat(l,`px; + } +`)},bV=function(e){var t=e.noRelative,n=e.noImportant,r=e.gapMode,o=r===void 0?"margin":r,i=d.useMemo(function(){return yV(o)},[o]);return d.createElement(wV,{styles:xV(i,!t,o,n?"":"!important")})},v1=!1;if(typeof window<"u")try{var vh=Object.defineProperty({},"passive",{get:function(){return v1=!0,!0}});window.addEventListener("test",vh,vh),window.removeEventListener("test",vh,vh)}catch{v1=!1}var sl=v1?{passive:!1}:!1,SV=function(e){return e.tagName==="TEXTAREA"},cT=function(e,t){var n=window.getComputedStyle(e);return n[t]!=="hidden"&&!(n.overflowY===n.overflowX&&!SV(e)&&n[t]==="visible")},_V=function(e){return cT(e,"overflowY")},EV=function(e){return cT(e,"overflowX")},r2=function(e,t){var n=t;do{typeof ShadowRoot<"u"&&n instanceof ShadowRoot&&(n=n.host);var r=uT(e,n);if(r){var o=dT(e,n),i=o[1],s=o[2];if(i>s)return!0}n=n.parentNode}while(n&&n!==document.body);return!1},CV=function(e){var t=e.scrollTop,n=e.scrollHeight,r=e.clientHeight;return[t,n,r]},$V=function(e){var t=e.scrollLeft,n=e.scrollWidth,r=e.clientWidth;return[t,n,r]},uT=function(e,t){return e==="v"?_V(t):EV(t)},dT=function(e,t){return e==="v"?CV(t):$V(t)},RV=function(e,t){return e==="h"&&t==="rtl"?-1:1},PV=function(e,t,n,r,o){var i=RV(e,window.getComputedStyle(t).direction),s=i*r,l=n.target,u=t.contains(l),f=!1,m=s>0,p=0,g=0;do{var y=dT(e,l),x=y[0],S=y[1],E=y[2],_=S-E-i*x;(x||_)&&uT(e,l)&&(p+=_,g+=x),l=l.parentNode}while(!u&&l!==document.body||u&&(t.contains(l)||t===l));return(m&&(o&&p===0||!o&&s>p)||!m&&(o&&g===0||!o&&-s>g))&&(f=!0),f},yh=function(e){return"changedTouches"in e?[e.changedTouches[0].clientX,e.changedTouches[0].clientY]:[0,0]},o2=function(e){return[e.deltaX,e.deltaY]},i2=function(e){return e&&"current"in e?e.current:e},TV=function(e,t){return e[0]===t[0]&&e[1]===t[1]},kV=function(e){return` + .block-interactivity-`.concat(e,` {pointer-events: none;} + .allow-interactivity-`).concat(e,` {pointer-events: all;} +`)},AV=0,al=[];function MV(e){var t=d.useRef([]),n=d.useRef([0,0]),r=d.useRef(),o=d.useState(AV++)[0],i=d.useState(function(){return lT()})[0],s=d.useRef(e);d.useEffect(function(){s.current=e},[e]),d.useEffect(function(){if(e.inert){document.body.classList.add("block-interactivity-".concat(o));var S=eV([e.lockRef.current],(e.shards||[]).map(i2),!0).filter(Boolean);return S.forEach(function(E){return E.classList.add("allow-interactivity-".concat(o))}),function(){document.body.classList.remove("block-interactivity-".concat(o)),S.forEach(function(E){return E.classList.remove("allow-interactivity-".concat(o))})}}},[e.inert,e.lockRef.current,e.shards]);var l=d.useCallback(function(S,E){if("touches"in S&&S.touches.length===2)return!s.current.allowPinchZoom;var _=yh(S),b=n.current,C="deltaX"in S?S.deltaX:b[0]-_[0],R="deltaY"in S?S.deltaY:b[1]-_[1],k,O=S.target,A=Math.abs(C)>Math.abs(R)?"h":"v";if("touches"in S&&A==="h"&&O.type==="range")return!1;var I=r2(A,O);if(!I)return!0;if(I?k=A:(k=A==="v"?"h":"v",I=r2(A,O)),!I)return!1;if(!r.current&&"changedTouches"in S&&(C||R)&&(r.current=k),!k)return!0;var z=r.current||k;return PV(z,E,S,z==="h"?C:R,!0)},[]),u=d.useCallback(function(S){var E=S;if(!(!al.length||al[al.length-1]!==i)){var _="deltaY"in E?o2(E):yh(E),b=t.current.filter(function(k){return k.name===E.type&&k.target===E.target&&TV(k.delta,_)})[0];if(b&&b.should){E.cancelable&&E.preventDefault();return}if(!b){var C=(s.current.shards||[]).map(i2).filter(Boolean).filter(function(k){return k.contains(E.target)}),R=C.length>0?l(E,C[0]):!s.current.noIsolation;R&&E.cancelable&&E.preventDefault()}}},[]),f=d.useCallback(function(S,E,_,b){var C={name:S,delta:E,target:_,should:b};t.current.push(C),setTimeout(function(){t.current=t.current.filter(function(R){return R!==C})},1)},[]),m=d.useCallback(function(S){n.current=yh(S),r.current=void 0},[]),p=d.useCallback(function(S){f(S.type,o2(S),S.target,l(S,e.lockRef.current))},[]),g=d.useCallback(function(S){f(S.type,yh(S),S.target,l(S,e.lockRef.current))},[]);d.useEffect(function(){return al.push(i),e.setCallbacks({onScrollCapture:p,onWheelCapture:p,onTouchMoveCapture:g}),document.addEventListener("wheel",u,sl),document.addEventListener("touchmove",u,sl),document.addEventListener("touchstart",m,sl),function(){al=al.filter(function(S){return S!==i}),document.removeEventListener("wheel",u,sl),document.removeEventListener("touchmove",u,sl),document.removeEventListener("touchstart",m,sl)}},[]);var y=e.removeScrollBar,x=e.inert;return d.createElement(d.Fragment,null,x?d.createElement(i,{styles:kV(o)}):null,y?d.createElement(bV,{gapMode:"margin"}):null)}const OV=cV(aT,MV);var fT=d.forwardRef(function(e,t){return d.createElement(eg,Bo({},e,{ref:t,sideCar:OV}))});fT.classNames=eg.classNames;const tg=fT;var NV=function(e){if(typeof document>"u")return null;var t=Array.isArray(e)?e[0]:e;return t.ownerDocument.body},ll=new WeakMap,wh=new WeakMap,xh={},O0=0,hT=function(e){return e&&(e.host||hT(e.parentNode))},DV=function(e,t){return t.map(function(n){if(e.contains(n))return n;var r=hT(n);return r&&e.contains(r)?r:(console.error("aria-hidden",n,"in not contained inside",e,". Doing nothing"),null)}).filter(function(n){return!!n})},IV=function(e,t,n,r){var o=DV(t,Array.isArray(e)?e:[e]);xh[n]||(xh[n]=new WeakMap);var i=xh[n],s=[],l=new Set,u=new Set(o),f=function(p){!p||l.has(p)||(l.add(p),f(p.parentNode))};o.forEach(f);var m=function(p){!p||u.has(p)||Array.prototype.forEach.call(p.children,function(g){if(l.has(g))m(g);else{var y=g.getAttribute(r),x=y!==null&&y!=="false",S=(ll.get(g)||0)+1,E=(i.get(g)||0)+1;ll.set(g,S),i.set(g,E),s.push(g),S===1&&x&&wh.set(g,!0),E===1&&g.setAttribute(n,"true"),x||g.setAttribute(r,"true")}})};return m(t),l.clear(),O0++,function(){s.forEach(function(p){var g=ll.get(p)-1,y=i.get(p)-1;ll.set(p,g),i.set(p,y),g||(wh.has(p)||p.removeAttribute(r),wh.delete(p)),y||p.removeAttribute(n)}),O0--,O0||(ll=new WeakMap,ll=new WeakMap,wh=new WeakMap,xh={})}},ng=function(e,t,n){n===void 0&&(n="data-aria-hidden");var r=Array.from(Array.isArray(e)?e:[e]),o=t||NV(e);return o?(r.push.apply(r,Array.from(o.querySelectorAll("[aria-live]"))),IV(r,o,n,"aria-hidden")):function(){return null}};const pT="Dialog",[mT,gT]=Tn(pT),[LV,$o]=mT(pT),FV=e=>{const{__scopeDialog:t,children:n,open:r,defaultOpen:o,onOpenChange:i,modal:s=!0}=e,l=d.useRef(null),u=d.useRef(null),[f=!1,m]=eo({prop:r,defaultProp:o,onChange:i});return d.createElement(LV,{scope:t,triggerRef:l,contentRef:u,contentId:tr(),titleId:tr(),descriptionId:tr(),open:f,onOpenChange:m,onOpenToggle:d.useCallback(()=>m(p=>!p),[m]),modal:s},n)},jV="DialogTrigger",vT=d.forwardRef((e,t)=>{const{__scopeDialog:n,...r}=e,o=$o(jV,n),i=Ve(t,o.triggerRef);return d.createElement(Ae.button,Y({type:"button","aria-haspopup":"dialog","aria-expanded":o.open,"aria-controls":o.contentId,"data-state":Px(o.open)},r,{ref:i,onClick:fe(e.onClick,o.onOpenToggle)}))}),yT="DialogPortal",[BV,wT]=mT(yT,{forceMount:void 0}),zV=e=>{const{__scopeDialog:t,forceMount:n,children:r,container:o}=e,i=$o(yT,t);return d.createElement(BV,{scope:t,forceMount:n},d.Children.map(r,s=>d.createElement(xn,{present:n||i.open},d.createElement(jd,{asChild:!0,container:o},s))))},y1="DialogOverlay",UV=d.forwardRef((e,t)=>{const n=wT(y1,e.__scopeDialog),{forceMount:r=n.forceMount,...o}=e,i=$o(y1,e.__scopeDialog);return i.modal?d.createElement(xn,{present:r||i.open},d.createElement(VV,Y({},o,{ref:t}))):null}),VV=d.forwardRef((e,t)=>{const{__scopeDialog:n,...r}=e,o=$o(y1,n);return d.createElement(tg,{as:Qo,allowPinchZoom:!0,shards:[o.contentRef]},d.createElement(Ae.div,Y({"data-state":Px(o.open)},r,{ref:t,style:{pointerEvents:"auto",...r.style}})))}),fc="DialogContent",WV=d.forwardRef((e,t)=>{const n=wT(fc,e.__scopeDialog),{forceMount:r=n.forceMount,...o}=e,i=$o(fc,e.__scopeDialog);return d.createElement(xn,{present:r||i.open},i.modal?d.createElement(HV,Y({},o,{ref:t})):d.createElement(KV,Y({},o,{ref:t})))}),HV=d.forwardRef((e,t)=>{const n=$o(fc,e.__scopeDialog),r=d.useRef(null),o=Ve(t,n.contentRef,r);return d.useEffect(()=>{const i=r.current;if(i)return ng(i)},[]),d.createElement(xT,Y({},e,{ref:o,trapFocus:n.open,disableOutsidePointerEvents:!0,onCloseAutoFocus:fe(e.onCloseAutoFocus,i=>{var s;i.preventDefault(),(s=n.triggerRef.current)===null||s===void 0||s.focus()}),onPointerDownOutside:fe(e.onPointerDownOutside,i=>{const s=i.detail.originalEvent,l=s.button===0&&s.ctrlKey===!0;(s.button===2||l)&&i.preventDefault()}),onFocusOutside:fe(e.onFocusOutside,i=>i.preventDefault())}))}),KV=d.forwardRef((e,t)=>{const n=$o(fc,e.__scopeDialog),r=d.useRef(!1),o=d.useRef(!1);return d.createElement(xT,Y({},e,{ref:t,trapFocus:!1,disableOutsidePointerEvents:!1,onCloseAutoFocus:i=>{var s;if((s=e.onCloseAutoFocus)===null||s===void 0||s.call(e,i),!i.defaultPrevented){var l;r.current||(l=n.triggerRef.current)===null||l===void 0||l.focus(),i.preventDefault()}r.current=!1,o.current=!1},onInteractOutside:i=>{var s,l;(s=e.onInteractOutside)===null||s===void 0||s.call(e,i),i.defaultPrevented||(r.current=!0,i.detail.originalEvent.type==="pointerdown"&&(o.current=!0));const u=i.target;((l=n.triggerRef.current)===null||l===void 0?void 0:l.contains(u))&&i.preventDefault(),i.detail.originalEvent.type==="focusin"&&o.current&&i.preventDefault()}}))}),xT=d.forwardRef((e,t)=>{const{__scopeDialog:n,trapFocus:r,onOpenAutoFocus:o,onCloseAutoFocus:i,...s}=e,l=$o(fc,n),u=d.useRef(null),f=Ve(t,u);return Jm(),d.createElement(d.Fragment,null,d.createElement(Qm,{asChild:!0,loop:!0,trapped:r,onMountAutoFocus:o,onUnmountAutoFocus:i},d.createElement($c,Y({role:"dialog",id:l.contentId,"aria-describedby":l.descriptionId,"aria-labelledby":l.titleId,"data-state":Px(l.open)},s,{ref:f,onDismiss:()=>l.onOpenChange(!1)}))),!1)}),bT="DialogTitle",GV=d.forwardRef((e,t)=>{const{__scopeDialog:n,...r}=e,o=$o(bT,n);return d.createElement(Ae.h2,Y({id:o.titleId},r,{ref:t}))}),YV="DialogDescription",XV=d.forwardRef((e,t)=>{const{__scopeDialog:n,...r}=e,o=$o(YV,n);return d.createElement(Ae.p,Y({id:o.descriptionId},r,{ref:t}))}),ZV="DialogClose",qV=d.forwardRef((e,t)=>{const{__scopeDialog:n,...r}=e,o=$o(ZV,n);return d.createElement(Ae.button,Y({type:"button"},r,{ref:t,onClick:fe(e.onClick,()=>o.onOpenChange(!1))}))});function Px(e){return e?"open":"closed"}const QV="DialogTitleWarning",[JV,pte]=vz(QV,{contentName:fc,titleName:bT,docsSlug:"dialog"}),Tx=FV,ST=vT,kx=zV,Bd=UV,zd=WV,Ud=GV,Vd=XV,Ax=qV,Mx=Tx,_T=ST,eW=kx,ET=d.forwardRef(({className:e,...t},n)=>v.jsx(Bd,{ref:n,className:xe("fixed inset-0 z-50 bg-background/80 backdrop-blur-sm data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0",e),...t}));ET.displayName=Bd.displayName;const rg=d.forwardRef(({className:e,children:t,...n},r)=>v.jsxs(eW,{children:[v.jsx(ET,{}),v.jsxs(zd,{ref:r,className:xe("fixed left-[50%] top-[50%] z-50 flex flex-col w-full max-w-lg translate-x-[-50%] translate-y-[-50%] gap-4 border bg-background p-6 shadow-lg duration-200 data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 data-[state=closed]:slide-out-to-left-1/2 data-[state=closed]:slide-out-to-top-[48%] data-[state=open]:slide-in-from-left-1/2 data-[state=open]:slide-in-from-top-[48%] sm:rounded-lg",e,"outline-none"),onCloseAutoFocus:o=>o.preventDefault(),...n,children:[t,v.jsxs(Ax,{className:"absolute right-4 top-4 rounded-sm opacity-70 ring-offset-background transition-opacity hover:opacity-100 focus:outline-none focus:ring-2 focus:ring-ring focus:ring-offset-2 disabled:pointer-events-none data-[state=open]:bg-accent data-[state=open]:text-muted-foreground",children:[v.jsx(CP,{className:"h-4 w-4"}),v.jsx("span",{className:"sr-only",children:"Close"})]})]})]}));rg.displayName=zd.displayName;const CT=({className:e,...t})=>v.jsx("div",{className:xe("flex flex-col space-y-1.5 text-center sm:text-left",e),...t});CT.displayName="DialogHeader";const og=d.forwardRef(({className:e,...t},n)=>v.jsx(Ud,{ref:n,className:xe("text-2xl font-semibold leading-none tracking-tight",e),...t}));og.displayName=Ud.displayName;const tW=d.forwardRef(({className:e,...t},n)=>v.jsx(Vd,{ref:n,className:xe("text-sm text-muted-foreground",e),...t}));tW.displayName=Vd.displayName;function w1(){return w1=Object.assign?Object.assign.bind():function(e){for(var t=1;t'),!0):t?e.some(function(n){return t.includes(n)})||e.includes("*"):!0}var cW=function(t,n,r){r===void 0&&(r=!1);var o=n.alt,i=n.meta,s=n.mod,l=n.shift,u=n.ctrl,f=n.keys,m=t.key,p=t.code,g=t.ctrlKey,y=t.metaKey,x=t.shiftKey,S=t.altKey,E=gs(p),_=m.toLowerCase();if(!r){if(o===!S&&_!=="alt"||l===!x&&_!=="shift")return!1;if(s){if(!y&&!g)return!1}else if(i===!y&&_!=="meta"&&_!=="os"||u===!g&&_!=="ctrl"&&_!=="control")return!1}return f&&f.length===1&&(f.includes(_)||f.includes(E))?!0:f?oW(f):!f},uW=d.createContext(void 0),dW=function(){return d.useContext(uW)};function kT(e,t){return e&&t&&typeof e=="object"&&typeof t=="object"?Object.keys(e).length===Object.keys(t).length&&Object.keys(e).reduce(function(n,r){return n&&kT(e[r],t[r])},!0):e===t}var fW=d.createContext({hotkeys:[],enabledScopes:[],toggleScope:function(){},enableScope:function(){},disableScope:function(){}}),hW=function(){return d.useContext(fW)};function pW(e){var t=d.useRef(void 0);return kT(t.current,e)||(t.current=e),t.current}var s2=function(t){t.stopPropagation(),t.preventDefault(),t.stopImmediatePropagation()},mW=typeof window<"u"?d.useLayoutEffect:d.useEffect;function gW(e,t,n,r){var o=d.useRef(null),i=d.useRef(!1),s=n instanceof Array?r instanceof Array?void 0:r:n,l=Ox(e)?e.join(s==null?void 0:s.splitKey):e,u=n instanceof Array?n:r instanceof Array?r:void 0,f=d.useCallback(t,u??[]),m=d.useRef(f);u?m.current=f:m.current=t;var p=pW(s),g=hW(),y=g.enabledScopes,x=dW();return mW(function(){if(!((p==null?void 0:p.enabled)===!1||!lW(y,p==null?void 0:p.scopes))){var S=function(R,k){var O;if(k===void 0&&(k=!1),!(aW(R)&&!TT(R,p==null?void 0:p.enableOnFormTags))&&!(p!=null&&p.ignoreEventWhen!=null&&p.ignoreEventWhen(R))){if(o.current!==null&&document.activeElement!==o.current&&!o.current.contains(document.activeElement)){s2(R);return}(O=R.target)!=null&&O.isContentEditable&&!(p!=null&&p.enableOnContentEditable)||N0(l,p==null?void 0:p.splitKey).forEach(function(A){var I,z=D0(A,p==null?void 0:p.combinationKey);if(cW(R,z,p==null?void 0:p.ignoreModifiers)||(I=z.keys)!=null&&I.includes("*")){if(k&&i.current)return;if(iW(R,z,p==null?void 0:p.preventDefault),!sW(R,z,p==null?void 0:p.enabled)){s2(R);return}m.current(R,z),k||(i.current=!0)}})}},E=function(R){R.key!==void 0&&(RT(gs(R.code)),((p==null?void 0:p.keydown)===void 0&&(p==null?void 0:p.keyup)!==!0||p!=null&&p.keydown)&&S(R))},_=function(R){R.key!==void 0&&(PT(gs(R.code)),i.current=!1,p!=null&&p.keyup&&S(R,!0))},b=o.current||(s==null?void 0:s.document)||document;return b.addEventListener("keyup",_),b.addEventListener("keydown",E),x&&N0(l,p==null?void 0:p.splitKey).forEach(function(C){return x.addHotkey(D0(C,p==null?void 0:p.combinationKey,p==null?void 0:p.description))}),function(){b.removeEventListener("keyup",_),b.removeEventListener("keydown",E),x&&N0(l,p==null?void 0:p.splitKey).forEach(function(C){return x.removeHotkey(D0(C,p==null?void 0:p.combinationKey,p==null?void 0:p.description))})}}},[l,p,y]),o}const Yn=(e,t,n)=>{const r=xt(i=>i.disableShortCuts);return gW(e,t,{enabled:!r},n)};function dr(e){const{content:t,keys:n}=e;return v.jsxs("div",{className:"flex justify-between",children:[v.jsx("div",{children:t}),v.jsx("div",{className:"flex gap-[8px]",children:n.map(r=>v.jsx("div",{className:"border px-2 py-1 rounded-lg",children:r},r))})]})}const vW=function(){return/macintosh|mac os x/i.test(navigator.userAgent)},bh=()=>vW()?"Cmd":"Ctrl";function yW(){const[e,t]=rT(!1);return Yn("h",()=>{t()}),v.jsxs(Mx,{open:e,onOpenChange:t,children:[v.jsx(_T,{asChild:!0}),v.jsx(rg,{children:v.jsxs(CT,{children:[v.jsx(og,{children:"Hotkeys"}),v.jsxs("div",{className:"flex gap-2 flex-col pt-4",children:[v.jsx(dr,{content:"Pan",keys:["Space + Drag"]}),v.jsx(dr,{content:"Reset Zoom/Pan",keys:["Esc"]}),v.jsx(dr,{content:"Decrease Brush Size",keys:["["]}),v.jsx(dr,{content:"Increase Brush Size",keys:["]"]}),v.jsx(dr,{content:"View Original Image",keys:["Hold Tab"]}),v.jsx(dr,{content:"Undo",keys:[bh(),"Z"]}),v.jsx(dr,{content:"Redo",keys:[bh(),"Shift","Z"]}),v.jsx(dr,{content:"Copy Result",keys:[bh(),"C"]}),v.jsx(dr,{content:"Paste Image",keys:[bh(),"V"]}),v.jsx(dr,{content:"Trigger Manually Inpainting",keys:["Shift","R"]}),v.jsx(dr,{content:"Toggle Hotkeys Dialog",keys:["H"]}),v.jsx(dr,{content:"Toggle Settings Dialog",keys:["S"]}),v.jsx(dr,{content:"Toggle File Manager",keys:["F"]})]})]})})]})}function Nx(e){const[t]=d.useState(new Image),[n,r]=d.useState(!1);return d.useEffect(()=>{if(e)return t.onload=()=>{r(!0)},r(!1),t.src=URL.createObjectURL(e),()=>{t.onload=null}},[e,t]),[t,n]}const AT="Popover",[MT,mte]=Tn(AT,[zs]),Dx=zs(),[wW,Tc]=MT(AT),xW=e=>{const{__scopePopover:t,children:n,open:r,defaultOpen:o,onOpenChange:i,modal:s=!1}=e,l=Dx(t),u=d.useRef(null),[f,m]=d.useState(!1),[p=!1,g]=eo({prop:r,defaultProp:o,onChange:i});return d.createElement(Fd,l,d.createElement(wW,{scope:t,contentId:tr(),triggerRef:u,open:p,onOpenChange:g,onOpenToggle:d.useCallback(()=>g(y=>!y),[g]),hasCustomAnchor:f,onCustomAnchorAdd:d.useCallback(()=>m(!0),[]),onCustomAnchorRemove:d.useCallback(()=>m(!1),[]),modal:s},n))},bW="PopoverTrigger",SW=d.forwardRef((e,t)=>{const{__scopePopover:n,...r}=e,o=Tc(bW,n),i=Dx(n),s=Ve(t,o.triggerRef),l=d.createElement(Ae.button,Y({type:"button","aria-haspopup":"dialog","aria-expanded":o.open,"aria-controls":o.contentId,"data-state":DT(o.open)},r,{ref:s,onClick:fe(e.onClick,o.onOpenToggle)}));return o.hasCustomAnchor?l:d.createElement(Gm,Y({asChild:!0},i),l)}),OT="PopoverPortal",[_W,EW]=MT(OT,{forceMount:void 0}),CW=e=>{const{__scopePopover:t,forceMount:n,children:r,container:o}=e,i=Tc(OT,t);return d.createElement(_W,{scope:t,forceMount:n},d.createElement(xn,{present:n||i.open},d.createElement(jd,{asChild:!0,container:o},r)))},dd="PopoverContent",$W=d.forwardRef((e,t)=>{const n=EW(dd,e.__scopePopover),{forceMount:r=n.forceMount,...o}=e,i=Tc(dd,e.__scopePopover);return d.createElement(xn,{present:r||i.open},i.modal?d.createElement(RW,Y({},o,{ref:t})):d.createElement(PW,Y({},o,{ref:t})))}),RW=d.forwardRef((e,t)=>{const n=Tc(dd,e.__scopePopover),r=d.useRef(null),o=Ve(t,r),i=d.useRef(!1);return d.useEffect(()=>{const s=r.current;if(s)return ng(s)},[]),d.createElement(tg,{as:Qo,allowPinchZoom:!0},d.createElement(NT,Y({},e,{ref:o,trapFocus:n.open,disableOutsidePointerEvents:!0,onCloseAutoFocus:fe(e.onCloseAutoFocus,s=>{var l;s.preventDefault(),i.current||(l=n.triggerRef.current)===null||l===void 0||l.focus()}),onPointerDownOutside:fe(e.onPointerDownOutside,s=>{const l=s.detail.originalEvent,u=l.button===0&&l.ctrlKey===!0,f=l.button===2||u;i.current=f},{checkForDefaultPrevented:!1}),onFocusOutside:fe(e.onFocusOutside,s=>s.preventDefault(),{checkForDefaultPrevented:!1})})))}),PW=d.forwardRef((e,t)=>{const n=Tc(dd,e.__scopePopover),r=d.useRef(!1),o=d.useRef(!1);return d.createElement(NT,Y({},e,{ref:t,trapFocus:!1,disableOutsidePointerEvents:!1,onCloseAutoFocus:i=>{var s;if((s=e.onCloseAutoFocus)===null||s===void 0||s.call(e,i),!i.defaultPrevented){var l;r.current||(l=n.triggerRef.current)===null||l===void 0||l.focus(),i.preventDefault()}r.current=!1,o.current=!1},onInteractOutside:i=>{var s,l;(s=e.onInteractOutside)===null||s===void 0||s.call(e,i),i.defaultPrevented||(r.current=!0,i.detail.originalEvent.type==="pointerdown"&&(o.current=!0));const u=i.target;((l=n.triggerRef.current)===null||l===void 0?void 0:l.contains(u))&&i.preventDefault(),i.detail.originalEvent.type==="focusin"&&o.current&&i.preventDefault()}}))}),NT=d.forwardRef((e,t)=>{const{__scopePopover:n,trapFocus:r,onOpenAutoFocus:o,onCloseAutoFocus:i,disableOutsidePointerEvents:s,onEscapeKeyDown:l,onPointerDownOutside:u,onFocusOutside:f,onInteractOutside:m,...p}=e,g=Tc(dd,n),y=Dx(n);return Jm(),d.createElement(Qm,{asChild:!0,loop:!0,trapped:r,onMountAutoFocus:o,onUnmountAutoFocus:i},d.createElement($c,{asChild:!0,disableOutsidePointerEvents:s,onInteractOutside:m,onEscapeKeyDown:l,onPointerDownOutside:u,onFocusOutside:f,onDismiss:()=>g.onOpenChange(!1)},d.createElement(Ym,Y({"data-state":DT(g.open),role:"dialog",id:g.contentId},y,p,{ref:t,style:{...p.style,"--radix-popover-content-transform-origin":"var(--radix-popper-transform-origin)","--radix-popover-content-available-width":"var(--radix-popper-available-width)","--radix-popover-content-available-height":"var(--radix-popper-available-height)","--radix-popover-trigger-width":"var(--radix-popper-anchor-width)","--radix-popover-trigger-height":"var(--radix-popper-anchor-height)"}}))))});function DT(e){return e?"open":"closed"}const TW=xW,kW=SW,AW=CW,IT=$W,MW=TW,OW=kW,LT=d.forwardRef(({className:e,align:t="center",sideOffset:n=4,...r},o)=>v.jsx(AW,{children:v.jsx(IT,{ref:o,align:t,tabIndex:-1,sideOffset:n,className:xe("z-50 w-72 rounded-md border bg-popover p-4 text-popover-foreground shadow-md outline-none data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2",e),...r})}));LT.displayName=IT.displayName;function NW(){var e=d.useRef(!0);return e.current?(e.current=!1,!0):e.current}var DW=function(e,t){var n=NW();d.useEffect(function(){if(!n)return e()},t)};const IW=DW;var LW=function(){};function Ix(e){for(var t=[],n=1;n{const r=xt(s=>s.updateAppState),o=()=>{r({disableShortCuts:!0})},i=()=>{r({disableShortCuts:!1})};return v.jsx("textarea",{className:xe("flex min-h-[60px] w-full rounded-md border border-input bg-transparent px-3 py-2 text-sm shadow-sm placeholder:text-muted-foreground focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring disabled:cursor-not-allowed disabled:opacity-50","overflow-auto",e),tabIndex:-1,ref:n,onFocus:o,onBlur:i,...t})});jx.displayName="Textarea";const oH=()=>{const[e,t,n,r,o,i]=xt(x=>[x.getIsProcessing(),x.settings.prompt,x.updateSettings,x.runInpainting,x.showPrevMask,x.hidePrevMask]),[s,l]=Fx(!1),u=d.useRef(null);FT(u,()=>{u!=null&&u.current&&u.current.blur()});const f=x=>{x.preventDefault(),x.stopPropagation();const S=x.target;n({prompt:S.value})},m=()=>{e||r()},p=x=>{x.key==="Enter"&&x.ctrlKey&&t.length!==0&&m()},g=()=>{o()},y=()=>{i()};return v.jsx("div",{className:"flex gap-4 relative w-full justify-center h-full",children:v.jsxs("div",{className:"absolute flex gap-4",children:[v.jsx(jx,{ref:u,placeholder:"I want to repaint of...",className:xe(s?"focus:overflow-y-auto":"overflow-y-hidden","min-h-[32px] h-[32px] overflow-x-hidden focus:h-[120px] overflow-y-hidden transition-[height] w-[500px] py-1 px-3 bg-background resize-none"),style:{scrollbarGutter:"stable"},value:t,onInput:f,onKeyUp:p,onTransitionEnd:l}),v.jsx(vn,{size:"sm",onClick:m,disabled:e,onMouseEnter:g,onMouseLeave:y,children:"Paint"})]})})};var iH={xmlns:"http://www.w3.org/2000/svg",width:24,height:24,viewBox:"0 0 24 24",fill:"none",stroke:"currentColor",strokeWidth:2,strokeLinecap:"round",strokeLinejoin:"round"};const sH=e=>e.replace(/([a-z0-9])([A-Z])/g,"$1-$2").toLowerCase(),kn=(e,t)=>{const n=d.forwardRef(({color:r="currentColor",size:o=24,strokeWidth:i=2,absoluteStrokeWidth:s,children:l,...u},f)=>d.createElement("svg",{ref:f,...iH,width:o,height:o,stroke:r,strokeWidth:s?Number(i)*24/Number(o):i,className:`lucide lucide-${sH(e)}`,...u},[...t.map(([m,p])=>d.createElement(m,p)),...(Array.isArray(l)?l:[l])||[]]));return n.displayName=`${e}`,n},aH=kn("Blocks",[["rect",{width:"7",height:"7",x:"14",y:"3",rx:"1",key:"6d4xhi"}],["path",{d:"M10 21V8a1 1 0 0 0-1-1H4a1 1 0 0 0-1 1v12a1 1 0 0 0 1 1h12a1 1 0 0 0 1-1v-5a1 1 0 0 0-1-1H3",key:"1fpvtg"}]]),lH=kn("ChevronLeft",[["path",{d:"m15 18-6-6 6-6",key:"1wnfg3"}]]),cH=kn("ChevronRight",[["path",{d:"m9 18 6-6-6-6",key:"mthhwq"}]]),uH=kn("Download",[["path",{d:"M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4",key:"ih7n3h"}],["polyline",{points:"7 10 12 15 17 10",key:"2ggqvy"}],["line",{x1:"12",x2:"12",y1:"15",y2:"3",key:"1vk2je"}]]),dH=kn("Eraser",[["path",{d:"m7 21-4.3-4.3c-1-1-1-2.5 0-3.4l9.6-9.6c1-1 2.5-1 3.4 0l5.6 5.6c1 1 1 2.5 0 3.4L13 21",key:"182aya"}],["path",{d:"M22 21H7",key:"t4ddhn"}],["path",{d:"m5 11 9 9",key:"1mo9qw"}]]),fH=kn("Expand",[["path",{d:"m21 21-6-6m6 6v-4.8m0 4.8h-4.8",key:"1c15vz"}],["path",{d:"M3 16.2V21m0 0h4.8M3 21l6-6",key:"1fsnz2"}],["path",{d:"M21 7.8V3m0 0h-4.8M21 3l-6 6",key:"hawz9i"}],["path",{d:"M3 7.8V3m0 0h4.8M3 3l6 6",key:"u9ee12"}]]),hH=kn("Eye",[["path",{d:"M2 12s3-7 10-7 10 7 10 7-3 7-10 7-10-7-10-7Z",key:"rwhkz3"}],["circle",{cx:"12",cy:"12",r:"3",key:"1v7zrd"}]]),pH=kn("FolderClosed",[["path",{d:"M20 20a2 2 0 0 0 2-2V8a2 2 0 0 0-2-2h-7.9a2 2 0 0 1-1.69-.9L9.6 3.9A2 2 0 0 0 7.93 3H4a2 2 0 0 0-2 2v13a2 2 0 0 0 2 2Z",key:"1kt360"}],["path",{d:"M2 10h20",key:"1ir3d8"}]]),jT=kn("Fullscreen",[["path",{d:"M3 7V5a2 2 0 0 1 2-2h2",key:"aa7l1z"}],["path",{d:"M17 3h2a2 2 0 0 1 2 2v2",key:"4qcy5o"}],["path",{d:"M21 17v2a2 2 0 0 1-2 2h-2",key:"6vwrx8"}],["path",{d:"M7 21H5a2 2 0 0 1-2-2v-2",key:"ioqczr"}],["rect",{width:"10",height:"8",x:"7",y:"8",rx:"1",key:"vys8me"}]]),mH=kn("MousePointerClick",[["path",{d:"m9 9 5 12 1.8-5.2L21 14Z",key:"1b76lo"}],["path",{d:"M7.2 2.2 8 5.1",key:"1cfko1"}],["path",{d:"m5.1 8-2.9-.8",key:"1go3kf"}],["path",{d:"M14 4.1 12 6",key:"ita8i4"}],["path",{d:"m6 12-1.9 2",key:"mnht97"}]]),gH=kn("Redo",[["path",{d:"M21 7v6h-6",key:"3ptur4"}],["path",{d:"M3 17a9 9 0 0 1 9-9 9 9 0 0 1 6 2.3l3 2.7",key:"1kgawr"}]]),vH=kn("Settings",[["path",{d:"M12.22 2h-.44a2 2 0 0 0-2 2v.18a2 2 0 0 1-1 1.73l-.43.25a2 2 0 0 1-2 0l-.15-.08a2 2 0 0 0-2.73.73l-.22.38a2 2 0 0 0 .73 2.73l.15.1a2 2 0 0 1 1 1.72v.51a2 2 0 0 1-1 1.74l-.15.09a2 2 0 0 0-.73 2.73l.22.38a2 2 0 0 0 2.73.73l.15-.08a2 2 0 0 1 2 0l.43.25a2 2 0 0 1 1 1.73V20a2 2 0 0 0 2 2h.44a2 2 0 0 0 2-2v-.18a2 2 0 0 1 1-1.73l.43-.25a2 2 0 0 1 2 0l.15.08a2 2 0 0 0 2.73-.73l.22-.39a2 2 0 0 0-.73-2.73l-.15-.08a2 2 0 0 1-1-1.74v-.5a2 2 0 0 1 1-1.74l.15-.09a2 2 0 0 0 .73-2.73l-.22-.38a2 2 0 0 0-2.73-.73l-.15.08a2 2 0 0 1-2 0l-.43-.25a2 2 0 0 1-1-1.73V4a2 2 0 0 0-2-2z",key:"1qme2f"}],["circle",{cx:"12",cy:"12",r:"3",key:"1v7zrd"}]]),u2=kn("Slice",[["path",{d:"m8 14-6 6h9v-3",key:"zo3j9a"}],["path",{d:"M18.37 3.63 8 14l3 3L21.37 6.63a2.12 2.12 0 1 0-3-3Z",key:"1dzx0j"}]]),d2=kn("Smile",[["circle",{cx:"12",cy:"12",r:"10",key:"1mglay"}],["path",{d:"M8 14s1.5 2 4 2 4-2 4-2",key:"1y1vjs"}],["line",{x1:"9",x2:"9.01",y1:"9",y2:"9",key:"yxxnd0"}],["line",{x1:"15",x2:"15.01",y1:"9",y2:"9",key:"1p4y9e"}]]),yH=kn("Undo",[["path",{d:"M3 7v6h6",key:"1v2h90"}],["path",{d:"M21 17a9 9 0 0 0-9-9 9 9 0 0 0-6 2.3L3 13",key:"1r6uu6"}]]),BT=kn("Upload",[["path",{d:"M21 15v4a2 2 0 0 1-2 2H5a2 2 0 0 1-2-2v-4",key:"ih7n3h"}],["polyline",{points:"17 8 12 3 7 8",key:"t8dd8p"}],["line",{x1:"12",x2:"12",y1:"3",y2:"15",key:"widbto"}]]);var Vp={exports:{}};/** + * @license + * Lodash + * Copyright OpenJS Foundation and other contributors + * Released under MIT license + * Based on Underscore.js 1.8.3 + * Copyright Jeremy Ashkenas, DocumentCloud and Investigative Reporters & Editors + */Vp.exports;(function(e,t){(function(){var n,r="4.17.21",o=200,i="Unsupported core-js use. Try https://npms.io/search?q=ponyfill.",s="Expected a function",l="Invalid `variable` option passed into `_.template`",u="__lodash_hash_undefined__",f=500,m="__lodash_placeholder__",p=1,g=2,y=4,x=1,S=2,E=1,_=2,b=4,C=8,R=16,k=32,O=64,A=128,I=256,z=512,H=30,ie="...",K=800,te=16,U=1,re=2,V=3,J=1/0,G=9007199254740991,Z=17976931348623157e292,Q=NaN,le=4294967295,L=le-1,ue=le>>>1,Ne=[["ary",A],["bind",E],["bindKey",_],["curry",C],["curryRight",R],["flip",z],["partial",k],["partialRight",O],["rearg",I]],Ke="[object Arguments]",Me="[object Array]",me="[object AsyncFunction]",be="[object Boolean]",Ee="[object Date]",Oe="[object DOMException]",Ie="[object Error]",ze="[object Function]",ht="[object GeneratorFunction]",st="[object Map]",Yt="[object Number]",rr="[object Null]",Jt="[object Object]",Li="[object Promise]",N="[object Proxy]",X="[object RegExp]",ee="[object Set]",Se="[object String]",pe="[object Symbol]",he="[object Undefined]",Te="[object WeakMap]",ut="[object WeakSet]",gt="[object ArrayBuffer]",An="[object DataView]",Mn="[object Float32Array]",Fi="[object Float64Array]",Ro="[object Int8Array]",Cr="[object Int16Array]",Ks="[object Int32Array]",ri="[object Uint8Array]",ji="[object Uint8ClampedArray]",or="[object Uint16Array]",Zd="[object Uint32Array]",$g=/\b__p \+= '';/g,qd=/\b(__p \+=) '' \+/g,Rg=/(__e\(.*?\)|\b__t\)) \+\n'';/g,Qd=/&(?:amp|lt|gt|quot|#39);/g,Nc=/[&<>"']/g,Jd=RegExp(Qd.source),ef=RegExp(Nc.source),tf=/<%-([\s\S]+?)%>/g,nf=/<%([\s\S]+?)%>/g,Va=/<%=([\s\S]+?)%>/g,Pg=/\.|\[(?:[^[\]]*|(["'])(?:(?!\1)[^\\]|\\.)*?\1)\]/,Dc=/^\w*$/,rf=/[^.[\]]+|\[(?:(-?\d+(?:\.\d+)?)|(["'])((?:(?!\2)[^\\]|\\.)*?)\2)\]|(?=(?:\.|\[\])(?:\.|\[\]|$))/g,Wa=/[\\^$.*+?()[\]{}|]/g,Tg=RegExp(Wa.source),Ic=/^\s+/,kg=/\s/,Ag=/\{(?:\n\/\* \[wrapped with .+\] \*\/)?\n?/,W=/\{\n\/\* \[wrapped with (.+)\] \*/,We=/,? & /,dt=/[^\x00-\x2f\x3a-\x40\x5b-\x60\x7b-\x7f]+/g,no=/[()=,{}\[\]\/\s]/,an=/\\(\\)?/g,of=/\$\{([^\\}]*(?:\\.[^\\}]*)*)\}/g,wb=/\w*$/,hM=/^[-+]0x[0-9a-f]+$/i,pM=/^0b[01]+$/i,mM=/^\[object .+?Constructor\]$/,gM=/^0o[0-7]+$/i,vM=/^(?:0|[1-9]\d*)$/,yM=/[\xc0-\xd6\xd8-\xf6\xf8-\xff\u0100-\u017f]/g,sf=/($^)/,wM=/['\n\r\u2028\u2029\\]/g,af="\\ud800-\\udfff",xM="\\u0300-\\u036f",bM="\\ufe20-\\ufe2f",SM="\\u20d0-\\u20ff",xb=xM+bM+SM,bb="\\u2700-\\u27bf",Sb="a-z\\xdf-\\xf6\\xf8-\\xff",_M="\\xac\\xb1\\xd7\\xf7",EM="\\x00-\\x2f\\x3a-\\x40\\x5b-\\x60\\x7b-\\xbf",CM="\\u2000-\\u206f",$M=" \\t\\x0b\\f\\xa0\\ufeff\\n\\r\\u2028\\u2029\\u1680\\u180e\\u2000\\u2001\\u2002\\u2003\\u2004\\u2005\\u2006\\u2007\\u2008\\u2009\\u200a\\u202f\\u205f\\u3000",_b="A-Z\\xc0-\\xd6\\xd8-\\xde",Eb="\\ufe0e\\ufe0f",Cb=_M+EM+CM+$M,Mg="['’]",RM="["+af+"]",$b="["+Cb+"]",lf="["+xb+"]",Rb="\\d+",PM="["+bb+"]",Pb="["+Sb+"]",Tb="[^"+af+Cb+Rb+bb+Sb+_b+"]",Og="\\ud83c[\\udffb-\\udfff]",TM="(?:"+lf+"|"+Og+")",kb="[^"+af+"]",Ng="(?:\\ud83c[\\udde6-\\uddff]){2}",Dg="[\\ud800-\\udbff][\\udc00-\\udfff]",Ha="["+_b+"]",Ab="\\u200d",Mb="(?:"+Pb+"|"+Tb+")",kM="(?:"+Ha+"|"+Tb+")",Ob="(?:"+Mg+"(?:d|ll|m|re|s|t|ve))?",Nb="(?:"+Mg+"(?:D|LL|M|RE|S|T|VE))?",Db=TM+"?",Ib="["+Eb+"]?",AM="(?:"+Ab+"(?:"+[kb,Ng,Dg].join("|")+")"+Ib+Db+")*",MM="\\d*(?:1st|2nd|3rd|(?![123])\\dth)(?=\\b|[A-Z_])",OM="\\d*(?:1ST|2ND|3RD|(?![123])\\dTH)(?=\\b|[a-z_])",Lb=Ib+Db+AM,NM="(?:"+[PM,Ng,Dg].join("|")+")"+Lb,DM="(?:"+[kb+lf+"?",lf,Ng,Dg,RM].join("|")+")",IM=RegExp(Mg,"g"),LM=RegExp(lf,"g"),Ig=RegExp(Og+"(?="+Og+")|"+DM+Lb,"g"),FM=RegExp([Ha+"?"+Pb+"+"+Ob+"(?="+[$b,Ha,"$"].join("|")+")",kM+"+"+Nb+"(?="+[$b,Ha+Mb,"$"].join("|")+")",Ha+"?"+Mb+"+"+Ob,Ha+"+"+Nb,OM,MM,Rb,NM].join("|"),"g"),jM=RegExp("["+Ab+af+xb+Eb+"]"),BM=/[a-z][A-Z]|[A-Z]{2}[a-z]|[0-9][a-zA-Z]|[a-zA-Z][0-9]|[^a-zA-Z0-9 ]/,zM=["Array","Buffer","DataView","Date","Error","Float32Array","Float64Array","Function","Int8Array","Int16Array","Int32Array","Map","Math","Object","Promise","RegExp","Set","String","Symbol","TypeError","Uint8Array","Uint8ClampedArray","Uint16Array","Uint32Array","WeakMap","_","clearTimeout","isFinite","parseInt","setTimeout"],UM=-1,Pt={};Pt[Mn]=Pt[Fi]=Pt[Ro]=Pt[Cr]=Pt[Ks]=Pt[ri]=Pt[ji]=Pt[or]=Pt[Zd]=!0,Pt[Ke]=Pt[Me]=Pt[gt]=Pt[be]=Pt[An]=Pt[Ee]=Pt[Ie]=Pt[ze]=Pt[st]=Pt[Yt]=Pt[Jt]=Pt[X]=Pt[ee]=Pt[Se]=Pt[Te]=!1;var Ct={};Ct[Ke]=Ct[Me]=Ct[gt]=Ct[An]=Ct[be]=Ct[Ee]=Ct[Mn]=Ct[Fi]=Ct[Ro]=Ct[Cr]=Ct[Ks]=Ct[st]=Ct[Yt]=Ct[Jt]=Ct[X]=Ct[ee]=Ct[Se]=Ct[pe]=Ct[ri]=Ct[ji]=Ct[or]=Ct[Zd]=!0,Ct[Ie]=Ct[ze]=Ct[Te]=!1;var VM={À:"A",Á:"A",Â:"A",Ã:"A",Ä:"A",Å:"A",à:"a",á:"a",â:"a",ã:"a",ä:"a",å:"a",Ç:"C",ç:"c",Ð:"D",ð:"d",È:"E",É:"E",Ê:"E",Ë:"E",è:"e",é:"e",ê:"e",ë:"e",Ì:"I",Í:"I",Î:"I",Ï:"I",ì:"i",í:"i",î:"i",ï:"i",Ñ:"N",ñ:"n",Ò:"O",Ó:"O",Ô:"O",Õ:"O",Ö:"O",Ø:"O",ò:"o",ó:"o",ô:"o",õ:"o",ö:"o",ø:"o",Ù:"U",Ú:"U",Û:"U",Ü:"U",ù:"u",ú:"u",û:"u",ü:"u",Ý:"Y",ý:"y",ÿ:"y",Æ:"Ae",æ:"ae",Þ:"Th",þ:"th",ß:"ss",Ā:"A",Ă:"A",Ą:"A",ā:"a",ă:"a",ą:"a",Ć:"C",Ĉ:"C",Ċ:"C",Č:"C",ć:"c",ĉ:"c",ċ:"c",č:"c",Ď:"D",Đ:"D",ď:"d",đ:"d",Ē:"E",Ĕ:"E",Ė:"E",Ę:"E",Ě:"E",ē:"e",ĕ:"e",ė:"e",ę:"e",ě:"e",Ĝ:"G",Ğ:"G",Ġ:"G",Ģ:"G",ĝ:"g",ğ:"g",ġ:"g",ģ:"g",Ĥ:"H",Ħ:"H",ĥ:"h",ħ:"h",Ĩ:"I",Ī:"I",Ĭ:"I",Į:"I",İ:"I",ĩ:"i",ī:"i",ĭ:"i",į:"i",ı:"i",Ĵ:"J",ĵ:"j",Ķ:"K",ķ:"k",ĸ:"k",Ĺ:"L",Ļ:"L",Ľ:"L",Ŀ:"L",Ł:"L",ĺ:"l",ļ:"l",ľ:"l",ŀ:"l",ł:"l",Ń:"N",Ņ:"N",Ň:"N",Ŋ:"N",ń:"n",ņ:"n",ň:"n",ŋ:"n",Ō:"O",Ŏ:"O",Ő:"O",ō:"o",ŏ:"o",ő:"o",Ŕ:"R",Ŗ:"R",Ř:"R",ŕ:"r",ŗ:"r",ř:"r",Ś:"S",Ŝ:"S",Ş:"S",Š:"S",ś:"s",ŝ:"s",ş:"s",š:"s",Ţ:"T",Ť:"T",Ŧ:"T",ţ:"t",ť:"t",ŧ:"t",Ũ:"U",Ū:"U",Ŭ:"U",Ů:"U",Ű:"U",Ų:"U",ũ:"u",ū:"u",ŭ:"u",ů:"u",ű:"u",ų:"u",Ŵ:"W",ŵ:"w",Ŷ:"Y",ŷ:"y",Ÿ:"Y",Ź:"Z",Ż:"Z",Ž:"Z",ź:"z",ż:"z",ž:"z",IJ:"IJ",ij:"ij",Œ:"Oe",œ:"oe",ʼn:"'n",ſ:"s"},WM={"&":"&","<":"<",">":">",'"':""","'":"'"},HM={"&":"&","<":"<",">":">",""":'"',"'":"'"},KM={"\\":"\\","'":"'","\n":"n","\r":"r","\u2028":"u2028","\u2029":"u2029"},GM=parseFloat,YM=parseInt,Fb=typeof Qc=="object"&&Qc&&Qc.Object===Object&&Qc,XM=typeof self=="object"&&self&&self.Object===Object&&self,fn=Fb||XM||Function("return this")(),Lg=t&&!t.nodeType&&t,Gs=Lg&&!0&&e&&!e.nodeType&&e,jb=Gs&&Gs.exports===Lg,Fg=jb&&Fb.process,$r=function(){try{var F=Gs&&Gs.require&&Gs.require("util").types;return F||Fg&&Fg.binding&&Fg.binding("util")}catch{}}(),Bb=$r&&$r.isArrayBuffer,zb=$r&&$r.isDate,Ub=$r&&$r.isMap,Vb=$r&&$r.isRegExp,Wb=$r&&$r.isSet,Hb=$r&&$r.isTypedArray;function ir(F,ne,q){switch(q.length){case 0:return F.call(ne);case 1:return F.call(ne,q[0]);case 2:return F.call(ne,q[0],q[1]);case 3:return F.call(ne,q[0],q[1],q[2])}return F.apply(ne,q)}function ZM(F,ne,q,Ce){for(var Ge=-1,pt=F==null?0:F.length;++Ge-1}function jg(F,ne,q){for(var Ce=-1,Ge=F==null?0:F.length;++Ce-1;);return q}function Jb(F,ne){for(var q=F.length;q--&&Ka(ne,F[q],0)>-1;);return q}function i6(F,ne){for(var q=F.length,Ce=0;q--;)F[q]===ne&&++Ce;return Ce}var s6=Vg(VM),a6=Vg(WM);function l6(F){return"\\"+KM[F]}function c6(F,ne){return F==null?n:F[ne]}function Ga(F){return jM.test(F)}function u6(F){return BM.test(F)}function d6(F){for(var ne,q=[];!(ne=F.next()).done;)q.push(ne.value);return q}function Gg(F){var ne=-1,q=Array(F.size);return F.forEach(function(Ce,Ge){q[++ne]=[Ge,Ce]}),q}function eS(F,ne){return function(q){return F(ne(q))}}function Ui(F,ne){for(var q=-1,Ce=F.length,Ge=0,pt=[];++q-1}function Q6(a,c){var h=this.__data__,w=$f(h,a);return w<0?(++this.size,h.push([a,c])):h[w][1]=c,this}oi.prototype.clear=Y6,oi.prototype.delete=X6,oi.prototype.get=Z6,oi.prototype.has=q6,oi.prototype.set=Q6;function ii(a){var c=-1,h=a==null?0:a.length;for(this.clear();++c=c?a:c)),a}function kr(a,c,h,w,$,T){var M,D=c&p,B=c&g,oe=c&y;if(h&&(M=$?h(a,w,$,T):h(a)),M!==n)return M;if(!Dt(a))return a;var se=Ye(a);if(se){if(M=nO(a),!D)return Vn(a,M)}else{var ce=Sn(a),ve=ce==ze||ce==ht;if(Yi(a))return IS(a,D);if(ce==Jt||ce==Ke||ve&&!$){if(M=B||ve?{}:t_(a),!D)return B?H7(a,p7(M,a)):W7(a,fS(M,a))}else{if(!Ct[ce])return $?a:{};M=rO(a,ce,D)}}T||(T=new oo);var Pe=T.get(a);if(Pe)return Pe;T.set(a,M),k_(a)?a.forEach(function(je){M.add(kr(je,c,h,je,a,T))}):P_(a)&&a.forEach(function(je,nt){M.set(nt,kr(je,c,h,nt,a,T))});var Fe=oe?B?wv:yv:B?Hn:ln,Qe=se?n:Fe(a);return Rr(Qe||a,function(je,nt){Qe&&(nt=je,je=a[nt]),Vc(M,nt,kr(je,c,h,nt,a,T))}),M}function m7(a){var c=ln(a);return function(h){return hS(h,a,c)}}function hS(a,c,h){var w=h.length;if(a==null)return!w;for(a=St(a);w--;){var $=h[w],T=c[$],M=a[$];if(M===n&&!($ in a)||!T(M))return!1}return!0}function pS(a,c,h){if(typeof a!="function")throw new Pr(s);return Zc(function(){a.apply(n,h)},c)}function Wc(a,c,h,w){var $=-1,T=cf,M=!0,D=a.length,B=[],oe=c.length;if(!D)return B;h&&(c=Mt(c,sr(h))),w?(T=jg,M=!1):c.length>=o&&(T=Lc,M=!1,c=new Zs(c));e:for(;++$$?0:$+h),w=w===n||w>$?$:qe(w),w<0&&(w+=$),w=h>w?0:M_(w);h0&&h(D)?c>1?hn(D,c-1,h,w,$):zi($,D):w||($[$.length]=D)}return $}var ev=US(),vS=US(!0);function Po(a,c){return a&&ev(a,c,ln)}function tv(a,c){return a&&vS(a,c,ln)}function Pf(a,c){return Bi(c,function(h){return ui(a[h])})}function Qs(a,c){c=Ki(c,a);for(var h=0,w=c.length;a!=null&&hc}function y7(a,c){return a!=null&&yt.call(a,c)}function w7(a,c){return a!=null&&c in St(a)}function x7(a,c,h){return a>=bn(c,h)&&a=120&&se.length>=120)?new Zs(M&&se):n}se=a[0];var ce=-1,ve=D[0];e:for(;++ce<$&&oe.length-1;)D!==a&&wf.call(D,B,1),wf.call(a,B,1);return a}function PS(a,c){for(var h=a?c.length:0,w=h-1;h--;){var $=c[h];if(h==w||$!==T){var T=$;ci($)?wf.call(a,$,1):dv(a,$)}}return a}function lv(a,c){return a+Sf(lS()*(c-a+1))}function O7(a,c,h,w){for(var $=-1,T=tn(bf((c-a)/(h||1)),0),M=q(T);T--;)M[w?T:++$]=a,a+=h;return M}function cv(a,c){var h="";if(!a||c<1||c>G)return h;do c%2&&(h+=a),c=Sf(c/2),c&&(a+=a);while(c);return h}function et(a,c){return $v(o_(a,c,Kn),a+"")}function N7(a){return dS(ol(a))}function D7(a,c){var h=ol(a);return jf(h,qs(c,0,h.length))}function Gc(a,c,h,w){if(!Dt(a))return a;c=Ki(c,a);for(var $=-1,T=c.length,M=T-1,D=a;D!=null&&++$$?0:$+c),h=h>$?$:h,h<0&&(h+=$),$=c>h?0:h-c>>>0,c>>>=0;for(var T=q($);++w<$;)T[w]=a[w+c];return T}function F7(a,c){var h;return Wi(a,function(w,$,T){return h=c(w,$,T),!h}),!!h}function kf(a,c,h){var w=0,$=a==null?w:a.length;if(typeof c=="number"&&c===c&&$<=ue){for(;w<$;){var T=w+$>>>1,M=a[T];M!==null&&!lr(M)&&(h?M<=c:M=o){var oe=c?null:X7(a);if(oe)return df(oe);M=!1,$=Lc,B=new Zs}else B=c?[]:D;e:for(;++w=w?a:Ar(a,c,h)}var DS=$6||function(a){return fn.clearTimeout(a)};function IS(a,c){if(c)return a.slice();var h=a.length,w=rS?rS(h):new a.constructor(h);return a.copy(w),w}function mv(a){var c=new a.constructor(a.byteLength);return new vf(c).set(new vf(a)),c}function B7(a,c){var h=c?mv(a.buffer):a.buffer;return new a.constructor(h,a.byteOffset,a.byteLength)}function z7(a){var c=new a.constructor(a.source,wb.exec(a));return c.lastIndex=a.lastIndex,c}function U7(a){return Uc?St(Uc.call(a)):{}}function LS(a,c){var h=c?mv(a.buffer):a.buffer;return new a.constructor(h,a.byteOffset,a.length)}function FS(a,c){if(a!==c){var h=a!==n,w=a===null,$=a===a,T=lr(a),M=c!==n,D=c===null,B=c===c,oe=lr(c);if(!D&&!oe&&!T&&a>c||T&&M&&B&&!D&&!oe||w&&M&&B||!h&&B||!$)return 1;if(!w&&!T&&!oe&&a=D)return B;var oe=h[w];return B*(oe=="desc"?-1:1)}}return a.index-c.index}function jS(a,c,h,w){for(var $=-1,T=a.length,M=h.length,D=-1,B=c.length,oe=tn(T-M,0),se=q(B+oe),ce=!w;++D1?h[$-1]:n,M=$>2?h[2]:n;for(T=a.length>3&&typeof T=="function"?($--,T):n,M&&Nn(h[0],h[1],M)&&(T=$<3?n:T,$=1),c=St(c);++w<$;){var D=h[w];D&&a(c,D,w,T)}return c})}function zS(a,c){return function(h,w){if(h==null)return h;if(!Wn(h))return a(h,w);for(var $=h.length,T=c?$:-1,M=St(h);(c?T--:++T<$)&&w(M[T],T,M)!==!1;);return h}}function US(a){return function(c,h,w){for(var $=-1,T=St(c),M=w(c),D=M.length;D--;){var B=M[a?D:++$];if(h(T[B],B,T)===!1)break}return c}}function K7(a,c,h){var w=c&E,$=Yc(a);function T(){var M=this&&this!==fn&&this instanceof T?$:a;return M.apply(w?h:this,arguments)}return T}function VS(a){return function(c){c=vt(c);var h=Ga(c)?ro(c):n,w=h?h[0]:c.charAt(0),$=h?Gi(h,1).join(""):c.slice(1);return w[a]()+$}}function tl(a){return function(c){return Bg(z_(B_(c).replace(IM,"")),a,"")}}function Yc(a){return function(){var c=arguments;switch(c.length){case 0:return new a;case 1:return new a(c[0]);case 2:return new a(c[0],c[1]);case 3:return new a(c[0],c[1],c[2]);case 4:return new a(c[0],c[1],c[2],c[3]);case 5:return new a(c[0],c[1],c[2],c[3],c[4]);case 6:return new a(c[0],c[1],c[2],c[3],c[4],c[5]);case 7:return new a(c[0],c[1],c[2],c[3],c[4],c[5],c[6])}var h=Ja(a.prototype),w=a.apply(h,c);return Dt(w)?w:h}}function G7(a,c,h){var w=Yc(a);function $(){for(var T=arguments.length,M=q(T),D=T,B=nl($);D--;)M[D]=arguments[D];var oe=T<3&&M[0]!==B&&M[T-1]!==B?[]:Ui(M,B);if(T-=oe.length,T-1?$[T?c[M]:M]:n}}function HS(a){return li(function(c){var h=c.length,w=h,$=Tr.prototype.thru;for(a&&c.reverse();w--;){var T=c[w];if(typeof T!="function")throw new Pr(s);if($&&!M&&Lf(T)=="wrapper")var M=new Tr([],!0)}for(w=M?w:h;++w1&&at.reverse(),se&&BD))return!1;var oe=T.get(a),se=T.get(c);if(oe&&se)return oe==c&&se==a;var ce=-1,ve=!0,Pe=h&S?new Zs:n;for(T.set(a,c),T.set(c,a);++ce1?"& ":"")+c[w],c=c.join(h>2?", ":" "),a.replace(Ag,`{ +/* [wrapped with `+c+`] */ +`)}function iO(a){return Ye(a)||ta(a)||!!(sS&&a&&a[sS])}function ci(a,c){var h=typeof a;return c=c??G,!!c&&(h=="number"||h!="symbol"&&vM.test(a))&&a>-1&&a%1==0&&a0){if(++c>=K)return arguments[0]}else c=0;return a.apply(n,arguments)}}function jf(a,c){var h=-1,w=a.length,$=w-1;for(c=c===n?w:c;++h1?a[c-1]:n;return h=typeof h=="function"?(a.pop(),h):n,g_(a,h)});function v_(a){var c=P(a);return c.__chain__=!0,c}function gN(a,c){return c(a),a}function Bf(a,c){return c(a)}var vN=li(function(a){var c=a.length,h=c?a[0]:0,w=this.__wrapped__,$=function(T){return Jg(T,a)};return c>1||this.__actions__.length||!(w instanceof ot)||!ci(h)?this.thru($):(w=w.slice(h,+h+(c?1:0)),w.__actions__.push({func:Bf,args:[$],thisArg:n}),new Tr(w,this.__chain__).thru(function(T){return c&&!T.length&&T.push(n),T}))});function yN(){return v_(this)}function wN(){return new Tr(this.value(),this.__chain__)}function xN(){this.__values__===n&&(this.__values__=A_(this.value()));var a=this.__index__>=this.__values__.length,c=a?n:this.__values__[this.__index__++];return{done:a,value:c}}function bN(){return this}function SN(a){for(var c,h=this;h instanceof Cf;){var w=u_(h);w.__index__=0,w.__values__=n,c?$.__wrapped__=w:c=w;var $=w;h=h.__wrapped__}return $.__wrapped__=a,c}function _N(){var a=this.__wrapped__;if(a instanceof ot){var c=a;return this.__actions__.length&&(c=new ot(this)),c=c.reverse(),c.__actions__.push({func:Bf,args:[Rv],thisArg:n}),new Tr(c,this.__chain__)}return this.thru(Rv)}function EN(){return OS(this.__wrapped__,this.__actions__)}var CN=Mf(function(a,c,h){yt.call(a,h)?++a[h]:si(a,h,1)});function $N(a,c,h){var w=Ye(a)?Kb:g7;return h&&Nn(a,c,h)&&(c=n),w(a,Le(c,3))}function RN(a,c){var h=Ye(a)?Bi:gS;return h(a,Le(c,3))}var PN=WS(d_),TN=WS(f_);function kN(a,c){return hn(zf(a,c),1)}function AN(a,c){return hn(zf(a,c),J)}function MN(a,c,h){return h=h===n?1:qe(h),hn(zf(a,c),h)}function y_(a,c){var h=Ye(a)?Rr:Wi;return h(a,Le(c,3))}function w_(a,c){var h=Ye(a)?qM:mS;return h(a,Le(c,3))}var ON=Mf(function(a,c,h){yt.call(a,h)?a[h].push(c):si(a,h,[c])});function NN(a,c,h,w){a=Wn(a)?a:ol(a),h=h&&!w?qe(h):0;var $=a.length;return h<0&&(h=tn($+h,0)),Kf(a)?h<=$&&a.indexOf(c,h)>-1:!!$&&Ka(a,c,h)>-1}var DN=et(function(a,c,h){var w=-1,$=typeof c=="function",T=Wn(a)?q(a.length):[];return Wi(a,function(M){T[++w]=$?ir(c,M,h):Hc(M,c,h)}),T}),IN=Mf(function(a,c,h){si(a,h,c)});function zf(a,c){var h=Ye(a)?Mt:SS;return h(a,Le(c,3))}function LN(a,c,h,w){return a==null?[]:(Ye(c)||(c=c==null?[]:[c]),h=w?n:h,Ye(h)||(h=h==null?[]:[h]),$S(a,c,h))}var FN=Mf(function(a,c,h){a[h?0:1].push(c)},function(){return[[],[]]});function jN(a,c,h){var w=Ye(a)?Bg:Zb,$=arguments.length<3;return w(a,Le(c,4),h,$,Wi)}function BN(a,c,h){var w=Ye(a)?QM:Zb,$=arguments.length<3;return w(a,Le(c,4),h,$,mS)}function zN(a,c){var h=Ye(a)?Bi:gS;return h(a,Wf(Le(c,3)))}function UN(a){var c=Ye(a)?dS:N7;return c(a)}function VN(a,c,h){(h?Nn(a,c,h):c===n)?c=1:c=qe(c);var w=Ye(a)?d7:D7;return w(a,c)}function WN(a){var c=Ye(a)?f7:L7;return c(a)}function HN(a){if(a==null)return 0;if(Wn(a))return Kf(a)?Ya(a):a.length;var c=Sn(a);return c==st||c==ee?a.size:iv(a).length}function KN(a,c,h){var w=Ye(a)?zg:F7;return h&&Nn(a,c,h)&&(c=n),w(a,Le(c,3))}var GN=et(function(a,c){if(a==null)return[];var h=c.length;return h>1&&Nn(a,c[0],c[1])?c=[]:h>2&&Nn(c[0],c[1],c[2])&&(c=[c[0]]),$S(a,hn(c,1),[])}),Uf=R6||function(){return fn.Date.now()};function YN(a,c){if(typeof c!="function")throw new Pr(s);return a=qe(a),function(){if(--a<1)return c.apply(this,arguments)}}function x_(a,c,h){return c=h?n:c,c=a&&c==null?a.length:c,ai(a,A,n,n,n,n,c)}function b_(a,c){var h;if(typeof c!="function")throw new Pr(s);return a=qe(a),function(){return--a>0&&(h=c.apply(this,arguments)),a<=1&&(c=n),h}}var Tv=et(function(a,c,h){var w=E;if(h.length){var $=Ui(h,nl(Tv));w|=k}return ai(a,w,c,h,$)}),S_=et(function(a,c,h){var w=E|_;if(h.length){var $=Ui(h,nl(S_));w|=k}return ai(c,w,a,h,$)});function __(a,c,h){c=h?n:c;var w=ai(a,C,n,n,n,n,n,c);return w.placeholder=__.placeholder,w}function E_(a,c,h){c=h?n:c;var w=ai(a,R,n,n,n,n,n,c);return w.placeholder=E_.placeholder,w}function C_(a,c,h){var w,$,T,M,D,B,oe=0,se=!1,ce=!1,ve=!0;if(typeof a!="function")throw new Pr(s);c=Or(c)||0,Dt(h)&&(se=!!h.leading,ce="maxWait"in h,T=ce?tn(Or(h.maxWait)||0,c):T,ve="trailing"in h?!!h.trailing:ve);function Pe(Kt){var so=w,fi=$;return w=$=n,oe=Kt,M=a.apply(fi,so),M}function Fe(Kt){return oe=Kt,D=Zc(nt,c),se?Pe(Kt):M}function Qe(Kt){var so=Kt-B,fi=Kt-oe,W_=c-so;return ce?bn(W_,T-fi):W_}function je(Kt){var so=Kt-B,fi=Kt-oe;return B===n||so>=c||so<0||ce&&fi>=T}function nt(){var Kt=Uf();if(je(Kt))return at(Kt);D=Zc(nt,Qe(Kt))}function at(Kt){return D=n,ve&&w?Pe(Kt):(w=$=n,M)}function cr(){D!==n&&DS(D),oe=0,w=B=$=D=n}function Dn(){return D===n?M:at(Uf())}function ur(){var Kt=Uf(),so=je(Kt);if(w=arguments,$=this,B=Kt,so){if(D===n)return Fe(B);if(ce)return DS(D),D=Zc(nt,c),Pe(B)}return D===n&&(D=Zc(nt,c)),M}return ur.cancel=cr,ur.flush=Dn,ur}var XN=et(function(a,c){return pS(a,1,c)}),ZN=et(function(a,c,h){return pS(a,Or(c)||0,h)});function qN(a){return ai(a,z)}function Vf(a,c){if(typeof a!="function"||c!=null&&typeof c!="function")throw new Pr(s);var h=function(){var w=arguments,$=c?c.apply(this,w):w[0],T=h.cache;if(T.has($))return T.get($);var M=a.apply(this,w);return h.cache=T.set($,M)||T,M};return h.cache=new(Vf.Cache||ii),h}Vf.Cache=ii;function Wf(a){if(typeof a!="function")throw new Pr(s);return function(){var c=arguments;switch(c.length){case 0:return!a.call(this);case 1:return!a.call(this,c[0]);case 2:return!a.call(this,c[0],c[1]);case 3:return!a.call(this,c[0],c[1],c[2])}return!a.apply(this,c)}}function QN(a){return b_(2,a)}var JN=j7(function(a,c){c=c.length==1&&Ye(c[0])?Mt(c[0],sr(Le())):Mt(hn(c,1),sr(Le()));var h=c.length;return et(function(w){for(var $=-1,T=bn(w.length,h);++$=c}),ta=wS(function(){return arguments}())?wS:function(a){return Bt(a)&&yt.call(a,"callee")&&!iS.call(a,"callee")},Ye=q.isArray,p4=Bb?sr(Bb):S7;function Wn(a){return a!=null&&Hf(a.length)&&!ui(a)}function Ht(a){return Bt(a)&&Wn(a)}function m4(a){return a===!0||a===!1||Bt(a)&&On(a)==be}var Yi=T6||zv,g4=zb?sr(zb):_7;function v4(a){return Bt(a)&&a.nodeType===1&&!qc(a)}function y4(a){if(a==null)return!0;if(Wn(a)&&(Ye(a)||typeof a=="string"||typeof a.splice=="function"||Yi(a)||rl(a)||ta(a)))return!a.length;var c=Sn(a);if(c==st||c==ee)return!a.size;if(Xc(a))return!iv(a).length;for(var h in a)if(yt.call(a,h))return!1;return!0}function w4(a,c){return Kc(a,c)}function x4(a,c,h){h=typeof h=="function"?h:n;var w=h?h(a,c):n;return w===n?Kc(a,c,n,h):!!w}function Av(a){if(!Bt(a))return!1;var c=On(a);return c==Ie||c==Oe||typeof a.message=="string"&&typeof a.name=="string"&&!qc(a)}function b4(a){return typeof a=="number"&&aS(a)}function ui(a){if(!Dt(a))return!1;var c=On(a);return c==ze||c==ht||c==me||c==N}function R_(a){return typeof a=="number"&&a==qe(a)}function Hf(a){return typeof a=="number"&&a>-1&&a%1==0&&a<=G}function Dt(a){var c=typeof a;return a!=null&&(c=="object"||c=="function")}function Bt(a){return a!=null&&typeof a=="object"}var P_=Ub?sr(Ub):C7;function S4(a,c){return a===c||ov(a,c,bv(c))}function _4(a,c,h){return h=typeof h=="function"?h:n,ov(a,c,bv(c),h)}function E4(a){return T_(a)&&a!=+a}function C4(a){if(lO(a))throw new Ge(i);return xS(a)}function $4(a){return a===null}function R4(a){return a==null}function T_(a){return typeof a=="number"||Bt(a)&&On(a)==Yt}function qc(a){if(!Bt(a)||On(a)!=Jt)return!1;var c=yf(a);if(c===null)return!0;var h=yt.call(c,"constructor")&&c.constructor;return typeof h=="function"&&h instanceof h&&pf.call(h)==_6}var Mv=Vb?sr(Vb):$7;function P4(a){return R_(a)&&a>=-G&&a<=G}var k_=Wb?sr(Wb):R7;function Kf(a){return typeof a=="string"||!Ye(a)&&Bt(a)&&On(a)==Se}function lr(a){return typeof a=="symbol"||Bt(a)&&On(a)==pe}var rl=Hb?sr(Hb):P7;function T4(a){return a===n}function k4(a){return Bt(a)&&Sn(a)==Te}function A4(a){return Bt(a)&&On(a)==ut}var M4=If(sv),O4=If(function(a,c){return a<=c});function A_(a){if(!a)return[];if(Wn(a))return Kf(a)?ro(a):Vn(a);if(Fc&&a[Fc])return d6(a[Fc]());var c=Sn(a),h=c==st?Gg:c==ee?df:ol;return h(a)}function di(a){if(!a)return a===0?a:0;if(a=Or(a),a===J||a===-J){var c=a<0?-1:1;return c*Z}return a===a?a:0}function qe(a){var c=di(a),h=c%1;return c===c?h?c-h:c:0}function M_(a){return a?qs(qe(a),0,le):0}function Or(a){if(typeof a=="number")return a;if(lr(a))return Q;if(Dt(a)){var c=typeof a.valueOf=="function"?a.valueOf():a;a=Dt(c)?c+"":c}if(typeof a!="string")return a===0?a:+a;a=qb(a);var h=pM.test(a);return h||gM.test(a)?YM(a.slice(2),h?2:8):hM.test(a)?Q:+a}function O_(a){return To(a,Hn(a))}function N4(a){return a?qs(qe(a),-G,G):a===0?a:0}function vt(a){return a==null?"":ar(a)}var D4=el(function(a,c){if(Xc(c)||Wn(c)){To(c,ln(c),a);return}for(var h in c)yt.call(c,h)&&Vc(a,h,c[h])}),N_=el(function(a,c){To(c,Hn(c),a)}),Gf=el(function(a,c,h,w){To(c,Hn(c),a,w)}),I4=el(function(a,c,h,w){To(c,ln(c),a,w)}),L4=li(Jg);function F4(a,c){var h=Ja(a);return c==null?h:fS(h,c)}var j4=et(function(a,c){a=St(a);var h=-1,w=c.length,$=w>2?c[2]:n;for($&&Nn(c[0],c[1],$)&&(w=1);++h1),T}),To(a,wv(a),h),w&&(h=kr(h,p|g|y,Z7));for(var $=c.length;$--;)dv(h,c[$]);return h});function r8(a,c){return I_(a,Wf(Le(c)))}var o8=li(function(a,c){return a==null?{}:A7(a,c)});function I_(a,c){if(a==null)return{};var h=Mt(wv(a),function(w){return[w]});return c=Le(c),RS(a,h,function(w,$){return c(w,$[0])})}function i8(a,c,h){c=Ki(c,a);var w=-1,$=c.length;for($||($=1,a=n);++w<$;){var T=a==null?n:a[ko(c[w])];T===n&&(w=$,T=h),a=ui(T)?T.call(a):T}return a}function s8(a,c,h){return a==null?a:Gc(a,c,h)}function a8(a,c,h,w){return w=typeof w=="function"?w:n,a==null?a:Gc(a,c,h,w)}var L_=XS(ln),F_=XS(Hn);function l8(a,c,h){var w=Ye(a),$=w||Yi(a)||rl(a);if(c=Le(c,4),h==null){var T=a&&a.constructor;$?h=w?new T:[]:Dt(a)?h=ui(T)?Ja(yf(a)):{}:h={}}return($?Rr:Po)(a,function(M,D,B){return c(h,M,D,B)}),h}function c8(a,c){return a==null?!0:dv(a,c)}function u8(a,c,h){return a==null?a:MS(a,c,pv(h))}function d8(a,c,h,w){return w=typeof w=="function"?w:n,a==null?a:MS(a,c,pv(h),w)}function ol(a){return a==null?[]:Kg(a,ln(a))}function f8(a){return a==null?[]:Kg(a,Hn(a))}function h8(a,c,h){return h===n&&(h=c,c=n),h!==n&&(h=Or(h),h=h===h?h:0),c!==n&&(c=Or(c),c=c===c?c:0),qs(Or(a),c,h)}function p8(a,c,h){return c=di(c),h===n?(h=c,c=0):h=di(h),a=Or(a),x7(a,c,h)}function m8(a,c,h){if(h&&typeof h!="boolean"&&Nn(a,c,h)&&(c=h=n),h===n&&(typeof c=="boolean"?(h=c,c=n):typeof a=="boolean"&&(h=a,a=n)),a===n&&c===n?(a=0,c=1):(a=di(a),c===n?(c=a,a=0):c=di(c)),a>c){var w=a;a=c,c=w}if(h||a%1||c%1){var $=lS();return bn(a+$*(c-a+GM("1e-"+(($+"").length-1))),c)}return lv(a,c)}var g8=tl(function(a,c,h){return c=c.toLowerCase(),a+(h?j_(c):c)});function j_(a){return Dv(vt(a).toLowerCase())}function B_(a){return a=vt(a),a&&a.replace(yM,s6).replace(LM,"")}function v8(a,c,h){a=vt(a),c=ar(c);var w=a.length;h=h===n?w:qs(qe(h),0,w);var $=h;return h-=c.length,h>=0&&a.slice(h,$)==c}function y8(a){return a=vt(a),a&&ef.test(a)?a.replace(Nc,a6):a}function w8(a){return a=vt(a),a&&Tg.test(a)?a.replace(Wa,"\\$&"):a}var x8=tl(function(a,c,h){return a+(h?"-":"")+c.toLowerCase()}),b8=tl(function(a,c,h){return a+(h?" ":"")+c.toLowerCase()}),S8=VS("toLowerCase");function _8(a,c,h){a=vt(a),c=qe(c);var w=c?Ya(a):0;if(!c||w>=c)return a;var $=(c-w)/2;return Df(Sf($),h)+a+Df(bf($),h)}function E8(a,c,h){a=vt(a),c=qe(c);var w=c?Ya(a):0;return c&&w>>0,h?(a=vt(a),a&&(typeof c=="string"||c!=null&&!Mv(c))&&(c=ar(c),!c&&Ga(a))?Gi(ro(a),0,h):a.split(c,h)):[]}var A8=tl(function(a,c,h){return a+(h?" ":"")+Dv(c)});function M8(a,c,h){return a=vt(a),h=h==null?0:qs(qe(h),0,a.length),c=ar(c),a.slice(h,h+c.length)==c}function O8(a,c,h){var w=P.templateSettings;h&&Nn(a,c,h)&&(c=n),a=vt(a),c=Gf({},c,w,ZS);var $=Gf({},c.imports,w.imports,ZS),T=ln($),M=Kg($,T),D,B,oe=0,se=c.interpolate||sf,ce="__p += '",ve=Yg((c.escape||sf).source+"|"+se.source+"|"+(se===Va?of:sf).source+"|"+(c.evaluate||sf).source+"|$","g"),Pe="//# sourceURL="+(yt.call(c,"sourceURL")?(c.sourceURL+"").replace(/\s/g," "):"lodash.templateSources["+ ++UM+"]")+` +`;a.replace(ve,function(je,nt,at,cr,Dn,ur){return at||(at=cr),ce+=a.slice(oe,ur).replace(wM,l6),nt&&(D=!0,ce+=`' + +__e(`+nt+`) + +'`),Dn&&(B=!0,ce+=`'; +`+Dn+`; +__p += '`),at&&(ce+=`' + +((__t = (`+at+`)) == null ? '' : __t) + +'`),oe=ur+je.length,je}),ce+=`'; +`;var Fe=yt.call(c,"variable")&&c.variable;if(!Fe)ce=`with (obj) { +`+ce+` +} +`;else if(no.test(Fe))throw new Ge(l);ce=(B?ce.replace($g,""):ce).replace(qd,"$1").replace(Rg,"$1;"),ce="function("+(Fe||"obj")+`) { +`+(Fe?"":`obj || (obj = {}); +`)+"var __t, __p = ''"+(D?", __e = _.escape":"")+(B?`, __j = Array.prototype.join; +function print() { __p += __j.call(arguments, '') } +`:`; +`)+ce+`return __p +}`;var Qe=U_(function(){return pt(T,Pe+"return "+ce).apply(n,M)});if(Qe.source=ce,Av(Qe))throw Qe;return Qe}function N8(a){return vt(a).toLowerCase()}function D8(a){return vt(a).toUpperCase()}function I8(a,c,h){if(a=vt(a),a&&(h||c===n))return qb(a);if(!a||!(c=ar(c)))return a;var w=ro(a),$=ro(c),T=Qb(w,$),M=Jb(w,$)+1;return Gi(w,T,M).join("")}function L8(a,c,h){if(a=vt(a),a&&(h||c===n))return a.slice(0,tS(a)+1);if(!a||!(c=ar(c)))return a;var w=ro(a),$=Jb(w,ro(c))+1;return Gi(w,0,$).join("")}function F8(a,c,h){if(a=vt(a),a&&(h||c===n))return a.replace(Ic,"");if(!a||!(c=ar(c)))return a;var w=ro(a),$=Qb(w,ro(c));return Gi(w,$).join("")}function j8(a,c){var h=H,w=ie;if(Dt(c)){var $="separator"in c?c.separator:$;h="length"in c?qe(c.length):h,w="omission"in c?ar(c.omission):w}a=vt(a);var T=a.length;if(Ga(a)){var M=ro(a);T=M.length}if(h>=T)return a;var D=h-Ya(w);if(D<1)return w;var B=M?Gi(M,0,D).join(""):a.slice(0,D);if($===n)return B+w;if(M&&(D+=B.length-D),Mv($)){if(a.slice(D).search($)){var oe,se=B;for($.global||($=Yg($.source,vt(wb.exec($))+"g")),$.lastIndex=0;oe=$.exec(se);)var ce=oe.index;B=B.slice(0,ce===n?D:ce)}}else if(a.indexOf(ar($),D)!=D){var ve=B.lastIndexOf($);ve>-1&&(B=B.slice(0,ve))}return B+w}function B8(a){return a=vt(a),a&&Jd.test(a)?a.replace(Qd,m6):a}var z8=tl(function(a,c,h){return a+(h?" ":"")+c.toUpperCase()}),Dv=VS("toUpperCase");function z_(a,c,h){return a=vt(a),c=h?n:c,c===n?u6(a)?y6(a):t6(a):a.match(c)||[]}var U_=et(function(a,c){try{return ir(a,n,c)}catch(h){return Av(h)?h:new Ge(h)}}),U8=li(function(a,c){return Rr(c,function(h){h=ko(h),si(a,h,Tv(a[h],a))}),a});function V8(a){var c=a==null?0:a.length,h=Le();return a=c?Mt(a,function(w){if(typeof w[1]!="function")throw new Pr(s);return[h(w[0]),w[1]]}):[],et(function(w){for(var $=-1;++$G)return[];var h=le,w=bn(a,le);c=Le(c),a-=le;for(var $=Hg(w,c);++h0||c<0)?new ot(h):(a<0?h=h.takeRight(-a):a&&(h=h.drop(a)),c!==n&&(c=qe(c),h=c<0?h.dropRight(-c):h.take(c-a)),h)},ot.prototype.takeRightWhile=function(a){return this.reverse().takeWhile(a).reverse()},ot.prototype.toArray=function(){return this.take(le)},Po(ot.prototype,function(a,c){var h=/^(?:filter|find|map|reject)|While$/.test(c),w=/^(?:head|last)$/.test(c),$=P[w?"take"+(c=="last"?"Right":""):c],T=w||/^find/.test(c);$&&(P.prototype[c]=function(){var M=this.__wrapped__,D=w?[1]:arguments,B=M instanceof ot,oe=D[0],se=B||Ye(M),ce=function(nt){var at=$.apply(P,zi([nt],D));return w&&ve?at[0]:at};se&&h&&typeof oe=="function"&&oe.length!=1&&(B=se=!1);var ve=this.__chain__,Pe=!!this.__actions__.length,Fe=T&&!ve,Qe=B&&!Pe;if(!T&&se){M=Qe?M:new ot(this);var je=a.apply(M,D);return je.__actions__.push({func:Bf,args:[ce],thisArg:n}),new Tr(je,ve)}return Fe&&Qe?a.apply(this,D):(je=this.thru(ce),Fe?w?je.value()[0]:je.value():je)})}),Rr(["pop","push","shift","sort","splice","unshift"],function(a){var c=ff[a],h=/^(?:push|sort|unshift)$/.test(a)?"tap":"thru",w=/^(?:pop|shift)$/.test(a);P.prototype[a]=function(){var $=arguments;if(w&&!this.__chain__){var T=this.value();return c.apply(Ye(T)?T:[],$)}return this[h](function(M){return c.apply(Ye(M)?M:[],$)})}}),Po(ot.prototype,function(a,c){var h=P[c];if(h){var w=h.name+"";yt.call(Qa,w)||(Qa[w]=[]),Qa[w].push({name:c,func:h})}}),Qa[Of(n,_).name]=[{name:"wrapper",func:n}],ot.prototype.clone=B6,ot.prototype.reverse=z6,ot.prototype.value=U6,P.prototype.at=vN,P.prototype.chain=yN,P.prototype.commit=wN,P.prototype.next=xN,P.prototype.plant=SN,P.prototype.reverse=_N,P.prototype.toJSON=P.prototype.valueOf=P.prototype.value=EN,P.prototype.first=P.prototype.head,Fc&&(P.prototype[Fc]=bN),P},Xa=w6();Gs?((Gs.exports=Xa)._=Xa,Lg._=Xa):fn._=Xa}).call(Qc)})(Vp,Vp.exports);var wH=Vp.exports;const xH=pm(wH);function Gr({width:e,height:t}){return e/t}function Wp(e,t=0){const n=10**t;return Math.round((e+Number.EPSILON)*n)/n}function bH(e){return(t,n)=>e(n)-e(t)}class SH{constructor(t){this.comparator=t,this.heap=[],this.n=0}greater(t,n){return this.comparator(this.heap[t],this.heap[n])<0}swap(t,n){const r=this.heap[t];this.heap[t]=this.heap[n],this.heap[n]=r}swim(t){let n=t,r=Math.floor(n/2);for(;n>1&&this.greater(r,n);)this.swap(r,n),n=r,r=Math.floor(n/2)}sink(t){let n=t,r=n*2;for(;r<=this.n&&(rl.weight));for(s.push({id:t,weight:0});s.size()>0;){const{id:l,weight:u}=s.pop();if(!o.has(l)){const f=e(l);o.add(l),f.forEach((m,p)=>{const g=u+m,y=r.get(p),x=i.get(p);(x===void 0||x>g&&(x/g>1.005||y!==void 0&&yMath.min(Gr(i),o),Number.MAX_VALUE);return Wp(n/t/r)+2}function zT(e,t,n,r){const o=t-(e.length-1)*n-2*r*e.length,i=e.reduce((s,l)=>s+Gr(l),0);return o/i}function RH(e,t,n,r,o,i,s){const l=e.slice(t,n),u=zT(l,r,i,s);return u>0?(u-o)**2*l.length:void 0}function PH({photos:e,layoutOptions:t,targetRowHeight:n,limitNodeSearch:r,rowConstraints:o}){return i=>{var s,l;const{containerWidth:u,spacing:f,padding:m}=t,p=new Map;p.set(i,0);const g=(s=o==null?void 0:o.minPhotos)!=null?s:1,y=Math.min(r,(l=o==null?void 0:o.maxPhotos)!=null?l:1/0);for(let x=i+g;xy);x+=1){const S=RH(e,i,x,u,n,f,m);if(S===void 0)break;p.set(x,S)}return p}}function TH({photos:e,layoutOptions:t}){const{spacing:n,padding:r,containerWidth:o,targetRowHeight:i,rowConstraints:s}=t,l=$H({photos:e,containerWidth:o,targetRowHeight:i}),u=PH({photos:e,layoutOptions:t,targetRowHeight:i,limitNodeSearch:l,rowConstraints:s}),f=CH(u,0,e.length);if(f===void 0)return;const m=[];for(let p=1;p({photo:x,index:S})).slice(f[p-1],f[p]),y=zT(g.map(({photo:x})=>x),o,n,r);m.push(g.map(({photo:x,index:S},E)=>({photo:x,layout:{height:y,width:y*Gr(x),index:S,photoIndex:E,photosCount:g.length}})))}return m}function ig(...e){return[...e].filter(t=>!!t).join(" ")}function UT(e,{width:t,photosCount:n},{spacing:r,padding:o,containerWidth:i}){const s=r*(n-1)+2*o*n;return`calc((${e} - ${s}px) / ${Wp((i-s)/t,5)})`}function kH(e,t){return t.layout!=="rows"?`calc(100% - ${2*t.padding}px)`:UT("100%",e,t)}function f2(e,t,n){var r,o;return UT((o=(r=e.match(/calc\((.*)\)/))==null?void 0:r[1])!=null?o:e,t,n)}function AH(e,t,n){let r,o;const i=e.srcSet||e.images;return i&&i.length>0&&(r=i.concat(i.find(({width:s})=>s===e.width)?[]:[{src:e.src,width:e.width,height:e.height}]).sort((s,l)=>s.width-l.width).map(s=>`${s.src} ${s.width}w`).join(", ")),n.sizes?o=(n.sizes.sizes||[]).map(({viewport:s,size:l})=>`${s} ${f2(l,t,n)}`).concat(f2(n.sizes.size,t,n)).join(", "):r&&(o=`${Math.ceil(t.width/n.containerWidth*100)}vw`),{srcSet:r,sizes:o}}function Bx(e){var t,n;const{photo:r,layout:o,layoutOptions:i,imageProps:{style:s,className:l,...u}={},renderPhoto:f}=e,{onClick:m}=i,p={display:"block",boxSizing:"content-box",width:kH(o,i),height:"auto",aspectRatio:`${r.width} / ${r.height}`,...i.padding?{padding:`${i.padding}px`}:null,...(i.layout==="columns"||i.layout==="masonry")&&o.photoIndex{m({event:E,photo:r,index:o.index})}:void 0,y={src:r.src,alt:(t=r.alt)!=null?t:"",title:r.title,onClick:g,style:p,className:ig("react-photo-album--photo",l),loading:"lazy",decoding:"async",...AH(r,o,i),...u},x=E=>{const{src:_,alt:b,srcSet:C,sizes:R,style:k,...O}=y;return d.createElement("img",{alt:b,...C?{srcSet:C,sizes:R}:null,src:_,style:E!=null&&E.wrapped?{display:"block",width:"100%",height:"100%"}:k,...O})},S=(({display:E,boxSizing:_,width:b,aspectRatio:C,padding:R,marginBottom:k,cursor:O})=>({display:E,boxSizing:_,width:b,aspectRatio:C,padding:R,marginBottom:k,cursor:O}))(p);return d.createElement(d.Fragment,null,(n=f==null?void 0:f({photo:r,layout:o,layoutOptions:i,imageProps:y,renderDefaultPhoto:x,wrapperStyle:S}))!=null?n:x())}function MH({rowContainerProps:e,children:t}){return d.createElement("div",{...e},t)}function OH(e){const{layoutOptions:t,rowIndex:n,rowsCount:r,renderRowContainer:o,rowContainerProps:{style:i,className:s,...l}={},children:u}=e,f={className:ig("react-photo-album--row",s),style:{display:"flex",flexDirection:"row",flexWrap:"nowrap",alignItems:"flex-start",justifyContent:"space-between",...nd.createElement(OH,{key:`row-${f}`,layoutOptions:n,rowIndex:f,rowsCount:l.length,renderRowContainer:o,rowContainerProps:s},u.map(({photo:m,layout:p})=>d.createElement(Bx,{key:m.key||m.src,photo:m,layout:p,layoutOptions:n,renderPhoto:r,imageProps:i}))))):null}function DH(e,t,n,r){const o=new Map,i=new Set;i.add(n);for(let s=0;s{const f=s>0?o.get(u)[s].weight:0;e(u).forEach(({neighbor:m,weight:p})=>{let g=o.get(m);g||(g=[],o.set(m,g));const y=f+p,x=g[s+1];(!x||x.weight>y&&(x.weight/y>1.0001||u0;i-=1)o=e.get(o)[i].node,r.push(o);return r.reverse()}function LH(e,t,n,r){return IH(DH(e,t,n,r),t,r)}function FH({photos:e,spacing:t,padding:n,targetColumnWidth:r,targetColumnHeight:o}){return i=>{const s=[],l=o*1.5;let u=r/Gr(e[i])+2*n;for(let f=i+1;fl||f===e.length));f+=1)u+=r/Gr(e[f])+t+2*n;return s}}function h2({path:e,photos:t,containerWidth:n,columnsGaps:r,columnsRatios:o,spacing:i,padding:s}){const l=[],u=o.reduce((f,m)=>f+m,0);for(let f=0;f({photo:y,index:x})).slice(e[f],e[f+1]),p=o.reduce((y,x,S)=>y+(r[f]-r[S])*x,0),g=(n-(e.length-2)*i-2*(e.length-1)*s-p)*o[f]/u;l.push(m.map(({photo:y,index:x},S)=>({photo:y,layout:{width:g,height:g/Gr(y),index:x,photoIndex:S,photosCount:m.length}})))}return l}function jH({photos:e,layoutOptions:t,targetColumnWidth:n}){const{columns:r,spacing:o,padding:i,containerWidth:s}=t,l=[],u=[];if(e.length<=r){const y=e.length>0?e.reduce((S,E)=>S+Gr(E),0)/e.length:1;for(let S=0;SMath.min(E,e.length)),photos:e,columnsRatios:u,columnsGaps:l,containerWidth:s,spacing:o,padding:i});return{columnsGaps:l,columnsRatios:u,columnsModel:x}}const f=(e.reduce((y,x)=>y+n/Gr(x),0)+o*(e.length-r)+2*i*e.length)/r,m=FH({photos:e,targetColumnWidth:n,targetColumnHeight:f,spacing:o,padding:i}),p=LH(m,r,0,e.length);for(let y=0;yS+1/Gr(E),0)}const g=h2({path:p,photos:e,columnsRatios:u,columnsGaps:l,containerWidth:s,spacing:o,padding:i});return{columnsGaps:l,columnsRatios:u,columnsModel:g}}function VT(e){const{photos:t,layoutOptions:n}=e,{columns:r,spacing:o,padding:i,containerWidth:s}=n,l=(s-o*(r-1)-2*i*r)/r,{columnsGaps:u,columnsRatios:f,columnsModel:m}=jH({photos:t,layoutOptions:n,targetColumnWidth:l});return m.findIndex(p=>p.findIndex(({layout:{width:g,height:y}})=>g<0||y<0)>=0)>=0?r>1?VT({photos:t,layoutOptions:{...n,columns:r-1}}):void 0:{columnsModel:m,columnsGaps:u,columnsRatios:f}}function BH({photos:e,layoutOptions:t}){return VT({photos:e,layoutOptions:t})}function zH({columnContainerProps:e,children:t}){return d.createElement("div",{...e},t)}function UH(e){const{layoutOptions:t,columnIndex:n,columnsCount:r,columnsGaps:o,columnsRatios:i}=e,{layout:s,spacing:l,padding:u}=t;if(s==="masonry"||!o||!i)return`calc((100% - ${l*(r-1)}px) / ${r})`;const f=i.reduce((p,g)=>p+g,0),m=i.reduce((p,g,y)=>p+(o[n]-o[y])*g,0);return`calc((100% - ${Wp((r-1)*l+2*r*u+m,3)}px) * ${Wp(i[n]/f,5)} + ${2*u}px)`}function WT(e){const{layoutOptions:t,renderColumnContainer:n,children:r,columnContainerProps:{style:o,className:i,...s}={},...l}=e,u={className:ig("react-photo-album--column",i),style:{display:"flex",flexDirection:"column",flexWrap:"nowrap",alignItems:"flex-start",width:UH(e),justifyContent:t.layout==="columns"?"space-between":"flex-start",...o},...s};return d.createElement(d.Fragment,null,(n??zH)({layoutOptions:t,columnContainerProps:u,children:r,...l}))}function VH(e){const{photos:t,layoutOptions:n,renderPhoto:r,renderColumnContainer:o,componentsProps:{imageProps:i,columnContainerProps:s}}=e,l=BH({photos:t,layoutOptions:n});if(!l)return null;const{columnsModel:u,columnsRatios:f,columnsGaps:m}=l;return d.createElement(d.Fragment,null,u.map((p,g)=>d.createElement(WT,{key:`column-${g}`,layoutOptions:n,columnIndex:g,columnsCount:u.length,columnsGaps:m,columnsRatios:f,renderColumnContainer:o,columnContainerProps:s},p.map(({photo:y,layout:x})=>d.createElement(Bx,{key:y.key||y.src,photo:y,layout:x,layoutOptions:n,renderPhoto:r,imageProps:i})))))}function HT(e){const{photos:t,layoutOptions:n}=e,{columns:r,spacing:o,padding:i,containerWidth:s}=n,l=(s-o*(r-1)-2*i*r)/r;if(l<=0)return r>1?HT({...e,layoutOptions:{...n,columns:r-1}}):void 0;const u=[];for(let m=0;m{const y=u.reduce((x,S,E)=>S[])).map(m=>m.map(({photo:p,index:g},y)=>({photo:p,layout:{width:l,height:l/Gr(p),index:g,photoIndex:y,photosCount:m.length}})))}function WH(e){const{photos:t,layoutOptions:n,renderPhoto:r,renderColumnContainer:o,componentsProps:{imageProps:i,columnContainerProps:s}}=e,l=HT({photos:t,layoutOptions:n});return l?d.createElement(d.Fragment,null,l.map((u,f)=>d.createElement(WT,{key:`masonry-column-${f}`,layoutOptions:n,columnsCount:l.length,columnIndex:f,renderColumnContainer:o,columnContainerProps:s},u.map(({photo:m,layout:p})=>d.createElement(Bx,{key:m.key||m.src,photo:m,layout:p,layoutOptions:n,renderPhoto:r,imageProps:i}))))):null}function HH({containerProps:e,children:t,containerRef:n}){return d.createElement("div",{ref:n,...e},t)}function KH(e){const{layout:t,renderContainer:n,children:r,containerRef:o,containerProps:{style:i,className:s,...l}={}}=e,u={className:ig("react-photo-album",`react-photo-album--${t}`,s),style:{display:"flex",flexWrap:"nowrap",justifyContent:"space-between",flexDirection:t==="rows"?"column":"row",...i},...l};return d.createElement(d.Fragment,null,(n??HH)({containerProps:u,containerRef:o,layout:t,children:r}))}function GH(e){const t=d.useRef(e);return(!e||!t.current||e.join()!==t.current.join())&&(t.current=e),t.current}function YH(e,{newContainerWidth:t,newScrollbarWidth:n}){const{containerWidth:r,scrollbarWidth:o}=e;return r!==void 0&&o!==void 0&&t!==void 0&&n!==void 0&&t>r&&t-r<=20&&n0){const r=[...t.filter(i=>i>0)].sort((i,s)=>s-i);r.push(Math.floor(r[r.length-1]/2));const o=n;n=r.find((i,s)=>i<=o||s===r.length-1)}return n}function ZH(e,t){const[{containerWidth:n},r]=d.useReducer(YH,{containerWidth:t}),o=d.useRef(null),i=d.useRef();return{containerRef:d.useCallback(l=>{var u;(u=i.current)==null||u.disconnect(),i.current=void 0,o.current=l;const f=()=>r({newContainerWidth:XH(o.current,e),newScrollbarWidth:window.innerWidth-document.documentElement.clientWidth});f(),l&&typeof ResizeObserver<"u"&&(i.current=new ResizeObserver(f),i.current.observe(l))},[e]),containerWidth:n}}const qH=Object.freeze([1200,600,300,0]);function zx(e,t){return typeof e=="function"?e(t):e}function KT(e,t){return typeof e<"u"?zx(e,t):void 0}function QH(e,t){const n=qH.findIndex(r=>r<=t);return zx(e[n>=0?n:0],t)}function Sh(e,t,n,r=0){const o=KT(e,t);return Math.round(Math.max(o===void 0?QH(n,t):o,r))}function JH({layout:e,onClick:t,containerWidth:n,targetRowHeight:r,rowConstraints:o,columns:i,spacing:s,padding:l,sizes:u}){return{layout:e,onClick:t,containerWidth:n,columns:Sh(i,n,[5,4,3,2],1),spacing:Sh(s,n,[20,15,10,5]),padding:Sh(l,n,[0,0,0,0,0]),targetRowHeight:Sh(r,n,[f=>f/5,f=>f/4,f=>f/3,f=>f/2]),rowConstraints:KT(o,n),sizes:u}}function eK(e,t,n){const{photos:r,componentsProps:o}=e,i=zx(o,t)||{};if(n){const{layout:s,spacing:l,padding:u,rowConstraints:f}=n;if(s==="rows"){const{singleRowMaxHeight:m}=f||{};if(m){const p=Math.floor(r.reduce((g,{width:y,height:x})=>g+y/x*m-2*u,u*r.length*2+l*(r.length-1)));p>0&&(i.containerProps=i.containerProps||{},i.containerProps.style={maxWidth:p,...i.containerProps.style})}}}return i}function tK(e,t,n){const{photos:r,layout:o,renderPhoto:i,renderRowContainer:s,renderColumnContainer:l}=e,u={photos:r,renderPhoto:i,componentsProps:t};return o==="rows"?d.createElement(NH,{layoutOptions:n,renderRowContainer:s,...u}):o==="columns"?d.createElement(VH,{layoutOptions:n,renderColumnContainer:l,...u}):d.createElement(WH,{layoutOptions:n,renderColumnContainer:l,...u})}function nK(e){const{photos:t,layout:n,renderContainer:r,defaultContainerWidth:o,breakpoints:i}=e,{containerRef:s,containerWidth:l}=ZH(GH(i),o);if(!n||!["rows","columns","masonry"].includes(n)||!Array.isArray(t))return null;const u=l?JH({containerWidth:l,...e}):void 0,f=eK(e,l,u);return d.createElement(KH,{layout:n,containerRef:s,renderContainer:r,containerProps:f.containerProps},u&&tK(e,f,u))}function rK({title:e,titleId:t,...n},r){return d.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",fill:"none",viewBox:"0 0 24 24",strokeWidth:1.5,stroke:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),e?d.createElement("title",{id:t},e):null,d.createElement("path",{strokeLinecap:"round",strokeLinejoin:"round",d:"M3 4.5h14.25M3 9h9.75M3 13.5h9.75m4.5-4.5v12m0 0l-3.75-3.75M17.25 21L21 17.25"}))}const oK=d.forwardRef(rK),iK=oK;function sK({title:e,titleId:t,...n},r){return d.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",fill:"none",viewBox:"0 0 24 24",strokeWidth:1.5,stroke:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),e?d.createElement("title",{id:t},e):null,d.createElement("path",{strokeLinecap:"round",strokeLinejoin:"round",d:"M3 4.5h14.25M3 9h9.75M3 13.5h5.25m5.25-.75L17.25 9m0 0L21 12.75M17.25 9v12"}))}const aK=d.forwardRef(sK),lK=aK;function cK({title:e,titleId:t,...n},r){return d.createElement("svg",Object.assign({xmlns:"http://www.w3.org/2000/svg",fill:"none",viewBox:"0 0 24 24",strokeWidth:1.5,stroke:"currentColor","aria-hidden":"true",ref:r,"aria-labelledby":t},n),e?d.createElement("title",{id:t},e):null,d.createElement("path",{strokeLinecap:"round",strokeLinejoin:"round",d:"M15.042 21.672L13.684 16.6m0 0l-2.51 2.225.569-9.47 5.227 7.917-3.286-.672zM12 2.25V4.5m5.834.166l-1.591 1.591M20.25 10.5H18M7.757 14.743l-1.59 1.59M6 10.5H3.75m4.007-4.243l-1.59-1.59"}))}const uK=d.forwardRef(cK),dK=uK;function ki(e){return Array.isArray?Array.isArray(e):XT(e)==="[object Array]"}const fK=1/0;function hK(e){if(typeof e=="string")return e;let t=e+"";return t=="0"&&1/e==-fK?"-0":t}function pK(e){return e==null?"":hK(e)}function Ho(e){return typeof e=="string"}function GT(e){return typeof e=="number"}function mK(e){return e===!0||e===!1||gK(e)&&XT(e)=="[object Boolean]"}function YT(e){return typeof e=="object"}function gK(e){return YT(e)&&e!==null}function mr(e){return e!=null}function L0(e){return!e.trim().length}function XT(e){return e==null?e===void 0?"[object Undefined]":"[object Null]":Object.prototype.toString.call(e)}const vK="Incorrect 'index' type",yK=e=>`Invalid value for key ${e}`,wK=e=>`Pattern length exceeds max of ${e}.`,xK=e=>`Missing ${e} property in key`,bK=e=>`Property 'weight' in key '${e}' must be a positive integer`,p2=Object.prototype.hasOwnProperty;class SK{constructor(t){this._keys=[],this._keyMap={};let n=0;t.forEach(r=>{let o=ZT(r);this._keys.push(o),this._keyMap[o.id]=o,n+=o.weight}),this._keys.forEach(r=>{r.weight/=n})}get(t){return this._keyMap[t]}keys(){return this._keys}toJSON(){return JSON.stringify(this._keys)}}function ZT(e){let t=null,n=null,r=null,o=1,i=null;if(Ho(e)||ki(e))r=e,t=m2(e),n=x1(e);else{if(!p2.call(e,"name"))throw new Error(xK("name"));const s=e.name;if(r=s,p2.call(e,"weight")&&(o=e.weight,o<=0))throw new Error(bK(s));t=m2(s),n=x1(s),i=e.getFn}return{path:t,id:n,weight:o,src:r,getFn:i}}function m2(e){return ki(e)?e:e.split(".")}function x1(e){return ki(e)?e.join("."):e}function _K(e,t){let n=[],r=!1;const o=(i,s,l)=>{if(mr(i))if(!s[l])n.push(i);else{let u=s[l];const f=i[u];if(!mr(f))return;if(l===s.length-1&&(Ho(f)||GT(f)||mK(f)))n.push(pK(f));else if(ki(f)){r=!0;for(let m=0,p=f.length;me.score===t.score?e.idx{this._keysMap[n.id]=r})}create(){this.isCreated||!this.docs.length||(this.isCreated=!0,Ho(this.docs[0])?this.docs.forEach((t,n)=>{this._addString(t,n)}):this.docs.forEach((t,n)=>{this._addObject(t,n)}),this.norm.clear())}add(t){const n=this.size();Ho(t)?this._addString(t,n):this._addObject(t,n)}removeAt(t){this.records.splice(t,1);for(let n=t,r=this.size();n{let s=o.getFn?o.getFn(t):this.getFn(t,o.path);if(mr(s)){if(ki(s)){let l=[];const u=[{nestedArrIndex:-1,value:s}];for(;u.length;){const{nestedArrIndex:f,value:m}=u.pop();if(mr(m))if(Ho(m)&&!L0(m)){let p={v:m,i:f,n:this.norm.get(m)};l.push(p)}else ki(m)&&m.forEach((p,g)=>{u.push({nestedArrIndex:g,value:p})})}r.$[i]=l}else if(Ho(s)&&!L0(s)){let l={v:s,n:this.norm.get(s)};r.$[i]=l}}}),this.records.push(r)}toJSON(){return{keys:this.keys,records:this.records}}}function qT(e,t,{getFn:n=He.getFn,fieldNormWeight:r=He.fieldNormWeight}={}){const o=new Ux({getFn:n,fieldNormWeight:r});return o.setKeys(e.map(ZT)),o.setSources(t),o.create(),o}function kK(e,{getFn:t=He.getFn,fieldNormWeight:n=He.fieldNormWeight}={}){const{keys:r,records:o}=e,i=new Ux({getFn:t,fieldNormWeight:n});return i.setKeys(r),i.setIndexRecords(o),i}function _h(e,{errors:t=0,currentLocation:n=0,expectedLocation:r=0,distance:o=He.distance,ignoreLocation:i=He.ignoreLocation}={}){const s=t/e.length;if(i)return s;const l=Math.abs(r-n);return o?s+l/o:l?1:s}function AK(e=[],t=He.minMatchCharLength){let n=[],r=-1,o=-1,i=0;for(let s=e.length;i=t&&n.push([r,o]),r=-1)}return e[i-1]&&i-r>=t&&n.push([r,i-1]),n}const ua=32;function MK(e,t,n,{location:r=He.location,distance:o=He.distance,threshold:i=He.threshold,findAllMatches:s=He.findAllMatches,minMatchCharLength:l=He.minMatchCharLength,includeMatches:u=He.includeMatches,ignoreLocation:f=He.ignoreLocation}={}){if(t.length>ua)throw new Error(wK(ua));const m=t.length,p=e.length,g=Math.max(0,Math.min(r,p));let y=i,x=g;const S=l>1||u,E=S?Array(p):[];let _;for(;(_=e.indexOf(t,x))>-1;){let A=_h(t,{currentLocation:_,expectedLocation:g,distance:o,ignoreLocation:f});if(y=Math.min(A,y),x=_+m,S){let I=0;for(;I=H;U-=1){let re=U-1,V=n[e.charAt(re)];if(S&&(E[re]=+!!V),K[U]=(K[U+1]<<1|1)&V,A&&(K[U]|=(b[U+1]|b[U])<<1|1|b[U+1]),K[U]&k&&(C=_h(t,{errors:A,currentLocation:re,expectedLocation:g,distance:o,ignoreLocation:f}),C<=y)){if(y=C,x=re,x<=g)break;H=Math.max(1,2*g-x)}}if(_h(t,{errors:A+1,currentLocation:g,expectedLocation:g,distance:o,ignoreLocation:f})>y)break;b=K}const O={isMatch:x>=0,score:Math.max(.001,C)};if(S){const A=AK(E,l);A.length?u&&(O.indices=A):O.isMatch=!1}return O}function OK(e){let t={};for(let n=0,r=e.length;n{this.chunks.push({pattern:g,alphabet:OK(g),startIndex:y})},p=this.pattern.length;if(p>ua){let g=0;const y=p%ua,x=p-y;for(;g{const{isMatch:_,score:b,indices:C}=MK(t,x,S,{location:o+E,distance:i,threshold:s,findAllMatches:l,minMatchCharLength:u,includeMatches:r,ignoreLocation:f});_&&(g=!0),p+=b,_&&C&&(m=[...m,...C])});let y={isMatch:g,score:g?p/this.chunks.length:1};return g&&r&&(y.indices=m),y}}class Us{constructor(t){this.pattern=t}static isMultiMatch(t){return g2(t,this.multiRegex)}static isSingleMatch(t){return g2(t,this.singleRegex)}search(){}}function g2(e,t){const n=e.match(t);return n?n[1]:null}class NK extends Us{constructor(t){super(t)}static get type(){return"exact"}static get multiRegex(){return/^="(.*)"$/}static get singleRegex(){return/^=(.*)$/}search(t){const n=t===this.pattern;return{isMatch:n,score:n?0:1,indices:[0,this.pattern.length-1]}}}class DK extends Us{constructor(t){super(t)}static get type(){return"inverse-exact"}static get multiRegex(){return/^!"(.*)"$/}static get singleRegex(){return/^!(.*)$/}search(t){const r=t.indexOf(this.pattern)===-1;return{isMatch:r,score:r?0:1,indices:[0,t.length-1]}}}class IK extends Us{constructor(t){super(t)}static get type(){return"prefix-exact"}static get multiRegex(){return/^\^"(.*)"$/}static get singleRegex(){return/^\^(.*)$/}search(t){const n=t.startsWith(this.pattern);return{isMatch:n,score:n?0:1,indices:[0,this.pattern.length-1]}}}class LK extends Us{constructor(t){super(t)}static get type(){return"inverse-prefix-exact"}static get multiRegex(){return/^!\^"(.*)"$/}static get singleRegex(){return/^!\^(.*)$/}search(t){const n=!t.startsWith(this.pattern);return{isMatch:n,score:n?0:1,indices:[0,t.length-1]}}}class FK extends Us{constructor(t){super(t)}static get type(){return"suffix-exact"}static get multiRegex(){return/^"(.*)"\$$/}static get singleRegex(){return/^(.*)\$$/}search(t){const n=t.endsWith(this.pattern);return{isMatch:n,score:n?0:1,indices:[t.length-this.pattern.length,t.length-1]}}}class jK extends Us{constructor(t){super(t)}static get type(){return"inverse-suffix-exact"}static get multiRegex(){return/^!"(.*)"\$$/}static get singleRegex(){return/^!(.*)\$$/}search(t){const n=!t.endsWith(this.pattern);return{isMatch:n,score:n?0:1,indices:[0,t.length-1]}}}class JT extends Us{constructor(t,{location:n=He.location,threshold:r=He.threshold,distance:o=He.distance,includeMatches:i=He.includeMatches,findAllMatches:s=He.findAllMatches,minMatchCharLength:l=He.minMatchCharLength,isCaseSensitive:u=He.isCaseSensitive,ignoreLocation:f=He.ignoreLocation}={}){super(t),this._bitapSearch=new QT(t,{location:n,threshold:r,distance:o,includeMatches:i,findAllMatches:s,minMatchCharLength:l,isCaseSensitive:u,ignoreLocation:f})}static get type(){return"fuzzy"}static get multiRegex(){return/^"(.*)"$/}static get singleRegex(){return/^(.*)$/}search(t){return this._bitapSearch.searchIn(t)}}class ek extends Us{constructor(t){super(t)}static get type(){return"include"}static get multiRegex(){return/^'"(.*)"$/}static get singleRegex(){return/^'(.*)$/}search(t){let n=0,r;const o=[],i=this.pattern.length;for(;(r=t.indexOf(this.pattern,n))>-1;)n=r+i,o.push([r,n-1]);const s=!!o.length;return{isMatch:s,score:s?0:1,indices:o}}}const b1=[NK,ek,IK,LK,jK,FK,DK,JT],v2=b1.length,BK=/ +(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)/,zK="|";function UK(e,t={}){return e.split(zK).map(n=>{let r=n.trim().split(BK).filter(i=>i&&!!i.trim()),o=[];for(let i=0,s=r.length;i!!(e[Hp.AND]||e[Hp.OR]),KK=e=>!!e[E1.PATH],GK=e=>!ki(e)&&YT(e)&&!C1(e),y2=e=>({[Hp.AND]:Object.keys(e).map(t=>({[t]:e[t]}))});function tk(e,t,{auto:n=!0}={}){const r=o=>{let i=Object.keys(o);const s=KK(o);if(!s&&i.length>1&&!C1(o))return r(y2(o));if(GK(o)){const u=s?o[E1.PATH]:i[0],f=s?o[E1.PATTERN]:o[u];if(!Ho(f))throw new Error(yK(u));const m={keyId:x1(u),pattern:f};return n&&(m.searcher=_1(f,t)),m}let l={children:[],operator:i[0]};return i.forEach(u=>{const f=o[u];ki(f)&&f.forEach(m=>{l.children.push(r(m))})}),l};return C1(e)||(e=y2(e)),r(e)}function YK(e,{ignoreFieldNorm:t=He.ignoreFieldNorm}){e.forEach(n=>{let r=1;n.matches.forEach(({key:o,norm:i,score:s})=>{const l=o?o.weight:null;r*=Math.pow(s===0&&l?Number.EPSILON:s,(l||1)*(t?1:i))}),n.score=r})}function XK(e,t){const n=e.matches;t.matches=[],mr(n)&&n.forEach(r=>{if(!mr(r.indices)||!r.indices.length)return;const{indices:o,value:i}=r;let s={indices:o,value:i};r.key&&(s.key=r.key.src),r.idx>-1&&(s.refIndex=r.idx),t.matches.push(s)})}function ZK(e,t){t.score=e.score}function qK(e,t,{includeMatches:n=He.includeMatches,includeScore:r=He.includeScore}={}){const o=[];return n&&o.push(XK),r&&o.push(ZK),e.map(i=>{const{idx:s}=i,l={item:t[s],refIndex:s};return o.length&&o.forEach(u=>{u(i,l)}),l})}class kc{constructor(t,n={},r){this.options={...He,...n},this.options.useExtendedSearch,this._keyStore=new SK(this.options.keys),this.setCollection(t,r)}setCollection(t,n){if(this._docs=t,n&&!(n instanceof Ux))throw new Error(vK);this._myIndex=n||qT(this.options.keys,this._docs,{getFn:this.options.getFn,fieldNormWeight:this.options.fieldNormWeight})}add(t){mr(t)&&(this._docs.push(t),this._myIndex.add(t))}remove(t=()=>!1){const n=[];for(let r=0,o=this._docs.length;r-1&&(u=u.slice(0,n)),qK(u,this._docs,{includeMatches:r,includeScore:o})}_searchStringList(t){const n=_1(t,this.options),{records:r}=this._myIndex,o=[];return r.forEach(({v:i,i:s,n:l})=>{if(!mr(i))return;const{isMatch:u,score:f,indices:m}=n.searchIn(i);u&&o.push({item:i,idx:s,matches:[{score:f,value:i,norm:l,indices:m}]})}),o}_searchLogical(t){const n=tk(t,this.options),r=(l,u,f)=>{if(!l.children){const{keyId:p,searcher:g}=l,y=this._findMatches({key:this._keyStore.get(p),value:this._myIndex.getValueForItemAtKeyId(u,p),searcher:g});return y&&y.length?[{idx:f,item:u,matches:y}]:[]}const m=[];for(let p=0,g=l.children.length;p{if(mr(l)){let f=r(n,l,u);f.length&&(i[u]||(i[u]={idx:u,item:l,matches:[]},s.push(i[u])),f.forEach(({matches:m})=>{i[u].matches.push(...m)}))}}),s}_searchObjectList(t){const n=_1(t,this.options),{keys:r,records:o}=this._myIndex,i=[];return o.forEach(({$:s,i:l})=>{if(!mr(s))return;let u=[];r.forEach((f,m)=>{u.push(...this._findMatches({key:f,value:s[m],searcher:n}))}),u.length&&i.push({idx:l,item:s,matches:u})}),i}_findMatches({key:t,value:n,searcher:r}){if(!mr(n))return[];let o=[];if(ki(n))n.forEach(({v:i,i:s,n:l})=>{if(!mr(i))return;const{isMatch:u,score:f,indices:m}=r.searchIn(i);u&&o.push({score:f,key:t,value:i,idx:s,norm:l,indices:m})});else{const{v:i,n:s}=n,{isMatch:l,score:u,indices:f}=r.searchIn(i);l&&o.push({score:u,key:t,value:i,norm:s,indices:f})}return o}}kc.version="7.0.0";kc.createIndex=qT;kc.parseIndex=kK;kc.config=He;kc.parseQuery=tk;HK(WK);function Wd(e){const t=e+"CollectionProvider",[n,r]=Tn(t),[o,i]=n(t,{collectionRef:{current:null},itemMap:new Map}),s=y=>{const{scope:x,children:S}=y,E=Be.useRef(null),_=Be.useRef(new Map).current;return Be.createElement(o,{scope:x,itemMap:_,collectionRef:E},S)},l=e+"CollectionSlot",u=Be.forwardRef((y,x)=>{const{scope:S,children:E}=y,_=i(l,S),b=Ve(x,_.collectionRef);return Be.createElement(Qo,{ref:b},E)}),f=e+"CollectionItemSlot",m="data-radix-collection-item",p=Be.forwardRef((y,x)=>{const{scope:S,children:E,..._}=y,b=Be.useRef(null),C=Ve(x,b),R=i(f,S);return Be.useEffect(()=>(R.itemMap.set(b,{ref:b,..._}),()=>void R.itemMap.delete(b))),Be.createElement(Qo,{[m]:"",ref:C},E)});function g(y){const x=i(e+"CollectionConsumer",y);return Be.useCallback(()=>{const E=x.collectionRef.current;if(!E)return[];const _=Array.from(E.querySelectorAll(`[${m}]`));return Array.from(x.itemMap.values()).sort((R,k)=>_.indexOf(R.ref.current)-_.indexOf(k.ref.current))},[x.collectionRef,x.itemMap])}return[{Provider:s,Slot:u,ItemSlot:p},g,r]}const QK=d.createContext(void 0);function Ac(e){const t=d.useContext(QK);return e||t||"ltr"}const F0="rovingFocusGroup.onEntryFocus",JK={bubbles:!1,cancelable:!0},Vx="RovingFocusGroup",[$1,nk,eG]=Wd(Vx),[tG,sg]=Tn(Vx,[eG]),[nG,rG]=tG(Vx),oG=d.forwardRef((e,t)=>d.createElement($1.Provider,{scope:e.__scopeRovingFocusGroup},d.createElement($1.Slot,{scope:e.__scopeRovingFocusGroup},d.createElement(iG,Y({},e,{ref:t}))))),iG=d.forwardRef((e,t)=>{const{__scopeRovingFocusGroup:n,orientation:r,loop:o=!1,dir:i,currentTabStopId:s,defaultCurrentTabStopId:l,onCurrentTabStopIdChange:u,onEntryFocus:f,...m}=e,p=d.useRef(null),g=Ve(t,p),y=Ac(i),[x=null,S]=eo({prop:s,defaultProp:l,onChange:u}),[E,_]=d.useState(!1),b=Lt(f),C=nk(n),R=d.useRef(!1),[k,O]=d.useState(0);return d.useEffect(()=>{const A=p.current;if(A)return A.addEventListener(F0,b),()=>A.removeEventListener(F0,b)},[b]),d.createElement(nG,{scope:n,orientation:r,dir:y,loop:o,currentTabStopId:x,onItemFocus:d.useCallback(A=>S(A),[S]),onItemShiftTab:d.useCallback(()=>_(!0),[]),onFocusableItemAdd:d.useCallback(()=>O(A=>A+1),[]),onFocusableItemRemove:d.useCallback(()=>O(A=>A-1),[])},d.createElement(Ae.div,Y({tabIndex:E||k===0?-1:0,"data-orientation":r},m,{ref:g,style:{outline:"none",...e.style},onMouseDown:fe(e.onMouseDown,()=>{R.current=!0}),onFocus:fe(e.onFocus,A=>{const I=!R.current;if(A.target===A.currentTarget&&I&&!E){const z=new CustomEvent(F0,JK);if(A.currentTarget.dispatchEvent(z),!z.defaultPrevented){const H=C().filter(re=>re.focusable),ie=H.find(re=>re.active),K=H.find(re=>re.id===x),U=[ie,K,...H].filter(Boolean).map(re=>re.ref.current);rk(U)}}R.current=!1}),onBlur:fe(e.onBlur,()=>_(!1))})))}),sG="RovingFocusGroupItem",aG=d.forwardRef((e,t)=>{const{__scopeRovingFocusGroup:n,focusable:r=!0,active:o=!1,tabStopId:i,...s}=e,l=tr(),u=i||l,f=rG(sG,n),m=f.currentTabStopId===u,p=nk(n),{onFocusableItemAdd:g,onFocusableItemRemove:y}=f;return d.useEffect(()=>{if(r)return g(),()=>y()},[r,g,y]),d.createElement($1.ItemSlot,{scope:n,id:u,focusable:r,active:o},d.createElement(Ae.span,Y({tabIndex:m?0:-1,"data-orientation":f.orientation},s,{ref:t,onMouseDown:fe(e.onMouseDown,x=>{r?f.onItemFocus(u):x.preventDefault()}),onFocus:fe(e.onFocus,()=>f.onItemFocus(u)),onKeyDown:fe(e.onKeyDown,x=>{if(x.key==="Tab"&&x.shiftKey){f.onItemShiftTab();return}if(x.target!==x.currentTarget)return;const S=uG(x,f.orientation,f.dir);if(S!==void 0){x.preventDefault();let _=p().filter(b=>b.focusable).map(b=>b.ref.current);if(S==="last")_.reverse();else if(S==="prev"||S==="next"){S==="prev"&&_.reverse();const b=_.indexOf(x.currentTarget);_=f.loop?dG(_,b+1):_.slice(b+1)}setTimeout(()=>rk(_))}})})))}),lG={ArrowLeft:"prev",ArrowUp:"prev",ArrowRight:"next",ArrowDown:"next",PageUp:"first",Home:"first",PageDown:"last",End:"last"};function cG(e,t){return t!=="rtl"?e:e==="ArrowLeft"?"ArrowRight":e==="ArrowRight"?"ArrowLeft":e}function uG(e,t,n){const r=cG(e.key,n);if(!(t==="vertical"&&["ArrowLeft","ArrowRight"].includes(r))&&!(t==="horizontal"&&["ArrowUp","ArrowDown"].includes(r)))return lG[r]}function rk(e){const t=document.activeElement;for(const n of e)if(n===t||(n.focus(),document.activeElement!==t))return}function dG(e,t){return e.map((n,r)=>e[(t+r)%e.length])}const ok=oG,ik=aG,sk="Tabs",[fG,gte]=Tn(sk,[sg]),ak=sg(),[hG,Wx]=fG(sk),pG=d.forwardRef((e,t)=>{const{__scopeTabs:n,value:r,onValueChange:o,defaultValue:i,orientation:s="horizontal",dir:l,activationMode:u="automatic",...f}=e,m=Ac(l),[p,g]=eo({prop:r,onChange:o,defaultProp:i});return d.createElement(hG,{scope:n,baseId:tr(),value:p,onValueChange:g,orientation:s,dir:m,activationMode:u},d.createElement(Ae.div,Y({dir:m,"data-orientation":s},f,{ref:t})))}),mG="TabsList",gG=d.forwardRef((e,t)=>{const{__scopeTabs:n,loop:r=!0,...o}=e,i=Wx(mG,n),s=ak(n);return d.createElement(ok,Y({asChild:!0},s,{orientation:i.orientation,dir:i.dir,loop:r}),d.createElement(Ae.div,Y({role:"tablist","aria-orientation":i.orientation},o,{ref:t})))}),vG="TabsTrigger",yG=d.forwardRef((e,t)=>{const{__scopeTabs:n,value:r,disabled:o=!1,...i}=e,s=Wx(vG,n),l=ak(n),u=lk(s.baseId,r),f=ck(s.baseId,r),m=r===s.value;return d.createElement(ik,Y({asChild:!0},l,{focusable:!o,active:m}),d.createElement(Ae.button,Y({type:"button",role:"tab","aria-selected":m,"aria-controls":f,"data-state":m?"active":"inactive","data-disabled":o?"":void 0,disabled:o,id:u},i,{ref:t,onMouseDown:fe(e.onMouseDown,p=>{!o&&p.button===0&&p.ctrlKey===!1?s.onValueChange(r):p.preventDefault()}),onKeyDown:fe(e.onKeyDown,p=>{[" ","Enter"].includes(p.key)&&s.onValueChange(r)}),onFocus:fe(e.onFocus,()=>{const p=s.activationMode!=="manual";!m&&!o&&p&&s.onValueChange(r)})})))}),wG="TabsContent",xG=d.forwardRef((e,t)=>{const{__scopeTabs:n,value:r,forceMount:o,children:i,...s}=e,l=Wx(wG,n),u=lk(l.baseId,r),f=ck(l.baseId,r),m=r===l.value,p=d.useRef(m);return d.useEffect(()=>{const g=requestAnimationFrame(()=>p.current=!1);return()=>cancelAnimationFrame(g)},[]),d.createElement(xn,{present:o||m},({present:g})=>d.createElement(Ae.div,Y({"data-state":m?"active":"inactive","data-orientation":l.orientation,role:"tabpanel","aria-labelledby":u,hidden:!g,id:f,tabIndex:0},s,{ref:t,style:{...e.style,animationDuration:p.current?"0s":void 0}}),g&&i))});function lk(e,t){return`${e}-trigger-${t}`}function ck(e,t){return`${e}-content-${t}`}const bG=pG,uk=gG,dk=yG,fk=xG,hk=bG,Hx=d.forwardRef(({className:e,...t},n)=>v.jsx(uk,{ref:n,className:xe("inline-flex h-9 items-center justify-center rounded-lg bg-muted p-1 text-muted-foreground",e),tabIndex:-1,...t}));Hx.displayName=uk.displayName;const ys=d.forwardRef(({className:e,...t},n)=>v.jsx(dk,{ref:n,className:xe("inline-flex items-center justify-center whitespace-nowrap rounded-md px-3 py-1 text-sm font-medium ring-offset-background transition-all focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 disabled:pointer-events-none disabled:opacity-50 data-[state=active]:bg-background data-[state=active]:text-foreground data-[state=active]:shadow",e),tabIndex:-1,...t}));ys.displayName=dk.displayName;const Su=d.forwardRef(({className:e,...t},n)=>v.jsx(fk,{ref:n,className:xe("mt-2 ring-offset-background focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2",e),tabIndex:-1,...t}));Su.displayName=fk.displayName;function fd(e,[t,n]){return Math.min(n,Math.max(t,e))}function Kx(e){const t=d.useRef({value:e,previous:e});return d.useMemo(()=>(t.current.value!==e&&(t.current.previous=t.current.value,t.current.value=e),t.current.previous),[e])}const SG=[" ","Enter","ArrowUp","ArrowDown"],_G=[" ","Enter"],ag="Select",[lg,cg,EG]=Wd(ag),[Mc,vte]=Tn(ag,[EG,zs]),Gx=zs(),[CG,Ba]=Mc(ag),[$G,RG]=Mc(ag),PG=e=>{const{__scopeSelect:t,children:n,open:r,defaultOpen:o,onOpenChange:i,value:s,defaultValue:l,onValueChange:u,dir:f,name:m,autoComplete:p,disabled:g,required:y}=e,x=Gx(t),[S,E]=d.useState(null),[_,b]=d.useState(null),[C,R]=d.useState(!1),k=Ac(f),[O=!1,A]=eo({prop:r,defaultProp:o,onChange:i}),[I,z]=eo({prop:s,defaultProp:l,onChange:u}),H=d.useRef(null),ie=S?!!S.closest("form"):!0,[K,te]=d.useState(new Set),U=Array.from(K).map(re=>re.props.value).join(";");return d.createElement(Fd,x,d.createElement(CG,{required:y,scope:t,trigger:S,onTriggerChange:E,valueNode:_,onValueNodeChange:b,valueNodeHasChildren:C,onValueNodeHasChildrenChange:R,contentId:tr(),value:I,onValueChange:z,open:O,onOpenChange:A,dir:k,triggerPointerDownPosRef:H,disabled:g},d.createElement(lg.Provider,{scope:t},d.createElement($G,{scope:e.__scopeSelect,onNativeOptionAdd:d.useCallback(re=>{te(V=>new Set(V).add(re))},[]),onNativeOptionRemove:d.useCallback(re=>{te(V=>{const J=new Set(V);return J.delete(re),J})},[])},n)),ie?d.createElement(yk,{key:U,"aria-hidden":!0,required:y,tabIndex:-1,name:m,autoComplete:p,value:I,onChange:re=>z(re.target.value),disabled:g},I===void 0?d.createElement("option",{value:""}):null,Array.from(K)):null))},TG="SelectTrigger",kG=d.forwardRef((e,t)=>{const{__scopeSelect:n,disabled:r=!1,...o}=e,i=Gx(n),s=Ba(TG,n),l=s.disabled||r,u=Ve(t,s.onTriggerChange),f=cg(n),[m,p,g]=wk(x=>{const S=f().filter(b=>!b.disabled),E=S.find(b=>b.value===s.value),_=xk(S,x,E);_!==void 0&&s.onValueChange(_.value)}),y=()=>{l||(s.onOpenChange(!0),g())};return d.createElement(Gm,Y({asChild:!0},i),d.createElement(Ae.button,Y({type:"button",role:"combobox","aria-controls":s.contentId,"aria-expanded":s.open,"aria-required":s.required,"aria-autocomplete":"none",dir:s.dir,"data-state":s.open?"open":"closed",disabled:l,"data-disabled":l?"":void 0,"data-placeholder":vk(s.value)?"":void 0},o,{ref:u,onClick:fe(o.onClick,x=>{x.currentTarget.focus()}),onPointerDown:fe(o.onPointerDown,x=>{const S=x.target;S.hasPointerCapture(x.pointerId)&&S.releasePointerCapture(x.pointerId),x.button===0&&x.ctrlKey===!1&&(y(),s.triggerPointerDownPosRef.current={x:Math.round(x.pageX),y:Math.round(x.pageY)},x.preventDefault())}),onKeyDown:fe(o.onKeyDown,x=>{const S=m.current!=="";!(x.ctrlKey||x.altKey||x.metaKey)&&x.key.length===1&&p(x.key),!(S&&x.key===" ")&&SG.includes(x.key)&&(y(),x.preventDefault())})})))}),AG="SelectValue",MG=d.forwardRef((e,t)=>{const{__scopeSelect:n,className:r,style:o,children:i,placeholder:s="",...l}=e,u=Ba(AG,n),{onValueNodeHasChildrenChange:f}=u,m=i!==void 0,p=Ve(t,u.onValueNodeChange);return Pn(()=>{f(m)},[f,m]),d.createElement(Ae.span,Y({},l,{ref:p,style:{pointerEvents:"none"}}),vk(u.value)?d.createElement(d.Fragment,null,s):i)}),OG=d.forwardRef((e,t)=>{const{__scopeSelect:n,children:r,...o}=e;return d.createElement(Ae.span,Y({"aria-hidden":!0},o,{ref:t}),r||"▼")}),NG=e=>d.createElement(jd,Y({asChild:!0},e)),hc="SelectContent",DG=d.forwardRef((e,t)=>{const n=Ba(hc,e.__scopeSelect),[r,o]=d.useState();if(Pn(()=>{o(new DocumentFragment)},[]),!n.open){const i=r;return i?Bs.createPortal(d.createElement(pk,{scope:e.__scopeSelect},d.createElement(lg.Slot,{scope:e.__scopeSelect},d.createElement("div",null,e.children))),i):null}return d.createElement(IG,Y({},e,{ref:t}))}),gi=10,[pk,za]=Mc(hc),IG=d.forwardRef((e,t)=>{const{__scopeSelect:n,position:r="item-aligned",onCloseAutoFocus:o,onEscapeKeyDown:i,onPointerDownOutside:s,side:l,sideOffset:u,align:f,alignOffset:m,arrowPadding:p,collisionBoundary:g,collisionPadding:y,sticky:x,hideWhenDetached:S,avoidCollisions:E,..._}=e,b=Ba(hc,n),[C,R]=d.useState(null),[k,O]=d.useState(null),A=Ve(t,me=>R(me)),[I,z]=d.useState(null),[H,ie]=d.useState(null),K=cg(n),[te,U]=d.useState(!1),re=d.useRef(!1);d.useEffect(()=>{if(C)return ng(C)},[C]),Jm();const V=d.useCallback(me=>{const[be,...Ee]=K().map(ze=>ze.ref.current),[Oe]=Ee.slice(-1),Ie=document.activeElement;for(const ze of me)if(ze===Ie||(ze==null||ze.scrollIntoView({block:"nearest"}),ze===be&&k&&(k.scrollTop=0),ze===Oe&&k&&(k.scrollTop=k.scrollHeight),ze==null||ze.focus(),document.activeElement!==Ie))return},[K,k]),J=d.useCallback(()=>V([I,C]),[V,I,C]);d.useEffect(()=>{te&&J()},[te,J]);const{onOpenChange:G,triggerPointerDownPosRef:Z}=b;d.useEffect(()=>{if(C){let me={x:0,y:0};const be=Oe=>{var Ie,ze,ht,st;me={x:Math.abs(Math.round(Oe.pageX)-((Ie=(ze=Z.current)===null||ze===void 0?void 0:ze.x)!==null&&Ie!==void 0?Ie:0)),y:Math.abs(Math.round(Oe.pageY)-((ht=(st=Z.current)===null||st===void 0?void 0:st.y)!==null&&ht!==void 0?ht:0))}},Ee=Oe=>{me.x<=10&&me.y<=10?Oe.preventDefault():C.contains(Oe.target)||G(!1),document.removeEventListener("pointermove",be),Z.current=null};return Z.current!==null&&(document.addEventListener("pointermove",be),document.addEventListener("pointerup",Ee,{capture:!0,once:!0})),()=>{document.removeEventListener("pointermove",be),document.removeEventListener("pointerup",Ee,{capture:!0})}}},[C,G,Z]),d.useEffect(()=>{const me=()=>G(!1);return window.addEventListener("blur",me),window.addEventListener("resize",me),()=>{window.removeEventListener("blur",me),window.removeEventListener("resize",me)}},[G]);const[Q,le]=wk(me=>{const be=K().filter(Ie=>!Ie.disabled),Ee=be.find(Ie=>Ie.ref.current===document.activeElement),Oe=xk(be,me,Ee);Oe&&setTimeout(()=>Oe.ref.current.focus())}),L=d.useCallback((me,be,Ee)=>{const Oe=!re.current&&!Ee;(b.value!==void 0&&b.value===be||Oe)&&(z(me),Oe&&(re.current=!0))},[b.value]),ue=d.useCallback(()=>C==null?void 0:C.focus(),[C]),Ne=d.useCallback((me,be,Ee)=>{const Oe=!re.current&&!Ee;(b.value!==void 0&&b.value===be||Oe)&&ie(me)},[b.value]),Ke=r==="popper"?w2:LG,Me=Ke===w2?{side:l,sideOffset:u,align:f,alignOffset:m,arrowPadding:p,collisionBoundary:g,collisionPadding:y,sticky:x,hideWhenDetached:S,avoidCollisions:E}:{};return d.createElement(pk,{scope:n,content:C,viewport:k,onViewportChange:O,itemRefCallback:L,selectedItem:I,onItemLeave:ue,itemTextRefCallback:Ne,focusSelectedItem:J,selectedItemText:H,position:r,isPositioned:te,searchRef:Q},d.createElement(tg,{as:Qo,allowPinchZoom:!0},d.createElement(Qm,{asChild:!0,trapped:b.open,onMountAutoFocus:me=>{me.preventDefault()},onUnmountAutoFocus:fe(o,me=>{var be;(be=b.trigger)===null||be===void 0||be.focus({preventScroll:!0}),me.preventDefault()})},d.createElement($c,{asChild:!0,disableOutsidePointerEvents:!0,onEscapeKeyDown:i,onPointerDownOutside:s,onFocusOutside:me=>me.preventDefault(),onDismiss:()=>b.onOpenChange(!1)},d.createElement(Ke,Y({role:"listbox",id:b.contentId,"data-state":b.open?"open":"closed",dir:b.dir,onContextMenu:me=>me.preventDefault()},_,Me,{onPlaced:()=>U(!0),ref:A,style:{display:"flex",flexDirection:"column",outline:"none",..._.style},onKeyDown:fe(_.onKeyDown,me=>{const be=me.ctrlKey||me.altKey||me.metaKey;if(me.key==="Tab"&&me.preventDefault(),!be&&me.key.length===1&&le(me.key),["ArrowUp","ArrowDown","Home","End"].includes(me.key)){let Oe=K().filter(Ie=>!Ie.disabled).map(Ie=>Ie.ref.current);if(["ArrowUp","End"].includes(me.key)&&(Oe=Oe.slice().reverse()),["ArrowUp","ArrowDown"].includes(me.key)){const Ie=me.target,ze=Oe.indexOf(Ie);Oe=Oe.slice(ze+1)}setTimeout(()=>V(Oe)),me.preventDefault()}})}))))))}),LG=d.forwardRef((e,t)=>{const{__scopeSelect:n,onPlaced:r,...o}=e,i=Ba(hc,n),s=za(hc,n),[l,u]=d.useState(null),[f,m]=d.useState(null),p=Ve(t,A=>m(A)),g=cg(n),y=d.useRef(!1),x=d.useRef(!0),{viewport:S,selectedItem:E,selectedItemText:_,focusSelectedItem:b}=s,C=d.useCallback(()=>{if(i.trigger&&i.valueNode&&l&&f&&S&&E&&_){const A=i.trigger.getBoundingClientRect(),I=f.getBoundingClientRect(),z=i.valueNode.getBoundingClientRect(),H=_.getBoundingClientRect();if(i.dir!=="rtl"){const Ie=H.left-I.left,ze=z.left-Ie,ht=A.left-ze,st=A.width+ht,Yt=Math.max(st,I.width),rr=window.innerWidth-gi,Jt=fd(ze,[gi,rr-Yt]);l.style.minWidth=st+"px",l.style.left=Jt+"px"}else{const Ie=I.right-H.right,ze=window.innerWidth-z.right-Ie,ht=window.innerWidth-A.right-ze,st=A.width+ht,Yt=Math.max(st,I.width),rr=window.innerWidth-gi,Jt=fd(ze,[gi,rr-Yt]);l.style.minWidth=st+"px",l.style.right=Jt+"px"}const ie=g(),K=window.innerHeight-gi*2,te=S.scrollHeight,U=window.getComputedStyle(f),re=parseInt(U.borderTopWidth,10),V=parseInt(U.paddingTop,10),J=parseInt(U.borderBottomWidth,10),G=parseInt(U.paddingBottom,10),Z=re+V+te+G+J,Q=Math.min(E.offsetHeight*5,Z),le=window.getComputedStyle(S),L=parseInt(le.paddingTop,10),ue=parseInt(le.paddingBottom,10),Ne=A.top+A.height/2-gi,Ke=K-Ne,Me=E.offsetHeight/2,me=E.offsetTop+Me,be=re+V+me,Ee=Z-be;if(be<=Ne){const Ie=E===ie[ie.length-1].ref.current;l.style.bottom="0px";const ze=f.clientHeight-S.offsetTop-S.offsetHeight,ht=Math.max(Ke,Me+(Ie?ue:0)+ze+J),st=be+ht;l.style.height=st+"px"}else{const Ie=E===ie[0].ref.current;l.style.top="0px";const ht=Math.max(Ne,re+S.offsetTop+(Ie?L:0)+Me)+Ee;l.style.height=ht+"px",S.scrollTop=be-Ne+S.offsetTop}l.style.margin=`${gi}px 0`,l.style.minHeight=Q+"px",l.style.maxHeight=K+"px",r==null||r(),requestAnimationFrame(()=>y.current=!0)}},[g,i.trigger,i.valueNode,l,f,S,E,_,i.dir,r]);Pn(()=>C(),[C]);const[R,k]=d.useState();Pn(()=>{f&&k(window.getComputedStyle(f).zIndex)},[f]);const O=d.useCallback(A=>{A&&x.current===!0&&(C(),b==null||b(),x.current=!1)},[C,b]);return d.createElement(FG,{scope:n,contentWrapper:l,shouldExpandOnScrollRef:y,onScrollButtonChange:O},d.createElement("div",{ref:u,style:{display:"flex",flexDirection:"column",position:"fixed",zIndex:R}},d.createElement(Ae.div,Y({},o,{ref:p,style:{boxSizing:"border-box",maxHeight:"100%",...o.style}}))))}),w2=d.forwardRef((e,t)=>{const{__scopeSelect:n,align:r="start",collisionPadding:o=gi,...i}=e,s=Gx(n);return d.createElement(Ym,Y({},s,i,{ref:t,align:r,collisionPadding:o,style:{boxSizing:"border-box",...i.style,"--radix-select-content-transform-origin":"var(--radix-popper-transform-origin)","--radix-select-content-available-width":"var(--radix-popper-available-width)","--radix-select-content-available-height":"var(--radix-popper-available-height)","--radix-select-trigger-width":"var(--radix-popper-anchor-width)","--radix-select-trigger-height":"var(--radix-popper-anchor-height)"}}))}),[FG,Yx]=Mc(hc,{}),x2="SelectViewport",jG=d.forwardRef((e,t)=>{const{__scopeSelect:n,...r}=e,o=za(x2,n),i=Yx(x2,n),s=Ve(t,o.onViewportChange),l=d.useRef(0);return d.createElement(d.Fragment,null,d.createElement("style",{dangerouslySetInnerHTML:{__html:"[data-radix-select-viewport]{scrollbar-width:none;-ms-overflow-style:none;-webkit-overflow-scrolling:touch;}[data-radix-select-viewport]::-webkit-scrollbar{display:none}"}}),d.createElement(lg.Slot,{scope:n},d.createElement(Ae.div,Y({"data-radix-select-viewport":"",role:"presentation"},r,{ref:s,style:{position:"relative",flex:1,overflow:"auto",...r.style},onScroll:fe(r.onScroll,u=>{const f=u.currentTarget,{contentWrapper:m,shouldExpandOnScrollRef:p}=i;if(p!=null&&p.current&&m){const g=Math.abs(l.current-f.scrollTop);if(g>0){const y=window.innerHeight-gi*2,x=parseFloat(m.style.minHeight),S=parseFloat(m.style.height),E=Math.max(x,S);if(E0?C:0,m.style.justifyContent="flex-end")}}}l.current=f.scrollTop})}))))}),BG="SelectGroup",[zG,UG]=Mc(BG),VG=d.forwardRef((e,t)=>{const{__scopeSelect:n,...r}=e,o=tr();return d.createElement(zG,{scope:n,id:o},d.createElement(Ae.div,Y({role:"group","aria-labelledby":o},r,{ref:t})))}),WG="SelectLabel",HG=d.forwardRef((e,t)=>{const{__scopeSelect:n,...r}=e,o=UG(WG,n);return d.createElement(Ae.div,Y({id:o.id},r,{ref:t}))}),R1="SelectItem",[KG,mk]=Mc(R1),GG=d.forwardRef((e,t)=>{const{__scopeSelect:n,value:r,disabled:o=!1,textValue:i,...s}=e,l=Ba(R1,n),u=za(R1,n),f=l.value===r,[m,p]=d.useState(i??""),[g,y]=d.useState(!1),x=Ve(t,_=>{var b;return(b=u.itemRefCallback)===null||b===void 0?void 0:b.call(u,_,r,o)}),S=tr(),E=()=>{o||(l.onValueChange(r),l.onOpenChange(!1))};if(r==="")throw new Error("A must have a value prop that is not an empty string. This is because the Select value can be set to an empty string to clear the selection and show the placeholder.");return d.createElement(KG,{scope:n,value:r,disabled:o,textId:S,isSelected:f,onItemTextChange:d.useCallback(_=>{p(b=>{var C;return b||((C=_==null?void 0:_.textContent)!==null&&C!==void 0?C:"").trim()})},[])},d.createElement(lg.ItemSlot,{scope:n,value:r,disabled:o,textValue:m},d.createElement(Ae.div,Y({role:"option","aria-labelledby":S,"data-highlighted":g?"":void 0,"aria-selected":f&&g,"data-state":f?"checked":"unchecked","aria-disabled":o||void 0,"data-disabled":o?"":void 0,tabIndex:o?void 0:-1},s,{ref:x,onFocus:fe(s.onFocus,()=>y(!0)),onBlur:fe(s.onBlur,()=>y(!1)),onPointerUp:fe(s.onPointerUp,E),onPointerMove:fe(s.onPointerMove,_=>{if(o){var b;(b=u.onItemLeave)===null||b===void 0||b.call(u)}else _.currentTarget.focus({preventScroll:!0})}),onPointerLeave:fe(s.onPointerLeave,_=>{if(_.currentTarget===document.activeElement){var b;(b=u.onItemLeave)===null||b===void 0||b.call(u)}}),onKeyDown:fe(s.onKeyDown,_=>{var b;((b=u.searchRef)===null||b===void 0?void 0:b.current)!==""&&_.key===" "||(_G.includes(_.key)&&E(),_.key===" "&&_.preventDefault())})}))))}),Eh="SelectItemText",YG=d.forwardRef((e,t)=>{const{__scopeSelect:n,className:r,style:o,...i}=e,s=Ba(Eh,n),l=za(Eh,n),u=mk(Eh,n),f=RG(Eh,n),[m,p]=d.useState(null),g=Ve(t,_=>p(_),u.onItemTextChange,_=>{var b;return(b=l.itemTextRefCallback)===null||b===void 0?void 0:b.call(l,_,u.value,u.disabled)}),y=m==null?void 0:m.textContent,x=d.useMemo(()=>d.createElement("option",{key:u.value,value:u.value,disabled:u.disabled},y),[u.disabled,u.value,y]),{onNativeOptionAdd:S,onNativeOptionRemove:E}=f;return Pn(()=>(S(x),()=>E(x)),[S,E,x]),d.createElement(d.Fragment,null,d.createElement(Ae.span,Y({id:u.textId},i,{ref:g})),u.isSelected&&s.valueNode&&!s.valueNodeHasChildren?Bs.createPortal(i.children,s.valueNode):null)}),XG="SelectItemIndicator",ZG=d.forwardRef((e,t)=>{const{__scopeSelect:n,...r}=e;return mk(XG,n).isSelected?d.createElement(Ae.span,Y({"aria-hidden":!0},r,{ref:t})):null}),b2="SelectScrollUpButton",qG=d.forwardRef((e,t)=>{const n=za(b2,e.__scopeSelect),r=Yx(b2,e.__scopeSelect),[o,i]=d.useState(!1),s=Ve(t,r.onScrollButtonChange);return Pn(()=>{if(n.viewport&&n.isPositioned){let f=function(){const m=u.scrollTop>0;i(m)};var l=f;const u=n.viewport;return f(),u.addEventListener("scroll",f),()=>u.removeEventListener("scroll",f)}},[n.viewport,n.isPositioned]),o?d.createElement(gk,Y({},e,{ref:s,onAutoScroll:()=>{const{viewport:l,selectedItem:u}=n;l&&u&&(l.scrollTop=l.scrollTop-u.offsetHeight)}})):null}),S2="SelectScrollDownButton",QG=d.forwardRef((e,t)=>{const n=za(S2,e.__scopeSelect),r=Yx(S2,e.__scopeSelect),[o,i]=d.useState(!1),s=Ve(t,r.onScrollButtonChange);return Pn(()=>{if(n.viewport&&n.isPositioned){let f=function(){const m=u.scrollHeight-u.clientHeight,p=Math.ceil(u.scrollTop)u.removeEventListener("scroll",f)}},[n.viewport,n.isPositioned]),o?d.createElement(gk,Y({},e,{ref:s,onAutoScroll:()=>{const{viewport:l,selectedItem:u}=n;l&&u&&(l.scrollTop=l.scrollTop+u.offsetHeight)}})):null}),gk=d.forwardRef((e,t)=>{const{__scopeSelect:n,onAutoScroll:r,...o}=e,i=za("SelectScrollButton",n),s=d.useRef(null),l=cg(n),u=d.useCallback(()=>{s.current!==null&&(window.clearInterval(s.current),s.current=null)},[]);return d.useEffect(()=>()=>u(),[u]),Pn(()=>{var f;const m=l().find(p=>p.ref.current===document.activeElement);m==null||(f=m.ref.current)===null||f===void 0||f.scrollIntoView({block:"nearest"})},[l]),d.createElement(Ae.div,Y({"aria-hidden":!0},o,{ref:t,style:{flexShrink:0,...o.style},onPointerDown:fe(o.onPointerDown,()=>{s.current===null&&(s.current=window.setInterval(r,50))}),onPointerMove:fe(o.onPointerMove,()=>{var f;(f=i.onItemLeave)===null||f===void 0||f.call(i),s.current===null&&(s.current=window.setInterval(r,50))}),onPointerLeave:fe(o.onPointerLeave,()=>{u()})}))}),JG=d.forwardRef((e,t)=>{const{__scopeSelect:n,...r}=e;return d.createElement(Ae.div,Y({"aria-hidden":!0},r,{ref:t}))});function vk(e){return e===""||e===void 0}const yk=d.forwardRef((e,t)=>{const{value:n,...r}=e,o=d.useRef(null),i=Ve(t,o),s=Kx(n);return d.useEffect(()=>{const l=o.current,u=window.HTMLSelectElement.prototype,m=Object.getOwnPropertyDescriptor(u,"value").set;if(s!==n&&m){const p=new Event("change",{bubbles:!0});m.call(l,n),l.dispatchEvent(p)}},[s,n]),d.createElement(Xm,{asChild:!0},d.createElement("select",Y({},r,{ref:i,defaultValue:n})))});yk.displayName="BubbleSelect";function wk(e){const t=Lt(e),n=d.useRef(""),r=d.useRef(0),o=d.useCallback(s=>{const l=n.current+s;t(l),function u(f){n.current=f,window.clearTimeout(r.current),f!==""&&(r.current=window.setTimeout(()=>u(""),1e3))}(l)},[t]),i=d.useCallback(()=>{n.current="",window.clearTimeout(r.current)},[]);return d.useEffect(()=>()=>window.clearTimeout(r.current),[]),[n,o,i]}function xk(e,t,n){const o=t.length>1&&Array.from(t).every(f=>f===t[0])?t[0]:t,i=n?e.indexOf(n):-1;let s=eY(e,Math.max(i,0));o.length===1&&(s=s.filter(f=>f!==n));const u=s.find(f=>f.textValue.toLowerCase().startsWith(o.toLowerCase()));return u!==n?u:void 0}function eY(e,t){return e.map((n,r)=>e[(t+r)%e.length])}const tY=PG,bk=kG,nY=MG,rY=OG,oY=NG,Sk=DG,iY=jG,sY=VG,_k=HG,Ek=GG,aY=YG,lY=ZG,Ck=qG,$k=QG,Rk=JG,yo=tY,zo=sY,wo=nY,Vr=d.forwardRef(({className:e,children:t,...n},r)=>v.jsxs(bk,{ref:r,className:xe("flex h-9 w-full items-center justify-between whitespace-nowrap rounded-md border border-input bg-transparent pl-2 pr-1 py-2 text-sm shadow-sm ring-offset-background placeholder:text-muted-foreground focus:outline-none focus:ring-1 focus:ring-ring disabled:cursor-not-allowed disabled:opacity-50 [&>span]:line-clamp-1",e),tabIndex:-1,...n,children:[t,v.jsx(rY,{asChild:!0,children:v.jsx(Jj,{className:"h-4 w-4 opacity-50"})})]}));Vr.displayName=bk.displayName;const Pk=d.forwardRef(({className:e,...t},n)=>v.jsx(Ck,{ref:n,className:xe("flex cursor-default items-center justify-center py-1",e),...t,children:v.jsx(sB,{})}));Pk.displayName=Ck.displayName;const Tk=d.forwardRef(({className:e,...t},n)=>v.jsx($k,{ref:n,className:xe("flex cursor-default items-center justify-center py-1",e),...t,children:v.jsx(nB,{})}));Tk.displayName=$k.displayName;const Wr=d.forwardRef(({className:e,children:t,position:n="popper",...r},o)=>v.jsx(oY,{children:v.jsxs(Sk,{ref:o,className:xe("relative z-50 max-h-96 min-w-[8rem] overflow-hidden rounded-md border bg-popover text-popover-foreground shadow-md data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2",n==="popper"&&"data-[side=bottom]:translate-y-1 data-[side=left]:-translate-x-1 data-[side=right]:translate-x-1 data-[side=top]:-translate-y-1",e),position:n,onCloseAutoFocus:i=>i.preventDefault(),...r,children:[v.jsx(Pk,{}),v.jsx(iY,{className:xe("p-1",n==="popper"&&"h-[var(--radix-select-trigger-height)] w-full min-w-[var(--radix-select-trigger-width)]"),children:t}),v.jsx(Tk,{})]})}));Wr.displayName=Sk.displayName;const cY=d.forwardRef(({className:e,...t},n)=>v.jsx(_k,{ref:n,className:xe("px-2 py-1.5 text-sm font-semibold",e),...t}));cY.displayName=_k.displayName;const Hr=d.forwardRef(({className:e,children:t,...n},r)=>v.jsxs(Ek,{ref:r,className:xe("relative flex w-full cursor-default select-none items-center rounded-sm py-1.5 pl-2 pr-8 text-sm outline-none focus:bg-accent focus:text-accent-foreground data-[disabled]:pointer-events-none data-[disabled]:opacity-50",e),...n,children:[v.jsx("span",{className:"absolute right-2 flex h-3.5 w-3.5 items-center justify-center",children:v.jsx(lY,{children:v.jsx(EP,{className:"h-4 w-4"})})}),v.jsx(aY,{children:t})]}));Hr.displayName=Ek.displayName;const uY=d.forwardRef(({className:e,...t},n)=>v.jsx(Rk,{ref:n,className:xe("-mx-1 my-1 h-px bg-muted",e),...t}));uY.displayName=Rk.displayName;function dY(e,t){return d.useReducer((n,r)=>{const o=t[n][r];return o??n},e)}const kk="ScrollArea",[Ak,yte]=Tn(kk),[fY,to]=Ak(kk),hY=d.forwardRef((e,t)=>{const{__scopeScrollArea:n,type:r="hover",dir:o,scrollHideDelay:i=600,...s}=e,[l,u]=d.useState(null),[f,m]=d.useState(null),[p,g]=d.useState(null),[y,x]=d.useState(null),[S,E]=d.useState(null),[_,b]=d.useState(0),[C,R]=d.useState(0),[k,O]=d.useState(!1),[A,I]=d.useState(!1),z=Ve(t,ie=>u(ie)),H=Ac(o);return d.createElement(fY,{scope:n,type:r,dir:H,scrollHideDelay:i,scrollArea:l,viewport:f,onViewportChange:m,content:p,onContentChange:g,scrollbarX:y,onScrollbarXChange:x,scrollbarXEnabled:k,onScrollbarXEnabledChange:O,scrollbarY:S,onScrollbarYChange:E,scrollbarYEnabled:A,onScrollbarYEnabledChange:I,onCornerWidthChange:b,onCornerHeightChange:R},d.createElement(Ae.div,Y({dir:H},s,{ref:z,style:{position:"relative","--radix-scroll-area-corner-width":_+"px","--radix-scroll-area-corner-height":C+"px",...e.style}})))}),pY="ScrollAreaViewport",mY=d.forwardRef((e,t)=>{const{__scopeScrollArea:n,children:r,...o}=e,i=to(pY,n),s=d.useRef(null),l=Ve(t,s,i.onViewportChange);return d.createElement(d.Fragment,null,d.createElement("style",{dangerouslySetInnerHTML:{__html:"[data-radix-scroll-area-viewport]{scrollbar-width:none;-ms-overflow-style:none;-webkit-overflow-scrolling:touch;}[data-radix-scroll-area-viewport]::-webkit-scrollbar{display:none}"}}),d.createElement(Ae.div,Y({"data-radix-scroll-area-viewport":""},o,{ref:l,style:{overflowX:i.scrollbarXEnabled?"scroll":"hidden",overflowY:i.scrollbarYEnabled?"scroll":"hidden",...e.style}}),d.createElement("div",{ref:i.onContentChange,style:{minWidth:"100%",display:"table"}},r)))}),Ii="ScrollAreaScrollbar",Mk=d.forwardRef((e,t)=>{const{forceMount:n,...r}=e,o=to(Ii,e.__scopeScrollArea),{onScrollbarXEnabledChange:i,onScrollbarYEnabledChange:s}=o,l=e.orientation==="horizontal";return d.useEffect(()=>(l?i(!0):s(!0),()=>{l?i(!1):s(!1)}),[l,i,s]),o.type==="hover"?d.createElement(gY,Y({},r,{ref:t,forceMount:n})):o.type==="scroll"?d.createElement(vY,Y({},r,{ref:t,forceMount:n})):o.type==="auto"?d.createElement(Ok,Y({},r,{ref:t,forceMount:n})):o.type==="always"?d.createElement(Xx,Y({},r,{ref:t})):null}),gY=d.forwardRef((e,t)=>{const{forceMount:n,...r}=e,o=to(Ii,e.__scopeScrollArea),[i,s]=d.useState(!1);return d.useEffect(()=>{const l=o.scrollArea;let u=0;if(l){const f=()=>{window.clearTimeout(u),s(!0)},m=()=>{u=window.setTimeout(()=>s(!1),o.scrollHideDelay)};return l.addEventListener("pointerenter",f),l.addEventListener("pointerleave",m),()=>{window.clearTimeout(u),l.removeEventListener("pointerenter",f),l.removeEventListener("pointerleave",m)}}},[o.scrollArea,o.scrollHideDelay]),d.createElement(xn,{present:n||i},d.createElement(Ok,Y({"data-state":i?"visible":"hidden"},r,{ref:t})))}),vY=d.forwardRef((e,t)=>{const{forceMount:n,...r}=e,o=to(Ii,e.__scopeScrollArea),i=e.orientation==="horizontal",s=dg(()=>u("SCROLL_END"),100),[l,u]=dY("hidden",{hidden:{SCROLL:"scrolling"},scrolling:{SCROLL_END:"idle",POINTER_ENTER:"interacting"},interacting:{SCROLL:"interacting",POINTER_LEAVE:"idle"},idle:{HIDE:"hidden",SCROLL:"scrolling",POINTER_ENTER:"interacting"}});return d.useEffect(()=>{if(l==="idle"){const f=window.setTimeout(()=>u("HIDE"),o.scrollHideDelay);return()=>window.clearTimeout(f)}},[l,o.scrollHideDelay,u]),d.useEffect(()=>{const f=o.viewport,m=i?"scrollLeft":"scrollTop";if(f){let p=f[m];const g=()=>{const y=f[m];p!==y&&(u("SCROLL"),s()),p=y};return f.addEventListener("scroll",g),()=>f.removeEventListener("scroll",g)}},[o.viewport,i,u,s]),d.createElement(xn,{present:n||l!=="hidden"},d.createElement(Xx,Y({"data-state":l==="hidden"?"hidden":"visible"},r,{ref:t,onPointerEnter:fe(e.onPointerEnter,()=>u("POINTER_ENTER")),onPointerLeave:fe(e.onPointerLeave,()=>u("POINTER_LEAVE"))})))}),Ok=d.forwardRef((e,t)=>{const n=to(Ii,e.__scopeScrollArea),{forceMount:r,...o}=e,[i,s]=d.useState(!1),l=e.orientation==="horizontal",u=dg(()=>{if(n.viewport){const f=n.viewport.offsetWidth{const{orientation:n="vertical",...r}=e,o=to(Ii,e.__scopeScrollArea),i=d.useRef(null),s=d.useRef(0),[l,u]=d.useState({content:0,viewport:0,scrollbar:{size:0,paddingStart:0,paddingEnd:0}}),f=Lk(l.viewport,l.content),m={...r,sizes:l,onSizesChange:u,hasThumb:f>0&&f<1,onThumbChange:g=>i.current=g,onThumbPointerUp:()=>s.current=0,onThumbPointerDown:g=>s.current=g};function p(g,y){return CY(g,s.current,l,y)}return n==="horizontal"?d.createElement(yY,Y({},m,{ref:t,onThumbPositionChange:()=>{if(o.viewport&&i.current){const g=o.viewport.scrollLeft,y=_2(g,l,o.dir);i.current.style.transform=`translate3d(${y}px, 0, 0)`}},onWheelScroll:g=>{o.viewport&&(o.viewport.scrollLeft=g)},onDragScroll:g=>{o.viewport&&(o.viewport.scrollLeft=p(g,o.dir))}})):n==="vertical"?d.createElement(wY,Y({},m,{ref:t,onThumbPositionChange:()=>{if(o.viewport&&i.current){const g=o.viewport.scrollTop,y=_2(g,l);i.current.style.transform=`translate3d(0, ${y}px, 0)`}},onWheelScroll:g=>{o.viewport&&(o.viewport.scrollTop=g)},onDragScroll:g=>{o.viewport&&(o.viewport.scrollTop=p(g))}})):null}),yY=d.forwardRef((e,t)=>{const{sizes:n,onSizesChange:r,...o}=e,i=to(Ii,e.__scopeScrollArea),[s,l]=d.useState(),u=d.useRef(null),f=Ve(t,u,i.onScrollbarXChange);return d.useEffect(()=>{u.current&&l(getComputedStyle(u.current))},[u]),d.createElement(Dk,Y({"data-orientation":"horizontal"},o,{ref:f,sizes:n,style:{bottom:0,left:i.dir==="rtl"?"var(--radix-scroll-area-corner-width)":0,right:i.dir==="ltr"?"var(--radix-scroll-area-corner-width)":0,"--radix-scroll-area-thumb-width":ug(n)+"px",...e.style},onThumbPointerDown:m=>e.onThumbPointerDown(m.x),onDragScroll:m=>e.onDragScroll(m.x),onWheelScroll:(m,p)=>{if(i.viewport){const g=i.viewport.scrollLeft+m.deltaX;e.onWheelScroll(g),jk(g,p)&&m.preventDefault()}},onResize:()=>{u.current&&i.viewport&&s&&r({content:i.viewport.scrollWidth,viewport:i.viewport.offsetWidth,scrollbar:{size:u.current.clientWidth,paddingStart:Kp(s.paddingLeft),paddingEnd:Kp(s.paddingRight)}})}}))}),wY=d.forwardRef((e,t)=>{const{sizes:n,onSizesChange:r,...o}=e,i=to(Ii,e.__scopeScrollArea),[s,l]=d.useState(),u=d.useRef(null),f=Ve(t,u,i.onScrollbarYChange);return d.useEffect(()=>{u.current&&l(getComputedStyle(u.current))},[u]),d.createElement(Dk,Y({"data-orientation":"vertical"},o,{ref:f,sizes:n,style:{top:0,right:i.dir==="ltr"?0:void 0,left:i.dir==="rtl"?0:void 0,bottom:"var(--radix-scroll-area-corner-height)","--radix-scroll-area-thumb-height":ug(n)+"px",...e.style},onThumbPointerDown:m=>e.onThumbPointerDown(m.y),onDragScroll:m=>e.onDragScroll(m.y),onWheelScroll:(m,p)=>{if(i.viewport){const g=i.viewport.scrollTop+m.deltaY;e.onWheelScroll(g),jk(g,p)&&m.preventDefault()}},onResize:()=>{u.current&&i.viewport&&s&&r({content:i.viewport.scrollHeight,viewport:i.viewport.offsetHeight,scrollbar:{size:u.current.clientHeight,paddingStart:Kp(s.paddingTop),paddingEnd:Kp(s.paddingBottom)}})}}))}),[xY,Nk]=Ak(Ii),Dk=d.forwardRef((e,t)=>{const{__scopeScrollArea:n,sizes:r,hasThumb:o,onThumbChange:i,onThumbPointerUp:s,onThumbPointerDown:l,onThumbPositionChange:u,onDragScroll:f,onWheelScroll:m,onResize:p,...g}=e,y=to(Ii,n),[x,S]=d.useState(null),E=Ve(t,z=>S(z)),_=d.useRef(null),b=d.useRef(""),C=y.viewport,R=r.content-r.viewport,k=Lt(m),O=Lt(u),A=dg(p,10);function I(z){if(_.current){const H=z.clientX-_.current.left,ie=z.clientY-_.current.top;f({x:H,y:ie})}}return d.useEffect(()=>{const z=H=>{const ie=H.target;(x==null?void 0:x.contains(ie))&&k(H,R)};return document.addEventListener("wheel",z,{passive:!1}),()=>document.removeEventListener("wheel",z,{passive:!1})},[C,x,R,k]),d.useEffect(O,[r,O]),pc(x,A),pc(y.content,A),d.createElement(xY,{scope:n,scrollbar:x,hasThumb:o,onThumbChange:Lt(i),onThumbPointerUp:Lt(s),onThumbPositionChange:O,onThumbPointerDown:Lt(l)},d.createElement(Ae.div,Y({},g,{ref:E,style:{position:"absolute",...g.style},onPointerDown:fe(e.onPointerDown,z=>{z.button===0&&(z.target.setPointerCapture(z.pointerId),_.current=x.getBoundingClientRect(),b.current=document.body.style.webkitUserSelect,document.body.style.webkitUserSelect="none",y.viewport&&(y.viewport.style.scrollBehavior="auto"),I(z))}),onPointerMove:fe(e.onPointerMove,I),onPointerUp:fe(e.onPointerUp,z=>{const H=z.target;H.hasPointerCapture(z.pointerId)&&H.releasePointerCapture(z.pointerId),document.body.style.webkitUserSelect=b.current,y.viewport&&(y.viewport.style.scrollBehavior=""),_.current=null})})))}),P1="ScrollAreaThumb",bY=d.forwardRef((e,t)=>{const{forceMount:n,...r}=e,o=Nk(P1,e.__scopeScrollArea);return d.createElement(xn,{present:n||o.hasThumb},d.createElement(SY,Y({ref:t},r)))}),SY=d.forwardRef((e,t)=>{const{__scopeScrollArea:n,style:r,...o}=e,i=to(P1,n),s=Nk(P1,n),{onThumbPositionChange:l}=s,u=Ve(t,p=>s.onThumbChange(p)),f=d.useRef(),m=dg(()=>{f.current&&(f.current(),f.current=void 0)},100);return d.useEffect(()=>{const p=i.viewport;if(p){const g=()=>{if(m(),!f.current){const y=$Y(p,l);f.current=y,l()}};return l(),p.addEventListener("scroll",g),()=>p.removeEventListener("scroll",g)}},[i.viewport,m,l]),d.createElement(Ae.div,Y({"data-state":s.hasThumb?"visible":"hidden"},o,{ref:u,style:{width:"var(--radix-scroll-area-thumb-width)",height:"var(--radix-scroll-area-thumb-height)",...r},onPointerDownCapture:fe(e.onPointerDownCapture,p=>{const y=p.target.getBoundingClientRect(),x=p.clientX-y.left,S=p.clientY-y.top;s.onThumbPointerDown({x,y:S})}),onPointerUp:fe(e.onPointerUp,s.onThumbPointerUp)}))}),Ik="ScrollAreaCorner",_Y=d.forwardRef((e,t)=>{const n=to(Ik,e.__scopeScrollArea),r=!!(n.scrollbarX&&n.scrollbarY);return n.type!=="scroll"&&r?d.createElement(EY,Y({},e,{ref:t})):null}),EY=d.forwardRef((e,t)=>{const{__scopeScrollArea:n,...r}=e,o=to(Ik,n),[i,s]=d.useState(0),[l,u]=d.useState(0),f=!!(i&&l);return pc(o.scrollbarX,()=>{var m;const p=((m=o.scrollbarX)===null||m===void 0?void 0:m.offsetHeight)||0;o.onCornerHeightChange(p),u(p)}),pc(o.scrollbarY,()=>{var m;const p=((m=o.scrollbarY)===null||m===void 0?void 0:m.offsetWidth)||0;o.onCornerWidthChange(p),s(p)}),f?d.createElement(Ae.div,Y({},r,{ref:t,style:{width:i,height:l,position:"absolute",right:o.dir==="ltr"?0:void 0,left:o.dir==="rtl"?0:void 0,bottom:0,...e.style}})):null});function Kp(e){return e?parseInt(e,10):0}function Lk(e,t){const n=e/t;return isNaN(n)?0:n}function ug(e){const t=Lk(e.viewport,e.content),n=e.scrollbar.paddingStart+e.scrollbar.paddingEnd,r=(e.scrollbar.size-n)*t;return Math.max(r,18)}function CY(e,t,n,r="ltr"){const o=ug(n),i=o/2,s=t||i,l=o-s,u=n.scrollbar.paddingStart+s,f=n.scrollbar.size-n.scrollbar.paddingEnd-l,m=n.content-n.viewport,p=r==="ltr"?[0,m]:[m*-1,0];return Fk([u,f],p)(e)}function _2(e,t,n="ltr"){const r=ug(t),o=t.scrollbar.paddingStart+t.scrollbar.paddingEnd,i=t.scrollbar.size-o,s=t.content-t.viewport,l=i-r,u=n==="ltr"?[0,s]:[s*-1,0],f=fd(e,u);return Fk([0,s],[0,l])(f)}function Fk(e,t){return n=>{if(e[0]===e[1]||t[0]===t[1])return t[0];const r=(t[1]-t[0])/(e[1]-e[0]);return t[0]+r*(n-e[0])}}function jk(e,t){return e>0&&e{})=>{let n={left:e.scrollLeft,top:e.scrollTop},r=0;return function o(){const i={left:e.scrollLeft,top:e.scrollTop},s=n.left!==i.left,l=n.top!==i.top;(s||l)&&t(),n=i,r=window.requestAnimationFrame(o)}(),()=>window.cancelAnimationFrame(r)};function dg(e,t){const n=Lt(e),r=d.useRef(0);return d.useEffect(()=>()=>window.clearTimeout(r.current),[]),d.useCallback(()=>{window.clearTimeout(r.current),r.current=window.setTimeout(n,t)},[n,t])}function pc(e,t){const n=Lt(t);Pn(()=>{let r=0;if(e){const o=new ResizeObserver(()=>{cancelAnimationFrame(r),r=window.requestAnimationFrame(n)});return o.observe(e),()=>{window.cancelAnimationFrame(r),o.unobserve(e)}}},[e,n])}const Bk=hY,RY=mY,PY=_Y,fg=d.forwardRef(({className:e,children:t,...n},r)=>v.jsxs(Bk,{ref:r,className:xe("relative overflow-hidden",e),scrollHideDelay:400,...n,children:[v.jsx(RY,{className:"h-full w-full rounded-[inherit]",children:t}),v.jsx(zk,{}),v.jsx(PY,{})]}));fg.displayName=Bk.displayName;const zk=d.forwardRef(({className:e,orientation:t="vertical",...n},r)=>v.jsx(Mk,{ref:r,orientation:t,className:xe("flex touch-none select-none transition-colors",t==="vertical"&&"h-full w-2.5 border-l border-l-transparent p-[1px]",t==="horizontal"&&"h-2.5 flex-col border-t border-t-transparent p-[1px]",e),...n,children:v.jsx(bY,{className:"relative flex-1 rounded-full bg-border"})}));zk.displayName=Mk.displayName;const Uk="Name",Vk="Created time",Wk="Modified time",E2="input",TY="output",Hk="mask",C2={[Ps.NAME]:Uk,[Ps.CTIME]:Vk,[Ps.MTIME]:Wk};function kY(e){const{onPhotoClick:t,photoWidth:n}=e,[r,o]=Fx(!1),[i,s]=xt(H=>[H.fileManagerState,H.updateFileManagerState]),{toast:l}=Id(),[u,f]=d.useState(0),[m,p]=d.useState(0),g=d.useRef(null),y=GU(i.searchText,300),[x,S]=d.useState(E2),[E,_]=d.useState([]),[b,C]=d.useState([]),[R,k]=d.useState(0);Yn("f",()=>{o()}),Yn("left",()=>{let H=R;R>0&&(H=R-1),k(H),t(x,b[H].name)},[R,b]),Yn("right",()=>{let H=R;R{r||p(u)},[r,u]);const O=d.useCallback(H=>{H!==null&&r&&setTimeout(()=>{H.scrollTo({top:m,left:0})},100)},[r,m]);d.useEffect(()=>{(async()=>{try{const ie=await Gj(x);_(ie)}catch(ie){l({variant:"destructive",title:"Uh oh! Something went wrong.",description:ie.message?ie.message:ie.toString()})}})()},[x]),d.useEffect(()=>{if(!r)return;(async()=>{try{let ie=E;y&&(ie=new kc(ie,{keys:["name"]}).search(y).map(re=>ie[re.refIndex])),ie=xH.orderBy(ie,i.sortBy,i.sortOrder);const K=ie.map(te=>{const U=n,re=te.height*(U/te.width);return{src:`${Ni}/media_thumbnail_file?tab=${x}&filename=${encodeURIComponent(te.name)}&width=${Math.ceil(U)}&height=${Math.ceil(re)}`,height:re,width:U,name:te.name}});C(K)}catch(ie){l({variant:"destructive",title:"Uh oh! Something went wrong.",description:ie.message?ie.message:ie.toString()})}})()},[E,y,i,n,r]);const A=H=>{f(H.currentTarget.scrollTop)},I=({index:H})=>{o(),k(H),t(x,b[H].name)},z=()=>v.jsxs("div",{className:"flex justify-start items-center gap-[12px]",children:[v.jsx("div",{children:`Images (${b.length})`}),v.jsxs("div",{className:"flex",children:[v.jsx(Zn,{tooltip:"Rows layout",onClick:()=>{s({layout:"rows"})},children:v.jsx(vB,{className:i.layout!=="rows"?"opacity-50":""})}),v.jsx(Zn,{tooltip:"Grid layout",onClick:()=>{s({layout:"masonry"})},children:v.jsx(mB,{className:i.layout!=="masonry"?"opacity-50":""})})]})]});return v.jsxs(Mx,{open:r,onOpenChange:o,children:[v.jsx(vT,{asChild:!0,children:v.jsx(Zn,{tooltip:"File Manager",children:v.jsx(pH,{})})}),v.jsxs(rg,{className:"h-4/5 max-w-6xl",children:[v.jsx(og,{children:z()}),v.jsxs("div",{className:"flex justify-between gap-8 items-center",children:[v.jsxs("div",{className:"flex relative justify-start items-center",children:[v.jsx(dB,{className:"absolute left-[8px]"}),v.jsx(Wm,{ref:g,value:i.searchText,className:"w-[250px] pl-[30px]",tabIndex:-1,onInput:H=>{H.preventDefault(),H.stopPropagation();const ie=H.target;s({searchText:ie.value})},placeholder:"Search by file name"})]}),v.jsx(hk,{defaultValue:x,onValueChange:H=>S(H),children:v.jsxs(Hx,{"aria-label":"Manage your account",children:[v.jsx(ys,{value:E2,children:"Image Directory"}),v.jsx(ys,{value:TY,children:"Output Directory"}),v.jsx(ys,{value:Hk,children:"Mask Directory"})]})}),v.jsx("div",{className:"flex gap-2",children:v.jsxs("div",{className:"flex gap-1",children:[v.jsxs(yo,{value:C2[i.sortBy],onValueChange:H=>{switch(H){case Uk:s({sortBy:Ps.NAME});break;case Vk:s({sortBy:Ps.CTIME});break;case Wk:s({sortBy:Ps.MTIME});break}},children:[v.jsx(Vr,{className:"w-[140px]",children:v.jsx(wo,{})}),v.jsx(Wr,{children:Object.values(C2).map(H=>v.jsx(Hr,{value:H,children:H},H))})]}),i.sortOrder===Ou.DESCENDING?v.jsx(Zn,{tooltip:"Descending Order",onClick:()=>{s({sortOrder:Ou.ASCENDING})},children:v.jsx(iK,{})}):v.jsx(Zn,{tooltip:"Ascending Order",onClick:()=>{s({sortOrder:Ou.DESCENDING})},children:v.jsx(lK,{})})]})})]}),v.jsx(fg,{className:"w-full h-full rounded-md",onScroll:A,ref:O,children:v.jsx(nK,{layout:i.layout,photos:b,spacing:12,padding:0,onClick:I})})]})]})}var Hd=e=>e.type==="checkbox",Tl=e=>e instanceof Date,Fn=e=>e==null;const Kk=e=>typeof e=="object";var sn=e=>!Fn(e)&&!Array.isArray(e)&&Kk(e)&&!Tl(e),Gk=e=>sn(e)&&e.target?Hd(e.target)?e.target.checked:e.target.value:e,AY=e=>e.substring(0,e.search(/\.\d+(\.|$)/))||e,Yk=(e,t)=>e.has(AY(t)),MY=e=>{const t=e.constructor&&e.constructor.prototype;return sn(t)&&t.hasOwnProperty("isPrototypeOf")},Zx=typeof window<"u"&&typeof window.HTMLElement<"u"&&typeof document<"u";function uo(e){let t;const n=Array.isArray(e);if(e instanceof Date)t=new Date(e);else if(e instanceof Set)t=new Set(e);else if(!(Zx&&(e instanceof Blob||e instanceof FileList))&&(n||sn(e)))if(t=n?[]:{},!n&&!MY(e))t=e;else for(const r in e)e.hasOwnProperty(r)&&(t[r]=uo(e[r]));else return e;return t}var Kd=e=>Array.isArray(e)?e.filter(Boolean):[],Wt=e=>e===void 0,ye=(e,t,n)=>{if(!t||!sn(e))return n;const r=Kd(t.split(/[,[\].]+?/)).reduce((o,i)=>Fn(o)?o:o[i],e);return Wt(r)||r===e?Wt(e[t])?n:e[t]:r},Uo=e=>typeof e=="boolean";const Gp={BLUR:"blur",FOCUS_OUT:"focusout",CHANGE:"change"},xo={onBlur:"onBlur",onChange:"onChange",onSubmit:"onSubmit",onTouched:"onTouched",all:"all"},hi={max:"max",min:"min",maxLength:"maxLength",minLength:"minLength",pattern:"pattern",required:"required",validate:"validate"},Xk=Be.createContext(null),hg=()=>Be.useContext(Xk),OY=e=>{const{children:t,...n}=e;return Be.createElement(Xk.Provider,{value:n},t)};var Zk=(e,t,n,r=!0)=>{const o={defaultValues:t._defaultValues};for(const i in e)Object.defineProperty(o,i,{get:()=>{const s=i;return t._proxyFormState[s]!==xo.all&&(t._proxyFormState[s]=!r||xo.all),n&&(n[s]=!0),e[s]}});return o},Fr=e=>sn(e)&&!Object.keys(e).length,qk=(e,t,n,r)=>{n(e);const{name:o,...i}=e;return Fr(i)||Object.keys(i).length>=Object.keys(t).length||Object.keys(i).find(s=>t[s]===(!r||xo.all))},rp=e=>Array.isArray(e)?e:[e],Qk=(e,t,n)=>!e||!t||e===t||rp(e).some(r=>r&&(n?r===t:r.startsWith(t)||t.startsWith(r)));function qx(e){const t=Be.useRef(e);t.current=e,Be.useEffect(()=>{const n=!e.disabled&&t.current.subject&&t.current.subject.subscribe({next:t.current.next});return()=>{n&&n.unsubscribe()}},[e.disabled])}function NY(e){const t=hg(),{control:n=t.control,disabled:r,name:o,exact:i}=e||{},[s,l]=Be.useState(n._formState),u=Be.useRef(!0),f=Be.useRef({isDirty:!1,isLoading:!1,dirtyFields:!1,touchedFields:!1,isValidating:!1,isValid:!1,errors:!1}),m=Be.useRef(o);return m.current=o,qx({disabled:r,next:p=>u.current&&Qk(m.current,p.name,i)&&qk(p,f.current,n._updateFormState)&&l({...n._formState,...p}),subject:n._subjects.state}),Be.useEffect(()=>(u.current=!0,f.current.isValid&&n._updateValid(!0),()=>{u.current=!1}),[n]),Zk(s,n,f.current,!1)}var Ko=e=>typeof e=="string",Jk=(e,t,n,r,o)=>Ko(e)?(r&&t.watch.add(e),ye(n,e,o)):Array.isArray(e)?e.map(i=>(r&&t.watch.add(i),ye(n,i))):(r&&(t.watchAll=!0),n);function DY(e){const t=hg(),{control:n=t.control,name:r,defaultValue:o,disabled:i,exact:s}=e||{},l=Be.useRef(r);l.current=r,qx({disabled:i,subject:n._subjects.values,next:m=>{Qk(l.current,m.name,s)&&f(uo(Jk(l.current,n._names,m.values||n._formValues,!1,o)))}});const[u,f]=Be.useState(n._getWatch(r,o));return Be.useEffect(()=>n._removeUnmounted()),u}var Qx=e=>/^\w*$/.test(e),e3=e=>Kd(e.replace(/["|']|\]/g,"").split(/\.|\[/));function wt(e,t,n){let r=-1;const o=Qx(t)?[t]:e3(t),i=o.length,s=i-1;for(;++r{const m=o._options.shouldUnregister||i,p=(g,y)=>{const x=ye(o._fields,g);x&&(x._f.mount=y)};if(p(n,!0),m){const g=uo(ye(o._options.defaultValues,n));wt(o._defaultValues,n,g),Wt(ye(o._formValues,n))&&wt(o._formValues,n,g)}return()=>{(s?m&&!o._state.action:m)?o.unregister(n):p(n,!1)}},[n,o,s,i]),Be.useEffect(()=>{ye(o._fields,n)&&o._updateDisabledField({disabled:r,fields:o._fields,name:n,value:ye(o._fields,n)._f.value})},[r,n,o]),{field:{name:n,value:l,...Uo(r)||Uo(u.disabled)?{disabled:u.disabled||r}:{},onChange:Be.useCallback(m=>f.current.onChange({target:{value:Gk(m),name:n},type:Gp.CHANGE}),[n]),onBlur:Be.useCallback(()=>f.current.onBlur({target:{value:ye(o._formValues,n),name:n},type:Gp.BLUR}),[n,o]),ref:m=>{const p=ye(o._fields,n);p&&m&&(p._f.ref={focus:()=>m.focus(),select:()=>m.select(),setCustomValidity:g=>m.setCustomValidity(g),reportValidity:()=>m.reportValidity()})}},formState:u,fieldState:Object.defineProperties({},{invalid:{enumerable:!0,get:()=>!!ye(u.errors,n)},isDirty:{enumerable:!0,get:()=>!!ye(u.dirtyFields,n)},isTouched:{enumerable:!0,get:()=>!!ye(u.touchedFields,n)},error:{enumerable:!0,get:()=>ye(u.errors,n)}})}}const LY=e=>e.render(IY(e));var t3=(e,t,n,r,o)=>t?{...n[e],types:{...n[e]&&n[e].types?n[e].types:{},[r]:o||!0}}:{},$2=e=>({isOnSubmit:!e||e===xo.onSubmit,isOnBlur:e===xo.onBlur,isOnChange:e===xo.onChange,isOnAll:e===xo.all,isOnTouch:e===xo.onTouched}),R2=(e,t,n)=>!n&&(t.watchAll||t.watch.has(e)||[...t.watch].some(r=>e.startsWith(r)&&/^\.\w+/.test(e.slice(r.length))));const op=(e,t,n,r)=>{for(const o of n||Object.keys(e)){const i=ye(e,o);if(i){const{_f:s,...l}=i;if(s){if(s.refs&&s.refs[0]&&t(s.refs[0],o)&&!r)break;if(s.ref&&t(s.ref,s.name)&&!r)break}else sn(l)&&op(l,t)}}};var FY=(e,t,n)=>{const r=Kd(ye(e,n));return wt(r,"root",t[n]),wt(e,n,r),e},Jx=e=>e.type==="file",ws=e=>typeof e=="function",Yp=e=>{if(!Zx)return!1;const t=e?e.ownerDocument:0;return e instanceof(t&&t.defaultView?t.defaultView.HTMLElement:HTMLElement)},ip=e=>Ko(e),eb=e=>e.type==="radio",Xp=e=>e instanceof RegExp;const P2={value:!1,isValid:!1},T2={value:!0,isValid:!0};var n3=e=>{if(Array.isArray(e)){if(e.length>1){const t=e.filter(n=>n&&n.checked&&!n.disabled).map(n=>n.value);return{value:t,isValid:!!t.length}}return e[0].checked&&!e[0].disabled?e[0].attributes&&!Wt(e[0].attributes.value)?Wt(e[0].value)||e[0].value===""?T2:{value:e[0].value,isValid:!0}:T2:P2}return P2};const k2={isValid:!1,value:null};var r3=e=>Array.isArray(e)?e.reduce((t,n)=>n&&n.checked&&!n.disabled?{isValid:!0,value:n.value}:t,k2):k2;function A2(e,t,n="validate"){if(ip(e)||Array.isArray(e)&&e.every(ip)||Uo(e)&&!e)return{type:n,message:ip(e)?e:"",ref:t}}var cl=e=>sn(e)&&!Xp(e)?e:{value:e,message:""},M2=async(e,t,n,r,o)=>{const{ref:i,refs:s,required:l,maxLength:u,minLength:f,min:m,max:p,pattern:g,validate:y,name:x,valueAsNumber:S,mount:E,disabled:_}=e._f,b=ye(t,x);if(!E||_)return{};const C=s?s[0]:i,R=K=>{r&&C.reportValidity&&(C.setCustomValidity(Uo(K)?"":K||""),C.reportValidity())},k={},O=eb(i),A=Hd(i),I=O||A,z=(S||Jx(i))&&Wt(i.value)&&Wt(b)||Yp(i)&&i.value===""||b===""||Array.isArray(b)&&!b.length,H=t3.bind(null,x,n,k),ie=(K,te,U,re=hi.maxLength,V=hi.minLength)=>{const J=K?te:U;k[x]={type:K?re:V,message:J,ref:i,...H(K?re:V,J)}};if(o?!Array.isArray(b)||!b.length:l&&(!I&&(z||Fn(b))||Uo(b)&&!b||A&&!n3(s).isValid||O&&!r3(s).isValid)){const{value:K,message:te}=ip(l)?{value:!!l,message:l}:cl(l);if(K&&(k[x]={type:hi.required,message:te,ref:C,...H(hi.required,te)},!n))return R(te),k}if(!z&&(!Fn(m)||!Fn(p))){let K,te;const U=cl(p),re=cl(m);if(!Fn(b)&&!isNaN(b)){const V=i.valueAsNumber||b&&+b;Fn(U.value)||(K=V>U.value),Fn(re.value)||(te=Vnew Date(new Date().toDateString()+" "+Q),G=i.type=="time",Z=i.type=="week";Ko(U.value)&&b&&(K=G?J(b)>J(U.value):Z?b>U.value:V>new Date(U.value)),Ko(re.value)&&b&&(te=G?J(b)+K.value,re=!Fn(te.value)&&b.length<+te.value;if((U||re)&&(ie(U,K.message,te.message),!n))return R(k[x].message),k}if(g&&!z&&Ko(b)){const{value:K,message:te}=cl(g);if(Xp(K)&&!b.match(K)&&(k[x]={type:hi.pattern,message:te,ref:i,...H(hi.pattern,te)},!n))return R(te),k}if(y){if(ws(y)){const K=await y(b,t),te=A2(K,C);if(te&&(k[x]={...te,...H(hi.validate,te.message)},!n))return R(te.message),k}else if(sn(y)){let K={};for(const te in y){if(!Fr(K)&&!n)break;const U=A2(await y[te](b,t),C,te);U&&(K={...U,...H(te,U.message)},R(U.message),n&&(k[x]=K))}if(!Fr(K)&&(k[x]={ref:C,...K},!n))return k}}return R(!0),k};function jY(e,t){const n=t.slice(0,-1).length;let r=0;for(;r{for(const i of e)i.next&&i.next(o)},subscribe:o=>(e.push(o),{unsubscribe:()=>{e=e.filter(i=>i!==o)}}),unsubscribe:()=>{e=[]}}}var Zp=e=>Fn(e)||!Kk(e);function va(e,t){if(Zp(e)||Zp(t))return e===t;if(Tl(e)&&Tl(t))return e.getTime()===t.getTime();const n=Object.keys(e),r=Object.keys(t);if(n.length!==r.length)return!1;for(const o of n){const i=e[o];if(!r.includes(o))return!1;if(o!=="ref"){const s=t[o];if(Tl(i)&&Tl(s)||sn(i)&&sn(s)||Array.isArray(i)&&Array.isArray(s)?!va(i,s):i!==s)return!1}}return!0}var o3=e=>e.type==="select-multiple",zY=e=>eb(e)||Hd(e),B0=e=>Yp(e)&&e.isConnected,i3=e=>{for(const t in e)if(ws(e[t]))return!0;return!1};function qp(e,t={}){const n=Array.isArray(e);if(sn(e)||n)for(const r in e)Array.isArray(e[r])||sn(e[r])&&!i3(e[r])?(t[r]=Array.isArray(e[r])?[]:{},qp(e[r],t[r])):Fn(e[r])||(t[r]=!0);return t}function s3(e,t,n){const r=Array.isArray(e);if(sn(e)||r)for(const o in e)Array.isArray(e[o])||sn(e[o])&&!i3(e[o])?Wt(t)||Zp(n[o])?n[o]=Array.isArray(e[o])?qp(e[o],[]):{...qp(e[o])}:s3(e[o],Fn(t)?{}:t[o],n[o]):n[o]=!va(e[o],t[o]);return n}var z0=(e,t)=>s3(e,t,qp(t)),a3=(e,{valueAsNumber:t,valueAsDate:n,setValueAs:r})=>Wt(e)?e:t?e===""?NaN:e&&+e:n&&Ko(e)?new Date(e):r?r(e):e;function U0(e){const t=e.ref;if(!(e.refs?e.refs.every(n=>n.disabled):t.disabled))return Jx(t)?t.files:eb(t)?r3(e.refs).value:o3(t)?[...t.selectedOptions].map(({value:n})=>n):Hd(t)?n3(e.refs).value:a3(Wt(t.value)?e.ref.value:t.value,e)}var UY=(e,t,n,r)=>{const o={};for(const i of e){const s=ye(t,i);s&&wt(o,i,s._f)}return{criteriaMode:n,names:[...e],fields:o,shouldUseNativeValidation:r}},uu=e=>Wt(e)?e:Xp(e)?e.source:sn(e)?Xp(e.value)?e.value.source:e.value:e,VY=e=>e.mount&&(e.required||e.min||e.max||e.maxLength||e.minLength||e.pattern||e.validate);function O2(e,t,n){const r=ye(e,n);if(r||Qx(n))return{error:r,name:n};const o=n.split(".");for(;o.length;){const i=o.join("."),s=ye(t,i),l=ye(e,i);if(s&&!Array.isArray(s)&&n!==i)return{name:n};if(l&&l.type)return{name:i,error:l};o.pop()}return{name:n}}var WY=(e,t,n,r,o)=>o.isOnAll?!1:!n&&o.isOnTouch?!(t||e):(n?r.isOnBlur:o.isOnBlur)?!e:(n?r.isOnChange:o.isOnChange)?e:!0,HY=(e,t)=>!Kd(ye(e,t)).length&&pn(e,t);const KY={mode:xo.onSubmit,reValidateMode:xo.onChange,shouldFocusError:!0};function GY(e={},t){let n={...KY,...e},r={submitCount:0,isDirty:!1,isLoading:ws(n.defaultValues),isValidating:!1,isSubmitted:!1,isSubmitting:!1,isSubmitSuccessful:!1,isValid:!1,touchedFields:{},dirtyFields:{},errors:{},disabled:!1},o={},i=sn(n.defaultValues)||sn(n.values)?uo(n.defaultValues||n.values)||{}:{},s=n.shouldUnregister?{}:uo(i),l={action:!1,mount:!1,watch:!1},u={mount:new Set,unMount:new Set,array:new Set,watch:new Set},f,m=0;const p={isDirty:!1,dirtyFields:!1,touchedFields:!1,isValidating:!1,isValid:!1,errors:!1},g={values:j0(),array:j0(),state:j0()},y=e.resetOptions&&e.resetOptions.keepDirtyValues,x=$2(n.mode),S=$2(n.reValidateMode),E=n.criteriaMode===xo.all,_=N=>X=>{clearTimeout(m),m=setTimeout(N,X)},b=async N=>{if(p.isValid||N){const X=n.resolver?Fr((await z()).errors):await ie(o,!0);X!==r.isValid&&g.state.next({isValid:X})}},C=N=>p.isValidating&&g.state.next({isValidating:N}),R=(N,X=[],ee,Se,pe=!0,he=!0)=>{if(Se&&ee){if(l.action=!0,he&&Array.isArray(ye(o,N))){const Te=ee(ye(o,N),Se.argA,Se.argB);pe&&wt(o,N,Te)}if(he&&Array.isArray(ye(r.errors,N))){const Te=ee(ye(r.errors,N),Se.argA,Se.argB);pe&&wt(r.errors,N,Te),HY(r.errors,N)}if(p.touchedFields&&he&&Array.isArray(ye(r.touchedFields,N))){const Te=ee(ye(r.touchedFields,N),Se.argA,Se.argB);pe&&wt(r.touchedFields,N,Te)}p.dirtyFields&&(r.dirtyFields=z0(i,s)),g.state.next({name:N,isDirty:te(N,X),dirtyFields:r.dirtyFields,errors:r.errors,isValid:r.isValid})}else wt(s,N,X)},k=(N,X)=>{wt(r.errors,N,X),g.state.next({errors:r.errors})},O=(N,X,ee,Se)=>{const pe=ye(o,N);if(pe){const he=ye(s,N,Wt(ee)?ye(i,N):ee);Wt(he)||Se&&Se.defaultChecked||X?wt(s,N,X?he:U0(pe._f)):V(N,he),l.mount&&b()}},A=(N,X,ee,Se,pe)=>{let he=!1,Te=!1;const ut={name:N};if(!ee||Se){p.isDirty&&(Te=r.isDirty,r.isDirty=ut.isDirty=te(),he=Te!==ut.isDirty);const gt=va(ye(i,N),X);Te=ye(r.dirtyFields,N),gt?pn(r.dirtyFields,N):wt(r.dirtyFields,N,!0),ut.dirtyFields=r.dirtyFields,he=he||p.dirtyFields&&Te!==!gt}if(ee){const gt=ye(r.touchedFields,N);gt||(wt(r.touchedFields,N,ee),ut.touchedFields=r.touchedFields,he=he||p.touchedFields&>!==ee)}return he&&pe&&g.state.next(ut),he?ut:{}},I=(N,X,ee,Se)=>{const pe=ye(r.errors,N),he=p.isValid&&Uo(X)&&r.isValid!==X;if(e.delayError&&ee?(f=_(()=>k(N,ee)),f(e.delayError)):(clearTimeout(m),f=null,ee?wt(r.errors,N,ee):pn(r.errors,N)),(ee?!va(pe,ee):pe)||!Fr(Se)||he){const Te={...Se,...he&&Uo(X)?{isValid:X}:{},errors:r.errors,name:N};r={...r,...Te},g.state.next(Te)}C(!1)},z=async N=>n.resolver(s,n.context,UY(N||u.mount,o,n.criteriaMode,n.shouldUseNativeValidation)),H=async N=>{const{errors:X}=await z(N);if(N)for(const ee of N){const Se=ye(X,ee);Se?wt(r.errors,ee,Se):pn(r.errors,ee)}else r.errors=X;return X},ie=async(N,X,ee={valid:!0})=>{for(const Se in N){const pe=N[Se];if(pe){const{_f:he,...Te}=pe;if(he){const ut=u.array.has(he.name),gt=await M2(pe,s,E,n.shouldUseNativeValidation&&!X,ut);if(gt[he.name]&&(ee.valid=!1,X))break;!X&&(ye(gt,he.name)?ut?FY(r.errors,gt,he.name):wt(r.errors,he.name,gt[he.name]):pn(r.errors,he.name))}Te&&await ie(Te,X,ee)}}return ee.valid},K=()=>{for(const N of u.unMount){const X=ye(o,N);X&&(X._f.refs?X._f.refs.every(ee=>!B0(ee)):!B0(X._f.ref))&&me(N)}u.unMount=new Set},te=(N,X)=>(N&&X&&wt(s,N,X),!va(L(),i)),U=(N,X,ee)=>Jk(N,u,{...l.mount?s:Wt(X)?i:Ko(N)?{[N]:X}:X},ee,X),re=N=>Kd(ye(l.mount?s:i,N,e.shouldUnregister?ye(i,N,[]):[])),V=(N,X,ee={})=>{const Se=ye(o,N);let pe=X;if(Se){const he=Se._f;he&&(!he.disabled&&wt(s,N,a3(X,he)),pe=Yp(he.ref)&&Fn(X)?"":X,o3(he.ref)?[...he.ref.options].forEach(Te=>Te.selected=pe.includes(Te.value)):he.refs?Hd(he.ref)?he.refs.length>1?he.refs.forEach(Te=>(!Te.defaultChecked||!Te.disabled)&&(Te.checked=Array.isArray(pe)?!!pe.find(ut=>ut===Te.value):pe===Te.value)):he.refs[0]&&(he.refs[0].checked=!!pe):he.refs.forEach(Te=>Te.checked=Te.value===pe):Jx(he.ref)?he.ref.value="":(he.ref.value=pe,he.ref.type||g.values.next({name:N,values:{...s}})))}(ee.shouldDirty||ee.shouldTouch)&&A(N,pe,ee.shouldTouch,ee.shouldDirty,!0),ee.shouldValidate&&le(N)},J=(N,X,ee)=>{for(const Se in X){const pe=X[Se],he=`${N}.${Se}`,Te=ye(o,he);(u.array.has(N)||!Zp(pe)||Te&&!Te._f)&&!Tl(pe)?J(he,pe,ee):V(he,pe,ee)}},G=(N,X,ee={})=>{const Se=ye(o,N),pe=u.array.has(N),he=uo(X);wt(s,N,he),pe?(g.array.next({name:N,values:{...s}}),(p.isDirty||p.dirtyFields)&&ee.shouldDirty&&g.state.next({name:N,dirtyFields:z0(i,s),isDirty:te(N,he)})):Se&&!Se._f&&!Fn(he)?J(N,he,ee):V(N,he,ee),R2(N,u)&&g.state.next({...r}),g.values.next({name:N,values:{...s}}),!l.mount&&t()},Z=async N=>{const X=N.target;let ee=X.name,Se=!0;const pe=ye(o,ee),he=()=>X.type?U0(pe._f):Gk(N),Te=ut=>{Se=Number.isNaN(ut)||ut===ye(s,ee,ut)};if(pe){let ut,gt;const An=he(),Mn=N.type===Gp.BLUR||N.type===Gp.FOCUS_OUT,Fi=!VY(pe._f)&&!n.resolver&&!ye(r.errors,ee)&&!pe._f.deps||WY(Mn,ye(r.touchedFields,ee),r.isSubmitted,S,x),Ro=R2(ee,u,Mn);wt(s,ee,An),Mn?(pe._f.onBlur&&pe._f.onBlur(N),f&&f(0)):pe._f.onChange&&pe._f.onChange(N);const Cr=A(ee,An,Mn,!1),Ks=!Fr(Cr)||Ro;if(!Mn&&g.values.next({name:ee,type:N.type,values:{...s}}),Fi)return p.isValid&&b(),Ks&&g.state.next({name:ee,...Ro?{}:Cr});if(!Mn&&Ro&&g.state.next({...r}),C(!0),n.resolver){const{errors:ri}=await z([ee]);if(Te(An),Se){const ji=O2(r.errors,o,ee),or=O2(ri,o,ji.name||ee);ut=or.error,ee=or.name,gt=Fr(ri)}}else ut=(await M2(pe,s,E,n.shouldUseNativeValidation))[ee],Te(An),Se&&(ut?gt=!1:p.isValid&&(gt=await ie(o,!0)));Se&&(pe._f.deps&&le(pe._f.deps),I(ee,gt,ut,Cr))}},Q=(N,X)=>{if(ye(r.errors,X)&&N.focus)return N.focus(),1},le=async(N,X={})=>{let ee,Se;const pe=rp(N);if(C(!0),n.resolver){const he=await H(Wt(N)?N:pe);ee=Fr(he),Se=N?!pe.some(Te=>ye(he,Te)):ee}else N?(Se=(await Promise.all(pe.map(async he=>{const Te=ye(o,he);return await ie(Te&&Te._f?{[he]:Te}:Te)}))).every(Boolean),!(!Se&&!r.isValid)&&b()):Se=ee=await ie(o);return g.state.next({...!Ko(N)||p.isValid&&ee!==r.isValid?{}:{name:N},...n.resolver||!N?{isValid:ee}:{},errors:r.errors,isValidating:!1}),X.shouldFocus&&!Se&&op(o,Q,N?pe:u.mount),Se},L=N=>{const X={...i,...l.mount?s:{}};return Wt(N)?X:Ko(N)?ye(X,N):N.map(ee=>ye(X,ee))},ue=(N,X)=>({invalid:!!ye((X||r).errors,N),isDirty:!!ye((X||r).dirtyFields,N),isTouched:!!ye((X||r).touchedFields,N),error:ye((X||r).errors,N)}),Ne=N=>{N&&rp(N).forEach(X=>pn(r.errors,X)),g.state.next({errors:N?r.errors:{}})},Ke=(N,X,ee)=>{const Se=(ye(o,N,{_f:{}})._f||{}).ref;wt(r.errors,N,{...X,ref:Se}),g.state.next({name:N,errors:r.errors,isValid:!1}),ee&&ee.shouldFocus&&Se&&Se.focus&&Se.focus()},Me=(N,X)=>ws(N)?g.values.subscribe({next:ee=>N(U(void 0,X),ee)}):U(N,X,!0),me=(N,X={})=>{for(const ee of N?rp(N):u.mount)u.mount.delete(ee),u.array.delete(ee),X.keepValue||(pn(o,ee),pn(s,ee)),!X.keepError&&pn(r.errors,ee),!X.keepDirty&&pn(r.dirtyFields,ee),!X.keepTouched&&pn(r.touchedFields,ee),!n.shouldUnregister&&!X.keepDefaultValue&&pn(i,ee);g.values.next({values:{...s}}),g.state.next({...r,...X.keepDirty?{isDirty:te()}:{}}),!X.keepIsValid&&b()},be=({disabled:N,name:X,field:ee,fields:Se,value:pe})=>{if(Uo(N)){const he=N?void 0:Wt(pe)?U0(ee?ee._f:ye(Se,X)._f):pe;wt(s,X,he),A(X,he,!1,!1,!0)}},Ee=(N,X={})=>{let ee=ye(o,N);const Se=Uo(X.disabled);return wt(o,N,{...ee||{},_f:{...ee&&ee._f?ee._f:{ref:{name:N}},name:N,mount:!0,...X}}),u.mount.add(N),ee?be({field:ee,disabled:X.disabled,name:N}):O(N,!0,X.value),{...Se?{disabled:X.disabled}:{},...n.progressive?{required:!!X.required,min:uu(X.min),max:uu(X.max),minLength:uu(X.minLength),maxLength:uu(X.maxLength),pattern:uu(X.pattern)}:{},name:N,onChange:Z,onBlur:Z,ref:pe=>{if(pe){Ee(N,X),ee=ye(o,N);const he=Wt(pe.value)&&pe.querySelectorAll&&pe.querySelectorAll("input,select,textarea")[0]||pe,Te=zY(he),ut=ee._f.refs||[];if(Te?ut.find(gt=>gt===he):he===ee._f.ref)return;wt(o,N,{_f:{...ee._f,...Te?{refs:[...ut.filter(B0),he,...Array.isArray(ye(i,N))?[{}]:[]],ref:{type:he.type,name:N}}:{ref:he}}}),O(N,!1,void 0,he)}else ee=ye(o,N,{}),ee._f&&(ee._f.mount=!1),(n.shouldUnregister||X.shouldUnregister)&&!(Yk(u.array,N)&&l.action)&&u.unMount.add(N)}}},Oe=()=>n.shouldFocusError&&op(o,Q,u.mount),Ie=N=>{Uo(N)&&(g.state.next({disabled:N}),op(o,X=>{X.disabled=N},0,!1))},ze=(N,X)=>async ee=>{ee&&(ee.preventDefault&&ee.preventDefault(),ee.persist&&ee.persist());let Se=uo(s);if(g.state.next({isSubmitting:!0}),n.resolver){const{errors:pe,values:he}=await z();r.errors=pe,Se=he}else await ie(o);pn(r.errors,"root"),Fr(r.errors)?(g.state.next({errors:{}}),await N(Se,ee)):(X&&await X({...r.errors},ee),Oe(),setTimeout(Oe)),g.state.next({isSubmitted:!0,isSubmitting:!1,isSubmitSuccessful:Fr(r.errors),submitCount:r.submitCount+1,errors:r.errors})},ht=(N,X={})=>{ye(o,N)&&(Wt(X.defaultValue)?G(N,ye(i,N)):(G(N,X.defaultValue),wt(i,N,X.defaultValue)),X.keepTouched||pn(r.touchedFields,N),X.keepDirty||(pn(r.dirtyFields,N),r.isDirty=X.defaultValue?te(N,ye(i,N)):te()),X.keepError||(pn(r.errors,N),p.isValid&&b()),g.state.next({...r}))},st=(N,X={})=>{const ee=N?uo(N):i,Se=uo(ee),pe=N&&!Fr(N)?Se:i;if(X.keepDefaultValues||(i=ee),!X.keepValues){if(X.keepDirtyValues||y)for(const he of u.mount)ye(r.dirtyFields,he)?wt(pe,he,ye(s,he)):G(he,ye(pe,he));else{if(Zx&&Wt(N))for(const he of u.mount){const Te=ye(o,he);if(Te&&Te._f){const ut=Array.isArray(Te._f.refs)?Te._f.refs[0]:Te._f.ref;if(Yp(ut)){const gt=ut.closest("form");if(gt){gt.reset();break}}}}o={}}s=e.shouldUnregister?X.keepDefaultValues?uo(i):{}:uo(pe),g.array.next({values:{...pe}}),g.values.next({values:{...pe}})}u={mount:new Set,unMount:new Set,array:new Set,watch:new Set,watchAll:!1,focus:""},!l.mount&&t(),l.mount=!p.isValid||!!X.keepIsValid,l.watch=!!e.shouldUnregister,g.state.next({submitCount:X.keepSubmitCount?r.submitCount:0,isDirty:X.keepDirty?r.isDirty:!!(X.keepDefaultValues&&!va(N,i)),isSubmitted:X.keepIsSubmitted?r.isSubmitted:!1,dirtyFields:X.keepDirtyValues?r.dirtyFields:X.keepDefaultValues&&N?z0(i,N):{},touchedFields:X.keepTouched?r.touchedFields:{},errors:X.keepErrors?r.errors:{},isSubmitSuccessful:X.keepIsSubmitSuccessful?r.isSubmitSuccessful:!1,isSubmitting:!1})},Yt=(N,X)=>st(ws(N)?N(s):N,X);return{control:{register:Ee,unregister:me,getFieldState:ue,handleSubmit:ze,setError:Ke,_executeSchema:z,_getWatch:U,_getDirty:te,_updateValid:b,_removeUnmounted:K,_updateFieldArray:R,_updateDisabledField:be,_getFieldArray:re,_reset:st,_resetDefaultValues:()=>ws(n.defaultValues)&&n.defaultValues().then(N=>{Yt(N,n.resetOptions),g.state.next({isLoading:!1})}),_updateFormState:N=>{r={...r,...N}},_disableForm:Ie,_subjects:g,_proxyFormState:p,get _fields(){return o},get _formValues(){return s},get _state(){return l},set _state(N){l=N},get _defaultValues(){return i},get _names(){return u},set _names(N){u=N},get _formState(){return r},set _formState(N){r=N},get _options(){return n},set _options(N){n={...n,...N}}},trigger:le,register:Ee,handleSubmit:ze,watch:Me,setValue:G,getValues:L,reset:Yt,resetField:ht,clearErrors:Ne,unregister:me,setError:Ke,setFocus:(N,X={})=>{const ee=ye(o,N),Se=ee&&ee._f;if(Se){const pe=Se.refs?Se.refs[0]:Se.ref;pe.focus&&(pe.focus(),X.shouldSelect&&pe.select())}},getFieldState:ue}}function YY(e={}){const t=Be.useRef(),n=Be.useRef(),[r,o]=Be.useState({isDirty:!1,isValidating:!1,isLoading:ws(e.defaultValues),isSubmitted:!1,isSubmitting:!1,isSubmitSuccessful:!1,isValid:!1,submitCount:0,dirtyFields:{},touchedFields:{},errors:{},disabled:!1,defaultValues:ws(e.defaultValues)?void 0:e.defaultValues});t.current||(t.current={...GY(e,()=>o(s=>({...s}))),formState:r});const i=t.current.control;return i._options=e,qx({subject:i._subjects.state,next:s=>{qk(s,i._proxyFormState,i._updateFormState,!0)&&o({...i._formState})}}),Be.useEffect(()=>i._disableForm(e.disabled),[i,e.disabled]),Be.useEffect(()=>{if(i._proxyFormState.isDirty){const s=i._getDirty();s!==r.isDirty&&i._subjects.state.next({isDirty:s})}},[i,r.isDirty]),Be.useEffect(()=>{e.values&&!va(e.values,n.current)?(i._reset(e.values,i._options.resetOptions),n.current=e.values):i._resetDefaultValues()},[e.values,i]),Be.useEffect(()=>{i._state.mount||(i._updateValid(),i._state.mount=!0),i._state.watch&&(i._state.watch=!1,i._subjects.state.next({...i._formState})),i._removeUnmounted()}),t.current.formState=Zk(r,i),t.current}var N2=function(e,t,n){if(e&&"reportValidity"in e){var r=ye(n,t);e.setCustomValidity(r&&r.message||""),e.reportValidity()}},l3=function(e,t){var n=function(o){var i=t.fields[o];i&&i.ref&&"reportValidity"in i.ref?N2(i.ref,o,e):i.refs&&i.refs.forEach(function(s){return N2(s,o,e)})};for(var r in t.fields)n(r)},XY=function(e,t){t.shouldUseNativeValidation&&l3(e,t);var n={};for(var r in e){var o=ye(t.fields,r),i=Object.assign(e[r]||{},{ref:o&&o.ref});if(qY(t.names||Object.keys(e),r)){var s=Object.assign({},ZY(ye(n,r)));wt(s,"root",i),wt(n,r,s)}else wt(n,r,i)}return n},ZY=function(e){return Array.isArray(e)?e.filter(Boolean):[]},qY=function(e,t){return e.some(function(n){return n.startsWith(t+".")})},QY=function(e,t){for(var n={};e.length;){var r=e[0],o=r.code,i=r.message,s=r.path.join(".");if(!n[s])if("unionErrors"in r){var l=r.unionErrors[0].errors[0];n[s]={message:l.message,type:l.code}}else n[s]={message:i,type:o};if("unionErrors"in r&&r.unionErrors.forEach(function(m){return m.errors.forEach(function(p){return e.push(p)})}),t){var u=n[s].types,f=u&&u[r.code];n[s]=t3(s,t,n,o,f?[].concat(f,r.message):r.message)}e.shift()}return n},JY=function(e,t,n){return n===void 0&&(n={}),function(r,o,i){try{return Promise.resolve(function(s,l){try{var u=Promise.resolve(e[n.mode==="sync"?"parse":"parseAsync"](r,t)).then(function(f){return i.shouldUseNativeValidation&&l3({},i),{errors:{},values:n.raw?r:f}})}catch(f){return l(f)}return u&&u.then?u.then(void 0,l):u}(0,function(s){if(function(l){return l.errors!=null}(s))return{values:{},errors:XY(QY(s.errors,!i.shouldUseNativeValidation&&i.criteriaMode==="all"),i)};throw s}))}catch(s){return Promise.reject(s)}}},mt;(function(e){e.assertEqual=o=>o;function t(o){}e.assertIs=t;function n(o){throw new Error}e.assertNever=n,e.arrayToEnum=o=>{const i={};for(const s of o)i[s]=s;return i},e.getValidEnumValues=o=>{const i=e.objectKeys(o).filter(l=>typeof o[o[l]]!="number"),s={};for(const l of i)s[l]=o[l];return e.objectValues(s)},e.objectValues=o=>e.objectKeys(o).map(function(i){return o[i]}),e.objectKeys=typeof Object.keys=="function"?o=>Object.keys(o):o=>{const i=[];for(const s in o)Object.prototype.hasOwnProperty.call(o,s)&&i.push(s);return i},e.find=(o,i)=>{for(const s of o)if(i(s))return s},e.isInteger=typeof Number.isInteger=="function"?o=>Number.isInteger(o):o=>typeof o=="number"&&isFinite(o)&&Math.floor(o)===o;function r(o,i=" | "){return o.map(s=>typeof s=="string"?`'${s}'`:s).join(i)}e.joinValues=r,e.jsonStringifyReplacer=(o,i)=>typeof i=="bigint"?i.toString():i})(mt||(mt={}));var D2;(function(e){e.mergeShapes=(t,n)=>({...t,...n})})(D2||(D2={}));const _e=mt.arrayToEnum(["string","nan","number","integer","float","boolean","date","bigint","symbol","function","undefined","null","array","object","unknown","promise","void","never","map","set"]),da=e=>{switch(typeof e){case"undefined":return _e.undefined;case"string":return _e.string;case"number":return isNaN(e)?_e.nan:_e.number;case"boolean":return _e.boolean;case"function":return _e.function;case"bigint":return _e.bigint;case"symbol":return _e.symbol;case"object":return Array.isArray(e)?_e.array:e===null?_e.null:e.then&&typeof e.then=="function"&&e.catch&&typeof e.catch=="function"?_e.promise:typeof Map<"u"&&e instanceof Map?_e.map:typeof Set<"u"&&e instanceof Set?_e.set:typeof Date<"u"&&e instanceof Date?_e.date:_e.object;default:return _e.unknown}},ge=mt.arrayToEnum(["invalid_type","invalid_literal","custom","invalid_union","invalid_union_discriminator","invalid_enum_value","unrecognized_keys","invalid_arguments","invalid_return_type","invalid_date","invalid_string","too_small","too_big","invalid_intersection_types","not_multiple_of","not_finite"]);class Xo extends Error{constructor(t){super(),this.issues=[],this.addIssue=r=>{this.issues=[...this.issues,r]},this.addIssues=(r=[])=>{this.issues=[...this.issues,...r]};const n=new.target.prototype;Object.setPrototypeOf?Object.setPrototypeOf(this,n):this.__proto__=n,this.name="ZodError",this.issues=t}get errors(){return this.issues}format(t){const n=t||function(i){return i.message},r={_errors:[]},o=i=>{for(const s of i.issues)if(s.code==="invalid_union")s.unionErrors.map(o);else if(s.code==="invalid_return_type")o(s.returnTypeError);else if(s.code==="invalid_arguments")o(s.argumentsError);else if(s.path.length===0)r._errors.push(n(s));else{let l=r,u=0;for(;un.message){const n={},r=[];for(const o of this.issues)o.path.length>0?(n[o.path[0]]=n[o.path[0]]||[],n[o.path[0]].push(t(o))):r.push(t(o));return{formErrors:r,fieldErrors:n}}get formErrors(){return this.flatten()}}Xo.create=e=>new Xo(e);const Qp=(e,t)=>{let n;switch(e.code){case ge.invalid_type:e.received===_e.undefined?n="Required":n=`Expected ${e.expected}, received ${e.received}`;break;case ge.invalid_literal:n=`Invalid literal value, expected ${JSON.stringify(e.expected,mt.jsonStringifyReplacer)}`;break;case ge.unrecognized_keys:n=`Unrecognized key(s) in object: ${mt.joinValues(e.keys,", ")}`;break;case ge.invalid_union:n="Invalid input";break;case ge.invalid_union_discriminator:n=`Invalid discriminator value. Expected ${mt.joinValues(e.options)}`;break;case ge.invalid_enum_value:n=`Invalid enum value. Expected ${mt.joinValues(e.options)}, received '${e.received}'`;break;case ge.invalid_arguments:n="Invalid function arguments";break;case ge.invalid_return_type:n="Invalid function return type";break;case ge.invalid_date:n="Invalid date";break;case ge.invalid_string:typeof e.validation=="object"?"includes"in e.validation?(n=`Invalid input: must include "${e.validation.includes}"`,typeof e.validation.position=="number"&&(n=`${n} at one or more positions greater than or equal to ${e.validation.position}`)):"startsWith"in e.validation?n=`Invalid input: must start with "${e.validation.startsWith}"`:"endsWith"in e.validation?n=`Invalid input: must end with "${e.validation.endsWith}"`:mt.assertNever(e.validation):e.validation!=="regex"?n=`Invalid ${e.validation}`:n="Invalid";break;case ge.too_small:e.type==="array"?n=`Array must contain ${e.exact?"exactly":e.inclusive?"at least":"more than"} ${e.minimum} element(s)`:e.type==="string"?n=`String must contain ${e.exact?"exactly":e.inclusive?"at least":"over"} ${e.minimum} character(s)`:e.type==="number"?n=`Number must be ${e.exact?"exactly equal to ":e.inclusive?"greater than or equal to ":"greater than "}${e.minimum}`:e.type==="date"?n=`Date must be ${e.exact?"exactly equal to ":e.inclusive?"greater than or equal to ":"greater than "}${new Date(Number(e.minimum))}`:n="Invalid input";break;case ge.too_big:e.type==="array"?n=`Array must contain ${e.exact?"exactly":e.inclusive?"at most":"less than"} ${e.maximum} element(s)`:e.type==="string"?n=`String must contain ${e.exact?"exactly":e.inclusive?"at most":"under"} ${e.maximum} character(s)`:e.type==="number"?n=`Number must be ${e.exact?"exactly":e.inclusive?"less than or equal to":"less than"} ${e.maximum}`:e.type==="bigint"?n=`BigInt must be ${e.exact?"exactly":e.inclusive?"less than or equal to":"less than"} ${e.maximum}`:e.type==="date"?n=`Date must be ${e.exact?"exactly":e.inclusive?"smaller than or equal to":"smaller than"} ${new Date(Number(e.maximum))}`:n="Invalid input";break;case ge.custom:n="Invalid input";break;case ge.invalid_intersection_types:n="Intersection results could not be merged";break;case ge.not_multiple_of:n=`Number must be a multiple of ${e.multipleOf}`;break;case ge.not_finite:n="Number must be finite";break;default:n=t.defaultError,mt.assertNever(e)}return{message:n}};let eX=Qp;function T1(){return eX}const k1=e=>{const{data:t,path:n,errorMaps:r,issueData:o}=e,i=[...n,...o.path||[]],s={...o,path:i};let l="";const u=r.filter(f=>!!f).slice().reverse();for(const f of u)l=f(s,{data:t,defaultError:l}).message;return{...o,path:i,message:o.message||l}};function Re(e,t){const n=k1({issueData:t,data:e.data,path:e.path,errorMaps:[e.common.contextualErrorMap,e.schemaErrorMap,T1(),Qp].filter(r=>!!r)});e.common.issues.push(n)}class Bn{constructor(){this.value="valid"}dirty(){this.value==="valid"&&(this.value="dirty")}abort(){this.value!=="aborted"&&(this.value="aborted")}static mergeArray(t,n){const r=[];for(const o of n){if(o.status==="aborted")return Ze;o.status==="dirty"&&t.dirty(),r.push(o.value)}return{status:t.value,value:r}}static async mergeObjectAsync(t,n){const r=[];for(const o of n)r.push({key:await o.key,value:await o.value});return Bn.mergeObjectSync(t,r)}static mergeObjectSync(t,n){const r={};for(const o of n){const{key:i,value:s}=o;if(i.status==="aborted"||s.status==="aborted")return Ze;i.status==="dirty"&&t.dirty(),s.status==="dirty"&&t.dirty(),i.value!=="__proto__"&&(typeof s.value<"u"||o.alwaysSet)&&(r[i.value]=s.value)}return{status:t.value,value:r}}}const Ze=Object.freeze({status:"aborted"}),tX=e=>({status:"dirty",value:e}),nr=e=>({status:"valid",value:e}),I2=e=>e.status==="aborted",L2=e=>e.status==="dirty",Jp=e=>e.status==="valid",A1=e=>typeof Promise<"u"&&e instanceof Promise;var De;(function(e){e.errToObj=t=>typeof t=="string"?{message:t}:t||{},e.toString=t=>typeof t=="string"?t:t==null?void 0:t.message})(De||(De={}));class ei{constructor(t,n,r,o){this._cachedPath=[],this.parent=t,this.data=n,this._path=r,this._key=o}get path(){return this._cachedPath.length||(this._key instanceof Array?this._cachedPath.push(...this._path,...this._key):this._cachedPath.push(...this._path,this._key)),this._cachedPath}}const F2=(e,t)=>{if(Jp(t))return{success:!0,data:t.value};if(!e.common.issues.length)throw new Error("Validation failed but no issues detected.");return{success:!1,get error(){if(this._error)return this._error;const n=new Xo(e.common.issues);return this._error=n,this._error}}};function Xe(e){if(!e)return{};const{errorMap:t,invalid_type_error:n,required_error:r,description:o}=e;if(t&&(n||r))throw new Error(`Can't use "invalid_type_error" or "required_error" in conjunction with custom error map.`);return t?{errorMap:t,description:o}:{errorMap:(s,l)=>s.code!=="invalid_type"?{message:l.defaultError}:typeof l.data>"u"?{message:r??l.defaultError}:{message:n??l.defaultError},description:o}}class rt{constructor(t){this.spa=this.safeParseAsync,this._def=t,this.parse=this.parse.bind(this),this.safeParse=this.safeParse.bind(this),this.parseAsync=this.parseAsync.bind(this),this.safeParseAsync=this.safeParseAsync.bind(this),this.spa=this.spa.bind(this),this.refine=this.refine.bind(this),this.refinement=this.refinement.bind(this),this.superRefine=this.superRefine.bind(this),this.optional=this.optional.bind(this),this.nullable=this.nullable.bind(this),this.nullish=this.nullish.bind(this),this.array=this.array.bind(this),this.promise=this.promise.bind(this),this.or=this.or.bind(this),this.and=this.and.bind(this),this.transform=this.transform.bind(this),this.brand=this.brand.bind(this),this.default=this.default.bind(this),this.catch=this.catch.bind(this),this.describe=this.describe.bind(this),this.pipe=this.pipe.bind(this),this.readonly=this.readonly.bind(this),this.isNullable=this.isNullable.bind(this),this.isOptional=this.isOptional.bind(this)}get description(){return this._def.description}_getType(t){return da(t.data)}_getOrReturnCtx(t,n){return n||{common:t.parent.common,data:t.data,parsedType:da(t.data),schemaErrorMap:this._def.errorMap,path:t.path,parent:t.parent}}_processInputParams(t){return{status:new Bn,ctx:{common:t.parent.common,data:t.data,parsedType:da(t.data),schemaErrorMap:this._def.errorMap,path:t.path,parent:t.parent}}}_parseSync(t){const n=this._parse(t);if(A1(n))throw new Error("Synchronous parse encountered promise.");return n}_parseAsync(t){const n=this._parse(t);return Promise.resolve(n)}parse(t,n){const r=this.safeParse(t,n);if(r.success)return r.data;throw r.error}safeParse(t,n){var r;const o={common:{issues:[],async:(r=n==null?void 0:n.async)!==null&&r!==void 0?r:!1,contextualErrorMap:n==null?void 0:n.errorMap},path:(n==null?void 0:n.path)||[],schemaErrorMap:this._def.errorMap,parent:null,data:t,parsedType:da(t)},i=this._parseSync({data:t,path:o.path,parent:o});return F2(o,i)}async parseAsync(t,n){const r=await this.safeParseAsync(t,n);if(r.success)return r.data;throw r.error}async safeParseAsync(t,n){const r={common:{issues:[],contextualErrorMap:n==null?void 0:n.errorMap,async:!0},path:(n==null?void 0:n.path)||[],schemaErrorMap:this._def.errorMap,parent:null,data:t,parsedType:da(t)},o=this._parse({data:t,path:r.path,parent:r}),i=await(A1(o)?o:Promise.resolve(o));return F2(r,i)}refine(t,n){const r=o=>typeof n=="string"||typeof n>"u"?{message:n}:typeof n=="function"?n(o):n;return this._refinement((o,i)=>{const s=t(o),l=()=>i.addIssue({code:ge.custom,...r(o)});return typeof Promise<"u"&&s instanceof Promise?s.then(u=>u?!0:(l(),!1)):s?!0:(l(),!1)})}refinement(t,n){return this._refinement((r,o)=>t(r)?!0:(o.addIssue(typeof n=="function"?n(r,o):n),!1))}_refinement(t){return new Mi({schema:this,typeName:Ue.ZodEffects,effect:{type:"refinement",refinement:t}})}superRefine(t){return this._refinement(t)}optional(){return Ts.create(this,this._def)}nullable(){return yc.create(this,this._def)}nullish(){return this.nullable().optional()}array(){return Zo.create(this,this._def)}promise(){return pd.create(this,this._def)}or(t){return nm.create([this,t],this._def)}and(t){return rm.create(this,t,this._def)}transform(t){return new Mi({...Xe(this._def),schema:this,typeName:Ue.ZodEffects,effect:{type:"transform",transform:t}})}default(t){const n=typeof t=="function"?t:()=>t;return new lm({...Xe(this._def),innerType:this,defaultValue:n,typeName:Ue.ZodDefault})}brand(){return new hX({typeName:Ue.ZodBranded,type:this,...Xe(this._def)})}catch(t){const n=typeof t=="function"?t:()=>t;return new F1({...Xe(this._def),innerType:this,catchValue:n,typeName:Ue.ZodCatch})}describe(t){const n=this.constructor;return new n({...this._def,description:t})}pipe(t){return pg.create(this,t)}readonly(){return B1.create(this)}isOptional(){return this.safeParse(void 0).success}isNullable(){return this.safeParse(null).success}}const nX=/^c[^\s-]{8,}$/i,rX=/^[a-z][a-z0-9]*$/,oX=/^[0-9A-HJKMNP-TV-Z]{26}$/,iX=/^[0-9a-fA-F]{8}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{4}\b-[0-9a-fA-F]{12}$/i,sX=/^(?!\.)(?!.*\.\.)([A-Z0-9_+-\.]*)[A-Z0-9_+-]@([A-Z0-9][A-Z0-9\-]*\.)+[A-Z]{2,}$/i,aX="^(\\p{Extended_Pictographic}|\\p{Emoji_Component})+$";let V0;const lX=/^(((25[0-5])|(2[0-4][0-9])|(1[0-9]{2})|([0-9]{1,2}))\.){3}((25[0-5])|(2[0-4][0-9])|(1[0-9]{2})|([0-9]{1,2}))$/,cX=/^(([a-f0-9]{1,4}:){7}|::([a-f0-9]{1,4}:){0,6}|([a-f0-9]{1,4}:){1}:([a-f0-9]{1,4}:){0,5}|([a-f0-9]{1,4}:){2}:([a-f0-9]{1,4}:){0,4}|([a-f0-9]{1,4}:){3}:([a-f0-9]{1,4}:){0,3}|([a-f0-9]{1,4}:){4}:([a-f0-9]{1,4}:){0,2}|([a-f0-9]{1,4}:){5}:([a-f0-9]{1,4}:){0,1})([a-f0-9]{1,4}|(((25[0-5])|(2[0-4][0-9])|(1[0-9]{2})|([0-9]{1,2}))\.){3}((25[0-5])|(2[0-4][0-9])|(1[0-9]{2})|([0-9]{1,2})))$/,uX=e=>e.precision?e.offset?new RegExp(`^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d{${e.precision}}(([+-]\\d{2}(:?\\d{2})?)|Z)$`):new RegExp(`^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}\\.\\d{${e.precision}}Z$`):e.precision===0?e.offset?new RegExp("^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}(([+-]\\d{2}(:?\\d{2})?)|Z)$"):new RegExp("^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}Z$"):e.offset?new RegExp("^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}(\\.\\d+)?(([+-]\\d{2}(:?\\d{2})?)|Z)$"):new RegExp("^\\d{4}-\\d{2}-\\d{2}T\\d{2}:\\d{2}:\\d{2}(\\.\\d+)?Z$");function dX(e,t){return!!((t==="v4"||!t)&&lX.test(e)||(t==="v6"||!t)&&cX.test(e))}class xi extends rt{_parse(t){if(this._def.coerce&&(t.data=String(t.data)),this._getType(t)!==_e.string){const i=this._getOrReturnCtx(t);return Re(i,{code:ge.invalid_type,expected:_e.string,received:i.parsedType}),Ze}const r=new Bn;let o;for(const i of this._def.checks)if(i.kind==="min")t.data.lengthi.value&&(o=this._getOrReturnCtx(t,o),Re(o,{code:ge.too_big,maximum:i.value,type:"string",inclusive:!0,exact:!1,message:i.message}),r.dirty());else if(i.kind==="length"){const s=t.data.length>i.value,l=t.data.lengtht.test(o),{validation:n,code:ge.invalid_string,...De.errToObj(r)})}_addCheck(t){return new xi({...this._def,checks:[...this._def.checks,t]})}email(t){return this._addCheck({kind:"email",...De.errToObj(t)})}url(t){return this._addCheck({kind:"url",...De.errToObj(t)})}emoji(t){return this._addCheck({kind:"emoji",...De.errToObj(t)})}uuid(t){return this._addCheck({kind:"uuid",...De.errToObj(t)})}cuid(t){return this._addCheck({kind:"cuid",...De.errToObj(t)})}cuid2(t){return this._addCheck({kind:"cuid2",...De.errToObj(t)})}ulid(t){return this._addCheck({kind:"ulid",...De.errToObj(t)})}ip(t){return this._addCheck({kind:"ip",...De.errToObj(t)})}datetime(t){var n;return typeof t=="string"?this._addCheck({kind:"datetime",precision:null,offset:!1,message:t}):this._addCheck({kind:"datetime",precision:typeof(t==null?void 0:t.precision)>"u"?null:t==null?void 0:t.precision,offset:(n=t==null?void 0:t.offset)!==null&&n!==void 0?n:!1,...De.errToObj(t==null?void 0:t.message)})}regex(t,n){return this._addCheck({kind:"regex",regex:t,...De.errToObj(n)})}includes(t,n){return this._addCheck({kind:"includes",value:t,position:n==null?void 0:n.position,...De.errToObj(n==null?void 0:n.message)})}startsWith(t,n){return this._addCheck({kind:"startsWith",value:t,...De.errToObj(n)})}endsWith(t,n){return this._addCheck({kind:"endsWith",value:t,...De.errToObj(n)})}min(t,n){return this._addCheck({kind:"min",value:t,...De.errToObj(n)})}max(t,n){return this._addCheck({kind:"max",value:t,...De.errToObj(n)})}length(t,n){return this._addCheck({kind:"length",value:t,...De.errToObj(n)})}nonempty(t){return this.min(1,De.errToObj(t))}trim(){return new xi({...this._def,checks:[...this._def.checks,{kind:"trim"}]})}toLowerCase(){return new xi({...this._def,checks:[...this._def.checks,{kind:"toLowerCase"}]})}toUpperCase(){return new xi({...this._def,checks:[...this._def.checks,{kind:"toUpperCase"}]})}get isDatetime(){return!!this._def.checks.find(t=>t.kind==="datetime")}get isEmail(){return!!this._def.checks.find(t=>t.kind==="email")}get isURL(){return!!this._def.checks.find(t=>t.kind==="url")}get isEmoji(){return!!this._def.checks.find(t=>t.kind==="emoji")}get isUUID(){return!!this._def.checks.find(t=>t.kind==="uuid")}get isCUID(){return!!this._def.checks.find(t=>t.kind==="cuid")}get isCUID2(){return!!this._def.checks.find(t=>t.kind==="cuid2")}get isULID(){return!!this._def.checks.find(t=>t.kind==="ulid")}get isIP(){return!!this._def.checks.find(t=>t.kind==="ip")}get minLength(){let t=null;for(const n of this._def.checks)n.kind==="min"&&(t===null||n.value>t)&&(t=n.value);return t}get maxLength(){let t=null;for(const n of this._def.checks)n.kind==="max"&&(t===null||n.value{var t;return new xi({checks:[],typeName:Ue.ZodString,coerce:(t=e==null?void 0:e.coerce)!==null&&t!==void 0?t:!1,...Xe(e)})};function fX(e,t){const n=(e.toString().split(".")[1]||"").length,r=(t.toString().split(".")[1]||"").length,o=n>r?n:r,i=parseInt(e.toFixed(o).replace(".","")),s=parseInt(t.toFixed(o).replace(".",""));return i%s/Math.pow(10,o)}class mc extends rt{constructor(){super(...arguments),this.min=this.gte,this.max=this.lte,this.step=this.multipleOf}_parse(t){if(this._def.coerce&&(t.data=Number(t.data)),this._getType(t)!==_e.number){const i=this._getOrReturnCtx(t);return Re(i,{code:ge.invalid_type,expected:_e.number,received:i.parsedType}),Ze}let r;const o=new Bn;for(const i of this._def.checks)i.kind==="int"?mt.isInteger(t.data)||(r=this._getOrReturnCtx(t,r),Re(r,{code:ge.invalid_type,expected:"integer",received:"float",message:i.message}),o.dirty()):i.kind==="min"?(i.inclusive?t.datai.value:t.data>=i.value)&&(r=this._getOrReturnCtx(t,r),Re(r,{code:ge.too_big,maximum:i.value,type:"number",inclusive:i.inclusive,exact:!1,message:i.message}),o.dirty()):i.kind==="multipleOf"?fX(t.data,i.value)!==0&&(r=this._getOrReturnCtx(t,r),Re(r,{code:ge.not_multiple_of,multipleOf:i.value,message:i.message}),o.dirty()):i.kind==="finite"?Number.isFinite(t.data)||(r=this._getOrReturnCtx(t,r),Re(r,{code:ge.not_finite,message:i.message}),o.dirty()):mt.assertNever(i);return{status:o.value,value:t.data}}gte(t,n){return this.setLimit("min",t,!0,De.toString(n))}gt(t,n){return this.setLimit("min",t,!1,De.toString(n))}lte(t,n){return this.setLimit("max",t,!0,De.toString(n))}lt(t,n){return this.setLimit("max",t,!1,De.toString(n))}setLimit(t,n,r,o){return new mc({...this._def,checks:[...this._def.checks,{kind:t,value:n,inclusive:r,message:De.toString(o)}]})}_addCheck(t){return new mc({...this._def,checks:[...this._def.checks,t]})}int(t){return this._addCheck({kind:"int",message:De.toString(t)})}positive(t){return this._addCheck({kind:"min",value:0,inclusive:!1,message:De.toString(t)})}negative(t){return this._addCheck({kind:"max",value:0,inclusive:!1,message:De.toString(t)})}nonpositive(t){return this._addCheck({kind:"max",value:0,inclusive:!0,message:De.toString(t)})}nonnegative(t){return this._addCheck({kind:"min",value:0,inclusive:!0,message:De.toString(t)})}multipleOf(t,n){return this._addCheck({kind:"multipleOf",value:t,message:De.toString(n)})}finite(t){return this._addCheck({kind:"finite",message:De.toString(t)})}safe(t){return this._addCheck({kind:"min",inclusive:!0,value:Number.MIN_SAFE_INTEGER,message:De.toString(t)})._addCheck({kind:"max",inclusive:!0,value:Number.MAX_SAFE_INTEGER,message:De.toString(t)})}get minValue(){let t=null;for(const n of this._def.checks)n.kind==="min"&&(t===null||n.value>t)&&(t=n.value);return t}get maxValue(){let t=null;for(const n of this._def.checks)n.kind==="max"&&(t===null||n.valuet.kind==="int"||t.kind==="multipleOf"&&mt.isInteger(t.value))}get isFinite(){let t=null,n=null;for(const r of this._def.checks){if(r.kind==="finite"||r.kind==="int"||r.kind==="multipleOf")return!0;r.kind==="min"?(n===null||r.value>n)&&(n=r.value):r.kind==="max"&&(t===null||r.valuenew mc({checks:[],typeName:Ue.ZodNumber,coerce:(e==null?void 0:e.coerce)||!1,...Xe(e)});class gc extends rt{constructor(){super(...arguments),this.min=this.gte,this.max=this.lte}_parse(t){if(this._def.coerce&&(t.data=BigInt(t.data)),this._getType(t)!==_e.bigint){const i=this._getOrReturnCtx(t);return Re(i,{code:ge.invalid_type,expected:_e.bigint,received:i.parsedType}),Ze}let r;const o=new Bn;for(const i of this._def.checks)i.kind==="min"?(i.inclusive?t.datai.value:t.data>=i.value)&&(r=this._getOrReturnCtx(t,r),Re(r,{code:ge.too_big,type:"bigint",maximum:i.value,inclusive:i.inclusive,message:i.message}),o.dirty()):i.kind==="multipleOf"?t.data%i.value!==BigInt(0)&&(r=this._getOrReturnCtx(t,r),Re(r,{code:ge.not_multiple_of,multipleOf:i.value,message:i.message}),o.dirty()):mt.assertNever(i);return{status:o.value,value:t.data}}gte(t,n){return this.setLimit("min",t,!0,De.toString(n))}gt(t,n){return this.setLimit("min",t,!1,De.toString(n))}lte(t,n){return this.setLimit("max",t,!0,De.toString(n))}lt(t,n){return this.setLimit("max",t,!1,De.toString(n))}setLimit(t,n,r,o){return new gc({...this._def,checks:[...this._def.checks,{kind:t,value:n,inclusive:r,message:De.toString(o)}]})}_addCheck(t){return new gc({...this._def,checks:[...this._def.checks,t]})}positive(t){return this._addCheck({kind:"min",value:BigInt(0),inclusive:!1,message:De.toString(t)})}negative(t){return this._addCheck({kind:"max",value:BigInt(0),inclusive:!1,message:De.toString(t)})}nonpositive(t){return this._addCheck({kind:"max",value:BigInt(0),inclusive:!0,message:De.toString(t)})}nonnegative(t){return this._addCheck({kind:"min",value:BigInt(0),inclusive:!0,message:De.toString(t)})}multipleOf(t,n){return this._addCheck({kind:"multipleOf",value:t,message:De.toString(n)})}get minValue(){let t=null;for(const n of this._def.checks)n.kind==="min"&&(t===null||n.value>t)&&(t=n.value);return t}get maxValue(){let t=null;for(const n of this._def.checks)n.kind==="max"&&(t===null||n.value{var t;return new gc({checks:[],typeName:Ue.ZodBigInt,coerce:(t=e==null?void 0:e.coerce)!==null&&t!==void 0?t:!1,...Xe(e)})};class M1 extends rt{_parse(t){if(this._def.coerce&&(t.data=!!t.data),this._getType(t)!==_e.boolean){const r=this._getOrReturnCtx(t);return Re(r,{code:ge.invalid_type,expected:_e.boolean,received:r.parsedType}),Ze}return nr(t.data)}}M1.create=e=>new M1({typeName:Ue.ZodBoolean,coerce:(e==null?void 0:e.coerce)||!1,...Xe(e)});class hd extends rt{_parse(t){if(this._def.coerce&&(t.data=new Date(t.data)),this._getType(t)!==_e.date){const i=this._getOrReturnCtx(t);return Re(i,{code:ge.invalid_type,expected:_e.date,received:i.parsedType}),Ze}if(isNaN(t.data.getTime())){const i=this._getOrReturnCtx(t);return Re(i,{code:ge.invalid_date}),Ze}const r=new Bn;let o;for(const i of this._def.checks)i.kind==="min"?t.data.getTime()i.value&&(o=this._getOrReturnCtx(t,o),Re(o,{code:ge.too_big,message:i.message,inclusive:!0,exact:!1,maximum:i.value,type:"date"}),r.dirty()):mt.assertNever(i);return{status:r.value,value:new Date(t.data.getTime())}}_addCheck(t){return new hd({...this._def,checks:[...this._def.checks,t]})}min(t,n){return this._addCheck({kind:"min",value:t.getTime(),message:De.toString(n)})}max(t,n){return this._addCheck({kind:"max",value:t.getTime(),message:De.toString(n)})}get minDate(){let t=null;for(const n of this._def.checks)n.kind==="min"&&(t===null||n.value>t)&&(t=n.value);return t!=null?new Date(t):null}get maxDate(){let t=null;for(const n of this._def.checks)n.kind==="max"&&(t===null||n.valuenew hd({checks:[],coerce:(e==null?void 0:e.coerce)||!1,typeName:Ue.ZodDate,...Xe(e)});class O1 extends rt{_parse(t){if(this._getType(t)!==_e.symbol){const r=this._getOrReturnCtx(t);return Re(r,{code:ge.invalid_type,expected:_e.symbol,received:r.parsedType}),Ze}return nr(t.data)}}O1.create=e=>new O1({typeName:Ue.ZodSymbol,...Xe(e)});class em extends rt{_parse(t){if(this._getType(t)!==_e.undefined){const r=this._getOrReturnCtx(t);return Re(r,{code:ge.invalid_type,expected:_e.undefined,received:r.parsedType}),Ze}return nr(t.data)}}em.create=e=>new em({typeName:Ue.ZodUndefined,...Xe(e)});class tm extends rt{_parse(t){if(this._getType(t)!==_e.null){const r=this._getOrReturnCtx(t);return Re(r,{code:ge.invalid_type,expected:_e.null,received:r.parsedType}),Ze}return nr(t.data)}}tm.create=e=>new tm({typeName:Ue.ZodNull,...Xe(e)});class N1 extends rt{constructor(){super(...arguments),this._any=!0}_parse(t){return nr(t.data)}}N1.create=e=>new N1({typeName:Ue.ZodAny,...Xe(e)});class Ll extends rt{constructor(){super(...arguments),this._unknown=!0}_parse(t){return nr(t.data)}}Ll.create=e=>new Ll({typeName:Ue.ZodUnknown,...Xe(e)});class Is extends rt{_parse(t){const n=this._getOrReturnCtx(t);return Re(n,{code:ge.invalid_type,expected:_e.never,received:n.parsedType}),Ze}}Is.create=e=>new Is({typeName:Ue.ZodNever,...Xe(e)});class D1 extends rt{_parse(t){if(this._getType(t)!==_e.undefined){const r=this._getOrReturnCtx(t);return Re(r,{code:ge.invalid_type,expected:_e.void,received:r.parsedType}),Ze}return nr(t.data)}}D1.create=e=>new D1({typeName:Ue.ZodVoid,...Xe(e)});class Zo extends rt{_parse(t){const{ctx:n,status:r}=this._processInputParams(t),o=this._def;if(n.parsedType!==_e.array)return Re(n,{code:ge.invalid_type,expected:_e.array,received:n.parsedType}),Ze;if(o.exactLength!==null){const s=n.data.length>o.exactLength.value,l=n.data.lengtho.maxLength.value&&(Re(n,{code:ge.too_big,maximum:o.maxLength.value,type:"array",inclusive:!0,exact:!1,message:o.maxLength.message}),r.dirty()),n.common.async)return Promise.all([...n.data].map((s,l)=>o.type._parseAsync(new ei(n,s,n.path,l)))).then(s=>Bn.mergeArray(r,s));const i=[...n.data].map((s,l)=>o.type._parseSync(new ei(n,s,n.path,l)));return Bn.mergeArray(r,i)}get element(){return this._def.type}min(t,n){return new Zo({...this._def,minLength:{value:t,message:De.toString(n)}})}max(t,n){return new Zo({...this._def,maxLength:{value:t,message:De.toString(n)}})}length(t,n){return new Zo({...this._def,exactLength:{value:t,message:De.toString(n)}})}nonempty(t){return this.min(1,t)}}Zo.create=(e,t)=>new Zo({type:e,minLength:null,maxLength:null,exactLength:null,typeName:Ue.ZodArray,...Xe(t)});function pl(e){if(e instanceof Ut){const t={};for(const n in e.shape){const r=e.shape[n];t[n]=Ts.create(pl(r))}return new Ut({...e._def,shape:()=>t})}else return e instanceof Zo?new Zo({...e._def,type:pl(e.element)}):e instanceof Ts?Ts.create(pl(e.unwrap())):e instanceof yc?yc.create(pl(e.unwrap())):e instanceof Ai?Ai.create(e.items.map(t=>pl(t))):e}class Ut extends rt{constructor(){super(...arguments),this._cached=null,this.nonstrict=this.passthrough,this.augment=this.extend}_getCached(){if(this._cached!==null)return this._cached;const t=this._def.shape(),n=mt.objectKeys(t);return this._cached={shape:t,keys:n}}_parse(t){if(this._getType(t)!==_e.object){const f=this._getOrReturnCtx(t);return Re(f,{code:ge.invalid_type,expected:_e.object,received:f.parsedType}),Ze}const{status:r,ctx:o}=this._processInputParams(t),{shape:i,keys:s}=this._getCached(),l=[];if(!(this._def.catchall instanceof Is&&this._def.unknownKeys==="strip"))for(const f in o.data)s.includes(f)||l.push(f);const u=[];for(const f of s){const m=i[f],p=o.data[f];u.push({key:{status:"valid",value:f},value:m._parse(new ei(o,p,o.path,f)),alwaysSet:f in o.data})}if(this._def.catchall instanceof Is){const f=this._def.unknownKeys;if(f==="passthrough")for(const m of l)u.push({key:{status:"valid",value:m},value:{status:"valid",value:o.data[m]}});else if(f==="strict")l.length>0&&(Re(o,{code:ge.unrecognized_keys,keys:l}),r.dirty());else if(f!=="strip")throw new Error("Internal ZodObject error: invalid unknownKeys value.")}else{const f=this._def.catchall;for(const m of l){const p=o.data[m];u.push({key:{status:"valid",value:m},value:f._parse(new ei(o,p,o.path,m)),alwaysSet:m in o.data})}}return o.common.async?Promise.resolve().then(async()=>{const f=[];for(const m of u){const p=await m.key;f.push({key:p,value:await m.value,alwaysSet:m.alwaysSet})}return f}).then(f=>Bn.mergeObjectSync(r,f)):Bn.mergeObjectSync(r,u)}get shape(){return this._def.shape()}strict(t){return De.errToObj,new Ut({...this._def,unknownKeys:"strict",...t!==void 0?{errorMap:(n,r)=>{var o,i,s,l;const u=(s=(i=(o=this._def).errorMap)===null||i===void 0?void 0:i.call(o,n,r).message)!==null&&s!==void 0?s:r.defaultError;return n.code==="unrecognized_keys"?{message:(l=De.errToObj(t).message)!==null&&l!==void 0?l:u}:{message:u}}}:{}})}strip(){return new Ut({...this._def,unknownKeys:"strip"})}passthrough(){return new Ut({...this._def,unknownKeys:"passthrough"})}extend(t){return new Ut({...this._def,shape:()=>({...this._def.shape(),...t})})}merge(t){return new Ut({unknownKeys:t._def.unknownKeys,catchall:t._def.catchall,shape:()=>({...this._def.shape(),...t._def.shape()}),typeName:Ue.ZodObject})}setKey(t,n){return this.augment({[t]:n})}catchall(t){return new Ut({...this._def,catchall:t})}pick(t){const n={};return mt.objectKeys(t).forEach(r=>{t[r]&&this.shape[r]&&(n[r]=this.shape[r])}),new Ut({...this._def,shape:()=>n})}omit(t){const n={};return mt.objectKeys(this.shape).forEach(r=>{t[r]||(n[r]=this.shape[r])}),new Ut({...this._def,shape:()=>n})}deepPartial(){return pl(this)}partial(t){const n={};return mt.objectKeys(this.shape).forEach(r=>{const o=this.shape[r];t&&!t[r]?n[r]=o:n[r]=o.optional()}),new Ut({...this._def,shape:()=>n})}required(t){const n={};return mt.objectKeys(this.shape).forEach(r=>{if(t&&!t[r])n[r]=this.shape[r];else{let i=this.shape[r];for(;i instanceof Ts;)i=i._def.innerType;n[r]=i}}),new Ut({...this._def,shape:()=>n})}keyof(){return c3(mt.objectKeys(this.shape))}}Ut.create=(e,t)=>new Ut({shape:()=>e,unknownKeys:"strip",catchall:Is.create(),typeName:Ue.ZodObject,...Xe(t)});Ut.strictCreate=(e,t)=>new Ut({shape:()=>e,unknownKeys:"strict",catchall:Is.create(),typeName:Ue.ZodObject,...Xe(t)});Ut.lazycreate=(e,t)=>new Ut({shape:e,unknownKeys:"strip",catchall:Is.create(),typeName:Ue.ZodObject,...Xe(t)});class nm extends rt{_parse(t){const{ctx:n}=this._processInputParams(t),r=this._def.options;function o(i){for(const l of i)if(l.result.status==="valid")return l.result;for(const l of i)if(l.result.status==="dirty")return n.common.issues.push(...l.ctx.common.issues),l.result;const s=i.map(l=>new Xo(l.ctx.common.issues));return Re(n,{code:ge.invalid_union,unionErrors:s}),Ze}if(n.common.async)return Promise.all(r.map(async i=>{const s={...n,common:{...n.common,issues:[]},parent:null};return{result:await i._parseAsync({data:n.data,path:n.path,parent:s}),ctx:s}})).then(o);{let i;const s=[];for(const u of r){const f={...n,common:{...n.common,issues:[]},parent:null},m=u._parseSync({data:n.data,path:n.path,parent:f});if(m.status==="valid")return m;m.status==="dirty"&&!i&&(i={result:m,ctx:f}),f.common.issues.length&&s.push(f.common.issues)}if(i)return n.common.issues.push(...i.ctx.common.issues),i.result;const l=s.map(u=>new Xo(u));return Re(n,{code:ge.invalid_union,unionErrors:l}),Ze}}get options(){return this._def.options}}nm.create=(e,t)=>new nm({options:e,typeName:Ue.ZodUnion,...Xe(t)});const sp=e=>e instanceof im?sp(e.schema):e instanceof Mi?sp(e.innerType()):e instanceof sm?[e.value]:e instanceof La?e.options:e instanceof am?Object.keys(e.enum):e instanceof lm?sp(e._def.innerType):e instanceof em?[void 0]:e instanceof tm?[null]:null;class tb extends rt{_parse(t){const{ctx:n}=this._processInputParams(t);if(n.parsedType!==_e.object)return Re(n,{code:ge.invalid_type,expected:_e.object,received:n.parsedType}),Ze;const r=this.discriminator,o=n.data[r],i=this.optionsMap.get(o);return i?n.common.async?i._parseAsync({data:n.data,path:n.path,parent:n}):i._parseSync({data:n.data,path:n.path,parent:n}):(Re(n,{code:ge.invalid_union_discriminator,options:Array.from(this.optionsMap.keys()),path:[r]}),Ze)}get discriminator(){return this._def.discriminator}get options(){return this._def.options}get optionsMap(){return this._def.optionsMap}static create(t,n,r){const o=new Map;for(const i of n){const s=sp(i.shape[t]);if(!s)throw new Error(`A discriminator value for key \`${t}\` could not be extracted from all schema options`);for(const l of s){if(o.has(l))throw new Error(`Discriminator property ${String(t)} has duplicate value ${String(l)}`);o.set(l,i)}}return new tb({typeName:Ue.ZodDiscriminatedUnion,discriminator:t,options:n,optionsMap:o,...Xe(r)})}}function I1(e,t){const n=da(e),r=da(t);if(e===t)return{valid:!0,data:e};if(n===_e.object&&r===_e.object){const o=mt.objectKeys(t),i=mt.objectKeys(e).filter(l=>o.indexOf(l)!==-1),s={...e,...t};for(const l of i){const u=I1(e[l],t[l]);if(!u.valid)return{valid:!1};s[l]=u.data}return{valid:!0,data:s}}else if(n===_e.array&&r===_e.array){if(e.length!==t.length)return{valid:!1};const o=[];for(let i=0;i{if(I2(i)||I2(s))return Ze;const l=I1(i.value,s.value);return l.valid?((L2(i)||L2(s))&&n.dirty(),{status:n.value,value:l.data}):(Re(r,{code:ge.invalid_intersection_types}),Ze)};return r.common.async?Promise.all([this._def.left._parseAsync({data:r.data,path:r.path,parent:r}),this._def.right._parseAsync({data:r.data,path:r.path,parent:r})]).then(([i,s])=>o(i,s)):o(this._def.left._parseSync({data:r.data,path:r.path,parent:r}),this._def.right._parseSync({data:r.data,path:r.path,parent:r}))}}rm.create=(e,t,n)=>new rm({left:e,right:t,typeName:Ue.ZodIntersection,...Xe(n)});class Ai extends rt{_parse(t){const{status:n,ctx:r}=this._processInputParams(t);if(r.parsedType!==_e.array)return Re(r,{code:ge.invalid_type,expected:_e.array,received:r.parsedType}),Ze;if(r.data.lengththis._def.items.length&&(Re(r,{code:ge.too_big,maximum:this._def.items.length,inclusive:!0,exact:!1,type:"array"}),n.dirty());const i=[...r.data].map((s,l)=>{const u=this._def.items[l]||this._def.rest;return u?u._parse(new ei(r,s,r.path,l)):null}).filter(s=>!!s);return r.common.async?Promise.all(i).then(s=>Bn.mergeArray(n,s)):Bn.mergeArray(n,i)}get items(){return this._def.items}rest(t){return new Ai({...this._def,rest:t})}}Ai.create=(e,t)=>{if(!Array.isArray(e))throw new Error("You must pass an array of schemas to z.tuple([ ... ])");return new Ai({items:e,typeName:Ue.ZodTuple,rest:null,...Xe(t)})};class om extends rt{get keySchema(){return this._def.keyType}get valueSchema(){return this._def.valueType}_parse(t){const{status:n,ctx:r}=this._processInputParams(t);if(r.parsedType!==_e.object)return Re(r,{code:ge.invalid_type,expected:_e.object,received:r.parsedType}),Ze;const o=[],i=this._def.keyType,s=this._def.valueType;for(const l in r.data)o.push({key:i._parse(new ei(r,l,r.path,l)),value:s._parse(new ei(r,r.data[l],r.path,l))});return r.common.async?Bn.mergeObjectAsync(n,o):Bn.mergeObjectSync(n,o)}get element(){return this._def.valueType}static create(t,n,r){return n instanceof rt?new om({keyType:t,valueType:n,typeName:Ue.ZodRecord,...Xe(r)}):new om({keyType:xi.create(),valueType:t,typeName:Ue.ZodRecord,...Xe(n)})}}class L1 extends rt{get keySchema(){return this._def.keyType}get valueSchema(){return this._def.valueType}_parse(t){const{status:n,ctx:r}=this._processInputParams(t);if(r.parsedType!==_e.map)return Re(r,{code:ge.invalid_type,expected:_e.map,received:r.parsedType}),Ze;const o=this._def.keyType,i=this._def.valueType,s=[...r.data.entries()].map(([l,u],f)=>({key:o._parse(new ei(r,l,r.path,[f,"key"])),value:i._parse(new ei(r,u,r.path,[f,"value"]))}));if(r.common.async){const l=new Map;return Promise.resolve().then(async()=>{for(const u of s){const f=await u.key,m=await u.value;if(f.status==="aborted"||m.status==="aborted")return Ze;(f.status==="dirty"||m.status==="dirty")&&n.dirty(),l.set(f.value,m.value)}return{status:n.value,value:l}})}else{const l=new Map;for(const u of s){const f=u.key,m=u.value;if(f.status==="aborted"||m.status==="aborted")return Ze;(f.status==="dirty"||m.status==="dirty")&&n.dirty(),l.set(f.value,m.value)}return{status:n.value,value:l}}}}L1.create=(e,t,n)=>new L1({valueType:t,keyType:e,typeName:Ue.ZodMap,...Xe(n)});class vc extends rt{_parse(t){const{status:n,ctx:r}=this._processInputParams(t);if(r.parsedType!==_e.set)return Re(r,{code:ge.invalid_type,expected:_e.set,received:r.parsedType}),Ze;const o=this._def;o.minSize!==null&&r.data.sizeo.maxSize.value&&(Re(r,{code:ge.too_big,maximum:o.maxSize.value,type:"set",inclusive:!0,exact:!1,message:o.maxSize.message}),n.dirty());const i=this._def.valueType;function s(u){const f=new Set;for(const m of u){if(m.status==="aborted")return Ze;m.status==="dirty"&&n.dirty(),f.add(m.value)}return{status:n.value,value:f}}const l=[...r.data.values()].map((u,f)=>i._parse(new ei(r,u,r.path,f)));return r.common.async?Promise.all(l).then(u=>s(u)):s(l)}min(t,n){return new vc({...this._def,minSize:{value:t,message:De.toString(n)}})}max(t,n){return new vc({...this._def,maxSize:{value:t,message:De.toString(n)}})}size(t,n){return this.min(t,n).max(t,n)}nonempty(t){return this.min(1,t)}}vc.create=(e,t)=>new vc({valueType:e,minSize:null,maxSize:null,typeName:Ue.ZodSet,...Xe(t)});class Lu extends rt{constructor(){super(...arguments),this.validate=this.implement}_parse(t){const{ctx:n}=this._processInputParams(t);if(n.parsedType!==_e.function)return Re(n,{code:ge.invalid_type,expected:_e.function,received:n.parsedType}),Ze;function r(l,u){return k1({data:l,path:n.path,errorMaps:[n.common.contextualErrorMap,n.schemaErrorMap,T1(),Qp].filter(f=>!!f),issueData:{code:ge.invalid_arguments,argumentsError:u}})}function o(l,u){return k1({data:l,path:n.path,errorMaps:[n.common.contextualErrorMap,n.schemaErrorMap,T1(),Qp].filter(f=>!!f),issueData:{code:ge.invalid_return_type,returnTypeError:u}})}const i={errorMap:n.common.contextualErrorMap},s=n.data;if(this._def.returns instanceof pd){const l=this;return nr(async function(...u){const f=new Xo([]),m=await l._def.args.parseAsync(u,i).catch(y=>{throw f.addIssue(r(u,y)),f}),p=await Reflect.apply(s,this,m);return await l._def.returns._def.type.parseAsync(p,i).catch(y=>{throw f.addIssue(o(p,y)),f})})}else{const l=this;return nr(function(...u){const f=l._def.args.safeParse(u,i);if(!f.success)throw new Xo([r(u,f.error)]);const m=Reflect.apply(s,this,f.data),p=l._def.returns.safeParse(m,i);if(!p.success)throw new Xo([o(m,p.error)]);return p.data})}}parameters(){return this._def.args}returnType(){return this._def.returns}args(...t){return new Lu({...this._def,args:Ai.create(t).rest(Ll.create())})}returns(t){return new Lu({...this._def,returns:t})}implement(t){return this.parse(t)}strictImplement(t){return this.parse(t)}static create(t,n,r){return new Lu({args:t||Ai.create([]).rest(Ll.create()),returns:n||Ll.create(),typeName:Ue.ZodFunction,...Xe(r)})}}class im extends rt{get schema(){return this._def.getter()}_parse(t){const{ctx:n}=this._processInputParams(t);return this._def.getter()._parse({data:n.data,path:n.path,parent:n})}}im.create=(e,t)=>new im({getter:e,typeName:Ue.ZodLazy,...Xe(t)});class sm extends rt{_parse(t){if(t.data!==this._def.value){const n=this._getOrReturnCtx(t);return Re(n,{received:n.data,code:ge.invalid_literal,expected:this._def.value}),Ze}return{status:"valid",value:t.data}}get value(){return this._def.value}}sm.create=(e,t)=>new sm({value:e,typeName:Ue.ZodLiteral,...Xe(t)});function c3(e,t){return new La({values:e,typeName:Ue.ZodEnum,...Xe(t)})}class La extends rt{_parse(t){if(typeof t.data!="string"){const n=this._getOrReturnCtx(t),r=this._def.values;return Re(n,{expected:mt.joinValues(r),received:n.parsedType,code:ge.invalid_type}),Ze}if(this._def.values.indexOf(t.data)===-1){const n=this._getOrReturnCtx(t),r=this._def.values;return Re(n,{received:n.data,code:ge.invalid_enum_value,options:r}),Ze}return nr(t.data)}get options(){return this._def.values}get enum(){const t={};for(const n of this._def.values)t[n]=n;return t}get Values(){const t={};for(const n of this._def.values)t[n]=n;return t}get Enum(){const t={};for(const n of this._def.values)t[n]=n;return t}extract(t){return La.create(t)}exclude(t){return La.create(this.options.filter(n=>!t.includes(n)))}}La.create=c3;class am extends rt{_parse(t){const n=mt.getValidEnumValues(this._def.values),r=this._getOrReturnCtx(t);if(r.parsedType!==_e.string&&r.parsedType!==_e.number){const o=mt.objectValues(n);return Re(r,{expected:mt.joinValues(o),received:r.parsedType,code:ge.invalid_type}),Ze}if(n.indexOf(t.data)===-1){const o=mt.objectValues(n);return Re(r,{received:r.data,code:ge.invalid_enum_value,options:o}),Ze}return nr(t.data)}get enum(){return this._def.values}}am.create=(e,t)=>new am({values:e,typeName:Ue.ZodNativeEnum,...Xe(t)});class pd extends rt{unwrap(){return this._def.type}_parse(t){const{ctx:n}=this._processInputParams(t);if(n.parsedType!==_e.promise&&n.common.async===!1)return Re(n,{code:ge.invalid_type,expected:_e.promise,received:n.parsedType}),Ze;const r=n.parsedType===_e.promise?n.data:Promise.resolve(n.data);return nr(r.then(o=>this._def.type.parseAsync(o,{path:n.path,errorMap:n.common.contextualErrorMap})))}}pd.create=(e,t)=>new pd({type:e,typeName:Ue.ZodPromise,...Xe(t)});class Mi extends rt{innerType(){return this._def.schema}sourceType(){return this._def.schema._def.typeName===Ue.ZodEffects?this._def.schema.sourceType():this._def.schema}_parse(t){const{status:n,ctx:r}=this._processInputParams(t),o=this._def.effect||null,i={addIssue:s=>{Re(r,s),s.fatal?n.abort():n.dirty()},get path(){return r.path}};if(i.addIssue=i.addIssue.bind(i),o.type==="preprocess"){const s=o.transform(r.data,i);return r.common.issues.length?{status:"dirty",value:r.data}:r.common.async?Promise.resolve(s).then(l=>this._def.schema._parseAsync({data:l,path:r.path,parent:r})):this._def.schema._parseSync({data:s,path:r.path,parent:r})}if(o.type==="refinement"){const s=l=>{const u=o.refinement(l,i);if(r.common.async)return Promise.resolve(u);if(u instanceof Promise)throw new Error("Async refinement encountered during synchronous parse operation. Use .parseAsync instead.");return l};if(r.common.async===!1){const l=this._def.schema._parseSync({data:r.data,path:r.path,parent:r});return l.status==="aborted"?Ze:(l.status==="dirty"&&n.dirty(),s(l.value),{status:n.value,value:l.value})}else return this._def.schema._parseAsync({data:r.data,path:r.path,parent:r}).then(l=>l.status==="aborted"?Ze:(l.status==="dirty"&&n.dirty(),s(l.value).then(()=>({status:n.value,value:l.value}))))}if(o.type==="transform")if(r.common.async===!1){const s=this._def.schema._parseSync({data:r.data,path:r.path,parent:r});if(!Jp(s))return s;const l=o.transform(s.value,i);if(l instanceof Promise)throw new Error("Asynchronous transform encountered during synchronous parse operation. Use .parseAsync instead.");return{status:n.value,value:l}}else return this._def.schema._parseAsync({data:r.data,path:r.path,parent:r}).then(s=>Jp(s)?Promise.resolve(o.transform(s.value,i)).then(l=>({status:n.value,value:l})):s);mt.assertNever(o)}}Mi.create=(e,t,n)=>new Mi({schema:e,typeName:Ue.ZodEffects,effect:t,...Xe(n)});Mi.createWithPreprocess=(e,t,n)=>new Mi({schema:t,effect:{type:"preprocess",transform:e},typeName:Ue.ZodEffects,...Xe(n)});class Ts extends rt{_parse(t){return this._getType(t)===_e.undefined?nr(void 0):this._def.innerType._parse(t)}unwrap(){return this._def.innerType}}Ts.create=(e,t)=>new Ts({innerType:e,typeName:Ue.ZodOptional,...Xe(t)});class yc extends rt{_parse(t){return this._getType(t)===_e.null?nr(null):this._def.innerType._parse(t)}unwrap(){return this._def.innerType}}yc.create=(e,t)=>new yc({innerType:e,typeName:Ue.ZodNullable,...Xe(t)});class lm extends rt{_parse(t){const{ctx:n}=this._processInputParams(t);let r=n.data;return n.parsedType===_e.undefined&&(r=this._def.defaultValue()),this._def.innerType._parse({data:r,path:n.path,parent:n})}removeDefault(){return this._def.innerType}}lm.create=(e,t)=>new lm({innerType:e,typeName:Ue.ZodDefault,defaultValue:typeof t.default=="function"?t.default:()=>t.default,...Xe(t)});class F1 extends rt{_parse(t){const{ctx:n}=this._processInputParams(t),r={...n,common:{...n.common,issues:[]}},o=this._def.innerType._parse({data:r.data,path:r.path,parent:{...r}});return A1(o)?o.then(i=>({status:"valid",value:i.status==="valid"?i.value:this._def.catchValue({get error(){return new Xo(r.common.issues)},input:r.data})})):{status:"valid",value:o.status==="valid"?o.value:this._def.catchValue({get error(){return new Xo(r.common.issues)},input:r.data})}}removeCatch(){return this._def.innerType}}F1.create=(e,t)=>new F1({innerType:e,typeName:Ue.ZodCatch,catchValue:typeof t.catch=="function"?t.catch:()=>t.catch,...Xe(t)});class j1 extends rt{_parse(t){if(this._getType(t)!==_e.nan){const r=this._getOrReturnCtx(t);return Re(r,{code:ge.invalid_type,expected:_e.nan,received:r.parsedType}),Ze}return{status:"valid",value:t.data}}}j1.create=e=>new j1({typeName:Ue.ZodNaN,...Xe(e)});class hX extends rt{_parse(t){const{ctx:n}=this._processInputParams(t),r=n.data;return this._def.type._parse({data:r,path:n.path,parent:n})}unwrap(){return this._def.type}}class pg extends rt{_parse(t){const{status:n,ctx:r}=this._processInputParams(t);if(r.common.async)return(async()=>{const i=await this._def.in._parseAsync({data:r.data,path:r.path,parent:r});return i.status==="aborted"?Ze:i.status==="dirty"?(n.dirty(),tX(i.value)):this._def.out._parseAsync({data:i.value,path:r.path,parent:r})})();{const o=this._def.in._parseSync({data:r.data,path:r.path,parent:r});return o.status==="aborted"?Ze:o.status==="dirty"?(n.dirty(),{status:"dirty",value:o.value}):this._def.out._parseSync({data:o.value,path:r.path,parent:r})}}static create(t,n){return new pg({in:t,out:n,typeName:Ue.ZodPipeline})}}class B1 extends rt{_parse(t){const n=this._def.innerType._parse(t);return Jp(n)&&(n.value=Object.freeze(n.value)),n}}B1.create=(e,t)=>new B1({innerType:e,typeName:Ue.ZodReadonly,...Xe(t)});Ut.lazycreate;var Ue;(function(e){e.ZodString="ZodString",e.ZodNumber="ZodNumber",e.ZodNaN="ZodNaN",e.ZodBigInt="ZodBigInt",e.ZodBoolean="ZodBoolean",e.ZodDate="ZodDate",e.ZodSymbol="ZodSymbol",e.ZodUndefined="ZodUndefined",e.ZodNull="ZodNull",e.ZodAny="ZodAny",e.ZodUnknown="ZodUnknown",e.ZodNever="ZodNever",e.ZodVoid="ZodVoid",e.ZodArray="ZodArray",e.ZodObject="ZodObject",e.ZodUnion="ZodUnion",e.ZodDiscriminatedUnion="ZodDiscriminatedUnion",e.ZodIntersection="ZodIntersection",e.ZodTuple="ZodTuple",e.ZodRecord="ZodRecord",e.ZodMap="ZodMap",e.ZodSet="ZodSet",e.ZodFunction="ZodFunction",e.ZodLazy="ZodLazy",e.ZodLiteral="ZodLiteral",e.ZodEnum="ZodEnum",e.ZodEffects="ZodEffects",e.ZodNativeEnum="ZodNativeEnum",e.ZodOptional="ZodOptional",e.ZodNullable="ZodNullable",e.ZodDefault="ZodDefault",e.ZodCatch="ZodCatch",e.ZodPromise="ZodPromise",e.ZodBranded="ZodBranded",e.ZodPipeline="ZodPipeline",e.ZodReadonly="ZodReadonly"})(Ue||(Ue={}));const du=xi.create;mc.create;j1.create;gc.create;const fu=M1.create;hd.create;O1.create;em.create;tm.create;N1.create;Ll.create;Is.create;D1.create;Zo.create;const pX=Ut.create;Ut.strictCreate;nm.create;tb.create;rm.create;Ai.create;om.create;L1.create;vc.create;Lu.create;im.create;sm.create;La.create;am.create;pd.create;Mi.create;Ts.create;yc.create;Mi.createWithPreprocess;pg.create;const z1="horizontal",mX=["horizontal","vertical"],u3=d.forwardRef((e,t)=>{const{decorative:n,orientation:r=z1,...o}=e,i=d3(r)?r:z1,l=n?{role:"none"}:{"aria-orientation":i==="vertical"?i:void 0,role:"separator"};return d.createElement(Ae.div,Y({"data-orientation":i},l,o,{ref:t}))});u3.propTypes={orientation(e,t,n){const r=e[t],o=String(r);return r&&!d3(r)?new Error(gX(o,n)):null}};function gX(e,t){return`Invalid prop \`orientation\` of value \`${e}\` supplied to \`${t}\`, expected one of: + - horizontal + - vertical + +Defaulting to \`${z1}\`.`}function d3(e){return mX.includes(e)}const f3=u3,Xt=d.forwardRef(({className:e,orientation:t="horizontal",decorative:n=!0,...r},o)=>v.jsx(f3,{ref:o,decorative:n,orientation:t,className:xe("shrink-0 bg-border",t==="horizontal"?"h-[1px] w-full":"h-full w-[1px]",e),...r}));Xt.displayName=f3.displayName;const vX=d.forwardRef((e,t)=>d.createElement(Ae.label,Y({},e,{ref:t,onMouseDown:n=>{var r;(r=e.onMouseDown)===null||r===void 0||r.call(e,n),!n.defaultPrevented&&n.detail>1&&n.preventDefault()}}))),h3=vX,yX=Fm("text-sm font-medium leading-none peer-disabled:cursor-not-allowed peer-disabled:opacity-70"),nb=d.forwardRef(({className:e,disabled:t,...n},r)=>v.jsx(h3,{ref:r,className:xe(yX(),e,t?"opacity-50":"","select-none"),...n}));nb.displayName=h3.displayName;const wX=OY,p3=d.createContext({}),ul=({...e})=>v.jsx(p3.Provider,{value:{name:e.name},children:v.jsx(LY,{...e})}),mg=()=>{const e=d.useContext(p3),t=d.useContext(m3),{getFieldState:n,formState:r}=hg(),o=n(e.name,r);if(!e)throw new Error("useFormField should be used within ");const{id:i}=t;return{id:i,name:e.name,formItemId:`${i}-form-item`,formDescriptionId:`${i}-form-item-description`,formMessageId:`${i}-form-item-message`,...o}},m3=d.createContext({}),sa=d.forwardRef(({className:e,...t},n)=>{const r=d.useId();return v.jsx(m3.Provider,{value:{id:r},children:v.jsx("div",{ref:n,className:xe("space-y-2",e),...t})})});sa.displayName="FormItem";const aa=d.forwardRef(({className:e,...t},n)=>{const{error:r,formItemId:o}=mg();return v.jsx(nb,{ref:n,className:xe(r&&"text-destructive","text-sm",e),htmlFor:o,...t})});aa.displayName="FormLabel";const la=d.forwardRef(({...e},t)=>{const{error:n,formItemId:r,formDescriptionId:o,formMessageId:i}=mg();return v.jsx(Qo,{ref:t,id:r,"aria-describedby":n?`${o} ${i}`:`${o}`,"aria-invalid":!!n,...e})});la.displayName="FormControl";const ca=d.forwardRef(({className:e,...t},n)=>{const{formDescriptionId:r}=mg();return v.jsx("p",{ref:n,id:r,className:xe("text-[0.8rem] text-muted-foreground",e),...t})});ca.displayName="FormDescription";const xX=d.forwardRef(({className:e,children:t,...n},r)=>{const{error:o,formMessageId:i}=mg(),s=o?String(o==null?void 0:o.message):t;return s?v.jsx("p",{ref:r,id:i,className:xe("text-[0.8rem] font-medium text-destructive",e),...n,children:s}):null});xX.displayName="FormMessage";const g3="Switch",[bX,wte]=Tn(g3),[SX,_X]=bX(g3),EX=d.forwardRef((e,t)=>{const{__scopeSwitch:n,name:r,checked:o,defaultChecked:i,required:s,disabled:l,value:u="on",onCheckedChange:f,...m}=e,[p,g]=d.useState(null),y=Ve(t,b=>g(b)),x=d.useRef(!1),S=p?!!p.closest("form"):!0,[E=!1,_]=eo({prop:o,defaultProp:i,onChange:f});return d.createElement(SX,{scope:n,checked:E,disabled:l},d.createElement(Ae.button,Y({type:"button",role:"switch","aria-checked":E,"aria-required":s,"data-state":v3(E),"data-disabled":l?"":void 0,disabled:l,value:u},m,{ref:y,onClick:fe(e.onClick,b=>{_(C=>!C),S&&(x.current=b.isPropagationStopped(),x.current||b.stopPropagation())})})),S&&d.createElement(RX,{control:p,bubbles:!x.current,name:r,value:u,checked:E,required:s,disabled:l,style:{transform:"translateX(-100%)"}}))}),CX="SwitchThumb",$X=d.forwardRef((e,t)=>{const{__scopeSwitch:n,...r}=e,o=_X(CX,n);return d.createElement(Ae.span,Y({"data-state":v3(o.checked),"data-disabled":o.disabled?"":void 0},r,{ref:t}))}),RX=e=>{const{control:t,checked:n,bubbles:r=!0,...o}=e,i=d.useRef(null),s=Kx(n),l=Sx(t);return d.useEffect(()=>{const u=i.current,f=window.HTMLInputElement.prototype,p=Object.getOwnPropertyDescriptor(f,"checked").set;if(s!==n&&p){const g=new Event("click",{bubbles:r});p.call(u,n),u.dispatchEvent(g)}},[s,n,r]),d.createElement("input",Y({type:"checkbox","aria-hidden":!0,defaultChecked:n},o,{tabIndex:-1,ref:i,style:{...e.style,...l,position:"absolute",pointerEvents:"none",opacity:0,margin:0}}))};function v3(e){return e?"checked":"unchecked"}const y3=EX,PX=$X,jr=d.forwardRef(({className:e,...t},n)=>v.jsx(y3,{className:xe("peer inline-flex h-5 w-9 shrink-0 cursor-pointer items-center rounded-full border-2 border-transparent shadow-sm transition-colors focus-visible:outline-none focus-visible:ring-2 focus-visible:ring-ring focus-visible:ring-offset-2 focus-visible:ring-offset-background disabled:cursor-not-allowed disabled:opacity-50 data-[state=checked]:bg-primary data-[state=unchecked]:bg-input",e),tabIndex:-1,...t,ref:n,children:v.jsx(PX,{className:xe("pointer-events-none block h-4 w-4 rounded-full bg-background shadow-lg ring-0 transition-transform data-[state=checked]:translate-x-4 data-[state=unchecked]:translate-x-0")})}));jr.displayName=y3.displayName;const TX="AlertDialog",[kX,xte]=Tn(TX,[gT]),Vs=gT(),AX=e=>{const{__scopeAlertDialog:t,...n}=e,r=Vs(t);return d.createElement(Tx,Y({},r,n,{modal:!0}))},MX=e=>{const{__scopeAlertDialog:t,...n}=e,r=Vs(t);return d.createElement(kx,Y({},r,n))},OX=d.forwardRef((e,t)=>{const{__scopeAlertDialog:n,...r}=e,o=Vs(n);return d.createElement(Bd,Y({},o,r,{ref:t}))}),w3="AlertDialogContent",[NX,DX]=kX(w3),IX=d.forwardRef((e,t)=>{const{__scopeAlertDialog:n,children:r,...o}=e,i=Vs(n),s=d.useRef(null),l=Ve(t,s),u=d.useRef(null);return d.createElement(JV,{contentName:w3,titleName:LX,docsSlug:"alert-dialog"},d.createElement(NX,{scope:n,cancelRef:u},d.createElement(zd,Y({role:"alertdialog"},i,o,{ref:l,onOpenAutoFocus:fe(o.onOpenAutoFocus,f=>{var m;f.preventDefault(),(m=u.current)===null||m===void 0||m.focus({preventScroll:!0})}),onPointerDownOutside:f=>f.preventDefault(),onInteractOutside:f=>f.preventDefault()}),d.createElement(fx,null,r),!1)))}),LX="AlertDialogTitle",FX=d.forwardRef((e,t)=>{const{__scopeAlertDialog:n,...r}=e,o=Vs(n);return d.createElement(Ud,Y({},o,r,{ref:t}))}),jX=d.forwardRef((e,t)=>{const{__scopeAlertDialog:n,...r}=e,o=Vs(n);return d.createElement(Vd,Y({},o,r,{ref:t}))}),BX=d.forwardRef((e,t)=>{const{__scopeAlertDialog:n,...r}=e,o=Vs(n);return d.createElement(Ax,Y({},o,r,{ref:t}))}),zX="AlertDialogCancel",UX=d.forwardRef((e,t)=>{const{__scopeAlertDialog:n,...r}=e,{cancelRef:o}=DX(zX,n),i=Vs(n),s=Ve(t,o);return d.createElement(Ax,Y({},i,r,{ref:s}))}),VX=AX,WX=MX,x3=OX,b3=IX,S3=BX,_3=UX,E3=FX,C3=jX,HX=VX,KX=WX,$3=d.forwardRef(({className:e,...t},n)=>v.jsx(x3,{className:xe("fixed inset-0 z-50 bg-background/80 backdrop-blur-sm data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0",e),...t,ref:n}));$3.displayName=x3.displayName;const R3=d.forwardRef(({className:e,...t},n)=>v.jsxs(KX,{children:[v.jsx($3,{}),v.jsx(b3,{ref:n,className:xe("fixed left-[50%] top-[50%] z-50 grid w-full max-w-lg translate-x-[-50%] translate-y-[-50%] gap-4 border bg-background p-6 shadow-lg duration-200 data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 data-[state=closed]:slide-out-to-left-1/2 data-[state=closed]:slide-out-to-top-[48%] data-[state=open]:slide-in-from-left-1/2 data-[state=open]:slide-in-from-top-[48%] sm:rounded-lg",e),...t})]}));R3.displayName=b3.displayName;const P3=({className:e,...t})=>v.jsx("div",{className:xe("flex flex-col space-y-2 text-center sm:text-left",e),...t});P3.displayName="AlertDialogHeader";const GX=d.forwardRef(({className:e,...t},n)=>v.jsx(E3,{ref:n,className:xe("text-lg font-semibold",e),...t}));GX.displayName=E3.displayName;const YX=d.forwardRef(({className:e,...t},n)=>v.jsx(C3,{ref:n,className:xe("text-sm text-muted-foreground",e),...t}));YX.displayName=C3.displayName;const XX=d.forwardRef(({className:e,...t},n)=>v.jsx(S3,{ref:n,className:xe(Rx(),e),...t}));XX.displayName=S3.displayName;const ZX=d.forwardRef(({className:e,...t},n)=>v.jsx(_3,{ref:n,className:xe(Rx({variant:"outline"}),"mt-2 sm:mt-0",e),...t}));ZX.displayName=_3.displayName;const qX=pX({enableFileManager:fu(),inputDirectory:du(),outputDirectory:du(),enableDownloadMask:fu(),enableManualInpainting:fu(),enableUploadMask:fu(),enableAutoExtractPrompt:fu(),removeBGModel:du(),realesrganModel:du(),interactiveSegModel:du()}),T3="General",U1="Model",k3="Plugins",QX=[U1,T3,k3];function JX(){const[e,t]=rT(!1),[n,r]=d.useState(U1),[o,i,s,l,u,f]=xt(V=>[V.updateAppState,V.settings,V.updateSettings,V.fileManagerState,V.setModel,V.setServerConfig]),{toast:m}=Id(),[p,g]=d.useState(i.model),[y,x]=d.useState([]),S=y.length>0;d.useEffect(()=>{g(i.model)},[i.model]);const{data:E,status:_,refetch:b}=OL({queryKey:["serverConfig"],queryFn:SP}),C=YY({resolver:JY(qX),defaultValues:{enableDownloadMask:i.enableDownloadMask,enableManualInpainting:i.enableManualInpainting,enableUploadMask:i.enableUploadMask,enableAutoExtractPrompt:i.enableAutoExtractPrompt,inputDirectory:l.inputDirectory,outputDirectory:l.outputDirectory,removeBGModel:E==null?void 0:E.removeBGModel,realesrganModel:E==null?void 0:E.realesrganModel,interactiveSegModel:E==null?void 0:E.interactiveSegModel}});d.useEffect(()=>{E&&(f(E),C.setValue("removeBGModel",E.removeBGModel),C.setValue("realesrganModel",E.realesrganModel),C.setValue("interactiveSegModel",E.interactiveSegModel))},[C,E]);async function R(V){s({enableDownloadMask:V.enableDownloadMask,enableManualInpainting:V.enableManualInpainting,enableUploadMask:V.enableUploadMask,enableAutoExtractPrompt:V.enableAutoExtractPrompt});const J=p.name!==i.model.name,G=(E==null?void 0:E.removeBGModel)!==V.removeBGModel&&A,Z=(E==null?void 0:E.realesrganModel)!==V.realesrganModel&&I,Q=(E==null?void 0:E.interactiveSegModel)!==V.interactiveSegModel&&z;if(J||G||Z||Q){const L=[];if(J&&L.push(`Switching model from ${i.model.name} to ${p.name}`),G&&L.push(`Switching RemoveBG model from ${E==null?void 0:E.removeBGModel} to ${V.removeBGModel}`),Z&&L.push(`Switching RealESRGAN model from ${E==null?void 0:E.realesrganModel} to ${V.realesrganModel}`),Q&&L.push(`Switching ${Oo.InteractiveSeg} model from ${E==null?void 0:E.interactiveSegModel} to ${V.interactiveSegModel}`),x(L),o({disableShortCuts:!0}),J)try{const ue=await Vj(p.name);m({title:`Switch to ${ue.name} success`}),u(p)}catch(ue){m({variant:"destructive",title:`Switch to ${p.name} failed: ${ue}`}),g(i.model)}if(G)try{const ue=await S0(Oo.RemoveBG,V.removeBGModel);if(ue.status!==200)throw new Error(ue.statusText)}catch(ue){m({variant:"destructive",title:`Switch RemoveBG model to ${V.removeBGModel} failed: ${ue}`})}if(Z)try{const ue=await S0(Oo.RealESRGAN,V.realesrganModel);if(ue.status!==200)throw new Error(ue.statusText)}catch(ue){m({variant:"destructive",title:`Switch RealESRGAN model to ${V.realesrganModel} failed: ${ue}`})}if(Q)try{const ue=await S0(Oo.InteractiveSeg,V.interactiveSegModel);if(ue.status!==200)throw new Error(ue.statusText)}catch(ue){m({variant:"destructive",title:`Switch ${Oo.InteractiveSeg} model to ${V.interactiveSegModel} failed: ${ue}`})}x([]),o({disableShortCuts:!1}),b()}}if(Yn("s",()=>{t(),e&&R(C.getValues())},[e,C,p,E]),_!=="success")return v.jsx(v.Fragment,{});const k=E.modelInfos,O=E.plugins,A=O.some(V=>V.name===Oo.RemoveBG),I=O.some(V=>V.name===Oo.RealESRGAN),z=O.some(V=>V.name===Oo.InteractiveSeg);function H(V){t(),V||R(C.getValues())}function ie(V){g(V)}function K(V){return k?k.filter(J=>V.includes(J.model_type)).map(J=>v.jsxs("div",{onClick:()=>ie(J),className:"px-2",children:[v.jsx("div",{className:xe([J.name===p.name?"bg-muted":"hover:bg-muted","rounded-md px-2 py-2","cursor-default"]),children:v.jsx("div",{className:"text-base",children:J.name})}),v.jsx(Xt,{className:"my-1"})]},J.name)):v.jsx("div",{children:"Please download model first"})}function te(){let V=ms;for(let J of k)if(p.name===J.name){V=J.model_type,V===uC&&(V=hh),V===dC&&(V=ph);break}return v.jsxs("div",{className:"flex flex-col gap-4 w-[510px]",children:[v.jsxs("div",{className:"flex flex-col gap-4 rounded-md",children:[v.jsx("div",{className:"font-medium",children:"Current Model"}),v.jsx("div",{children:p.name})]}),v.jsx(Xt,{}),v.jsxs("div",{className:"space-y-4 rounded-md",children:[v.jsx("div",{className:"flex gap-1 items-center justify-start",children:v.jsx("div",{className:"font-medium",children:"Available models"})}),v.jsxs(hk,{defaultValue:V,children:[v.jsxs(Hx,{children:[v.jsx(ys,{value:ms,children:"Inpaint"}),v.jsx(ys,{value:hh,children:"Stable Diffusion"}),v.jsx(ys,{value:ph,children:"Stable Diffusion Inpaint"}),v.jsx(ys,{value:g0,children:"Other Diffusion"})]}),v.jsxs(fg,{className:"h-[240px] w-full mt-2 outline-none border rounded-lg",children:[v.jsx(Su,{value:ms,children:K([ms])}),v.jsx(Su,{value:hh,children:K([hh,uC])}),v.jsx(Su,{value:ph,children:K([ph,dC])}),v.jsx(Su,{value:g0,children:K([g0])})]})]})]})]})}function U(){return v.jsxs("div",{className:"space-y-4 w-[510px]",children:[v.jsx(ul,{control:C.control,name:"enableManualInpainting",render:({field:V})=>v.jsxs(sa,{className:"flex items-center justify-between",children:[v.jsxs("div",{className:"space-y-0.5",children:[v.jsx(aa,{children:"Enable manual inpainting"}),v.jsx(ca,{children:"For erase model, click a button to trigger inpainting after draw mask."})]}),v.jsx(la,{children:v.jsx(jr,{checked:V.value,onCheckedChange:V.onChange})})]})}),v.jsx(Xt,{}),v.jsx(ul,{control:C.control,name:"enableDownloadMask",render:({field:V})=>v.jsxs(sa,{className:"flex items-center justify-between",children:[v.jsxs("div",{className:"space-y-0.5",children:[v.jsx(aa,{children:"Enable download mask"}),v.jsx(ca,{children:"Also download the mask after save the inpainting result."})]}),v.jsx(la,{children:v.jsx(jr,{checked:V.value,onCheckedChange:V.onChange})})]})}),v.jsx(Xt,{}),v.jsx(ul,{control:C.control,name:"enableAutoExtractPrompt",render:({field:V})=>v.jsxs(sa,{className:"flex items-center justify-between",children:[v.jsxs("div",{className:"space-y-0.5",children:[v.jsx(aa,{children:"Enable auto extract prompt"}),v.jsx(ca,{children:"Automatically extract prompt/negativate prompt from the image meta."})]}),v.jsx(la,{children:v.jsx(jr,{checked:V.value,onCheckedChange:V.onChange})})]})})]})}function re(){return v.jsxs("div",{className:"space-y-4 w-[510px]",children:[v.jsx(ul,{control:C.control,name:"removeBGModel",render:({field:V})=>v.jsxs(sa,{className:"flex items-center justify-between",children:[v.jsxs("div",{className:"space-y-0.5",children:[v.jsx(aa,{children:"Remove Background"}),v.jsx(ca,{children:"Remove background model"})]}),v.jsxs(yo,{onValueChange:V.onChange,defaultValue:V.value,disabled:!A,children:[v.jsx(la,{children:v.jsx(Vr,{className:"w-auto",children:v.jsx(wo,{placeholder:"Select removebg model"})})}),v.jsx(Wr,{align:"end",children:v.jsx(zo,{children:E==null?void 0:E.removeBGModels.map(J=>v.jsx(Hr,{value:J,children:J},J))})})]})]})}),v.jsx(Xt,{}),v.jsx(ul,{control:C.control,name:"realesrganModel",render:({field:V})=>v.jsxs(sa,{className:"flex items-center justify-between",children:[v.jsxs("div",{className:"space-y-0.5",children:[v.jsx(aa,{children:"RealESRGAN"}),v.jsx(ca,{children:"RealESRGAN Model"})]}),v.jsxs(yo,{onValueChange:V.onChange,defaultValue:V.value,disabled:!I,children:[v.jsx(la,{children:v.jsx(Vr,{className:"w-auto",children:v.jsx(wo,{placeholder:"Select RealESRGAN model"})})}),v.jsx(Wr,{align:"end",children:v.jsx(zo,{children:E==null?void 0:E.realesrganModels.map(J=>v.jsx(Hr,{value:J,children:J},J))})})]})]})}),v.jsx(Xt,{}),v.jsx(ul,{control:C.control,name:"interactiveSegModel",render:({field:V})=>v.jsxs(sa,{className:"flex items-center justify-between",children:[v.jsxs("div",{className:"space-y-0.5",children:[v.jsx(aa,{children:"Interactive Segmentation"}),v.jsx(ca,{children:"Interactive Segmentation Model"})]}),v.jsxs(yo,{onValueChange:V.onChange,defaultValue:V.value,disabled:!z,children:[v.jsx(la,{children:v.jsx(Vr,{className:"w-auto",children:v.jsx(wo,{placeholder:"Select interactive segmentation model"})})}),v.jsx(Wr,{align:"end",children:v.jsx(zo,{children:E==null?void 0:E.interactiveSegModels.map(J=>v.jsx(Hr,{value:J,children:J},J))})})]})]})})]})}return v.jsxs(v.Fragment,{children:[v.jsx(HX,{open:S,children:v.jsx(R3,{children:v.jsx(P3,{children:v.jsxs("div",{className:"flex flex-col justify-center items-center gap-4",children:[v.jsxs("div",{role:"status",children:[v.jsxs("svg",{"aria-hidden":"true",className:"w-8 h-8 text-gray-200 animate-spin dark:text-gray-600 fill-primary",viewBox:"0 0 100 101",fill:"none",xmlns:"http://www.w3.org/2000/svg",children:[v.jsx("path",{d:"M100 50.5908C100 78.2051 77.6142 100.591 50 100.591C22.3858 100.591 0 78.2051 0 50.5908C0 22.9766 22.3858 0.59082 50 0.59082C77.6142 0.59082 100 22.9766 100 50.5908ZM9.08144 50.5908C9.08144 73.1895 27.4013 91.5094 50 91.5094C72.5987 91.5094 90.9186 73.1895 90.9186 50.5908C90.9186 27.9921 72.5987 9.67226 50 9.67226C27.4013 9.67226 9.08144 27.9921 9.08144 50.5908Z",fill:"currentColor"}),v.jsx("path",{d:"M93.9676 39.0409C96.393 38.4038 97.8624 35.9116 97.0079 33.5539C95.2932 28.8227 92.871 24.3692 89.8167 20.348C85.8452 15.1192 80.8826 10.7238 75.2124 7.41289C69.5422 4.10194 63.2754 1.94025 56.7698 1.05124C51.7666 0.367541 46.6976 0.446843 41.7345 1.27873C39.2613 1.69328 37.813 4.19778 38.4501 6.62326C39.0873 9.04874 41.5694 10.4717 44.0505 10.1071C47.8511 9.54855 51.7191 9.52689 55.5402 10.0491C60.8642 10.7766 65.9928 12.5457 70.6331 15.2552C75.2735 17.9648 79.3347 21.5619 82.5849 25.841C84.9175 28.9121 86.7997 32.2913 88.1811 35.8758C89.083 38.2158 91.5421 39.6781 93.9676 39.0409Z",fill:"currentFill"})]}),v.jsx("span",{className:"sr-only",children:"Loading..."})]}),y?v.jsx("div",{className:"flex flex-col",children:y.map((V,J)=>v.jsx("div",{children:V},J))}):v.jsx(v.Fragment,{})]})})})}),v.jsxs(Mx,{open:e,onOpenChange:H,children:[v.jsx(_T,{asChild:!0,children:v.jsx(Zn,{tooltip:"Settings",children:v.jsx(vH,{})})}),v.jsxs(rg,{className:"max-w-3xl h-[600px]",onOpenAutoFocus:V=>V.preventDefault(),children:[v.jsx(og,{children:"Settings"}),v.jsx(Xt,{}),v.jsxs("div",{className:"flex flex-row space-x-8 h-full",children:[v.jsx("div",{className:"flex flex-col space-y-1",children:QX.map(V=>v.jsx(vn,{variant:"ghost",onClick:()=>r(V),className:xe(n===V?"bg-muted ":"hover:bg-muted","justify-start"),children:V},V))}),v.jsx(Xt,{orientation:"vertical"}),v.jsx(wX,{...C,children:v.jsx("div",{className:"flex w-full justify-center",children:v.jsxs("form",{onSubmit:C.handleSubmit(R),children:[n===U1?te():v.jsx(v.Fragment,{}),n===T3?U():v.jsx(v.Fragment,{}),n===k3?re():v.jsx(v.Fragment,{}),v.jsx("div",{className:"absolute right-10 bottom-6",children:v.jsx(vn,{onClick:()=>H(!1),children:"Ok"})})]})})})]})]})]})]})}const eZ=()=>{const[e,t,n,r,o,i,s,l,u,f,m,p,g,y,x]=xt(k=>[k.file,k.customMask,k.isInpainting,k.serverConfig,k.runMannually(),k.settings.enableUploadMask,k.settings.model,k.setFile,k.setCustomFile,k.runInpainting,k.showPrevMask,k.hidePrevMask,k.imageHeight,k.imageWidth,k.handleFileManagerMaskSelect]),{toast:S}=Id(),[E,_]=Nx(t),[b,C]=d.useState(!1),R=async(k,O)=>{try{if(k===Hk){const A=await Kj(k,O);x(A)}else{const A=await Hj(k,O);l(A)}}catch(A){S({variant:"destructive",description:A.message?A.message:A.toString()});return}};return v.jsxs("header",{className:"h-[60px] px-6 py-4 absolute top-[0] flex justify-between items-center w-full z-20 border-b backdrop-filter backdrop-blur-md bg-background/70",children:[v.jsxs("div",{className:"flex items-center gap-1",children:[r.enableFileManager?v.jsx(kY,{photoWidth:512,onPhotoClick:R}):v.jsx(v.Fragment,{}),v.jsxs("div",{className:xe(["flex items-center gap-1",e&&i?"visible":"hidden"]),children:[v.jsx(nT,{disabled:n,tooltip:"Upload custom mask",onFileUpload:async k=>{let O=null;try{O=await yF(k)}catch(A){S({variant:"destructive",description:A.message?A.message:A.toString()});return}if(O.naturalHeight!==g||O.naturalWidth!==y){S({variant:"destructive",description:`The size of the mask must same as image: ${y}x${g}`});return}u(k),o||f()},children:v.jsx(BT,{})}),t?v.jsxs(MW,{open:b,children:[v.jsx(OW,{className:"btn-primary side-panel-trigger",onMouseEnter:()=>C(!0),onMouseLeave:()=>C(!1),style:{visibility:t?"visible":"hidden",outline:"none"},onClick:()=>{},children:v.jsx(Zn,{tooltip:"Run custom mask",children:v.jsx(hB,{})})}),v.jsx(LT,{children:_?v.jsx("img",{src:E.src,alt:"Custom mask"}):v.jsx(v.Fragment,{})})]}):v.jsx(v.Fragment,{})]})]}),s.need_prompt?v.jsx(oH,{}):v.jsx(v.Fragment,{}),v.jsxs("div",{className:"flex gap-1",children:[v.jsx(yW,{}),r.disableModelSwitch?v.jsx(v.Fragment,{}):v.jsx(JX,{})]})]})};var Yr=function(e,t){return Number(e.toFixed(t))},tZ=function(e,t){return typeof e=="number"?e:t},Ot=function(e,t,n){n&&typeof n=="function"&&n(e,t)},nZ=function(e){return-Math.cos(e*Math.PI)/2+.5},rZ=function(e){return e},oZ=function(e){return e*e},iZ=function(e){return e*(2-e)},sZ=function(e){return e<.5?2*e*e:-1+(4-2*e)*e},aZ=function(e){return e*e*e},lZ=function(e){return--e*e*e+1},cZ=function(e){return e<.5?4*e*e*e:(e-1)*(2*e-2)*(2*e-2)+1},uZ=function(e){return e*e*e*e},dZ=function(e){return 1- --e*e*e*e},fZ=function(e){return e<.5?8*e*e*e*e:1-8*--e*e*e*e},hZ=function(e){return e*e*e*e*e},pZ=function(e){return 1+--e*e*e*e*e},mZ=function(e){return e<.5?16*e*e*e*e*e:1+16*--e*e*e*e*e},A3={easeOut:nZ,linear:rZ,easeInQuad:oZ,easeOutQuad:iZ,easeInOutQuad:sZ,easeInCubic:aZ,easeOutCubic:lZ,easeInOutCubic:cZ,easeInQuart:uZ,easeOutQuart:dZ,easeInOutQuart:fZ,easeInQuint:hZ,easeOutQuint:pZ,easeInOutQuint:mZ},M3=function(e){typeof e=="number"&&cancelAnimationFrame(e)},Vo=function(e){e.mounted&&(M3(e.animation),e.animate=!1,e.animation=null,e.velocity=null)};function O3(e,t,n,r){if(e.mounted){var o=new Date().getTime(),i=1;Vo(e),e.animation=function(){if(!e.mounted)return M3(e.animation);var s=new Date().getTime()-o,l=s/n,u=A3[t],f=u(l);s>=n?(r(i),e.animation=null):e.animation&&(r(f),requestAnimationFrame(e.animation))},requestAnimationFrame(e.animation)}}function gZ(e){var t=e.scale,n=e.positionX,r=e.positionY;return!(Number.isNaN(t)||Number.isNaN(n)||Number.isNaN(r))}function Ws(e,t,n,r){var o=gZ(t);if(!(!e.mounted||!o)){var i=e.setTransformState,s=e.transformState,l=s.scale,u=s.positionX,f=s.positionY,m=t.scale-l,p=t.positionX-u,g=t.positionY-f;n===0?i(t.scale,t.positionX,t.positionY):O3(e,r,n,function(y){var x=l+m*y,S=u+p*y,E=f+g*y;i(x,S,E)})}}function vZ(e,t,n){var r=e.offsetWidth,o=e.offsetHeight,i=t.offsetWidth,s=t.offsetHeight,l=i*n,u=s*n,f=r-l,m=o-u;return{wrapperWidth:r,wrapperHeight:o,newContentWidth:l,newDiffWidth:f,newContentHeight:u,newDiffHeight:m}}var yZ=function(e,t,n,r,o,i,s){var l=e>t?n*(s?1:.5):0,u=r>o?i*(s?1:.5):0,f=e-t-l,m=l,p=r-o-u,g=u;return{minPositionX:f,maxPositionX:m,minPositionY:p,maxPositionY:g}},rb=function(e,t){var n=e.wrapperComponent,r=e.contentComponent,o=e.setup.centerZoomedOut;if(!n||!r)throw new Error("Components are not mounted");var i=vZ(n,r,t),s=i.wrapperWidth,l=i.wrapperHeight,u=i.newContentWidth,f=i.newDiffWidth,m=i.newContentHeight,p=i.newDiffHeight,g=yZ(s,u,f,l,m,p,!!o);return g},V1=function(e,t,n,r){return r?en?Yr(n,2):Yr(e,2):Yr(e,2)},wc=function(e,t){var n=rb(e,t);return e.bounds=n,n};function gg(e,t,n,r,o,i,s){var l=n.minPositionX,u=n.minPositionY,f=n.maxPositionX,m=n.maxPositionY,p=0,g=0;s&&(p=o,g=i);var y=V1(e,l-p,f+p,r),x=V1(t,u-g,m+g,r);return{x:y,y:x}}function vg(e,t,n,r,o,i){var s=e.transformState,l=s.scale,u=s.positionX,f=s.positionY,m=r-l;if(typeof t!="number"||typeof n!="number")return console.error("Mouse X and Y position were not provided!"),{x:u,y:f};var p=u-t*m,g=f-n*m,y=gg(p,g,o,i,0,0,null);return y}function Gd(e,t,n,r,o){var i=o?r:0,s=t-i;return!Number.isNaN(n)&&e>=n?n:!Number.isNaN(t)&&e<=s?s:e}var j2=function(e,t){var n=e.setup.panning.excluded,r=e.isInitialized,o=e.wrapperComponent,i=t.target,s=o==null?void 0:o.contains(i),l=r&&i&&s;if(!l)return!1;var u=yg(i,n);return!u},B2=function(e){var t=e.isInitialized,n=e.isPanning,r=e.setup,o=r.panning.disabled,i=t&&n&&!o;return!!i},wZ=function(e,t){var n=e.transformState,r=n.positionX,o=n.positionY;e.isPanning=!0;var i=t.clientX,s=t.clientY;e.startCoords={x:i-r,y:s-o}},xZ=function(e,t){var n=t.touches,r=e.transformState,o=r.positionX,i=r.positionY;e.isPanning=!0;var s=n.length===1;if(s){var l=n[0].clientX,u=n[0].clientY;e.startCoords={x:l-o,y:u-i}}};function bZ(e){var t=e.transformState,n=t.positionX,r=t.positionY,o=t.scale,i=e.setup,s=i.disabled,l=i.limitToBounds,u=i.centerZoomedOut,f=e.wrapperComponent;if(!(s||!f||!e.bounds)){var m=e.bounds,p=m.maxPositionX,g=m.minPositionX,y=m.maxPositionY,x=m.minPositionY,S=n>p||ny||rp?f.offsetWidth:e.setup.minPositionX||0,b=r>y?f.offsetHeight:e.setup.minPositionY||0,C=vg(e,_,b,o,e.bounds,l||u),R=C.x,k=C.y;return{scale:o,positionX:S?R:n,positionY:E?k:r}}}function SZ(e,t,n,r,o){var i=e.setup.limitToBounds,s=e.wrapperComponent,l=e.bounds,u=e.transformState,f=u.scale,m=u.positionX,p=u.positionY;if(!(s===null||l===null||t===m&&n===p)){var g=gg(t,n,l,i,r,o,s),y=g.x,x=g.y;e.setTransformState(f,y,x)}}var _Z=function(e,t,n){var r=e.startCoords,o=e.transformState,i=e.setup.panning,s=i.lockAxisX,l=i.lockAxisY,u=o.positionX,f=o.positionY;if(!r)return{x:u,y:f};var m=t-r.x,p=n-r.y,g=s?u:m,y=l?f:p;return{x:g,y}},cm=function(e,t){var n=e.setup,r=e.transformState,o=r.scale,i=n.minScale,s=n.disablePadding;return t>0&&o>=i&&!s?t:0},EZ=function(e){var t=e.mounted,n=e.setup,r=n.disabled,o=n.velocityAnimation,i=e.transformState.scale,s=o.disabled,l=!s||i>1||!r||t;return!!l},CZ=function(e){var t=e.mounted,n=e.velocity,r=e.bounds,o=e.setup,i=o.disabled,s=o.velocityAnimation,l=e.transformState.scale,u=s.disabled,f=!u||l>1||!i||t;return!(!f||!n||!r)};function $Z(e,t){var n=e.setup.velocityAnimation,r=n.equalToMove,o=n.animationTime,i=n.sensitivity;return r?o*t*i:o}function z2(e,t,n,r,o,i,s,l,u,f){if(o){if(t>s&&n>s){var m=s+(e-s)*f;return m>u?u:mi?i:m}}return r?t:V1(e,i,s,o)}function RZ(e,t){var n=1;return t?Math.min(n,e.offsetWidth/window.innerWidth):n}function PZ(e,t){var n=EZ(e);if(n){var r=e.lastMousePosition,o=e.velocityTime,i=e.setup,s=e.wrapperComponent,l=i.velocityAnimation.equalToMove,u=Date.now();if(r&&o&&s){var f=RZ(s,l),m=t.x-r.x,p=t.y-r.y,g=m/f,y=p/f,x=u-o,S=m*m+p*p,E=Math.sqrt(S)/x;e.velocity={velocityX:g,velocityY:y,total:E}}e.lastMousePosition=t,e.velocityTime=u}}function TZ(e){var t=e.velocity,n=e.bounds,r=e.setup,o=e.wrapperComponent,i=CZ(e);if(!(!i||!t||!n||!o)){var s=t.velocityX,l=t.velocityY,u=t.total,f=n.maxPositionX,m=n.minPositionX,p=n.maxPositionY,g=n.minPositionY,y=r.limitToBounds,x=r.alignmentAnimation,S=r.zoomAnimation,E=r.panning,_=E.lockAxisY,b=E.lockAxisX,C=S.animationType,R=x.sizeX,k=x.sizeY,O=x.velocityAlignmentTime,A=O,I=$Z(e,u),z=Math.max(I,A),H=cm(e,R),ie=cm(e,k),K=H*o.offsetWidth/100,te=ie*o.offsetHeight/100,U=f+K,re=m-K,V=p+te,J=g-te,G=e.transformState,Z=new Date().getTime();O3(e,C,z,function(Q){var le=e.transformState,L=le.scale,ue=le.positionX,Ne=le.positionY,Ke=new Date().getTime()-Z,Me=Ke/A,me=A3[x.animationType],be=1-me(Math.min(1,Me)),Ee=1-Q,Oe=ue+s*Ee,Ie=Ne+l*Ee,ze=z2(Oe,G.positionX,ue,b,y,m,f,re,U,be),ht=z2(Ie,G.positionY,Ne,_,y,g,p,J,V,be);(ue!==Oe||Ne!==Ie)&&e.setTransformState(L,ze,ht)})}}function U2(e,t){var n=e.transformState.scale;Vo(e),wc(e,n),window.TouchEvent!==void 0&&t instanceof TouchEvent?xZ(e,t):wZ(e,t)}function N3(e){var t=e.transformState.scale,n=e.setup,r=n.minScale,o=n.alignmentAnimation,i=o.disabled,s=o.sizeX,l=o.sizeY,u=o.animationTime,f=o.animationType,m=i||t.1&&p;g?TZ(e):N3(e)}}function ob(e,t,n,r){var o=e.setup,i=o.minScale,s=o.maxScale,l=o.limitToBounds,u=Gd(Yr(t,2),i,s,0,!1),f=wc(e,u),m=vg(e,n,r,u,f,l),p=m.x,g=m.y;return{scale:u,positionX:p,positionY:g}}function D3(e,t,n){var r=e.transformState.scale,o=e.wrapperComponent,i=e.setup,s=i.minScale,l=i.limitToBounds,u=i.zoomAnimation,f=u.disabled,m=u.animationTime,p=u.animationType,g=f||r>=s;if((r>=1||l)&&N3(e),!(g||!o||!e.mounted)){var y=t||o.offsetWidth/2,x=n||o.offsetHeight/2,S=ob(e,s,y,x);S&&Ws(e,S,m,p)}}var ks=function(){return ks=Object.assign||function(t){for(var n,r=1,o=arguments.length;rs||Math.sign(n.deltaY)!==Math.sign(t.deltaY)||n.deltaY>0&&n.deltaYt.deltaY||Math.sign(n.deltaY)!==Math.sign(t.deltaY):!1},GZ=function(e,t){var n=e.setup.pinch,r=n.disabled,o=n.excluded,i=e.isInitialized,s=t.target,l=i&&!r&&s;if(!l)return!1;var u=yg(s,o);return!u},YZ=function(e){var t=e.setup.pinch.disabled,n=e.isInitialized,r=e.pinchStartDistance,o=n&&!t&&r;return!!o},XZ=function(e,t,n){var r=n.getBoundingClientRect(),o=e.touches,i=Yr(o[0].clientX-r.left,5),s=Yr(o[0].clientY-r.top,5),l=Yr(o[1].clientX-r.left,5),u=Yr(o[1].clientY-r.top,5);return{x:(i+l)/2/t,y:(s+u)/2/t}},U3=function(e){return Math.sqrt(Math.pow(e.touches[0].pageX-e.touches[1].pageX,2)+Math.pow(e.touches[0].pageY-e.touches[1].pageY,2))},ZZ=function(e,t){var n=e.pinchStartScale,r=e.pinchStartDistance,o=e.setup,i=o.maxScale,s=o.minScale,l=o.zoomAnimation,u=o.disablePadding,f=l.size,m=l.disabled;if(!n||r===null||!t)throw new Error("Pinch touches distance was not provided");if(t<0)return e.transformState.scale;var p=t/r,g=p*n;return Gd(Yr(g,2),s,i,f,!m&&!u)},qZ=160,QZ=100,JZ=function(e,t){var n=e.props,r=n.onWheelStart,o=n.onZoomStart;e.wheelStopEventTimer||(Vo(e),Ot(Et(e),t,r),Ot(Et(e),t,o))},eq=function(e,t){var n=e.props,r=n.onWheel,o=n.onZoom,i=e.contentComponent,s=e.setup,l=e.transformState,u=l.scale,f=s.limitToBounds,m=s.centerZoomedOut,p=s.zoomAnimation,g=s.wheel,y=s.disablePadding,x=s.smooth,S=p.size,E=p.disabled,_=g.step,b=g.smoothStep;if(!i)throw new Error("Component not mounted");t.preventDefault(),t.stopPropagation();var C=WZ(t,null),R=x?b*Math.abs(t.deltaY):_,k=HZ(e,C,R,!t.ctrlKey);if(u!==k){var O=wc(e,k),A=z3(t,i,u),I=E||S===0||m||y,z=f&&I,H=vg(e,A.x,A.y,k,O,z),ie=H.x,K=H.y;e.previousWheelEvent=t,e.setTransformState(k,ie,K),Ot(Et(e),t,r),Ot(Et(e),t,o)}},tq=function(e,t){var n=e.props,r=n.onWheelStop,o=n.onZoomStop;H1(e.wheelAnimationTimer),e.wheelAnimationTimer=setTimeout(function(){e.mounted&&(D3(e,t.x,t.y),e.wheelAnimationTimer=null)},QZ);var i=KZ(e,t);i&&(H1(e.wheelStopEventTimer),e.wheelStopEventTimer=setTimeout(function(){e.mounted&&(e.wheelStopEventTimer=null,Ot(Et(e),t,r),Ot(Et(e),t,o))},qZ))},nq=function(e,t){var n=U3(t);e.pinchStartDistance=n,e.lastDistance=n,e.pinchStartScale=e.transformState.scale,e.isPanning=!1,Vo(e)},rq=function(e,t){var n=e.contentComponent,r=e.pinchStartDistance,o=e.transformState.scale,i=e.setup,s=i.limitToBounds,l=i.centerZoomedOut,u=i.zoomAnimation,f=u.disabled,m=u.size;if(!(r===null||!n)){var p=XZ(t,o,n);if(!(!Number.isFinite(p.x)||!Number.isFinite(p.y))){var g=U3(t),y=ZZ(e,g);if(y!==o){var x=wc(e,y),S=f||m===0||l,E=s&&S,_=vg(e,p.x,p.y,y,x,E),b=_.x,C=_.y;e.pinchMidpoint=p,e.lastDistance=g,e.setTransformState(y,b,C)}}}},oq=function(e){var t=e.pinchMidpoint;e.velocity=null,e.lastDistance=null,e.pinchMidpoint=null,e.pinchStartScale=null,e.pinchStartDistance=null,D3(e,t==null?void 0:t.x,t==null?void 0:t.y)},V3=function(e,t){var n=e.props.onZoomStop,r=e.setup.doubleClick.animationTime;H1(e.doubleClickStopEventTimer),e.doubleClickStopEventTimer=setTimeout(function(){e.doubleClickStopEventTimer=null,Ot(Et(e),t,n)},r)},iq=function(e,t){var n=e.props,r=n.onZoomStart,o=n.onZoom,i=e.setup.doubleClick,s=i.animationTime,l=i.animationType;Ot(Et(e),t,r),j3(e,s,l,function(){return Ot(Et(e),t,o)}),V3(e,t)};function sq(e,t){var n=e.setup,r=e.doubleClickStopEventTimer,o=e.transformState,i=e.contentComponent,s=o.scale,l=e.props,u=l.onZoomStart,f=l.onZoom,m=n.doubleClick,p=m.disabled,g=m.mode,y=m.step,x=m.animationTime,S=m.animationType;if(!p&&!r){if(g==="reset")return iq(e,t);if(!i)return console.error("No ContentComponent found");var E=g==="zoomOut"?-1:1,_=L3(e,E,y);if(s!==_){Ot(Et(e),t,u);var b=z3(t,i,s),C=ob(e,_,b.x,b.y);if(!C)return console.error("Error during zoom event. New transformation state was not calculated.");Ot(Et(e),t,f),Ws(e,C,x,S),V3(e,t)}}}var aq=function(e,t){var n=e.isInitialized,r=e.setup,o=e.wrapperComponent,i=r.doubleClick,s=i.disabled,l=i.excluded,u=t.target,f=o==null?void 0:o.contains(u),m=n&&u&&f&&!s;if(!m)return!1;var p=yg(u,l);return!p},lq=function(){function e(t){var n=this;this.mounted=!0,this.onChangeCallbacks=new Set,this.onInitCallbacks=new Set,this.wrapperComponent=null,this.contentComponent=null,this.isInitialized=!1,this.bounds=null,this.previousWheelEvent=null,this.wheelStopEventTimer=null,this.wheelAnimationTimer=null,this.isPanning=!1,this.startCoords=null,this.lastTouch=null,this.distance=null,this.lastDistance=null,this.pinchStartDistance=null,this.pinchStartScale=null,this.pinchMidpoint=null,this.doubleClickStopEventTimer=null,this.velocity=null,this.velocityTime=null,this.lastMousePosition=null,this.animate=!1,this.animation=null,this.maxBounds=null,this.pressedKeys={},this.mount=function(){n.initializeWindowEvents()},this.unmount=function(){n.cleanupWindowEvents()},this.update=function(r){wc(n,n.transformState.scale),n.setup=H2(r)},this.initializeWindowEvents=function(){var r,o=H0(),i=(r=n.wrapperComponent)===null||r===void 0?void 0:r.ownerDocument,s=i==null?void 0:i.defaultView;s==null||s.addEventListener("mousedown",n.onPanningStart,o),s==null||s.addEventListener("mousemove",n.onPanning,o),s==null||s.addEventListener("mouseup",n.onPanningStop,o),i==null||i.addEventListener("mouseleave",n.clearPanning,o),s==null||s.addEventListener("keyup",n.setKeyUnPressed,o),s==null||s.addEventListener("keydown",n.setKeyPressed,o)},this.cleanupWindowEvents=function(){var r,o,i=H0(),s=(r=n.wrapperComponent)===null||r===void 0?void 0:r.ownerDocument,l=s==null?void 0:s.defaultView;l==null||l.removeEventListener("mousedown",n.onPanningStart,i),l==null||l.removeEventListener("mousemove",n.onPanning,i),l==null||l.removeEventListener("mouseup",n.onPanningStop,i),s==null||s.removeEventListener("mouseleave",n.clearPanning,i),l==null||l.removeEventListener("keyup",n.setKeyUnPressed,i),l==null||l.removeEventListener("keydown",n.setKeyPressed,i),document.removeEventListener("mouseleave",n.clearPanning,i),Vo(n),(o=n.observer)===null||o===void 0||o.disconnect()},this.handleInitializeWrapperEvents=function(r){var o=H0();r.addEventListener("wheel",n.onWheelZoom,o),r.addEventListener("dblclick",n.onDoubleClick,o),r.addEventListener("touchstart",n.onTouchPanningStart,o),r.addEventListener("touchmove",n.onTouchPanning,o),r.addEventListener("touchend",n.onTouchPanningStop,o)},this.handleInitialize=function(r){var o=n.setup.centerOnInit;n.applyTransformation(),n.onInitCallbacks.forEach(function(i){return i(Et(n))}),o&&(n.setCenter(),n.observer=new ResizeObserver(function(){var i;n.onInitCallbacks.forEach(function(s){return s(Et(n))}),n.setCenter(),(i=n.observer)===null||i===void 0||i.disconnect()}),n.observer.observe(r))},this.onWheelZoom=function(r){var o=n.setup.disabled;if(!o){var i=UZ(n,r);if(i){var s=n.isPressingKeys(n.setup.wheel.activationKeys);s&&(JZ(n,r),eq(n,r),tq(n,r))}}},this.onPanningStart=function(r){var o=n.setup.disabled,i=n.props.onPanningStart;if(!o){var s=j2(n,r);if(s){var l=n.isPressingKeys(n.setup.panning.activationKeys);l&&(r.button===0&&!n.setup.panning.allowLeftClickPan||r.button===1&&!n.setup.panning.allowMiddleClickPan||r.button===2&&!n.setup.panning.allowRightClickPan||(r.preventDefault(),r.stopPropagation(),Vo(n),U2(n,r),Ot(Et(n),r,i)))}}},this.onPanning=function(r){var o=n.setup.disabled,i=n.props.onPanning;if(!o){var s=B2(n);if(s){var l=n.isPressingKeys(n.setup.panning.activationKeys);l&&(r.preventDefault(),r.stopPropagation(),V2(n,r.clientX,r.clientY),Ot(Et(n),r,i))}}},this.onPanningStop=function(r){var o=n.props.onPanningStop;n.isPanning&&(kZ(n),Ot(Et(n),r,o))},this.onPinchStart=function(r){var o=n.setup.disabled,i=n.props,s=i.onPinchingStart,l=i.onZoomStart;if(!o){var u=GZ(n,r);u&&(nq(n,r),Vo(n),Ot(Et(n),r,s),Ot(Et(n),r,l))}},this.onPinch=function(r){var o=n.setup.disabled,i=n.props,s=i.onPinching,l=i.onZoom;if(!o){var u=YZ(n);u&&(r.preventDefault(),r.stopPropagation(),rq(n,r),Ot(Et(n),r,s),Ot(Et(n),r,l))}},this.onPinchStop=function(r){var o=n.props,i=o.onPinchingStop,s=o.onZoomStop;n.pinchStartScale&&(oq(n),Ot(Et(n),r,i),Ot(Et(n),r,s))},this.onTouchPanningStart=function(r){var o=n.setup.disabled,i=n.props.onPanningStart;if(!o){var s=j2(n,r);if(s){var l=n.lastTouch&&+new Date-n.lastTouch<200;if(l&&r.touches.length===1)n.onDoubleClick(r);else{n.lastTouch=+new Date,Vo(n);var u=r.touches,f=u.length===1,m=u.length===2;f&&(Vo(n),U2(n,r),Ot(Et(n),r,i)),m&&n.onPinchStart(r)}}}},this.onTouchPanning=function(r){var o=n.setup.disabled,i=n.props.onPanning;if(n.isPanning&&r.touches.length===1){if(o)return;var s=B2(n);if(!s)return;r.preventDefault(),r.stopPropagation();var l=r.touches[0];V2(n,l.clientX,l.clientY),Ot(Et(n),r,i)}else r.touches.length>1&&n.onPinch(r)},this.onTouchPanningStop=function(r){n.onPanningStop(r),n.onPinchStop(r)},this.onDoubleClick=function(r){var o=n.setup.disabled;if(!o){var i=aq(n,r);i&&sq(n,r)}},this.clearPanning=function(r){n.isPanning&&n.onPanningStop(r)},this.setKeyPressed=function(r){n.pressedKeys[r.key]=!0},this.setKeyUnPressed=function(r){n.pressedKeys[r.key]=!1},this.isPressingKeys=function(r){return r.length?!!r.find(function(o){return n.pressedKeys[o]}):!0},this.setTransformState=function(r,o,i){var s=n.props.onTransformed;if(!Number.isNaN(r)&&!Number.isNaN(o)&&!Number.isNaN(i)){r!==n.transformState.scale&&(n.transformState.previousScale=n.transformState.scale,n.transformState.scale=r),n.transformState.positionX=o,n.transformState.positionY=i,n.applyTransformation();var l=Et(n);n.onChangeCallbacks.forEach(function(u){return u(l)}),Ot(l,{scale:r,positionX:o,positionY:i},s)}else console.error("Detected NaN set state values")},this.setCenter=function(){if(n.wrapperComponent&&n.contentComponent){var r=B3(n.transformState.scale,n.wrapperComponent,n.contentComponent);n.setTransformState(r.scale,r.positionX,r.positionY)}},this.handleTransformStyles=function(r,o,i){return n.props.customTransform?n.props.customTransform(r,o,i):BZ(r,o,i)},this.applyTransformation=function(){if(!(!n.mounted||!n.contentComponent)){var r=n.transformState,o=r.scale,i=r.positionX,s=r.positionY,l=n.handleTransformStyles(i,s,o);n.contentComponent.style.transform=l}},this.getContext=function(){return Et(n)},this.onChange=function(r){return n.onChangeCallbacks.has(r)||n.onChangeCallbacks.add(r),function(){n.onChangeCallbacks.delete(r)}},this.onInit=function(r){return n.onInitCallbacks.has(r)||n.onInitCallbacks.add(r),function(){n.onInitCallbacks.delete(r)}},this.init=function(r,o){n.cleanupWindowEvents(),n.wrapperComponent=r,n.contentComponent=o,wc(n,n.transformState.scale),n.handleInitializeWrapperEvents(r),n.handleInitialize(o),n.initializeWindowEvents(),n.isInitialized=!0;var i=Et(n);Ot(i,void 0,n.props.onInit)},this.props=t,this.setup=H2(this.props),this.transformState=I3(this.props)}return e}(),ib=Be.createContext(null),cq=function(e,t){return typeof e=="function"?e(t):e},uq=Be.forwardRef(function(e,t){var n=d.useRef(new lq(e)).current,r=cq(e.children,W1(n));return d.useImperativeHandle(t,function(){return W1(n)},[n]),d.useEffect(function(){n.update(e)},[n,e]),Be.createElement(ib.Provider,{value:n},r)});Be.forwardRef(function(e,t){var n=d.useRef(null),r=d.useContext(ib);return d.useEffect(function(){return r.onChange(function(o){if(n.current){var i=0,s=0;n.current.style.transform=r.handleTransformStyles(i,s,1/o.instance.transformState.scale)}})},[r]),Be.createElement("div",ks({},e,{ref:zZ([n,t])}))});function dq(e,t){t===void 0&&(t={});var n=t.insertAt;if(!(!e||typeof document>"u")){var r=document.head||document.getElementsByTagName("head")[0],o=document.createElement("style");o.type="text/css",n==="top"&&r.firstChild?r.insertBefore(o,r.firstChild):r.appendChild(o),o.styleSheet?o.styleSheet.cssText=e:o.appendChild(document.createTextNode(e))}}var fq=`.transform-component-module_wrapper__SPB86 { + position: relative; + width: -moz-fit-content; + width: fit-content; + height: -moz-fit-content; + height: fit-content; + overflow: hidden; + -webkit-touch-callout: none; /* iOS Safari */ + -webkit-user-select: none; /* Safari */ + -khtml-user-select: none; /* Konqueror HTML */ + -moz-user-select: none; /* Firefox */ + -ms-user-select: none; /* Internet Explorer/Edge */ + user-select: none; + margin: 0; + padding: 0; +} +.transform-component-module_content__FBWxo { + display: flex; + flex-wrap: wrap; + width: -moz-fit-content; + width: fit-content; + height: -moz-fit-content; + height: fit-content; + margin: 0; + padding: 0; + transform-origin: 0% 0%; +} +.transform-component-module_content__FBWxo img { + pointer-events: none; +} +`,K2={wrapper:"transform-component-module_wrapper__SPB86",content:"transform-component-module_content__FBWxo"};dq(fq);var hq=function(e){var t=e.children,n=e.wrapperClass,r=n===void 0?"":n,o=e.contentClass,i=o===void 0?"":o,s=e.wrapperStyle,l=e.contentStyle,u=e.wrapperProps,f=u===void 0?{}:u,m=e.contentProps,p=m===void 0?{}:m,g=d.useContext(ib),y=g.init,x=g.cleanupWindowEvents,S=d.useRef(null),E=d.useRef(null);return d.useEffect(function(){var _=S.current,b=E.current;return _!==null&&b!==null&&y&&(y==null||y(_,b)),function(){x==null||x()}},[]),Be.createElement("div",ks({},f,{ref:S,className:"react-transform-wrapper ".concat(K2.wrapper," ").concat(r),style:s}),Be.createElement("div",ks({},p,{ref:E,className:"react-transform-component ".concat(K2.content," ").concat(i),style:l}),t))};const W3=["PageUp","PageDown"],H3=["ArrowUp","ArrowDown","ArrowLeft","ArrowRight"],K3={"from-left":["Home","PageDown","ArrowDown","ArrowLeft"],"from-right":["Home","PageDown","ArrowDown","ArrowRight"],"from-bottom":["Home","PageDown","ArrowDown","ArrowLeft"],"from-top":["Home","PageDown","ArrowUp","ArrowLeft"]},Yd="Slider",[K1,pq,mq]=Wd(Yd),[G3,bte]=Tn(Yd,[mq]),[gq,wg]=G3(Yd),vq=d.forwardRef((e,t)=>{const{name:n,min:r=0,max:o=100,step:i=1,orientation:s="horizontal",disabled:l=!1,minStepsBetweenThumbs:u=0,defaultValue:f=[r],value:m,onValueChange:p=()=>{},onValueCommit:g=()=>{},inverted:y=!1,...x}=e,[S,E]=d.useState(null),_=Ve(t,U=>E(U)),b=d.useRef(new Set),C=d.useRef(0),R=s==="horizontal",k=S?!!S.closest("form"):!0,O=R?yq:wq,[A=[],I]=eo({prop:m,defaultProp:f,onChange:U=>{var re;(re=[...b.current][C.current])===null||re===void 0||re.focus(),p(U)}}),z=d.useRef(A);function H(U){const re=Pq(A,U);te(U,re)}function ie(U){te(U,C.current)}function K(){const U=z.current[C.current];A[C.current]!==U&&g(A)}function te(U,re,{commit:V}={commit:!1}){const J=Mq(i),G=Oq(Math.round((U-r)/i)*i+r,J),Z=fd(G,[r,o]);I((Q=[])=>{const le=$q(Q,Z,re);if(Aq(le,u*i)){C.current=le.indexOf(Z);const L=String(le)!==String(Q);return L&&V&&g(le),L?le:Q}else return Q})}return d.createElement(gq,{scope:e.__scopeSlider,disabled:l,min:r,max:o,valueIndexToChangeRef:C,thumbs:b.current,values:A,orientation:s},d.createElement(K1.Provider,{scope:e.__scopeSlider},d.createElement(K1.Slot,{scope:e.__scopeSlider},d.createElement(O,Y({"aria-disabled":l,"data-disabled":l?"":void 0},x,{ref:_,onPointerDown:fe(x.onPointerDown,()=>{l||(z.current=A)}),min:r,max:o,inverted:y,onSlideStart:l?void 0:H,onSlideMove:l?void 0:ie,onSlideEnd:l?void 0:K,onHomeKeyDown:()=>!l&&te(r,0,{commit:!0}),onEndKeyDown:()=>!l&&te(o,A.length-1,{commit:!0}),onStepKeyDown:({event:U,direction:re})=>{if(!l){const G=W3.includes(U.key)||U.shiftKey&&H3.includes(U.key)?10:1,Z=C.current,Q=A[Z],le=i*G*re;te(Q+le,Z,{commit:!0})}}})))),k&&A.map((U,re)=>d.createElement(Cq,{key:re,name:n?n+(A.length>1?"[]":""):void 0,value:U})))}),[Y3,X3]=G3(Yd,{startEdge:"left",endEdge:"right",size:"width",direction:1}),yq=d.forwardRef((e,t)=>{const{min:n,max:r,dir:o,inverted:i,onSlideStart:s,onSlideMove:l,onSlideEnd:u,onStepKeyDown:f,...m}=e,[p,g]=d.useState(null),y=Ve(t,C=>g(C)),x=d.useRef(),S=Ac(o),E=S==="ltr",_=E&&!i||!E&&i;function b(C){const R=x.current||p.getBoundingClientRect(),k=[0,R.width],A=sb(k,_?[n,r]:[r,n]);return x.current=R,A(C-R.left)}return d.createElement(Y3,{scope:e.__scopeSlider,startEdge:_?"left":"right",endEdge:_?"right":"left",direction:_?1:-1,size:"width"},d.createElement(Z3,Y({dir:S,"data-orientation":"horizontal"},m,{ref:y,style:{...m.style,"--radix-slider-thumb-transform":"translateX(-50%)"},onSlideStart:C=>{const R=b(C.clientX);s==null||s(R)},onSlideMove:C=>{const R=b(C.clientX);l==null||l(R)},onSlideEnd:()=>{x.current=void 0,u==null||u()},onStepKeyDown:C=>{const k=K3[_?"from-left":"from-right"].includes(C.key);f==null||f({event:C,direction:k?-1:1})}})))}),wq=d.forwardRef((e,t)=>{const{min:n,max:r,inverted:o,onSlideStart:i,onSlideMove:s,onSlideEnd:l,onStepKeyDown:u,...f}=e,m=d.useRef(null),p=Ve(t,m),g=d.useRef(),y=!o;function x(S){const E=g.current||m.current.getBoundingClientRect(),_=[0,E.height],C=sb(_,y?[r,n]:[n,r]);return g.current=E,C(S-E.top)}return d.createElement(Y3,{scope:e.__scopeSlider,startEdge:y?"bottom":"top",endEdge:y?"top":"bottom",size:"height",direction:y?1:-1},d.createElement(Z3,Y({"data-orientation":"vertical"},f,{ref:p,style:{...f.style,"--radix-slider-thumb-transform":"translateY(50%)"},onSlideStart:S=>{const E=x(S.clientY);i==null||i(E)},onSlideMove:S=>{const E=x(S.clientY);s==null||s(E)},onSlideEnd:()=>{g.current=void 0,l==null||l()},onStepKeyDown:S=>{const _=K3[y?"from-bottom":"from-top"].includes(S.key);u==null||u({event:S,direction:_?-1:1})}})))}),Z3=d.forwardRef((e,t)=>{const{__scopeSlider:n,onSlideStart:r,onSlideMove:o,onSlideEnd:i,onHomeKeyDown:s,onEndKeyDown:l,onStepKeyDown:u,...f}=e,m=wg(Yd,n);return d.createElement(Ae.span,Y({},f,{ref:t,onKeyDown:fe(e.onKeyDown,p=>{p.key==="Home"?(s(p),p.preventDefault()):p.key==="End"?(l(p),p.preventDefault()):W3.concat(H3).includes(p.key)&&(u(p),p.preventDefault())}),onPointerDown:fe(e.onPointerDown,p=>{const g=p.target;g.setPointerCapture(p.pointerId),p.preventDefault(),m.thumbs.has(g)?g.focus():r(p)}),onPointerMove:fe(e.onPointerMove,p=>{p.target.hasPointerCapture(p.pointerId)&&o(p)}),onPointerUp:fe(e.onPointerUp,p=>{const g=p.target;g.hasPointerCapture(p.pointerId)&&(g.releasePointerCapture(p.pointerId),i(p))})}))}),xq="SliderTrack",bq=d.forwardRef((e,t)=>{const{__scopeSlider:n,...r}=e,o=wg(xq,n);return d.createElement(Ae.span,Y({"data-disabled":o.disabled?"":void 0,"data-orientation":o.orientation},r,{ref:t}))}),G2="SliderRange",Sq=d.forwardRef((e,t)=>{const{__scopeSlider:n,...r}=e,o=wg(G2,n),i=X3(G2,n),s=d.useRef(null),l=Ve(t,s),u=o.values.length,f=o.values.map(g=>q3(g,o.min,o.max)),m=u>1?Math.min(...f):0,p=100-Math.max(...f);return d.createElement(Ae.span,Y({"data-orientation":o.orientation,"data-disabled":o.disabled?"":void 0},r,{ref:l,style:{...e.style,[i.startEdge]:m+"%",[i.endEdge]:p+"%"}}))}),Y2="SliderThumb",_q=d.forwardRef((e,t)=>{const n=pq(e.__scopeSlider),[r,o]=d.useState(null),i=Ve(t,l=>o(l)),s=d.useMemo(()=>r?n().findIndex(l=>l.ref.current===r):-1,[n,r]);return d.createElement(Eq,Y({},e,{ref:i,index:s}))}),Eq=d.forwardRef((e,t)=>{const{__scopeSlider:n,index:r,...o}=e,i=wg(Y2,n),s=X3(Y2,n),[l,u]=d.useState(null),f=Ve(t,E=>u(E)),m=Sx(l),p=i.values[r],g=p===void 0?0:q3(p,i.min,i.max),y=Rq(r,i.values.length),x=m==null?void 0:m[s.size],S=x?Tq(x,g,s.direction):0;return d.useEffect(()=>{if(l)return i.thumbs.add(l),()=>{i.thumbs.delete(l)}},[l,i.thumbs]),d.createElement("span",{style:{transform:"var(--radix-slider-thumb-transform)",position:"absolute",[s.startEdge]:`calc(${g}% + ${S}px)`}},d.createElement(K1.ItemSlot,{scope:e.__scopeSlider},d.createElement(Ae.span,Y({role:"slider","aria-label":e["aria-label"]||y,"aria-valuemin":i.min,"aria-valuenow":p,"aria-valuemax":i.max,"aria-orientation":i.orientation,"data-orientation":i.orientation,"data-disabled":i.disabled?"":void 0,tabIndex:i.disabled?void 0:0},o,{ref:f,style:p===void 0?{display:"none"}:e.style,onFocus:fe(e.onFocus,()=>{i.valueIndexToChangeRef.current=r})}))))}),Cq=e=>{const{value:t,...n}=e,r=d.useRef(null),o=Kx(t);return d.useEffect(()=>{const i=r.current,s=window.HTMLInputElement.prototype,u=Object.getOwnPropertyDescriptor(s,"value").set;if(o!==t&&u){const f=new Event("input",{bubbles:!0});u.call(i,t),i.dispatchEvent(f)}},[o,t]),d.createElement("input",Y({style:{display:"none"}},n,{ref:r,defaultValue:t}))};function $q(e=[],t,n){const r=[...e];return r[n]=t,r.sort((o,i)=>o-i)}function q3(e,t,n){const i=100/(n-t)*(e-t);return fd(i,[0,100])}function Rq(e,t){return t>2?`Value ${e+1} of ${t}`:t===2?["Minimum","Maximum"][e]:void 0}function Pq(e,t){if(e.length===1)return 0;const n=e.map(o=>Math.abs(o-t)),r=Math.min(...n);return n.indexOf(r)}function Tq(e,t,n){const r=e/2,i=sb([0,50],[0,r]);return(r-i(t)*n)*n}function kq(e){return e.slice(0,-1).map((t,n)=>e[n+1]-t)}function Aq(e,t){if(t>0){const n=kq(e);return Math.min(...n)>=t}return!0}function sb(e,t){return n=>{if(e[0]===e[1]||t[0]===t[1])return t[0];const r=(t[1]-t[0])/(e[1]-e[0]);return t[0]+r*(n-e[0])}}function Mq(e){return(String(e).split(".")[1]||"").length}function Oq(e,t){const n=Math.pow(10,t);return Math.round(e*n)/n}const Q3=vq,Nq=bq,Dq=Sq,Iq=_q,po=d.forwardRef(({className:e,...t},n)=>v.jsxs(Q3,{ref:n,className:xe("relative flex w-full touch-none select-none items-center",e),tabIndex:-1,...t,children:[v.jsx(Nq,{className:"relative h-1.5 w-full grow overflow-hidden rounded-full bg-primary/20 data-[disabled]:cursor-not-allowed data-[disabled]:opacity-50",children:v.jsx(Dq,{className:"absolute h-full bg-primary data-[disabled]:cursor-not-allowed "})}),v.jsx(Iq,{tabIndex:-1,className:"block h-4 w-4 rounded-full border border-primary/60 bg-background shadow transition-colors focus-visible:outline-none focus-visible:ring-1 focus-visible:ring-ring data-[disabled]:cursor-not-allowed"})]}));po.displayName=Q3.displayName;const dl={capture:!0,passive:!1},Lq=2,X2=(e,t,n,r,o,i)=>{if(e!==n&&t===r){if(e<0)return[0,r];if(e+t>i)return[i-r,r]}else{if(ti)return[e,i-e]}return[e,t]},Fq=e=>{const{minHeight:t,minWidth:n,maxHeight:r,maxWidth:o,scale:i,show:s}=e,[l,u,f,m,{x:p,y:g,width:y,height:x},S,E,_,b,C,R]=xt(Q=>[Q.imageWidth,Q.imageHeight,Q.isInpainting,Q.isSD(),Q.cropperState,Q.setCropperX,Q.setCropperY,Q.setCropperWidth,Q.setCropperHeight,Q.isCropperExtenderResizing,Q.setIsCropperExtenderResizing]),[k,O]=d.useState(!1);d.useEffect(()=>{S(Math.round((o-512)/2)),E(Math.round((r-512)/2))},[r,o,l,u]);const[A,I]=d.useState({initX:0,initY:0,initHeight:0,initWidth:0,startResizeX:0,startResizeY:0,ord:"top"}),z=()=>{},H=(Q,le)=>X2(Q,le,p,y,n,o),ie=(Q,le)=>X2(Q,le,g,x,t,r),K=Q=>{if(f)return;const le=Q.clientX,L=Q.clientY,ue=Math.round((L-A.startResizeY)/i),Ne=Math.round((le-A.startResizeX)/i),Ke=()=>{const Ee=A.initHeight-ue,Oe=A.initY+ue,[Ie,ze]=ie(Oe,Ee);b(ze),E(Ie)},Me=()=>{const Ee=A.initHeight+ue,[Oe,Ie]=ie(A.initY,Ee);b(Ie),E(Oe)},me=()=>{const Ee=A.initWidth-Ne,Oe=A.initX+Ne,[Ie,ze]=H(Oe,Ee);_(ze),S(Ie)},be=()=>{const Ee=A.initWidth+Ne,[Oe,Ie]=H(A.initX,Ee);_(Ie),S(Oe)};if(C)switch(A.ord){case"topleft":{Ke(),me();break}case"topright":{Ke(),be();break}case"bottomleft":{Me(),me();break}case"bottomright":{Me(),be();break}case"top":{Ke();break}case"right":{be();break}case"bottom":{Me();break}case"left":{me();break}}if(k){const Ee=A.initX+Ne,Oe=A.initY+ue,[Ie,ze]=H(Ee,A.initWidth),[ht,st]=ie(Oe,A.initHeight);_(ze),b(st),S(Ie),E(ht)}},te=()=>{C&&R(!1),k&&O(!1)};d.useEffect(()=>{if(C||k)return document.addEventListener("pointermove",K,dl),document.addEventListener("pointerup",te,dl),document.addEventListener("pointercancel",te,dl),()=>{document.removeEventListener("pointermove",K,dl),document.removeEventListener("pointerup",te,dl),document.removeEventListener("pointercancel",te,dl)}},[C,k,y,x,A]);const U=Q=>{const{ord:le}=Q.target.dataset;le&&(R(!0),I({initX:p,initY:g,initHeight:x,initWidth:y,startResizeX:Q.clientX,startResizeY:Q.clientY,ord:le}))},re=(Q,le,L)=>{const Ke="w-[12px] h-[12px] z-[4] absolute content-[''] block border-2 border-primary borde pointer-events-auto hover:bg-primary";let Me="0",me="0",be=L,Ee="-6px";return L===""&&(Ee="50%",le==="left"||le==="right"?(be="top",me="-50%"):(be="left",Me="-50%")),v.jsx("div",{className:xe(Ke,Q),style:{[le]:-6,[be]:Ee,transform:`translate(${Me}, ${me}) scale(${1/i})`},"data-ord":le+L,"aria-label":le+L,tabIndex:-1,role:"button"})},V=()=>v.jsxs("div",{onFocus:z,onPointerDown:U,className:"absolute top-0 h-full w-full",children:[v.jsx("div",{className:"absolute pointer-events-auto top-0 left-0 w-full cursor-ns-resize h-[12px] mt-[-6px]","data-ord":"top"}),v.jsx("div",{className:"absolute pointer-events-auto top-0 right-0 h-full cursor-ew-resize w-[12px] mr-[-6px]","data-ord":"right"}),v.jsx("div",{className:"absolute pointer-events-auto bottom-0 left-0 w-full cursor-ns-resize h-[12px] mb-[-6px]","data-ord":"bottom"}),v.jsx("div",{className:"absolute pointer-events-auto top-0 left-0 h-full cursor-ew-resize w-[12px] ml-[-6px]","data-ord":"left"}),re("cursor-nw-resize","top","left"),re("cursor-ne-resize","top","right"),re("cursor-sw-resize","bottom","left"),re("cursor-se-resize","bottom","right"),re("cursor-ns-resize","top",""),re("cursor-ns-resize","bottom",""),re("cursor-ew-resize","left",""),re("cursor-ew-resize","right","")]}),J=Q=>{O(!0),I({initX:p,initY:g,initHeight:x,initWidth:y,startResizeX:Q.clientX,startResizeY:Q.clientY,ord:""})},G=()=>v.jsxs("div",{className:ix("border absolute pointer-events-auto px-2 py-1 rounded-full hover:cursor-move bg-background","origin-top-left top-0 left-0"),style:{transform:`scale(${1/i*.8})`},onPointerDown:J,children:[y," x ",x]}),Z=()=>v.jsx("div",{className:"outline-dashed outline-primary",style:{height:x,width:y,outlineWidth:`${Lq/i*1.3}px`}});return s===!1||!m?null:v.jsx("div",{className:"absolute h-full w-full overflow-hidden pointer-events-none z-[2]",children:v.jsxs("div",{className:"relative pointer-events-none z-[2] [box-shadow:0_0_0_9999px_rgba(0,_0,_0,_0.5)]",style:{height:x,width:y,left:p,top:g},children:[Z(),G(),V()]})})},jq=()=>{const[e,t,n]=xt(r=>[r.interactiveSegState,r.resetInteractiveSegState,r.handleInteractiveSegAccept]);return e.isInteractiveSeg?v.jsxs("div",{className:"z-10 absolute top-[68px] rounded-xl border-solid border p-[8px] left-1/2 translate-x-[-50%] flex justify-center items-center gap-[8px] bg-background",children:[v.jsx(vn,{onClick:()=>{t()},size:"sm",variant:"secondary",children:"Cancel"}),v.jsx(vn,{size:"sm",onClick:()=>{n()},children:"Accept"})]}):null},Bq=e=>{const{x:t,y:n,positive:r}=e,o=r?"bg-[rgba(21,_215,_121,_0.936)] outline-[rgba(98,255,179,0.31)]":"bg-[rgba(237,_49,_55,_0.942)] outline-[rgba(255,89,95,0.31)]";return v.jsx("div",{className:`absolute h-[10px] w-[10px] rounded-[50%] ${o} outline-8 outline`,style:{left:t,top:n,transform:"translate(-50%, -50%)"}})},zq=()=>{const e=xt(t=>t.interactiveSegState.clicks);return v.jsx("div",{className:"absolute h-full w-full overflow-hidden pointer-events-none",children:e.map(t=>v.jsx(Bq,{x:t[0],y:t[1],positive:t[2]===1},t[3]))})},Uq=()=>v.jsx("div",{children:v.jsx(jq,{})}),fl={capture:!0,passive:!1},Vq=2,Z2=(e,t,n,r)=>t{const{minHeight:t,minWidth:n,scale:r,show:o}=e,[i,s,l,u,{x:f,y:m,width:p,height:g},y,x,S,E,_,b,C]=xt(J=>[J.isInpainting,J.imageHeight,J.imageWidth,J.isSD(),J.extenderState,J.setExtenderX,J.setExtenderY,J.setExtenderWidth,J.setExtenderHeight,J.settings.extenderDirection,J.isCropperExtenderResizing,J.setIsCropperExtenderResizing]),[R,k]=d.useState({initX:0,initY:0,initHeight:0,initWidth:0,startResizeX:0,startResizeY:0,ord:"top"}),O=()=>{},A=(J,G)=>Z2(J,G,f,n),I=(J,G)=>Z2(J,G,m,t),z=J=>{if(i)return;const G=J.clientX,Z=J.clientY,Q=Math.round((Z-R.startResizeY)/r),le=Math.round((G-R.startResizeX)/r),L=()=>{const Me=R.initHeight-Q,me=R.initY+Q;let be=me,Ee=Me;if(_===Ln.xy)be>0&&(be=0,Ee=R.initHeight-Math.abs(R.initY));else{const Oe=I(me,Me);be=Oe[0],Ee=Oe[1]}E(Ee),x(be)},ue=()=>{const Me=R.initHeight+Q;let[me,be]=I(R.initY,Me);_===Ln.xy&&be{const Me=R.initWidth-le,me=R.initX+le;let be=me,Ee=Me;if(_===Ln.xy)be>0&&(be=0,Ee=R.initWidth-Math.abs(R.initX));else{const Oe=A(me,Me);be=Oe[0],Ee=Oe[1]}S(Ee),y(be)},Ke=()=>{const Me=R.initWidth+le;let[me,be]=A(R.initX,Me);_===Ln.xy&&be{b&&C(!1)};d.useEffect(()=>{if(b)return document.addEventListener("pointermove",z,fl),document.addEventListener("pointerup",H,fl),document.addEventListener("pointercancel",H,fl),()=>{document.removeEventListener("pointermove",z,fl),document.removeEventListener("pointerup",H,fl),document.removeEventListener("pointercancel",H,fl)}},[b,p,g,R]);const ie=J=>{const{ord:G}=J.target.dataset;G&&(C(!0),k({initX:f,initY:m,initHeight:g,initWidth:p,startResizeX:J.clientX,startResizeY:J.clientY,ord:G}))},K=(J,G,Z)=>{const L="w-[12px] h-[12px] z-[4] absolute content-[''] block border-2 border-primary borde pointer-events-auto hover:bg-primary";let ue="0",Ne="0",Ke=Z,Me="-6px";return Z===""&&(Me="50%",G==="left"||G==="right"?(Ke="top",Ne="-50%"):(Ke="left",ue="-50%")),v.jsx("div",{className:xe(L,J),style:{[G]:-6,[Ke]:Me,transform:`translate(${ue}, ${Ne}) scale(${1/r})`},"data-ord":G+Z,"aria-label":G+Z,tabIndex:-1,role:"button"})},te=()=>v.jsxs("div",{onFocus:O,onPointerDown:ie,className:"absolute top-0 h-full w-full",children:[[Ln.y,Ln.xy].includes(_)?v.jsxs(v.Fragment,{children:[v.jsx("div",{className:"absolute pointer-events-auto top-0 left-0 w-full cursor-ns-resize h-[12px] mt-[-6px]","data-ord":"top"}),v.jsx("div",{className:"absolute pointer-events-auto bottom-0 left-0 w-full cursor-ns-resize h-[12px] mb-[-6px]","data-ord":"bottom"}),K("cursor-ns-resize","top",""),K("cursor-ns-resize","bottom","")]}):v.jsx(v.Fragment,{}),[Ln.x,Ln.xy].includes(_)?v.jsxs(v.Fragment,{children:[v.jsx("div",{className:"absolute pointer-events-auto top-0 right-0 h-full cursor-ew-resize w-[12px] mr-[-6px]","data-ord":"right"}),v.jsx("div",{className:"absolute pointer-events-auto top-0 left-0 h-full cursor-ew-resize w-[12px] ml-[-6px]","data-ord":"left"}),K("cursor-ew-resize","left",""),K("cursor-ew-resize","right","")]}):v.jsx(v.Fragment,{}),_===Ln.xy?v.jsxs(v.Fragment,{children:[K("cursor-nw-resize","top","left"),K("cursor-ne-resize","top","right"),K("cursor-sw-resize","bottom","left"),K("cursor-se-resize","bottom","right")]}):v.jsx(v.Fragment,{})]}),U=J=>{k({initX:f,initY:m,initHeight:g,initWidth:p,startResizeX:J.clientX,startResizeY:J.clientY,ord:""})},re=()=>v.jsxs("div",{className:ix("border absolute pointer-events-auto px-2 py-1 rounded-full bg-background","origin-top-left top-0 left-0"),style:{transform:`scale(${1/r*.8})`},onPointerDown:U,children:[p," x ",g]}),V=()=>v.jsx("div",{className:xe("outline-dashed outline-primary"),style:{height:g,width:p,outlineWidth:`${Vq/r*1.3}px`}});return o===!1||!u?null:v.jsx("div",{className:"absolute h-full w-full pointer-events-none z-[2]",children:v.jsxs("div",{className:"relative pointer-events-none z-[2] [box-shadow:0_0_0_9999px_rgba(0,_0,_0,_0.5)]",style:{height:g,width:p,left:f,top:m},children:[V(),re(),te()]})})},Hq=200,pu=300;function Kq(e){const{file:t}=e,{toast:n}=Id(),[r,o,i,s,l,u,f,m,p,g,y,x,S,E,_,b,C,R,k,O,A,I,z,H]=xt(W=>[W.disableShortCuts,W.windowSize,W.isInpainting,W.imageWidth,W.imageHeight,W.settings,W.serverConfig.enableAutoSaving,W.setImageSize,W.setBaseBrushSize,W.interactiveSegState,W.updateInteractiveSegState,W.handleCanvasMouseDown,W.handleCanvasMouseMove,W.undo,W.redo,W.undoDisabled(),W.redoDisabled(),W.getIsProcessing(),W.updateAppState,W.runMannually(),W.runInpainting,W.isCropperExtenderResizing,W.decreaseBaseBrushSize,W.increaseBaseBrushSize]),ie=xt(W=>W.editorState.baseBrushSize),K=xt(W=>W.getBrushSize()),te=xt(W=>W.editorState.renders),U=xt(W=>W.editorState.extraMasks),re=xt(W=>W.editorState.temporaryMasks),V=xt(W=>W.editorState.lineGroups),J=xt(W=>W.editorState.curLineGroup),[G,Z]=d.useState(!1),[Q,le]=Nx(t),[L,ue]=d.useState(),[Ne,Ke]=d.useState(),[{x:Me,y:me},be]=d.useState({x:-1,y:-1}),[Ee,Oe]=d.useState(!1),[Ie,ze]=d.useState(!1),[ht,st]=d.useState(!1),[Yt,rr]=d.useState(1),[Jt,Li]=d.useState(!1),[N,X]=d.useState(1),ee=o.width/2,Se=o.height/2,pe=d.useRef(null),[he,Te]=d.useState(!1),[ut,gt]=d.useState(!1),[An,Mn]=d.useState(0),[Fi,Ro]=d.useState(!1),Cr=d.useCallback(()=>J.length!==0,[J]);d.useEffect(()=>{if(!Ne||!le||s===0||l===0)return;const W=te.length===0?Q:te[te.length-1];Ne.canvas.width=s,Ne.canvas.height=l,Ne.clearRect(0,0,Ne.canvas.width,Ne.canvas.height),Ne.drawImage(W,0,0,s,l)},[te,Q,le,Ne,l,s]),d.useEffect(()=>{!L||!le||s===0||l===0||(L.canvas.width=s,L.canvas.height=l,L.clearRect(0,0,L.canvas.width,L.canvas.height),re.forEach(W=>{L.drawImage(W,0,0,s,l)}),U.forEach(W=>{L.drawImage(W,0,0,s,l)}),g.isInteractiveSeg&&g.tmpInteractiveSegMask&&L.drawImage(g.tmpInteractiveSegMask,0,0,s,l),eP(L,J))},[re,U,le,g,L,J,l,s]);const Ks=d.useCallback(async()=>{let W=t;if(te.length>0){const We=te[te.length-1];W=await Nu(We.currentSrc,t.name,t.type)}return W},[t,te]),ri=()=>te.length!==0,ji=d.useCallback(()=>{let W=512,We=512;return le?(te.length===0?(W=Q.naturalWidth,We=Q.naturalHeight):te.length!==0&&(W=te[te.length-1].width,We=te[te.length-1].height),[W,We]):[W,We]},[Q,le,te]);d.useEffect(()=>{var of;if(!le)return;const[W,We]=ji();(W!==s||We!==l)&&m(W,We);const dt=o.width/W,no=(o.height-Hq)/We;let an=1;(dt<1||no<1)&&(an=Math.min(dt,no)),X(an),rr(an),console.log(`[on file load] image size: ${W}x${We}, scale: ${an}, initialCentered: ${he}`),L!=null&&L.canvas&&(console.log("[on file load] set canvas size"),W!=L.canvas.width&&(L.canvas.width=W),We!=L.canvas.height&&(L.canvas.height=We)),he||((of=pe.current)==null||of.centerView(an,1),console.log("[on file load] centerView"),Te(!0))},[pe,l,s,Q,le,o,he,ji]),d.useEffect(()=>{var W;console.log("[useEffect] centerView"),(W=pe==null?void 0:pe.current)==null||W.centerView(N,1)},[l,s,pe,N]);const or=d.useCallback(()=>{if(!N||!o)return;const W=pe.current;if(!W)return;const We=(o.width-s*N)/2,dt=(o.height-l*N)/2;W.setTransform(We,dt,N,200,"easeOutQuad"),W.instance.transformState.scale&&(W.instance.transformState.scale=N),rr(N),Li(!1)},[pe,o,l,s,o.height,N]);d.useEffect(()=>(window.addEventListener("resize",()=>{or()}),()=>{window.removeEventListener("resize",()=>{or()})}),[o,or]),Yn("Escape",()=>{R||(ut?gt(!1):or())},[ut,i,or]);const $g=W=>{const We=W.nativeEvent;be({x:We.pageX,y:We.pageY})},qd=W=>{R||g.isInteractiveSeg||ht||ut&&J.length!==0&&S(v0(W))},Rg=async W=>{k({isPluginRunning:!0});const We=await Ks();try{const dt=await _P(!0,Oo.InteractiveSeg,We,void 0,W),{blob:no}=dt,an=new Image;an.onload=()=>{y({tmpInteractiveSegMask:an})},an.src=no}catch(dt){n({variant:"destructive",description:dt.message?dt.message:dt.toString()})}k({isPluginRunning:!1})},Qd=W=>{if(vC(W)){st(!1);return}!Cr()||g.isInteractiveSeg||ht||!Q.src||!(L!=null&&L.canvas)||i||ut&&(O?gt(!1):A())},Nc=W=>{if(g.isInteractiveSeg){const We=v0(W),dt=[...g.clicks];gC(W)?dt.push([We.x,We.y,0,dt.length]):dt.push([We.x,We.y,1,dt.length]),Rg(dt),y({clicks:dt})}},Jd=W=>{if(!(R||g.isInteractiveSeg||ht||!le||!(L!=null&&L.canvas))&&!gC(W)){if(vC(W)){st(!0);return}gt(!0),x(v0(W))}},ef=W=>{W.preventDefault(),E()};Yn("meta+z,ctrl+z",ef);const tf=W=>{W.preventDefault(),_()};Yn("shift+ctrl+z,shift+meta+z",tf),I0("Tab",W=>{W==null||W.preventDefault(),W==null||W.stopPropagation(),ri()&&Z(()=>(window.setTimeout(()=>{Mn(100)},10),!0))},W=>{W==null||W.preventDefault(),W==null||W.stopPropagation(),ri()&&(window.setTimeout(()=>{Mn(0)},10),window.setTimeout(()=>{Z(!1)},pu))});const nf=d.useCallback(async()=>{if(t===void 0)return;if(f&&te.length>0){try{await Yj(te[te.length-1],t.name,t.type),n({description:"Save image success"})}catch(dt){n({variant:"destructive",title:"Uh oh! Something went wrong.",description:dt.message?dt.message:dt.toString()})}return}const W=t.name.replace(/(\.[\w\d_-]+)$/i,"_cleanup$1"),We=te[te.length-1];if(_F(We.currentSrc,W),u.enableDownloadMask){let dt=t.name.replace(/(\.[\w\d_-]+)$/i,"_mask$1");dt=dt.replace(/\.[^/.]+$/,".jpg");const no=Gh(s,l,V),an=document.createElement("a");an.download=dt,an.href=no.toDataURL("image/jpeg"),an.click()}},[t,f,te,u,l,s,V]);Yn("meta+s,ctrl+s",nf);const Va=W=>{W!==Ee&&!ht&&!I&&Oe(W)},Pg=d.useCallback(()=>{if(R)return"default";if(ht)return"grab";if(Ee)return"none"},[Ee,ht,R]);Yn("[",()=>{z()},[z]),Yn("]",()=>{H()},[H]),Yn("shift+r",()=>{O&&Cr()&&A()},[O,A,Cr]),Yn("ctrl+c,meta+c",async()=>{await wF()&&te.length>0&&L!=null&&L.canvas&&(await SF(L==null?void 0:L.canvas),n({title:"Copy inpainting result to clipboard"}))},[te,L]),I0(" ",W=>{r||(W==null||W.preventDefault(),W==null||W.stopPropagation(),Oe(!1),st(!0))},W=>{r||(W==null||W.preventDefault(),W==null||W.stopPropagation(),Oe(!0),st(!1))}),I0("Alt",W=>{r||(W==null||W.preventDefault(),W==null||W.stopPropagation(),Ro(!0))},W=>{r||(W==null||W.preventDefault(),W==null||W.stopPropagation(),Ro(!1))});const Dc=()=>{var We,dt,no,an;let W=N;return((dt=(We=pe.current)==null?void 0:We.instance)==null?void 0:dt.transformState.scale)!==void 0&&(W=(an=(no=pe.current)==null?void 0:no.instance)==null?void 0:an.transformState.scale),W},rf=(W,We)=>{const dt=Dc();return{width:`${K*dt}px`,height:`${K*dt}px`,left:`${W}px`,top:`${We}px`,transform:"translate(-50%, -50%)"}},Wa=W=>v.jsx("div",{className:"absolute rounded-[50%] border-[1px] border-[solid] border-[#ffcc00] pointer-events-none bg-[#ffcc00bb]",style:W}),Tg=W=>{p(W),Ie||(ze(!0),window.setTimeout(()=>{ze(!1)},1e4))},Ic=()=>v.jsx("div",{className:"absolute h-[20px] w-[20px] pointer-events-none rounded-[50%] bg-[rgba(21,_215,_121,_0.936)] [box-shadow:0_0_0_0_rgba(21,_215,_121,_0.936)] animate-pulse",style:{left:`${Me}px`,top:`${me}px`,transform:"translate(-50%, -50%)"},children:v.jsx(dK,{})}),kg=()=>v.jsx(uq,{ref:W=>{W&&(pe.current=W)},panning:{disabled:!ht,velocityDisabled:!0},wheel:{step:.05,wheelDisabled:Fi},centerZoomedOut:!0,alignmentAnimation:{disabled:!0},centerOnInit:!0,limitToBounds:!1,doubleClick:{disabled:!0},initialScale:N,minScale:N*.3,onPanning:()=>{Jt||Li(!0)},onZoom:W=>{rr(W.state.scale)},children:v.jsxs(hq,{contentStyle:{visibility:he?"visible":"hidden"},children:[v.jsxs("div",{className:"grid [grid-template-areas:'editor-content'] gap-y-4",children:[v.jsx("canvas",{className:"[grid-area:editor-content]",style:{clipPath:`inset(0 ${An}% 0 0)`,transition:`clip-path ${pu}ms`},ref:W=>{if(W&&!Ne){const We=W.getContext("2d");We&&Ke(We)}}}),v.jsx("canvas",{className:xe("[grid-area:editor-content]",R?"pointer-events-none animate-pulse duration-600":""),style:{cursor:Pg(),clipPath:`inset(0 ${An}% 0 0)`,transition:`clip-path ${pu}ms`},onContextMenu:W=>{W.preventDefault()},onMouseOver:()=>{Va(!0),ze(!1)},onFocus:()=>Va(!0),onMouseLeave:()=>Va(!1),onMouseDown:Jd,onMouseUp:Nc,onMouseMove:qd,onTouchStart:Jd,onTouchEnd:Nc,onTouchMove:qd,ref:W=>{if(W&&!L){const We=W.getContext("2d");We&&ue(We)}}}),v.jsx("div",{className:"[grid-area:editor-content] pointer-events-none grid [grid-template-areas:'original-image-content']",style:{width:`${s}px`,height:`${l}px`},children:G&&v.jsxs(v.Fragment,{children:[v.jsx("div",{className:"[grid-area:original-image-content] z-10 bg-primary h-full w-[6px] justify-self-end",style:{marginRight:`${An}%`,transition:`margin-right ${pu}ms`}}),v.jsx("img",{className:"[grid-area:original-image-content]",src:Q.src,alt:"original",style:{width:`${s}px`,height:`${l}px`}})]})})]}),v.jsx(Fq,{maxHeight:l,maxWidth:s,minHeight:Math.min(512,l),minWidth:Math.min(512,s),scale:Dc(),show:u.showCropper}),v.jsx(Wq,{minHeight:Math.min(512,l),minWidth:Math.min(512,s),scale:Dc(),show:u.showExtender}),g.isInteractiveSeg?v.jsx(zq,{}):v.jsx(v.Fragment,{})]})}),Ag=W=>{if(!Fi)return;const{deltaY:We}=W;We>0?H():We<0&&z()};return v.jsxs("div",{className:"flex w-screen h-screen justify-center items-center","aria-hidden":"true",onMouseMove:$g,onMouseUp:Qd,onWheel:Ag,children:[kg(),Ee&&!i&&!ht&&(g.isInteractiveSeg?Ic():Wa(rf(Me,me))),Ie&&Wa(rf(ee,Se)),v.jsxs("div",{className:"fixed flex bottom-5 border px-4 py-2 rounded-[3rem] gap-8 items-center justify-center backdrop-filter backdrop-blur-md bg-background/70",children:[v.jsx(po,{className:"w-48",defaultValue:[50],min:dF,max:QR,step:1,tabIndex:-1,value:[ie],onValueChange:W=>Tg(W[0]),onClick:()=>ze(!1)}),v.jsxs("div",{className:"flex gap-2",children:[v.jsx(Zn,{tooltip:"Reset zoom & pan",disabled:Yt===N&&Jt===!1,onClick:or,children:v.jsx(fH,{})}),v.jsx(Zn,{tooltip:"Undo",onClick:ef,disabled:b,children:v.jsx(yH,{})}),v.jsx(Zn,{tooltip:"Redo",onClick:tf,disabled:C,children:v.jsx(gH,{})}),v.jsx(Zn,{tooltip:"Show original image",onPointerDown:W=>{W.preventDefault(),Z(()=>(window.setTimeout(()=>{Mn(100)},10),!0))},onPointerUp:()=>{window.setTimeout(()=>{Mn(0)},10),window.setTimeout(()=>{Z(!1)},pu)},disabled:te.length===0,children:v.jsx(hH,{})}),v.jsx(Zn,{tooltip:"Save Image",disabled:!te.length,onClick:nf,children:v.jsx(uH,{})}),u.enableManualInpainting&&u.model.model_type==="inpaint"?v.jsx(Zn,{tooltip:"Run Inpainting",disabled:R||!Cr()&&U.length===0,onClick:()=>{A()},children:v.jsx(dH,{})}):v.jsx(v.Fragment,{})]})]})]})}const Gq=()=>{const[e,t]=xt(n=>[n.imageWidth,n.imageHeight]);return!e||!t?null:v.jsxs("div",{className:"border rounded-lg px-2 py-[6px] z-10 bg-background",children:[e,"x",t]})},G1=["Enter"," "],Yq=["ArrowDown","PageUp","Home"],J3=["ArrowUp","PageDown","End"],Xq=[...Yq,...J3],Zq={ltr:[...G1,"ArrowRight"],rtl:[...G1,"ArrowLeft"]},qq={ltr:["ArrowLeft"],rtl:["ArrowRight"]},xg="Menu",[md,Qq,Jq]=Wd(xg),[Ua,eA]=Tn(xg,[Jq,zs,sg]),bg=zs(),tA=sg(),[nA,Hs]=Ua(xg),[eQ,Xd]=Ua(xg),tQ=e=>{const{__scopeMenu:t,open:n=!1,children:r,dir:o,onOpenChange:i,modal:s=!0}=e,l=bg(t),[u,f]=d.useState(null),m=d.useRef(!1),p=Lt(i),g=Ac(o);return d.useEffect(()=>{const y=()=>{m.current=!0,document.addEventListener("pointerdown",x,{capture:!0,once:!0}),document.addEventListener("pointermove",x,{capture:!0,once:!0})},x=()=>m.current=!1;return document.addEventListener("keydown",y,{capture:!0}),()=>{document.removeEventListener("keydown",y,{capture:!0}),document.removeEventListener("pointerdown",x,{capture:!0}),document.removeEventListener("pointermove",x,{capture:!0})}},[]),d.createElement(Fd,l,d.createElement(nA,{scope:t,open:n,onOpenChange:p,content:u,onContentChange:f},d.createElement(eQ,{scope:t,onClose:d.useCallback(()=>p(!1),[p]),isUsingKeyboardRef:m,dir:g,modal:s},r)))},rA=d.forwardRef((e,t)=>{const{__scopeMenu:n,...r}=e,o=bg(n);return d.createElement(Gm,Y({},o,r,{ref:t}))}),oA="MenuPortal",[nQ,iA]=Ua(oA,{forceMount:void 0}),rQ=e=>{const{__scopeMenu:t,forceMount:n,children:r,container:o}=e,i=Hs(oA,t);return d.createElement(nQ,{scope:t,forceMount:n},d.createElement(xn,{present:n||i.open},d.createElement(jd,{asChild:!0,container:o},r)))},_o="MenuContent",[oQ,ab]=Ua(_o),iQ=d.forwardRef((e,t)=>{const n=iA(_o,e.__scopeMenu),{forceMount:r=n.forceMount,...o}=e,i=Hs(_o,e.__scopeMenu),s=Xd(_o,e.__scopeMenu);return d.createElement(md.Provider,{scope:e.__scopeMenu},d.createElement(xn,{present:r||i.open},d.createElement(md.Slot,{scope:e.__scopeMenu},s.modal?d.createElement(sQ,Y({},o,{ref:t})):d.createElement(aQ,Y({},o,{ref:t})))))}),sQ=d.forwardRef((e,t)=>{const n=Hs(_o,e.__scopeMenu),r=d.useRef(null),o=Ve(t,r);return d.useEffect(()=>{const i=r.current;if(i)return ng(i)},[]),d.createElement(lb,Y({},e,{ref:o,trapFocus:n.open,disableOutsidePointerEvents:n.open,disableOutsideScroll:!0,onFocusOutside:fe(e.onFocusOutside,i=>i.preventDefault(),{checkForDefaultPrevented:!1}),onDismiss:()=>n.onOpenChange(!1)}))}),aQ=d.forwardRef((e,t)=>{const n=Hs(_o,e.__scopeMenu);return d.createElement(lb,Y({},e,{ref:t,trapFocus:!1,disableOutsidePointerEvents:!1,disableOutsideScroll:!1,onDismiss:()=>n.onOpenChange(!1)}))}),lb=d.forwardRef((e,t)=>{const{__scopeMenu:n,loop:r=!1,trapFocus:o,onOpenAutoFocus:i,onCloseAutoFocus:s,disableOutsidePointerEvents:l,onEntryFocus:u,onEscapeKeyDown:f,onPointerDownOutside:m,onFocusOutside:p,onInteractOutside:g,onDismiss:y,disableOutsideScroll:x,...S}=e,E=Hs(_o,n),_=Xd(_o,n),b=bg(n),C=tA(n),R=Qq(n),[k,O]=d.useState(null),A=d.useRef(null),I=Ve(t,A,E.onContentChange),z=d.useRef(0),H=d.useRef(""),ie=d.useRef(0),K=d.useRef(null),te=d.useRef("right"),U=d.useRef(0),re=x?tg:d.Fragment,V=x?{as:Qo,allowPinchZoom:!0}:void 0,J=Z=>{var Q,le;const L=H.current+Z,ue=R().filter(Ee=>!Ee.disabled),Ne=document.activeElement,Ke=(Q=ue.find(Ee=>Ee.ref.current===Ne))===null||Q===void 0?void 0:Q.textValue,Me=ue.map(Ee=>Ee.textValue),me=EQ(Me,L,Ke),be=(le=ue.find(Ee=>Ee.textValue===me))===null||le===void 0?void 0:le.ref.current;(function Ee(Oe){H.current=Oe,window.clearTimeout(z.current),Oe!==""&&(z.current=window.setTimeout(()=>Ee(""),1e3))})(L),be&&setTimeout(()=>be.focus())};d.useEffect(()=>()=>window.clearTimeout(z.current),[]),Jm();const G=d.useCallback(Z=>{var Q,le;return te.current===((Q=K.current)===null||Q===void 0?void 0:Q.side)&&$Q(Z,(le=K.current)===null||le===void 0?void 0:le.area)},[]);return d.createElement(oQ,{scope:n,searchRef:H,onItemEnter:d.useCallback(Z=>{G(Z)&&Z.preventDefault()},[G]),onItemLeave:d.useCallback(Z=>{var Q;G(Z)||((Q=A.current)===null||Q===void 0||Q.focus(),O(null))},[G]),onTriggerLeave:d.useCallback(Z=>{G(Z)&&Z.preventDefault()},[G]),pointerGraceTimerRef:ie,onPointerGraceIntentChange:d.useCallback(Z=>{K.current=Z},[])},d.createElement(re,V,d.createElement(Qm,{asChild:!0,trapped:o,onMountAutoFocus:fe(i,Z=>{var Q;Z.preventDefault(),(Q=A.current)===null||Q===void 0||Q.focus()}),onUnmountAutoFocus:s},d.createElement($c,{asChild:!0,disableOutsidePointerEvents:l,onEscapeKeyDown:f,onPointerDownOutside:m,onFocusOutside:p,onInteractOutside:g,onDismiss:y},d.createElement(ok,Y({asChild:!0},C,{dir:_.dir,orientation:"vertical",loop:r,currentTabStopId:k,onCurrentTabStopIdChange:O,onEntryFocus:fe(u,Z=>{_.isUsingKeyboardRef.current||Z.preventDefault()})}),d.createElement(Ym,Y({role:"menu","aria-orientation":"vertical","data-state":dA(E.open),"data-radix-menu-content":"",dir:_.dir},b,S,{ref:I,style:{outline:"none",...S.style},onKeyDown:fe(S.onKeyDown,Z=>{const le=Z.target.closest("[data-radix-menu-content]")===Z.currentTarget,L=Z.ctrlKey||Z.altKey||Z.metaKey,ue=Z.key.length===1;le&&(Z.key==="Tab"&&Z.preventDefault(),!L&&ue&&J(Z.key));const Ne=A.current;if(Z.target!==Ne||!Xq.includes(Z.key))return;Z.preventDefault();const Me=R().filter(me=>!me.disabled).map(me=>me.ref.current);J3.includes(Z.key)&&Me.reverse(),SQ(Me)}),onBlur:fe(e.onBlur,Z=>{Z.currentTarget.contains(Z.target)||(window.clearTimeout(z.current),H.current="")}),onPointerMove:fe(e.onPointerMove,gd(Z=>{const Q=Z.target,le=U.current!==Z.clientX;if(Z.currentTarget.contains(Q)&&le){const L=Z.clientX>U.current?"right":"left";te.current=L,U.current=Z.clientX}}))})))))))}),lQ=d.forwardRef((e,t)=>{const{__scopeMenu:n,...r}=e;return d.createElement(Ae.div,Y({},r,{ref:t}))}),Y1="MenuItem",q2="menu.itemSelect",cb=d.forwardRef((e,t)=>{const{disabled:n=!1,onSelect:r,...o}=e,i=d.useRef(null),s=Xd(Y1,e.__scopeMenu),l=ab(Y1,e.__scopeMenu),u=Ve(t,i),f=d.useRef(!1),m=()=>{const p=i.current;if(!n&&p){const g=new CustomEvent(q2,{bubbles:!0,cancelable:!0});p.addEventListener(q2,y=>r==null?void 0:r(y),{once:!0}),mx(p,g),g.defaultPrevented?f.current=!1:s.onClose()}};return d.createElement(sA,Y({},o,{ref:u,disabled:n,onClick:fe(e.onClick,m),onPointerDown:p=>{var g;(g=e.onPointerDown)===null||g===void 0||g.call(e,p),f.current=!0},onPointerUp:fe(e.onPointerUp,p=>{var g;f.current||(g=p.currentTarget)===null||g===void 0||g.click()}),onKeyDown:fe(e.onKeyDown,p=>{const g=l.searchRef.current!=="";n||g&&p.key===" "||G1.includes(p.key)&&(p.currentTarget.click(),p.preventDefault())})}))}),sA=d.forwardRef((e,t)=>{const{__scopeMenu:n,disabled:r=!1,textValue:o,...i}=e,s=ab(Y1,n),l=tA(n),u=d.useRef(null),f=Ve(t,u),[m,p]=d.useState(!1),[g,y]=d.useState("");return d.useEffect(()=>{const x=u.current;if(x){var S;y(((S=x.textContent)!==null&&S!==void 0?S:"").trim())}},[i.children]),d.createElement(md.ItemSlot,{scope:n,disabled:r,textValue:o??g},d.createElement(ik,Y({asChild:!0},l,{focusable:!r}),d.createElement(Ae.div,Y({role:"menuitem","data-highlighted":m?"":void 0,"aria-disabled":r||void 0,"data-disabled":r?"":void 0},i,{ref:f,onPointerMove:fe(e.onPointerMove,gd(x=>{r?s.onItemLeave(x):(s.onItemEnter(x),x.defaultPrevented||x.currentTarget.focus())})),onPointerLeave:fe(e.onPointerLeave,gd(x=>s.onItemLeave(x))),onFocus:fe(e.onFocus,()=>p(!0)),onBlur:fe(e.onBlur,()=>p(!1))}))))}),cQ=d.forwardRef((e,t)=>{const{checked:n=!1,onCheckedChange:r,...o}=e;return d.createElement(lA,{scope:e.__scopeMenu,checked:n},d.createElement(cb,Y({role:"menuitemcheckbox","aria-checked":um(n)?"mixed":n},o,{ref:t,"data-state":ub(n),onSelect:fe(o.onSelect,()=>r==null?void 0:r(um(n)?!0:!n),{checkForDefaultPrevented:!1})})))}),uQ="MenuRadioGroup",[Ste,dQ]=Ua(uQ,{value:void 0,onValueChange:()=>{}}),fQ="MenuRadioItem",hQ=d.forwardRef((e,t)=>{const{value:n,...r}=e,o=dQ(fQ,e.__scopeMenu),i=n===o.value;return d.createElement(lA,{scope:e.__scopeMenu,checked:i},d.createElement(cb,Y({role:"menuitemradio","aria-checked":i},r,{ref:t,"data-state":ub(i),onSelect:fe(r.onSelect,()=>{var s;return(s=o.onValueChange)===null||s===void 0?void 0:s.call(o,n)},{checkForDefaultPrevented:!1})})))}),aA="MenuItemIndicator",[lA,pQ]=Ua(aA,{checked:!1}),mQ=d.forwardRef((e,t)=>{const{__scopeMenu:n,forceMount:r,...o}=e,i=pQ(aA,n);return d.createElement(xn,{present:r||um(i.checked)||i.checked===!0},d.createElement(Ae.span,Y({},o,{ref:t,"data-state":ub(i.checked)})))}),gQ=d.forwardRef((e,t)=>{const{__scopeMenu:n,...r}=e;return d.createElement(Ae.div,Y({role:"separator","aria-orientation":"horizontal"},r,{ref:t}))}),cA="MenuSub",[vQ,uA]=Ua(cA),yQ=e=>{const{__scopeMenu:t,children:n,open:r=!1,onOpenChange:o}=e,i=Hs(cA,t),s=bg(t),[l,u]=d.useState(null),[f,m]=d.useState(null),p=Lt(o);return d.useEffect(()=>(i.open===!1&&p(!1),()=>p(!1)),[i.open,p]),d.createElement(Fd,s,d.createElement(nA,{scope:t,open:r,onOpenChange:p,content:f,onContentChange:m},d.createElement(vQ,{scope:t,contentId:tr(),triggerId:tr(),trigger:l,onTriggerChange:u},n)))},$h="MenuSubTrigger",wQ=d.forwardRef((e,t)=>{const n=Hs($h,e.__scopeMenu),r=Xd($h,e.__scopeMenu),o=uA($h,e.__scopeMenu),i=ab($h,e.__scopeMenu),s=d.useRef(null),{pointerGraceTimerRef:l,onPointerGraceIntentChange:u}=i,f={__scopeMenu:e.__scopeMenu},m=d.useCallback(()=>{s.current&&window.clearTimeout(s.current),s.current=null},[]);return d.useEffect(()=>m,[m]),d.useEffect(()=>{const p=l.current;return()=>{window.clearTimeout(p),u(null)}},[l,u]),d.createElement(rA,Y({asChild:!0},f),d.createElement(sA,Y({id:o.triggerId,"aria-haspopup":"menu","aria-expanded":n.open,"aria-controls":o.contentId,"data-state":dA(n.open)},e,{ref:Lm(t,o.onTriggerChange),onClick:p=>{var g;(g=e.onClick)===null||g===void 0||g.call(e,p),!(e.disabled||p.defaultPrevented)&&(p.currentTarget.focus(),n.open||n.onOpenChange(!0))},onPointerMove:fe(e.onPointerMove,gd(p=>{i.onItemEnter(p),!p.defaultPrevented&&!e.disabled&&!n.open&&!s.current&&(i.onPointerGraceIntentChange(null),s.current=window.setTimeout(()=>{n.onOpenChange(!0),m()},100))})),onPointerLeave:fe(e.onPointerLeave,gd(p=>{var g;m();const y=(g=n.content)===null||g===void 0?void 0:g.getBoundingClientRect();if(y){var x;const S=(x=n.content)===null||x===void 0?void 0:x.dataset.side,E=S==="right",_=E?-5:5,b=y[E?"left":"right"],C=y[E?"right":"left"];i.onPointerGraceIntentChange({area:[{x:p.clientX+_,y:p.clientY},{x:b,y:y.top},{x:C,y:y.top},{x:C,y:y.bottom},{x:b,y:y.bottom}],side:S}),window.clearTimeout(l.current),l.current=window.setTimeout(()=>i.onPointerGraceIntentChange(null),300)}else{if(i.onTriggerLeave(p),p.defaultPrevented)return;i.onPointerGraceIntentChange(null)}})),onKeyDown:fe(e.onKeyDown,p=>{const g=i.searchRef.current!=="";if(!(e.disabled||g&&p.key===" ")&&Zq[r.dir].includes(p.key)){var y;n.onOpenChange(!0),(y=n.content)===null||y===void 0||y.focus(),p.preventDefault()}})})))}),xQ="MenuSubContent",bQ=d.forwardRef((e,t)=>{const n=iA(_o,e.__scopeMenu),{forceMount:r=n.forceMount,...o}=e,i=Hs(_o,e.__scopeMenu),s=Xd(_o,e.__scopeMenu),l=uA(xQ,e.__scopeMenu),u=d.useRef(null),f=Ve(t,u);return d.createElement(md.Provider,{scope:e.__scopeMenu},d.createElement(xn,{present:r||i.open},d.createElement(md.Slot,{scope:e.__scopeMenu},d.createElement(lb,Y({id:l.contentId,"aria-labelledby":l.triggerId},o,{ref:f,align:"start",side:s.dir==="rtl"?"left":"right",disableOutsidePointerEvents:!1,disableOutsideScroll:!1,trapFocus:!1,onOpenAutoFocus:m=>{var p;s.isUsingKeyboardRef.current&&((p=u.current)===null||p===void 0||p.focus()),m.preventDefault()},onCloseAutoFocus:m=>m.preventDefault(),onFocusOutside:fe(e.onFocusOutside,m=>{m.target!==l.trigger&&i.onOpenChange(!1)}),onEscapeKeyDown:fe(e.onEscapeKeyDown,m=>{s.onClose(),m.preventDefault()}),onKeyDown:fe(e.onKeyDown,m=>{const p=m.currentTarget.contains(m.target),g=qq[s.dir].includes(m.key);if(p&&g){var y;i.onOpenChange(!1),(y=l.trigger)===null||y===void 0||y.focus(),m.preventDefault()}})})))))});function dA(e){return e?"open":"closed"}function um(e){return e==="indeterminate"}function ub(e){return um(e)?"indeterminate":e?"checked":"unchecked"}function SQ(e){const t=document.activeElement;for(const n of e)if(n===t||(n.focus(),document.activeElement!==t))return}function _Q(e,t){return e.map((n,r)=>e[(t+r)%e.length])}function EQ(e,t,n){const o=t.length>1&&Array.from(t).every(f=>f===t[0])?t[0]:t,i=n?e.indexOf(n):-1;let s=_Q(e,Math.max(i,0));o.length===1&&(s=s.filter(f=>f!==n));const u=s.find(f=>f.toLowerCase().startsWith(o.toLowerCase()));return u!==n?u:void 0}function CQ(e,t){const{x:n,y:r}=e;let o=!1;for(let i=0,s=t.length-1;ir!=m>r&&n<(f-l)*(r-u)/(m-u)+l&&(o=!o)}return o}function $Q(e,t){if(!t)return!1;const n={x:e.clientX,y:e.clientY};return CQ(n,t)}function gd(e){return t=>t.pointerType==="mouse"?e(t):void 0}const RQ=tQ,PQ=rA,TQ=rQ,kQ=iQ,AQ=lQ,MQ=cb,OQ=cQ,NQ=hQ,DQ=mQ,IQ=gQ,LQ=yQ,FQ=wQ,jQ=bQ,fA="DropdownMenu",[BQ,_te]=Tn(fA,[eA]),Er=eA(),[zQ,hA]=BQ(fA),UQ=e=>{const{__scopeDropdownMenu:t,children:n,dir:r,open:o,defaultOpen:i,onOpenChange:s,modal:l=!0}=e,u=Er(t),f=d.useRef(null),[m=!1,p]=eo({prop:o,defaultProp:i,onChange:s});return d.createElement(zQ,{scope:t,triggerId:tr(),triggerRef:f,contentId:tr(),open:m,onOpenChange:p,onOpenToggle:d.useCallback(()=>p(g=>!g),[p]),modal:l},d.createElement(RQ,Y({},u,{open:m,onOpenChange:p,dir:r,modal:l}),n))},VQ="DropdownMenuTrigger",WQ=d.forwardRef((e,t)=>{const{__scopeDropdownMenu:n,disabled:r=!1,...o}=e,i=hA(VQ,n),s=Er(n);return d.createElement(PQ,Y({asChild:!0},s),d.createElement(Ae.button,Y({type:"button",id:i.triggerId,"aria-haspopup":"menu","aria-expanded":i.open,"aria-controls":i.open?i.contentId:void 0,"data-state":i.open?"open":"closed","data-disabled":r?"":void 0,disabled:r},o,{ref:Lm(t,i.triggerRef),onPointerDown:fe(e.onPointerDown,l=>{!r&&l.button===0&&l.ctrlKey===!1&&(i.onOpenToggle(),i.open||l.preventDefault())}),onKeyDown:fe(e.onKeyDown,l=>{r||(["Enter"," "].includes(l.key)&&i.onOpenToggle(),l.key==="ArrowDown"&&i.onOpenChange(!0),["Enter"," ","ArrowDown"].includes(l.key)&&l.preventDefault())})})))}),HQ=e=>{const{__scopeDropdownMenu:t,...n}=e,r=Er(t);return d.createElement(TQ,Y({},r,n))},KQ="DropdownMenuContent",GQ=d.forwardRef((e,t)=>{const{__scopeDropdownMenu:n,...r}=e,o=hA(KQ,n),i=Er(n),s=d.useRef(!1);return d.createElement(kQ,Y({id:o.contentId,"aria-labelledby":o.triggerId},i,r,{ref:t,onCloseAutoFocus:fe(e.onCloseAutoFocus,l=>{var u;s.current||(u=o.triggerRef.current)===null||u===void 0||u.focus(),s.current=!1,l.preventDefault()}),onInteractOutside:fe(e.onInteractOutside,l=>{const u=l.detail.originalEvent,f=u.button===0&&u.ctrlKey===!0,m=u.button===2||f;(!o.modal||m)&&(s.current=!0)}),style:{...e.style,"--radix-dropdown-menu-content-transform-origin":"var(--radix-popper-transform-origin)","--radix-dropdown-menu-content-available-width":"var(--radix-popper-available-width)","--radix-dropdown-menu-content-available-height":"var(--radix-popper-available-height)","--radix-dropdown-menu-trigger-width":"var(--radix-popper-anchor-width)","--radix-dropdown-menu-trigger-height":"var(--radix-popper-anchor-height)"}}))}),YQ=d.forwardRef((e,t)=>{const{__scopeDropdownMenu:n,...r}=e,o=Er(n);return d.createElement(AQ,Y({},o,r,{ref:t}))}),XQ=d.forwardRef((e,t)=>{const{__scopeDropdownMenu:n,...r}=e,o=Er(n);return d.createElement(MQ,Y({},o,r,{ref:t}))}),ZQ=d.forwardRef((e,t)=>{const{__scopeDropdownMenu:n,...r}=e,o=Er(n);return d.createElement(OQ,Y({},o,r,{ref:t}))}),qQ=d.forwardRef((e,t)=>{const{__scopeDropdownMenu:n,...r}=e,o=Er(n);return d.createElement(NQ,Y({},o,r,{ref:t}))}),QQ=d.forwardRef((e,t)=>{const{__scopeDropdownMenu:n,...r}=e,o=Er(n);return d.createElement(DQ,Y({},o,r,{ref:t}))}),JQ=d.forwardRef((e,t)=>{const{__scopeDropdownMenu:n,...r}=e,o=Er(n);return d.createElement(IQ,Y({},o,r,{ref:t}))}),eJ=e=>{const{__scopeDropdownMenu:t,children:n,open:r,onOpenChange:o,defaultOpen:i}=e,s=Er(t),[l=!1,u]=eo({prop:r,defaultProp:i,onChange:o});return d.createElement(LQ,Y({},s,{open:l,onOpenChange:u}),n)},tJ=d.forwardRef((e,t)=>{const{__scopeDropdownMenu:n,...r}=e,o=Er(n);return d.createElement(FQ,Y({},o,r,{ref:t}))}),nJ=d.forwardRef((e,t)=>{const{__scopeDropdownMenu:n,...r}=e,o=Er(n);return d.createElement(jQ,Y({},o,r,{ref:t,style:{...e.style,"--radix-dropdown-menu-content-transform-origin":"var(--radix-popper-transform-origin)","--radix-dropdown-menu-content-available-width":"var(--radix-popper-available-width)","--radix-dropdown-menu-content-available-height":"var(--radix-popper-available-height)","--radix-dropdown-menu-trigger-width":"var(--radix-popper-anchor-width)","--radix-dropdown-menu-trigger-height":"var(--radix-popper-anchor-height)"}}))}),rJ=UQ,oJ=WQ,iJ=HQ,pA=GQ,mA=YQ,gA=XQ,vA=ZQ,yA=qQ,wA=QQ,xA=JQ,sJ=eJ,bA=tJ,SA=nJ,aJ=rJ,lJ=oJ,Q2=sJ,X1=d.forwardRef(({className:e,inset:t,children:n,...r},o)=>v.jsxs(bA,{ref:o,className:xe("flex cursor-default select-none items-center rounded-sm px-2 py-1.5 text-sm outline-none focus:bg-accent data-[state=open]:bg-accent data-[disabled]:pointer-events-none data-[disabled]:opacity-50",t&&"pl-8",e),...r,children:[n,v.jsx(oB,{className:"ml-auto h-4 w-4"})]}));X1.displayName=bA.displayName;const Z1=d.forwardRef(({className:e,...t},n)=>v.jsx(SA,{ref:n,className:xe("z-50 min-w-[8rem] overflow-hidden rounded-md border bg-popover p-1 text-popover-foreground shadow-lg data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2",e),...t}));Z1.displayName=SA.displayName;const _A=d.forwardRef(({className:e,sideOffset:t=4,...n},r)=>v.jsx(iJ,{children:v.jsx(pA,{ref:r,sideOffset:t,className:xe("z-50 min-w-[8rem] overflow-hidden rounded-md border bg-popover p-1 text-popover-foreground shadow-md","data-[state=open]:animate-in data-[state=closed]:animate-out data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0 data-[state=closed]:zoom-out-95 data-[state=open]:zoom-in-95 data-[side=bottom]:slide-in-from-top-2 data-[side=left]:slide-in-from-right-2 data-[side=right]:slide-in-from-left-2 data-[side=top]:slide-in-from-bottom-2",e),onCloseAutoFocus:o=>o.preventDefault(),...n})}));_A.displayName=pA.displayName;const ml=d.forwardRef(({className:e,inset:t,...n},r)=>v.jsx(gA,{ref:r,className:xe("relative flex cursor-default select-none items-center rounded-sm px-2 py-1.5 text-sm outline-none transition-colors focus:bg-accent focus:text-accent-foreground data-[disabled]:pointer-events-none data-[disabled]:opacity-50",t&&"pl-8",e),...n}));ml.displayName=gA.displayName;const cJ=d.forwardRef(({className:e,children:t,checked:n,...r},o)=>v.jsxs(vA,{ref:o,className:xe("relative flex cursor-default select-none items-center rounded-sm py-1.5 pl-8 pr-2 text-sm outline-none transition-colors focus:bg-accent focus:text-accent-foreground data-[disabled]:pointer-events-none data-[disabled]:opacity-50",e),checked:n,...r,children:[v.jsx("span",{className:"absolute left-2 flex h-3.5 w-3.5 items-center justify-center",children:v.jsx(wA,{children:v.jsx(EP,{className:"h-4 w-4"})})}),t]}));cJ.displayName=vA.displayName;const uJ=d.forwardRef(({className:e,children:t,...n},r)=>v.jsxs(yA,{ref:r,className:xe("relative flex cursor-default select-none items-center rounded-sm py-1.5 pl-8 pr-2 text-sm outline-none transition-colors focus:bg-accent focus:text-accent-foreground data-[disabled]:pointer-events-none data-[disabled]:opacity-50",e),...n,children:[v.jsx("span",{className:"absolute left-2 flex h-3.5 w-3.5 items-center justify-center",children:v.jsx(wA,{children:v.jsx(cB,{className:"h-4 w-4 fill-current"})})}),t]}));uJ.displayName=yA.displayName;const dJ=d.forwardRef(({className:e,inset:t,...n},r)=>v.jsx(mA,{ref:r,className:xe("px-2 py-1.5 text-sm font-semibold",t&&"pl-8",e),...n}));dJ.displayName=mA.displayName;const fJ=d.forwardRef(({className:e,...t},n)=>v.jsx(xA,{ref:n,className:xe("-mx-1 my-1 h-px bg-muted",e),...t}));fJ.displayName=xA.displayName;const J2={RemoveBG:{IconClass:u2,showName:"RemoveBG"},AnimeSeg:{IconClass:u2,showName:"Anime Segmentation"},RealESRGAN:{IconClass:jT,showName:"RealESRGAN"},GFPGAN:{IconClass:d2,showName:"GFPGAN"},RestoreFormer:{IconClass:d2,showName:"RestoreFormer"},InteractiveSeg:{IconClass:mH,showName:"Interactive Segmentation"}},hJ=()=>{const[e,t,n,r,o]=xt(m=>[m.file,m.serverConfig.plugins,m.isPluginRunning,m.updateInteractiveSegState,m.runRenderablePlugin]),i=!e;if(t.length===0)return null;const s=(m,p)=>{p==="InteractiveSeg"?r({isInteractiveSeg:!0}):o(m,p)},l=()=>v.jsxs(Q2,{children:[v.jsx(X1,{disabled:i,children:v.jsxs("div",{className:"flex gap-2 items-center",children:[v.jsx(jT,{}),"RealESRGAN"]})}),v.jsxs(Z1,{children:[v.jsx(ml,{onClick:()=>o(!1,"RealESRGAN",{upscale:2}),children:"upscale 2x"}),v.jsx(ml,{onClick:()=>o(!1,"RealESRGAN",{upscale:4}),children:"upscale 4x"})]})]},"RealESRGAN"),u=m=>{const{IconClass:p,showName:g}=J2[m.name];return v.jsxs(Q2,{children:[v.jsx(X1,{disabled:i,children:v.jsxs("div",{className:"flex gap-2 items-center",children:[v.jsx(p,{className:"p-1"}),g]})}),v.jsxs(Z1,{children:[v.jsx(ml,{onClick:()=>s(!1,m.name),children:"Remove Background"}),v.jsx(ml,{onClick:()=>s(!0,m.name),children:"Generate Mask"})]})]},m.name)},f=()=>t.map(m=>{const{IconClass:p,showName:g}=J2[m.name];return m.name==="RealESRGAN"?l():m.name==="RemoveBG"||m.name==="AnimeSeg"?u(m):v.jsx(ml,{onClick:()=>s(!1,m.name),disabled:i,children:v.jsxs("div",{className:"flex gap-2 items-center",children:[v.jsx(p,{className:"p-1"}),g]})},m.name)});return v.jsxs(aJ,{modal:!1,children:[v.jsx(lJ,{className:"border rounded-lg z-10 bg-background outline-none",tabIndex:-1,children:v.jsx(vn,{variant:"ghost",size:"icon",asChild:!0,className:"p-1.5",children:n?v.jsx("div",{role:"status",children:v.jsxs("svg",{"aria-hidden":"true",className:"w-5 h-5 animate-spin fill-primary",viewBox:"0 0 100 101",fill:"none",xmlns:"http://www.w3.org/2000/svg",children:[v.jsx("path",{d:"M100 50.5908C100 78.2051 77.6142 100.591 50 100.591C22.3858 100.591 0 78.2051 0 50.5908C0 22.9766 22.3858 0.59082 50 0.59082C77.6142 0.59082 100 22.9766 100 50.5908ZM9.08144 50.5908C9.08144 73.1895 27.4013 91.5094 50 91.5094C72.5987 91.5094 90.9186 73.1895 90.9186 50.5908C90.9186 27.9921 72.5987 9.67226 50 9.67226C27.4013 9.67226 9.08144 27.9921 9.08144 50.5908Z",fill:"currentColor"}),v.jsx("path",{d:"M93.9676 39.0409C96.393 38.4038 97.8624 35.9116 97.0079 33.5539C95.2932 28.8227 92.871 24.3692 89.8167 20.348C85.8452 15.1192 80.8826 10.7238 75.2124 7.41289C69.5422 4.10194 63.2754 1.94025 56.7698 1.05124C51.7666 0.367541 46.6976 0.446843 41.7345 1.27873C39.2613 1.69328 37.813 4.19778 38.4501 6.62326C39.0873 9.04874 41.5694 10.4717 44.0505 10.1071C47.8511 9.54855 51.7191 9.52689 55.5402 10.0491C60.8642 10.7766 65.9928 12.5457 70.6331 15.2552C75.2735 17.9648 79.3347 21.5619 82.5849 25.841C84.9175 28.9121 86.7997 32.2913 88.1811 35.8758C89.083 38.2158 91.5421 39.6781 93.9676 39.0409Z",fill:"currentFill"})]})}):v.jsx(aH,{strokeWidth:1})})}),v.jsx(_A,{side:"bottom",align:"start",children:f()})]})},pJ=Tx,mJ=ST,gJ=kx,EA=d.forwardRef(({className:e,...t},n)=>v.jsx(Bd,{className:xe("fixed inset-0 bg-background/80 backdrop-blur-sm data-[state=closed]:fade-out-0 data-[state=open]:fade-in-0",e),...t,ref:n}));EA.displayName=Bd.displayName;const vJ=Fm("fixed gap-4 bg-background p-6 shadow-lg transition ease-in-out data-[state=closed]:duration-200 data-[state=open]:duration-300",{variants:{side:{top:"inset-x-0 top-0 border-b data-[state=closed]:slide-out-to-top data-[state=open]:slide-in-from-top",bottom:"inset-x-0 bottom-0 border-t data-[state=closed]:slide-out-to-bottom data-[state=open]:slide-in-from-bottom",left:"inset-y-0 left-0 h-full w-3/4 border-r data-[state=closed]:slide-out-to-left data-[state=open]:slide-in-from-left sm:max-w-sm",right:"inset-y-0 right-0 h-full w-3/4 border-l data-[state=closed]:slide-out-to-right data-[state=open]:slide-in-from-right sm:max-w-sm"}},defaultVariants:{side:"right"}}),CA=d.forwardRef(({side:e="right",className:t,children:n,...r},o)=>v.jsxs(gJ,{children:[v.jsx(EA,{}),v.jsx(zd,{ref:o,className:xe(vJ({side:e}),t),...r,children:n})]}));CA.displayName=zd.displayName;const $A=({className:e,...t})=>v.jsx("div",{className:xe("flex flex-col space-y-2 text-center sm:text-left",e),...t});$A.displayName="SheetHeader";const yJ=d.forwardRef(({className:e,...t},n)=>v.jsx(Ud,{ref:n,className:xe("text-lg font-semibold text-foreground",e),...t}));yJ.displayName=Ud.displayName;const wJ=d.forwardRef(({className:e,...t},n)=>v.jsx(Vd,{ref:n,className:xe("text-sm text-muted-foreground",e),...t}));wJ.displayName=Vd.displayName;const _t=({children:e})=>v.jsx("div",{className:"flex justify-between items-center pr-4",children:e}),zt=({text:e,toolTip:t="",url:n,htmlFor:r,disabled:o=!1,className:i=""})=>v.jsxs(eT,{children:[v.jsx(tT,{asChild:!0,children:v.jsx(nb,{htmlFor:r||e.toLowerCase().replace(" ","-"),className:xe("font-medium min-w-[65px]",i),disabled:o,children:e})}),t||n?v.jsxs($x,{className:"flex flex-col max-w-xs text-sm",side:"left",children:[v.jsx("p",{children:t}),n?v.jsx(vn,{variant:"link",className:"justify-end",children:v.jsx("a",{href:n,target:"_blank",children:"More info"})}):v.jsx(v.Fragment,{})]}):v.jsx(v.Fragment,{})]}),xJ=()=>{const[e,t]=xt(n=>[n.settings,n.updateSettings]);return v.jsxs("div",{className:"flex flex-col gap-4 mt-4",children:[v.jsxs("div",{className:"flex flex-col gap-1",children:[v.jsx(zt,{htmlFor:"steps",text:"Steps",toolTip:"The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference."}),v.jsxs(_t,{children:[v.jsx(po,{className:"w-[180px]",defaultValue:[30],min:1,max:100,step:1,value:[Math.floor(e.ldmSteps)],onValueChange:n=>t({ldmSteps:n[0]})}),v.jsx(No,{id:"steps",className:"w-[50px] rounded-full",numberValue:e.ldmSteps,allowFloat:!1,onNumberValueChange:n=>{t({ldmSteps:n})}})]})]}),v.jsxs(_t,{children:[v.jsx(zt,{text:"Sampler"}),v.jsxs(yo,{value:e.ldmSampler,onValueChange:n=>{t({ldmSampler:n})},children:[v.jsx(Vr,{className:"w-[100px]",children:v.jsx(wo,{placeholder:"Select sampler"})}),v.jsx(Wr,{align:"end",children:v.jsx(zo,{children:Object.values(nx).map(n=>v.jsx(Hr,{value:n,children:n},n))})})]})]})]})},Rh=({text:e,onClick:t})=>{const[n]=xt(r=>[r.settings.showExtender]);return v.jsx(vn,{variant:"outline",className:"p-1 h-8",disabled:!n,onClick:t,children:v.jsx("div",{className:"flex items-center gap-1",children:e})})},bJ=()=>{const[e,t,n,r,o,i,s,l,u,f,m,p,g,y,x]=xt(L=>[L.serverConfig.samplers,L.settings,L.paintByExampleFile,L.getIsProcessing(),L.updateSettings,L.runInpainting,L.updateAppState,L.updateExtenderByBuiltIn,L.updateExtenderDirection,L.adjustMask,L.clearMask,L.updateEnablePowerPaintV2,L.updateEnableBrushNet,L.updateEnableControlnet,L.updateLCMLora]),[S,E]=Nx(n),_=d.useRef(null);FT(_,()=>{_!=null&&_.current&&_.current.blur()});const b=L=>{L.key==="Enter"&&L.ctrlKey&&t.prompt.length!==0&&i()},C=()=>v.jsxs(_t,{children:[v.jsx(zt,{text:"Cropper",toolTip:"Inpainting on part of image, improve inference speed and reduce memory usage."}),v.jsx(jr,{id:"cropper",checked:t.showCropper,onCheckedChange:L=>{o({showCropper:L}),L&&o({showExtender:!1})}})]}),R=()=>{if(!t.model.support_brushnet)return null;let L="BrushNet is a plug-and-play image inpainting model works on any SD1.5 base models.";return v.jsxs("div",{className:"flex flex-col gap-4",children:[v.jsxs("div",{className:"flex flex-col gap-4",children:[v.jsxs(_t,{children:[v.jsx(zt,{text:"BrushNet",url:"https://github.com/TencentARC/BrushNet",toolTip:L}),v.jsx(jr,{id:"brushnet",checked:t.enableBrushNet,onCheckedChange:ue=>{g(ue)}})]}),v.jsx(_t,{children:v.jsxs(yo,{defaultValue:t.brushnetMethod,value:t.brushnetMethod,onValueChange:ue=>{o({brushnetMethod:ue})},disabled:!t.enableBrushNet,children:[v.jsx(Vr,{children:v.jsx(wo,{placeholder:"Select brushnet model"})}),v.jsx(Wr,{align:"end",children:v.jsx(zo,{children:Object.values(t.model.brushnets).map(ue=>v.jsx(Hr,{value:ue,children:ue.split("/")[1]},ue))})})]})})]}),v.jsx(Xt,{})]})},k=()=>{if(!t.model.support_controlnet)return null;let L="Using an additional conditioning image to control how an image is generated";return v.jsxs("div",{className:"flex flex-col gap-4",children:[v.jsxs("div",{className:"flex flex-col gap-4",children:[v.jsxs(_t,{children:[v.jsx(zt,{text:"ControlNet",url:"https://huggingface.co/docs/diffusers/main/en/using-diffusers/inpaint#controlnet",toolTip:L}),v.jsx(jr,{id:"controlnet",checked:t.enableControlnet,onCheckedChange:ue=>{y(ue)}})]}),v.jsx("div",{className:"flex flex-col gap-1",children:v.jsxs(_t,{children:[v.jsx(po,{className:"w-[180px]",defaultValue:[100],min:1,max:100,step:1,disabled:!t.enableControlnet,value:[Math.floor(t.controlnetConditioningScale*100)],onValueChange:ue=>o({controlnetConditioningScale:ue[0]/100})}),v.jsx(No,{id:"controlnet-weight",className:"w-[50px] rounded-full",disabled:!t.enableControlnet,numberValue:t.controlnetConditioningScale,allowFloat:!1,onNumberValueChange:ue=>{o({controlnetConditioningScale:ue})}})]})}),v.jsx(_t,{children:v.jsxs(yo,{defaultValue:t.controlnetMethod,value:t.controlnetMethod,onValueChange:ue=>{o({controlnetMethod:ue})},disabled:!t.enableControlnet,children:[v.jsx(Vr,{children:v.jsx(wo,{placeholder:"Select control method"})}),v.jsx(Wr,{align:"end",children:v.jsx(zo,{children:Object.values(t.model.controlnets).map(ue=>v.jsx(Hr,{value:ue,children:ue.split("/")[1]},ue))})})]})})]}),v.jsx(Xt,{})]})},O=()=>{if(!t.model.support_lcm_lora)return null;let L="Enable quality image generation in typically 2-8 steps. Suggest disabling guidance_scale by setting it to 0. You can also try values between 1.0 and 2.0. When LCM Lora is enabled, LCMSampler will be used automatically.";return v.jsxs(v.Fragment,{children:[v.jsxs(_t,{children:[v.jsx(zt,{text:"LCM LoRA",url:"https://huggingface.co/docs/diffusers/main/en/using-diffusers/inference_with_lcm_lora",toolTip:L}),v.jsx(jr,{id:"lcm-lora",checked:t.enableLCMLora,onCheckedChange:ue=>{x(ue)}})]}),v.jsx(Xt,{})]})},A=()=>t.model.need_prompt?v.jsxs("div",{className:"flex flex-col gap-4",children:[v.jsx(zt,{text:"Negative prompt",url:"https://huggingface.co/docs/diffusers/main/en/using-diffusers/inpaint#negative-prompt",toolTip:"Negative prompt guides the model away from generating certain things in an image"}),v.jsx("div",{className:"pl-2 pr-4",children:v.jsx(jx,{ref:_,rows:4,onKeyUp:b,className:"max-h-[8rem] overflow-y-auto mb-2",placeholder:"",id:"negative-prompt",value:t.negativePrompt,onInput:L=>{L.preventDefault(),L.stopPropagation();const ue=L.target;o({negativePrompt:ue.value})}})})]}):null,I=()=>t.model.name!==JR?null:v.jsxs("div",{children:[v.jsxs(_t,{children:[v.jsx(zt,{text:"Example Image",toolTip:"An example image to guide image generation."}),v.jsx(nT,{tooltip:"Upload example image",onFileUpload:L=>{s({paintByExampleFile:L})},children:v.jsx(BT,{})})]}),E?v.jsx("div",{className:"flex justify-center items-center",children:v.jsx("img",{src:S.src,alt:"example",className:"max-w-[200px] max-h-[200px] m-3"})}):v.jsx(v.Fragment,{}),v.jsx(vn,{variant:"default",className:"w-full",disabled:r||!E,onClick:()=>{i()},children:"Paint"})]}),z=()=>t.model.name!==fF?null:v.jsxs("div",{className:"flex flex-col gap-1",children:[v.jsx(zt,{text:"Image guidance scale",toolTip:"Push the generated image towards the inital image. Higher image guidance scale encourages generated images that are closely linked to the source image, usually at the expense of lower image quality.",url:"https://huggingface.co/docs/diffusers/main/en/api/pipelines/pix2pix"}),v.jsxs(_t,{children:[v.jsx(po,{className:"w-[180px]",defaultValue:[150],min:100,max:1e3,step:1,value:[Math.floor(t.p2pImageGuidanceScale*100)],onValueChange:L=>o({p2pImageGuidanceScale:L[0]/100})}),v.jsx(No,{id:"image-guidance-scale",className:"w-[50px] rounded-full",numberValue:t.p2pImageGuidanceScale,allowFloat:!0,onNumberValueChange:L=>{o({p2pImageGuidanceScale:L})}})]})]}),H=()=>{if(!t.model.support_strength)return null;let L="Strength is a measure of how much noise is added to the base image, which influences how similar the output is to the base image. Higher value means more noise and more different from the base image";return v.jsxs(_t,{children:[v.jsx(zt,{text:"Strength",url:"https://huggingface.co/docs/diffusers/main/en/using-diffusers/inpaint#strength",toolTip:L}),v.jsx(po,{className:"w-[110px]",defaultValue:[100],min:10,max:100,step:1,value:[Math.floor(t.sdStrength*100)],onValueChange:ue=>o({sdStrength:ue[0]/100})}),v.jsx(No,{id:"strength",className:"w-[50px] rounded-full",numberValue:t.sdStrength,allowFloat:!0,onNumberValueChange:ue=>{o({sdStrength:ue})}})]})},ie=()=>t.model.support_outpainting?v.jsxs(v.Fragment,{children:[v.jsxs("div",{className:"flex flex-col gap-2",children:[v.jsxs(_t,{children:[v.jsx(zt,{text:"Extender",toolTip:"Perform outpainting on images to expand it's content."}),v.jsx(jr,{id:"extender",checked:t.showExtender,onCheckedChange:L=>{o({showExtender:L}),L&&o({showCropper:!1})}})]}),v.jsxs(_t,{children:[v.jsxs(yo,{defaultValue:t.extenderDirection,value:t.extenderDirection,onValueChange:L=>{u(L)},children:[v.jsx(Vr,{className:"w-[65px] h-7",disabled:!t.showExtender,children:v.jsx(wo,{placeholder:"Select axis"})}),v.jsx(Wr,{align:"end",children:v.jsx(zo,{children:Object.values(Ln).map(L=>v.jsx(Hr,{value:L,children:L},L))})})]}),v.jsxs("div",{className:"flex gap-1 justify-center mt-0",children:[v.jsx(Rh,{text:"1.25x",onClick:()=>l(t.extenderDirection,1.25)}),v.jsx(Rh,{text:"1.5x",onClick:()=>l(t.extenderDirection,1.5)}),v.jsx(Rh,{text:"1.75x",onClick:()=>l(t.extenderDirection,1.75)}),v.jsx(Rh,{text:"2.0x",onClick:()=>l(t.extenderDirection,2)})]})]})]}),v.jsx(Xt,{})]}):null,K=()=>v.jsxs(_t,{children:[v.jsx(zt,{text:"Task",toolTip:"PowerPaint task. When using extender, image-outpainting task will be auto used. For object-removal and image-outpainting, it is recommended to set the guidance_scale at 10 or above."}),v.jsxs(yo,{defaultValue:t.powerpaintTask,value:t.powerpaintTask,onValueChange:L=>{o({powerpaintTask:L})},disabled:t.showExtender,children:[v.jsx(Vr,{className:"w-[130px]",children:v.jsx(wo,{placeholder:"Select task"})}),v.jsx(Wr,{align:"end",children:v.jsx(zo,{children:[ma.text_guided,ma.object_remove,ma.context_aware,ma.shape_guided].map(L=>v.jsx(Hr,{value:L,children:L},L))})})]})]}),te=()=>t.model.name!==hF?null:v.jsxs(v.Fragment,{children:[K(),v.jsx(Xt,{})]}),U=()=>t.model.support_powerpaint_v2===!1?null:v.jsxs(v.Fragment,{children:[v.jsxs(_t,{children:[v.jsx(zt,{text:"PowerPaint V2",toolTip:"PowerPaint is a plug-and-play image inpainting model works on any SD1.5 base models."}),v.jsx(jr,{id:"powerpaint-v2",checked:t.enablePowerPaintV2,onCheckedChange:L=>{p(L)}})]}),K(),v.jsx(Xt,{})]}),re=()=>v.jsxs(_t,{children:[v.jsx(zt,{htmlFor:"steps",text:"Steps",toolTip:"The number of denoising steps. More denoising steps usually lead to a higher quality image at the expense of slower inference."}),v.jsx(po,{className:"w-[110px]",defaultValue:[30],min:1,max:100,step:1,value:[Math.floor(t.sdSteps)],onValueChange:L=>o({sdSteps:L[0]})}),v.jsx(No,{id:"steps",className:"w-[50px] rounded-full",numberValue:t.sdSteps,allowFloat:!1,onNumberValueChange:L=>{o({sdSteps:L})}})]}),V=()=>v.jsxs(_t,{children:[v.jsx(zt,{text:"Guidance",url:"https://huggingface.co/docs/diffusers/main/en/using-diffusers/inpaint#guidance-scale",toolTip:"Guidance scale affects how aligned the text prompt and generated image are. Higher value means the prompt and generated image are closely aligned, so the output is a stricter interpretation of the prompt"}),v.jsx(po,{className:"w-[110px]",defaultValue:[750],min:0,max:1500,step:1,value:[Math.floor(t.sdGuidanceScale*100)],onValueChange:L=>o({sdGuidanceScale:L[0]/100})}),v.jsx(No,{id:"guid",className:"w-[50px] rounded-full",numberValue:t.sdGuidanceScale,allowFloat:!0,onNumberValueChange:L=>{o({sdGuidanceScale:L})}})]}),J=()=>t.model.name===pF?null:v.jsxs(_t,{children:[v.jsx(zt,{text:"Sampler"}),v.jsxs(yo,{defaultValue:t.sdSampler,value:t.sdSampler,onValueChange:L=>{o({sdSampler:L})},children:[v.jsx(Vr,{className:"w-[175px] text-xs",children:v.jsx(wo,{placeholder:"Select sampler"})}),v.jsx(Wr,{align:"end",children:v.jsx(zo,{children:e.map(L=>v.jsx(Hr,{value:L,className:"text-xs",children:L},L))})})]})]}),G=()=>v.jsxs(_t,{children:[v.jsx(zt,{text:"Seed",toolTip:"Using same parameters and a fixed seed can generate same result image."}),v.jsxs("div",{className:"flex gap-2 justify-center items-center",children:[v.jsx(jr,{id:"seed",checked:t.seedFixed,onCheckedChange:L=>{o({seedFixed:L})}}),v.jsx(No,{id:"seed",className:"w-[110px]",disabled:!t.seedFixed,numberValue:t.seed,allowFloat:!1,onNumberValueChange:L=>{o({seed:L})}})]})]}),Z=()=>v.jsxs(v.Fragment,{children:[v.jsxs(_t,{children:[v.jsx(zt,{text:"Mask blur",toolTip:"How much to blur the mask before processing, in pixels. Make the generated inpainting boundaries appear more natural."}),v.jsx(po,{className:"w-[110px]",defaultValue:[t.sdMaskBlur],min:0,max:96,step:1,value:[Math.floor(t.sdMaskBlur)],onValueChange:L=>o({sdMaskBlur:L[0]})}),v.jsx(No,{id:"mask-blur",className:"w-[50px] rounded-full",numberValue:t.sdMaskBlur,allowFloat:!1,onNumberValueChange:L=>{o({sdMaskBlur:L})}})]}),v.jsx(Xt,{})]}),Q=()=>v.jsxs(v.Fragment,{children:[v.jsxs(_t,{children:[v.jsx(zt,{text:"Match histograms",toolTip:"Match the inpainting result histogram to the source image histogram",url:"https://github.com/Sanster/lama-cleaner/pull/143#issuecomment-1325859307"}),v.jsx(jr,{id:"match-histograms",checked:t.sdMatchHistograms,onCheckedChange:L=>{o({sdMatchHistograms:L})}})]}),v.jsx(Xt,{})]}),le=()=>v.jsxs(v.Fragment,{children:[v.jsxs("div",{className:"flex flex-col gap-2",children:[v.jsxs(_t,{children:[v.jsx(zt,{htmlFor:"adjustMaskKernelSize",text:"Mask OP",toolTip:"Expand or shrink mask. Using the slider to adjust the kernel size for dilation or erosion."}),v.jsx(po,{className:"w-[110px]",defaultValue:[12],min:1,max:100,step:1,value:[Math.floor(t.adjustMaskKernelSize)],onValueChange:L=>o({adjustMaskKernelSize:L[0]})}),v.jsx(No,{id:"adjustMaskKernelSize",className:"w-[50px] rounded-full",numberValue:t.adjustMaskKernelSize,allowFloat:!1,onNumberValueChange:L=>{o({adjustMaskKernelSize:L})}})]}),v.jsxs(_t,{children:[v.jsx(vn,{variant:"outline",className:"p-1 h-8",onClick:()=>f("expand"),disabled:r,children:v.jsx("div",{className:"flex items-center gap-1 select-none",children:"Expand"})}),v.jsx(vn,{variant:"outline",className:"p-1 h-8",onClick:()=>f("shrink"),disabled:r,children:v.jsx("div",{className:"flex items-center gap-1 select-none",children:"Shrink"})}),v.jsx(vn,{variant:"outline",className:"p-1 h-8",onClick:()=>f("reverse"),disabled:r,children:v.jsx("div",{className:"flex items-center gap-1 select-none",children:"Reverse"})}),v.jsx(vn,{variant:"outline",className:"p-1 h-8 justify-self-end",onClick:m,disabled:r,children:v.jsx("div",{className:"flex items-center gap-1 select-none",children:"Clear"})})]})]}),v.jsx(Xt,{})]});return v.jsxs("div",{className:"flex flex-col gap-[14px] mt-4",children:[C(),ie(),Z(),le(),Q(),te(),re(),V(),z(),H(),J(),G(),A(),v.jsx(Xt,{}),R(),U(),k(),O(),I()]})},SJ=()=>{const[e,t]=xt(n=>[n.settings,n.updateSettings]);return v.jsxs("div",{className:"flex flex-col gap-4 mt-4",children:[v.jsxs(_t,{children:[v.jsx(zt,{text:"CV2 Flag",url:"https://docs.opencv.org/4.8.0/d7/d8b/group__photo__inpaint.html#gga8002a65f5a3328fbf15df81b842d3c3ca892824c38e258feb5e72f308a358d52e"}),v.jsxs(yo,{value:e.cv2Flag,onValueChange:n=>{t({cv2Flag:n})},children:[v.jsx(Vr,{className:"w-[160px]",children:v.jsx(wo,{placeholder:"Select flag"})}),v.jsx(Wr,{align:"end",children:v.jsx(zo,{children:Object.values(rx).map(n=>v.jsx(Hr,{value:n,children:n},n))})})]})]}),v.jsx(zt,{text:"CV2 Radius",url:"https://docs.opencv.org/4.8.0/d7/d8b/group__photo__inpaint.html#gga8002a65f5a3328fbf15df81b842d3c3ca892824c38e258feb5e72f308a358d52e"}),v.jsxs(_t,{children:[v.jsx(po,{className:"w-[180px]",defaultValue:[5],min:1,max:100,step:1,value:[Math.floor(e.cv2Radius)],onValueChange:n=>t({cv2Radius:n[0]})}),v.jsx(No,{id:"cv2-radius",className:"w-[50px] rounded-full",numberValue:e.cv2Radius,allowFloat:!1,onNumberValueChange:n=>{t({cv2Radius:n})}})]})]})},_J=()=>{const[e,t]=xt(i=>[i.settings,i.windowSize]),[n,r]=Fx(!0);if(Yn("c",()=>{r()}),e.model.name!==fC&&e.model.name!==hC&&e.model.model_type===ms)return null;const o=()=>e.model.name===fC?v.jsx(xJ,{}):e.model.name===hC?v.jsx(SJ,{}):v.jsx(bJ,{});return v.jsxs(pJ,{open:n,modal:!1,children:[v.jsx(mJ,{tabIndex:-1,className:"z-10 outline-none absolute top-[68px] right-6 rounded-lg border bg-background",hidden:n,children:v.jsx(vn,{variant:"ghost",size:"icon",asChild:!0,className:"p-1.5",onClick:r,children:v.jsx(lH,{strokeWidth:1})})}),v.jsxs(CA,{side:"right",className:"w-[286px] mt-[60px] outline-none pl-3 pr-1",onOpenAutoFocus:i=>i.preventDefault(),onPointerDownOutside:i=>i.preventDefault(),children:[v.jsxs($A,{children:[v.jsxs(_t,{children:[v.jsx("div",{className:"overflow-hidden mr-8",children:e.model.name.split("/")[e.model.name.split("/").length-1]}),v.jsx(vn,{variant:"ghost",size:"icon",className:"border h-6 w-6",onClick:r,children:v.jsx(cH,{strokeWidth:1})})]}),v.jsx(Xt,{})]}),v.jsx(fg,{style:{height:t.height-160},children:o()})]})]})},ti=Object.create(null);ti.open="0";ti.close="1";ti.ping="2";ti.pong="3";ti.message="4";ti.upgrade="5";ti.noop="6";const ap=Object.create(null);Object.keys(ti).forEach(e=>{ap[ti[e]]=e});const q1={type:"error",data:"parser error"},RA=typeof Blob=="function"||typeof Blob<"u"&&Object.prototype.toString.call(Blob)==="[object BlobConstructor]",PA=typeof ArrayBuffer=="function",TA=e=>typeof ArrayBuffer.isView=="function"?ArrayBuffer.isView(e):e&&e.buffer instanceof ArrayBuffer,db=({type:e,data:t},n,r)=>RA&&t instanceof Blob?n?r(t):e$(t,r):PA&&(t instanceof ArrayBuffer||TA(t))?n?r(t):e$(new Blob([t]),r):r(ti[e]+(t||"")),e$=(e,t)=>{const n=new FileReader;return n.onload=function(){const r=n.result.split(",")[1];t("b"+(r||""))},n.readAsDataURL(e)};function t$(e){return e instanceof Uint8Array?e:e instanceof ArrayBuffer?new Uint8Array(e):new Uint8Array(e.buffer,e.byteOffset,e.byteLength)}let K0;function EJ(e,t){if(RA&&e.data instanceof Blob)return e.data.arrayBuffer().then(t$).then(t);if(PA&&(e.data instanceof ArrayBuffer||TA(e.data)))return t(t$(e.data));db(e,!1,n=>{K0||(K0=new TextEncoder),t(K0.encode(n))})}const n$="ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/",_u=typeof Uint8Array>"u"?[]:new Uint8Array(256);for(let e=0;e{let t=e.length*.75,n=e.length,r,o=0,i,s,l,u;e[e.length-1]==="="&&(t--,e[e.length-2]==="="&&t--);const f=new ArrayBuffer(t),m=new Uint8Array(f);for(r=0;r>4,m[o++]=(s&15)<<4|l>>2,m[o++]=(l&3)<<6|u&63;return f},$J=typeof ArrayBuffer=="function",fb=(e,t)=>{if(typeof e!="string")return{type:"message",data:kA(e,t)};const n=e.charAt(0);return n==="b"?{type:"message",data:RJ(e.substring(1),t)}:ap[n]?e.length>1?{type:ap[n],data:e.substring(1)}:{type:ap[n]}:q1},RJ=(e,t)=>{if($J){const n=CJ(e);return kA(n,t)}else return{base64:!0,data:e}},kA=(e,t)=>{switch(t){case"blob":return e instanceof Blob?e:new Blob([e]);case"arraybuffer":default:return e instanceof ArrayBuffer?e:e.buffer}},AA="",PJ=(e,t)=>{const n=e.length,r=new Array(n);let o=0;e.forEach((i,s)=>{db(i,!1,l=>{r[s]=l,++o===n&&t(r.join(AA))})})},TJ=(e,t)=>{const n=e.split(AA),r=[];for(let o=0;o{const r=n.length;let o;if(r<126)o=new Uint8Array(1),new DataView(o.buffer).setUint8(0,r);else if(r<65536){o=new Uint8Array(3);const i=new DataView(o.buffer);i.setUint8(0,126),i.setUint16(1,r)}else{o=new Uint8Array(9);const i=new DataView(o.buffer);i.setUint8(0,127),i.setBigUint64(1,BigInt(r))}e.data&&typeof e.data!="string"&&(o[0]|=128),t.enqueue(o),t.enqueue(n)})}})}let G0;function Ph(e){return e.reduce((t,n)=>t+n.length,0)}function Th(e,t){if(e[0].length===t)return e.shift();const n=new Uint8Array(t);let r=0;for(let o=0;oMath.pow(2,21)-1){l.enqueue(q1);break}o=m*Math.pow(2,32)+f.getUint32(4),r=3}else{if(Ph(n)e){l.enqueue(q1);break}}}})}const MA=4;function qt(e){if(e)return MJ(e)}function MJ(e){for(var t in qt.prototype)e[t]=qt.prototype[t];return e}qt.prototype.on=qt.prototype.addEventListener=function(e,t){return this._callbacks=this._callbacks||{},(this._callbacks["$"+e]=this._callbacks["$"+e]||[]).push(t),this};qt.prototype.once=function(e,t){function n(){this.off(e,n),t.apply(this,arguments)}return n.fn=t,this.on(e,n),this};qt.prototype.off=qt.prototype.removeListener=qt.prototype.removeAllListeners=qt.prototype.removeEventListener=function(e,t){if(this._callbacks=this._callbacks||{},arguments.length==0)return this._callbacks={},this;var n=this._callbacks["$"+e];if(!n)return this;if(arguments.length==1)return delete this._callbacks["$"+e],this;for(var r,o=0;o(e.hasOwnProperty(r)&&(n[r]=e[r]),n),{})}const OJ=Ur.setTimeout,NJ=Ur.clearTimeout;function Sg(e,t){t.useNativeTimers?(e.setTimeoutFn=OJ.bind(Ur),e.clearTimeoutFn=NJ.bind(Ur)):(e.setTimeoutFn=Ur.setTimeout.bind(Ur),e.clearTimeoutFn=Ur.clearTimeout.bind(Ur))}const DJ=1.33;function IJ(e){return typeof e=="string"?LJ(e):Math.ceil((e.byteLength||e.size)*DJ)}function LJ(e){let t=0,n=0;for(let r=0,o=e.length;r=57344?n+=3:(r++,n+=4);return n}function FJ(e){let t="";for(let n in e)e.hasOwnProperty(n)&&(t.length&&(t+="&"),t+=encodeURIComponent(n)+"="+encodeURIComponent(e[n]));return t}function jJ(e){let t={},n=e.split("&");for(let r=0,o=n.length;r0);return t}function DA(){const e=i$(+new Date);return e!==o$?(r$=0,o$=e):e+"."+i$(r$++)}for(;kh{this.readyState="paused",t()};if(this.polling||!this.writable){let r=0;this.polling&&(r++,this.once("pollComplete",function(){--r||n()})),this.writable||(r++,this.once("drain",function(){--r||n()}))}else n()}poll(){this.polling=!0,this.doPoll(),this.emitReserved("poll")}onData(t){const n=r=>{if(this.readyState==="opening"&&r.type==="open"&&this.onOpen(),r.type==="close")return this.onClose({description:"transport closed by the server"}),!1;this.onPacket(r)};TJ(t,this.socket.binaryType).forEach(n),this.readyState!=="closed"&&(this.polling=!1,this.emitReserved("pollComplete"),this.readyState==="open"&&this.poll())}doClose(){const t=()=>{this.write([{type:"close"}])};this.readyState==="open"?t():this.once("open",t)}write(t){this.writable=!1,PJ(t,n=>{this.doWrite(n,()=>{this.writable=!0,this.emitReserved("drain")})})}uri(){const t=this.opts.secure?"https":"http",n=this.query||{};return this.opts.timestampRequests!==!1&&(n[this.opts.timestampParam]=DA()),!this.supportsBinary&&!n.sid&&(n.b64=1),this.createUri(t,n)}request(t={}){return Object.assign(t,{xd:this.xd,cookieJar:this.cookieJar},this.opts),new qo(this.uri(),t)}doWrite(t,n){const r=this.request({method:"POST",data:t});r.on("success",n),r.on("error",(o,i)=>{this.onError("xhr post error",o,i)})}doPoll(){const t=this.request();t.on("data",this.onData.bind(this)),t.on("error",(n,r)=>{this.onError("xhr poll error",n,r)}),this.pollXhr=t}}class qo extends qt{constructor(t,n){super(),Sg(this,n),this.opts=n,this.method=n.method||"GET",this.uri=t,this.data=n.data!==void 0?n.data:null,this.create()}create(){var t;const n=OA(this.opts,"agent","pfx","key","passphrase","cert","ca","ciphers","rejectUnauthorized","autoUnref");n.xdomain=!!this.opts.xd;const r=this.xhr=new LA(n);try{r.open(this.method,this.uri,!0);try{if(this.opts.extraHeaders){r.setDisableHeaderCheck&&r.setDisableHeaderCheck(!0);for(let o in this.opts.extraHeaders)this.opts.extraHeaders.hasOwnProperty(o)&&r.setRequestHeader(o,this.opts.extraHeaders[o])}}catch{}if(this.method==="POST")try{r.setRequestHeader("Content-type","text/plain;charset=UTF-8")}catch{}try{r.setRequestHeader("Accept","*/*")}catch{}(t=this.opts.cookieJar)===null||t===void 0||t.addCookies(r),"withCredentials"in r&&(r.withCredentials=this.opts.withCredentials),this.opts.requestTimeout&&(r.timeout=this.opts.requestTimeout),r.onreadystatechange=()=>{var o;r.readyState===3&&((o=this.opts.cookieJar)===null||o===void 0||o.parseCookies(r)),r.readyState===4&&(r.status===200||r.status===1223?this.onLoad():this.setTimeoutFn(()=>{this.onError(typeof r.status=="number"?r.status:0)},0))},r.send(this.data)}catch(o){this.setTimeoutFn(()=>{this.onError(o)},0);return}typeof document<"u"&&(this.index=qo.requestsCount++,qo.requests[this.index]=this)}onError(t){this.emitReserved("error",t,this.xhr),this.cleanup(!0)}cleanup(t){if(!(typeof this.xhr>"u"||this.xhr===null)){if(this.xhr.onreadystatechange=VJ,t)try{this.xhr.abort()}catch{}typeof document<"u"&&delete qo.requests[this.index],this.xhr=null}}onLoad(){const t=this.xhr.responseText;t!==null&&(this.emitReserved("data",t),this.emitReserved("success"),this.cleanup())}abort(){this.cleanup()}}qo.requestsCount=0;qo.requests={};if(typeof document<"u"){if(typeof attachEvent=="function")attachEvent("onunload",s$);else if(typeof addEventListener=="function"){const e="onpagehide"in Ur?"pagehide":"unload";addEventListener(e,s$,!1)}}function s$(){for(let e in qo.requests)qo.requests.hasOwnProperty(e)&&qo.requests[e].abort()}const pb=typeof Promise=="function"&&typeof Promise.resolve=="function"?t=>Promise.resolve().then(t):(t,n)=>n(t,0),Ah=Ur.WebSocket||Ur.MozWebSocket,a$=!0,KJ="arraybuffer",l$=typeof navigator<"u"&&typeof navigator.product=="string"&&navigator.product.toLowerCase()==="reactnative";class GJ extends hb{constructor(t){super(t),this.supportsBinary=!t.forceBase64}get name(){return"websocket"}doOpen(){if(!this.check())return;const t=this.uri(),n=this.opts.protocols,r=l$?{}:OA(this.opts,"agent","perMessageDeflate","pfx","key","passphrase","cert","ca","ciphers","rejectUnauthorized","localAddress","protocolVersion","origin","maxPayload","family","checkServerIdentity");this.opts.extraHeaders&&(r.headers=this.opts.extraHeaders);try{this.ws=a$&&!l$?n?new Ah(t,n):new Ah(t):new Ah(t,n,r)}catch(o){return this.emitReserved("error",o)}this.ws.binaryType=this.socket.binaryType,this.addEventListeners()}addEventListeners(){this.ws.onopen=()=>{this.opts.autoUnref&&this.ws._socket.unref(),this.onOpen()},this.ws.onclose=t=>this.onClose({description:"websocket connection closed",context:t}),this.ws.onmessage=t=>this.onData(t.data),this.ws.onerror=t=>this.onError("websocket error",t)}write(t){this.writable=!1;for(let n=0;n{const s={};try{a$&&this.ws.send(i)}catch{}o&&pb(()=>{this.writable=!0,this.emitReserved("drain")},this.setTimeoutFn)})}}doClose(){typeof this.ws<"u"&&(this.ws.close(),this.ws=null)}uri(){const t=this.opts.secure?"wss":"ws",n=this.query||{};return this.opts.timestampRequests&&(n[this.opts.timestampParam]=DA()),this.supportsBinary||(n.b64=1),this.createUri(t,n)}check(){return!!Ah}}class YJ extends hb{get name(){return"webtransport"}doOpen(){typeof WebTransport=="function"&&(this.transport=new WebTransport(this.createUri("https"),this.opts.transportOptions[this.name]),this.transport.closed.then(()=>{this.onClose()}).catch(t=>{this.onError("webtransport error",t)}),this.transport.ready.then(()=>{this.transport.createBidirectionalStream().then(t=>{const n=AJ(Number.MAX_SAFE_INTEGER,this.socket.binaryType),r=t.readable.pipeThrough(n).getReader(),o=kJ();o.readable.pipeTo(t.writable),this.writer=o.writable.getWriter();const i=()=>{r.read().then(({done:l,value:u})=>{l||(this.onPacket(u),i())}).catch(l=>{})};i();const s={type:"open"};this.query.sid&&(s.data=`{"sid":"${this.query.sid}"}`),this.writer.write(s).then(()=>this.onOpen())})}))}write(t){this.writable=!1;for(let n=0;n{o&&pb(()=>{this.writable=!0,this.emitReserved("drain")},this.setTimeoutFn)})}}doClose(){var t;(t=this.transport)===null||t===void 0||t.close()}}const XJ={websocket:GJ,webtransport:YJ,polling:HJ},ZJ=/^(?:(?![^:@\/?#]+:[^:@\/]*@)(http|https|ws|wss):\/\/)?((?:(([^:@\/?#]*)(?::([^:@\/?#]*))?)?@)?((?:[a-f0-9]{0,4}:){2,7}[a-f0-9]{0,4}|[^:\/?#]*)(?::(\d*))?)(((\/(?:[^?#](?![^?#\/]*\.[^?#\/.]+(?:[?#]|$)))*\/?)?([^?#\/]*))(?:\?([^#]*))?(?:#(.*))?)/,qJ=["source","protocol","authority","userInfo","user","password","host","port","relative","path","directory","file","query","anchor"];function J1(e){if(e.length>2e3)throw"URI too long";const t=e,n=e.indexOf("["),r=e.indexOf("]");n!=-1&&r!=-1&&(e=e.substring(0,n)+e.substring(n,r).replace(/:/g,";")+e.substring(r,e.length));let o=ZJ.exec(e||""),i={},s=14;for(;s--;)i[qJ[s]]=o[s]||"";return n!=-1&&r!=-1&&(i.source=t,i.host=i.host.substring(1,i.host.length-1).replace(/;/g,":"),i.authority=i.authority.replace("[","").replace("]","").replace(/;/g,":"),i.ipv6uri=!0),i.pathNames=QJ(i,i.path),i.queryKey=JJ(i,i.query),i}function QJ(e,t){const n=/\/{2,9}/g,r=t.replace(n,"/").split("/");return(t.slice(0,1)=="/"||t.length===0)&&r.splice(0,1),t.slice(-1)=="/"&&r.splice(r.length-1,1),r}function JJ(e,t){const n={};return t.replace(/(?:^|&)([^&=]*)=?([^&]*)/g,function(r,o,i){o&&(n[o]=i)}),n}let FA=class gl extends qt{constructor(t,n={}){super(),this.binaryType=KJ,this.writeBuffer=[],t&&typeof t=="object"&&(n=t,t=null),t?(t=J1(t),n.hostname=t.host,n.secure=t.protocol==="https"||t.protocol==="wss",n.port=t.port,t.query&&(n.query=t.query)):n.host&&(n.hostname=J1(n.host).host),Sg(this,n),this.secure=n.secure!=null?n.secure:typeof location<"u"&&location.protocol==="https:",n.hostname&&!n.port&&(n.port=this.secure?"443":"80"),this.hostname=n.hostname||(typeof location<"u"?location.hostname:"localhost"),this.port=n.port||(typeof location<"u"&&location.port?location.port:this.secure?"443":"80"),this.transports=n.transports||["polling","websocket","webtransport"],this.writeBuffer=[],this.prevBufferLen=0,this.opts=Object.assign({path:"/engine.io",agent:!1,withCredentials:!1,upgrade:!0,timestampParam:"t",rememberUpgrade:!1,addTrailingSlash:!0,rejectUnauthorized:!0,perMessageDeflate:{threshold:1024},transportOptions:{},closeOnBeforeunload:!1},n),this.opts.path=this.opts.path.replace(/\/$/,"")+(this.opts.addTrailingSlash?"/":""),typeof this.opts.query=="string"&&(this.opts.query=jJ(this.opts.query)),this.id=null,this.upgrades=null,this.pingInterval=null,this.pingTimeout=null,this.pingTimeoutTimer=null,typeof addEventListener=="function"&&(this.opts.closeOnBeforeunload&&(this.beforeunloadEventListener=()=>{this.transport&&(this.transport.removeAllListeners(),this.transport.close())},addEventListener("beforeunload",this.beforeunloadEventListener,!1)),this.hostname!=="localhost"&&(this.offlineEventListener=()=>{this.onClose("transport close",{description:"network connection lost"})},addEventListener("offline",this.offlineEventListener,!1))),this.open()}createTransport(t){const n=Object.assign({},this.opts.query);n.EIO=MA,n.transport=t,this.id&&(n.sid=this.id);const r=Object.assign({},this.opts,{query:n,socket:this,hostname:this.hostname,secure:this.secure,port:this.port},this.opts.transportOptions[t]);return new XJ[t](r)}open(){let t;if(this.opts.rememberUpgrade&&gl.priorWebsocketSuccess&&this.transports.indexOf("websocket")!==-1)t="websocket";else if(this.transports.length===0){this.setTimeoutFn(()=>{this.emitReserved("error","No transports available")},0);return}else t=this.transports[0];this.readyState="opening";try{t=this.createTransport(t)}catch{this.transports.shift(),this.open();return}t.open(),this.setTransport(t)}setTransport(t){this.transport&&this.transport.removeAllListeners(),this.transport=t,t.on("drain",this.onDrain.bind(this)).on("packet",this.onPacket.bind(this)).on("error",this.onError.bind(this)).on("close",n=>this.onClose("transport close",n))}probe(t){let n=this.createTransport(t),r=!1;gl.priorWebsocketSuccess=!1;const o=()=>{r||(n.send([{type:"ping",data:"probe"}]),n.once("packet",p=>{if(!r)if(p.type==="pong"&&p.data==="probe"){if(this.upgrading=!0,this.emitReserved("upgrading",n),!n)return;gl.priorWebsocketSuccess=n.name==="websocket",this.transport.pause(()=>{r||this.readyState!=="closed"&&(m(),this.setTransport(n),n.send([{type:"upgrade"}]),this.emitReserved("upgrade",n),n=null,this.upgrading=!1,this.flush())})}else{const g=new Error("probe error");g.transport=n.name,this.emitReserved("upgradeError",g)}}))};function i(){r||(r=!0,m(),n.close(),n=null)}const s=p=>{const g=new Error("probe error: "+p);g.transport=n.name,i(),this.emitReserved("upgradeError",g)};function l(){s("transport closed")}function u(){s("socket closed")}function f(p){n&&p.name!==n.name&&i()}const m=()=>{n.removeListener("open",o),n.removeListener("error",s),n.removeListener("close",l),this.off("close",u),this.off("upgrading",f)};n.once("open",o),n.once("error",s),n.once("close",l),this.once("close",u),this.once("upgrading",f),this.upgrades.indexOf("webtransport")!==-1&&t!=="webtransport"?this.setTimeoutFn(()=>{r||n.open()},200):n.open()}onOpen(){if(this.readyState="open",gl.priorWebsocketSuccess=this.transport.name==="websocket",this.emitReserved("open"),this.flush(),this.readyState==="open"&&this.opts.upgrade){let t=0;const n=this.upgrades.length;for(;t{this.onClose("ping timeout")},this.pingInterval+this.pingTimeout),this.opts.autoUnref&&this.pingTimeoutTimer.unref()}onDrain(){this.writeBuffer.splice(0,this.prevBufferLen),this.prevBufferLen=0,this.writeBuffer.length===0?this.emitReserved("drain"):this.flush()}flush(){if(this.readyState!=="closed"&&this.transport.writable&&!this.upgrading&&this.writeBuffer.length){const t=this.getWritablePackets();this.transport.send(t),this.prevBufferLen=t.length,this.emitReserved("flush")}}getWritablePackets(){if(!(this.maxPayload&&this.transport.name==="polling"&&this.writeBuffer.length>1))return this.writeBuffer;let n=1;for(let r=0;r0&&n>this.maxPayload)return this.writeBuffer.slice(0,r);n+=2}return this.writeBuffer}write(t,n,r){return this.sendPacket("message",t,n,r),this}send(t,n,r){return this.sendPacket("message",t,n,r),this}sendPacket(t,n,r,o){if(typeof n=="function"&&(o=n,n=void 0),typeof r=="function"&&(o=r,r=null),this.readyState==="closing"||this.readyState==="closed")return;r=r||{},r.compress=r.compress!==!1;const i={type:t,data:n,options:r};this.emitReserved("packetCreate",i),this.writeBuffer.push(i),o&&this.once("flush",o),this.flush()}close(){const t=()=>{this.onClose("forced close"),this.transport.close()},n=()=>{this.off("upgrade",n),this.off("upgradeError",n),t()},r=()=>{this.once("upgrade",n),this.once("upgradeError",n)};return(this.readyState==="opening"||this.readyState==="open")&&(this.readyState="closing",this.writeBuffer.length?this.once("drain",()=>{this.upgrading?r():t()}):this.upgrading?r():t()),this}onError(t){gl.priorWebsocketSuccess=!1,this.emitReserved("error",t),this.onClose("transport error",t)}onClose(t,n){(this.readyState==="opening"||this.readyState==="open"||this.readyState==="closing")&&(this.clearTimeoutFn(this.pingTimeoutTimer),this.transport.removeAllListeners("close"),this.transport.close(),this.transport.removeAllListeners(),typeof removeEventListener=="function"&&(removeEventListener("beforeunload",this.beforeunloadEventListener,!1),removeEventListener("offline",this.offlineEventListener,!1)),this.readyState="closed",this.id=null,this.emitReserved("close",t,n),this.writeBuffer=[],this.prevBufferLen=0)}filterUpgrades(t){const n=[];let r=0;const o=t.length;for(;rtypeof ArrayBuffer.isView=="function"?ArrayBuffer.isView(e):e.buffer instanceof ArrayBuffer,jA=Object.prototype.toString,ree=typeof Blob=="function"||typeof Blob<"u"&&jA.call(Blob)==="[object BlobConstructor]",oee=typeof File=="function"||typeof File<"u"&&jA.call(File)==="[object FileConstructor]";function mb(e){return tee&&(e instanceof ArrayBuffer||nee(e))||ree&&e instanceof Blob||oee&&e instanceof File}function lp(e,t){if(!e||typeof e!="object")return!1;if(Array.isArray(e)){for(let n=0,r=e.length;n=0&&e.num{delete this.acks[t];for(let s=0;s{this.io.clearTimeoutFn(i),n.apply(this,[null,...s])}}emitWithAck(t,...n){const r=this.flags.timeout!==void 0||this._opts.ackTimeout!==void 0;return new Promise((o,i)=>{n.push((s,l)=>r?s?i(s):o(l):o(s)),this.emit(t,...n)})}_addToQueue(t){let n;typeof t[t.length-1]=="function"&&(n=t.pop());const r={id:this._queueSeq++,tryCount:0,pending:!1,args:t,flags:Object.assign({fromQueue:!0},this.flags)};t.push((o,...i)=>r!==this._queue[0]?void 0:(o!==null?r.tryCount>this._opts.retries&&(this._queue.shift(),n&&n(o)):(this._queue.shift(),n&&n(null,...i)),r.pending=!1,this._drainQueue())),this._queue.push(r),this._drainQueue()}_drainQueue(t=!1){if(!this.connected||this._queue.length===0)return;const n=this._queue[0];n.pending&&!t||(n.pending=!0,n.tryCount++,this.flags=n.flags,this.emit.apply(this,n.args))}packet(t){t.nsp=this.nsp,this.io._packet(t)}onopen(){typeof this.auth=="function"?this.auth(t=>{this._sendConnectPacket(t)}):this._sendConnectPacket(this.auth)}_sendConnectPacket(t){this.packet({type:lt.CONNECT,data:this._pid?Object.assign({pid:this._pid,offset:this._lastOffset},t):t})}onerror(t){this.connected||this.emitReserved("connect_error",t)}onclose(t,n){this.connected=!1,delete this.id,this.emitReserved("disconnect",t,n)}onpacket(t){if(t.nsp===this.nsp)switch(t.type){case lt.CONNECT:t.data&&t.data.sid?this.onconnect(t.data.sid,t.data.pid):this.emitReserved("connect_error",new Error("It seems you are trying to reach a Socket.IO server in v2.x with a v3.x client, but they are not compatible (more information here: https://socket.io/docs/v3/migrating-from-2-x-to-3-0/)"));break;case lt.EVENT:case lt.BINARY_EVENT:this.onevent(t);break;case lt.ACK:case lt.BINARY_ACK:this.onack(t);break;case lt.DISCONNECT:this.ondisconnect();break;case lt.CONNECT_ERROR:this.destroy();const r=new Error(t.data.message);r.data=t.data.data,this.emitReserved("connect_error",r);break}}onevent(t){const n=t.data||[];t.id!=null&&n.push(this.ack(t.id)),this.connected?this.emitEvent(n):this.receiveBuffer.push(Object.freeze(n))}emitEvent(t){if(this._anyListeners&&this._anyListeners.length){const n=this._anyListeners.slice();for(const r of n)r.apply(this,t)}super.emit.apply(this,t),this._pid&&t.length&&typeof t[t.length-1]=="string"&&(this._lastOffset=t[t.length-1])}ack(t){const n=this;let r=!1;return function(...o){r||(r=!0,n.packet({type:lt.ACK,id:t,data:o}))}}onack(t){const n=this.acks[t.id];typeof n=="function"&&(n.apply(this,t.data),delete this.acks[t.id])}onconnect(t,n){this.id=t,this.recovered=n&&this._pid===n,this._pid=n,this.connected=!0,this.emitBuffered(),this.emitReserved("connect"),this._drainQueue(!0)}emitBuffered(){this.receiveBuffer.forEach(t=>this.emitEvent(t)),this.receiveBuffer=[],this.sendBuffer.forEach(t=>{this.notifyOutgoingListeners(t),this.packet(t)}),this.sendBuffer=[]}ondisconnect(){this.destroy(),this.onclose("io server disconnect")}destroy(){this.subs&&(this.subs.forEach(t=>t()),this.subs=void 0),this.io._destroy(this)}disconnect(){return this.connected&&this.packet({type:lt.DISCONNECT}),this.destroy(),this.connected&&this.onclose("io client disconnect"),this}close(){return this.disconnect()}compress(t){return this.flags.compress=t,this}get volatile(){return this.flags.volatile=!0,this}timeout(t){return this.flags.timeout=t,this}onAny(t){return this._anyListeners=this._anyListeners||[],this._anyListeners.push(t),this}prependAny(t){return this._anyListeners=this._anyListeners||[],this._anyListeners.unshift(t),this}offAny(t){if(!this._anyListeners)return this;if(t){const n=this._anyListeners;for(let r=0;r0&&e.jitter<=1?e.jitter:0,this.attempts=0}Oc.prototype.duration=function(){var e=this.ms*Math.pow(this.factor,this.attempts++);if(this.jitter){var t=Math.random(),n=Math.floor(t*this.jitter*e);e=Math.floor(t*10)&1?e+n:e-n}return Math.min(e,this.max)|0};Oc.prototype.reset=function(){this.attempts=0};Oc.prototype.setMin=function(e){this.ms=e};Oc.prototype.setMax=function(e){this.max=e};Oc.prototype.setJitter=function(e){this.jitter=e};class nw extends qt{constructor(t,n){var r;super(),this.nsps={},this.subs=[],t&&typeof t=="object"&&(n=t,t=void 0),n=n||{},n.path=n.path||"/socket.io",this.opts=n,Sg(this,n),this.reconnection(n.reconnection!==!1),this.reconnectionAttempts(n.reconnectionAttempts||1/0),this.reconnectionDelay(n.reconnectionDelay||1e3),this.reconnectionDelayMax(n.reconnectionDelayMax||5e3),this.randomizationFactor((r=n.randomizationFactor)!==null&&r!==void 0?r:.5),this.backoff=new Oc({min:this.reconnectionDelay(),max:this.reconnectionDelayMax(),jitter:this.randomizationFactor()}),this.timeout(n.timeout==null?2e4:n.timeout),this._readyState="closed",this.uri=t;const o=n.parser||dee;this.encoder=new o.Encoder,this.decoder=new o.Decoder,this._autoConnect=n.autoConnect!==!1,this._autoConnect&&this.open()}reconnection(t){return arguments.length?(this._reconnection=!!t,this):this._reconnection}reconnectionAttempts(t){return t===void 0?this._reconnectionAttempts:(this._reconnectionAttempts=t,this)}reconnectionDelay(t){var n;return t===void 0?this._reconnectionDelay:(this._reconnectionDelay=t,(n=this.backoff)===null||n===void 0||n.setMin(t),this)}randomizationFactor(t){var n;return t===void 0?this._randomizationFactor:(this._randomizationFactor=t,(n=this.backoff)===null||n===void 0||n.setJitter(t),this)}reconnectionDelayMax(t){var n;return t===void 0?this._reconnectionDelayMax:(this._reconnectionDelayMax=t,(n=this.backoff)===null||n===void 0||n.setMax(t),this)}timeout(t){return arguments.length?(this._timeout=t,this):this._timeout}maybeReconnectOnOpen(){!this._reconnecting&&this._reconnection&&this.backoff.attempts===0&&this.reconnect()}open(t){if(~this._readyState.indexOf("open"))return this;this.engine=new FA(this.uri,this.opts);const n=this.engine,r=this;this._readyState="opening",this.skipReconnect=!1;const o=mo(n,"open",function(){r.onopen(),t&&t()}),i=l=>{this.cleanup(),this._readyState="closed",this.emitReserved("error",l),t?t(l):this.maybeReconnectOnOpen()},s=mo(n,"error",i);if(this._timeout!==!1){const l=this._timeout,u=this.setTimeoutFn(()=>{o(),i(new Error("timeout")),n.close()},l);this.opts.autoUnref&&u.unref(),this.subs.push(()=>{this.clearTimeoutFn(u)})}return this.subs.push(o),this.subs.push(s),this}connect(t){return this.open(t)}onopen(){this.cleanup(),this._readyState="open",this.emitReserved("open");const t=this.engine;this.subs.push(mo(t,"ping",this.onping.bind(this)),mo(t,"data",this.ondata.bind(this)),mo(t,"error",this.onerror.bind(this)),mo(t,"close",this.onclose.bind(this)),mo(this.decoder,"decoded",this.ondecoded.bind(this)))}onping(){this.emitReserved("ping")}ondata(t){try{this.decoder.add(t)}catch(n){this.onclose("parse error",n)}}ondecoded(t){pb(()=>{this.emitReserved("packet",t)},this.setTimeoutFn)}onerror(t){this.emitReserved("error",t)}socket(t,n){let r=this.nsps[t];return r?this._autoConnect&&!r.active&&r.connect():(r=new BA(this,t,n),this.nsps[t]=r),r}_destroy(t){const n=Object.keys(this.nsps);for(const r of n)if(this.nsps[r].active)return;this._close()}_packet(t){const n=this.encoder.encode(t);for(let r=0;rt()),this.subs.length=0,this.decoder.destroy()}_close(){this.skipReconnect=!0,this._reconnecting=!1,this.onclose("forced close"),this.engine&&this.engine.close()}disconnect(){return this._close()}onclose(t,n){this.cleanup(),this.backoff.reset(),this._readyState="closed",this.emitReserved("close",t,n),this._reconnection&&!this.skipReconnect&&this.reconnect()}reconnect(){if(this._reconnecting||this.skipReconnect)return this;const t=this;if(this.backoff.attempts>=this._reconnectionAttempts)this.backoff.reset(),this.emitReserved("reconnect_failed"),this._reconnecting=!1;else{const n=this.backoff.duration();this._reconnecting=!0;const r=this.setTimeoutFn(()=>{t.skipReconnect||(this.emitReserved("reconnect_attempt",t.backoff.attempts),!t.skipReconnect&&t.open(o=>{o?(t._reconnecting=!1,t.reconnect(),this.emitReserved("reconnect_error",o)):t.onreconnect()}))},n);this.opts.autoUnref&&r.unref(),this.subs.push(()=>{this.clearTimeoutFn(r)})}}onreconnect(){const t=this.backoff.attempts;this._reconnecting=!1,this.backoff.reset(),this.emitReserved("reconnect",t)}}const mu={};function cp(e,t){typeof e=="object"&&(t=e,e=void 0),t=t||{};const n=eee(e,t.path||"/socket.io"),r=n.source,o=n.id,i=n.path,s=mu[o]&&i in mu[o].nsps,l=t.forceNew||t["force new connection"]||t.multiplex===!1||s;let u;return l?u=new nw(r,t):(mu[o]||(mu[o]=new nw(r,t)),u=mu[o]),n.query&&!t.query&&(t.query=n.queryKey),u.socket(n.path,t)}Object.assign(cp,{Manager:nw,Socket:BA,io:cp,connect:cp});const zA="Progress",_g=100,[hee,Ete]=Tn(zA),[pee,mee]=hee(zA),UA=d.forwardRef((e,t)=>{const{__scopeProgress:n,value:r,max:o,getValueLabel:i=yee,...s}=e,l=rw(o)?o:_g,u=WA(r,l)?r:null,f=dm(u)?i(u,l):void 0;return d.createElement(pee,{scope:n,value:u,max:l},d.createElement(Ae.div,Y({"aria-valuemax":l,"aria-valuemin":0,"aria-valuenow":dm(u)?u:void 0,"aria-valuetext":f,role:"progressbar","data-state":VA(u,l),"data-value":u??void 0,"data-max":l},s,{ref:t})))});UA.propTypes={max(e,t,n){const r=e[t],o=String(r);return r&&!rw(r)?new Error(wee(o,n)):null},value(e,t,n){const r=e[t],o=String(r),i=rw(e.max)?e.max:_g;return r!=null&&!WA(r,i)?new Error(xee(o,n)):null}};const gee="ProgressIndicator",vee=d.forwardRef((e,t)=>{var n;const{__scopeProgress:r,...o}=e,i=mee(gee,r);return d.createElement(Ae.div,Y({"data-state":VA(i.value,i.max),"data-value":(n=i.value)!==null&&n!==void 0?n:void 0,"data-max":i.max},o,{ref:t}))});function yee(e,t){return`${Math.round(e/t*100)}%`}function VA(e,t){return e==null?"indeterminate":e===t?"complete":"loading"}function dm(e){return typeof e=="number"}function rw(e){return dm(e)&&!isNaN(e)&&e>0}function WA(e,t){return dm(e)&&!isNaN(e)&&e<=t&&e>=0}function wee(e,t){return`Invalid prop \`max\` of value \`${e}\` supplied to \`${t}\`. Only numbers greater than 0 are valid max values. Defaulting to \`${_g}\`.`}function xee(e,t){return`Invalid prop \`value\` of value \`${e}\` supplied to \`${t}\`. The \`value\` prop must be: + - a positive number + - less than the value passed to \`max\` (or ${_g} if no \`max\` prop is set) + - \`null\` if the progress is indeterminate. + +Defaulting to \`null\`.`}const HA=UA,bee=vee,KA=d.forwardRef(({className:e,value:t,...n},r)=>v.jsx(HA,{ref:r,className:xe("relative h-2 w-full overflow-hidden rounded-full bg-primary/20",e),...n,children:v.jsx(bee,{className:"h-full w-full flex-1 bg-primary transition-all",style:{transform:`translateX(-${100-(t||0)}%)`}})}));KA.displayName=HA.displayName;const See="",Ji=cp(See),_ee=()=>{const[e,t,n]=xt(u=>[u.settings,u.isInpainting,u.isSD()]),[r,o]=d.useState(!1),[i,s]=d.useState(0),l=Math.min(Math.round(i/e.sdSteps*100),100);return d.useEffect(()=>(Ji.on("connect",()=>{o(!0)}),Ji.on("disconnect",()=>{o(!1)}),Ji.on("diffusion_progress",u=>{u&&s(u.step+1)}),Ji.on("diffusion_finish",()=>{s(0)}),()=>{Ji.off("connect"),Ji.off("disconnect"),Ji.off("diffusion_progress"),Ji.off("diffusion_finish")}),[]),v.jsxs("div",{className:"z-10 fixed bg-background w-[220px] left-1/2 -translate-x-1/2 top-[68px] h-[32px] flex justify-center items-center gap-[18px] border-[1px] border-[solid] rounded-[14px] pl-[8px] pr-[8px]",style:{visibility:r&&t&&n?"visible":"hidden"},children:[v.jsx(KA,{value:l}),v.jsxs("div",{className:"w-[45px] flex justify-center font-nums",children:[l,"%"]})]})},Eee=()=>{const[e,t]=xt(n=>[n.file,n.updateSettings]);return d.useEffect(()=>{(async()=>{const r=await Wj();t({model:r})})()},[]),v.jsxs(v.Fragment,{children:[v.jsxs("div",{className:"flex gap-3 absolute top-[68px] left-[24px] items-center",children:[v.jsx(hJ,{}),v.jsx(Gq,{})]}),v.jsx(Uq,{}),v.jsx(_ee,{}),v.jsx(_J,{}),e?v.jsx(Kq,{file:e}):v.jsx(v.Fragment,{})]})},Cee=()=>{const[e,t]=d.useState(window.innerWidth),n=d.useCallback(()=>{t(window.innerWidth)},[]);if(d.useEffect(()=>(window.addEventListener("resize",n),()=>{window.removeEventListener("resize",n)})),e<768)return"mobile";if(e>=768&&e<1224)return"tablet";if(e>=1224)return"desktop"};function $ee(e){const[t]=d.useState(`file-upload-${Math.random().toString()}`),n=Cee();return v.jsx("div",{className:"absolute flex w-screen h-screen justify-center items-center pointer-events-none",children:v.jsx("label",{htmlFor:t,className:"grid bg-background border-[2px] border-[dashed] rounded-lg min-w-[600px] pointer-events-auto",children:v.jsx("div",{className:"grid p-16 w-full h-full",children:v.jsx("p",{className:"text-center",children:"Sollte das Bild nicht geladen werden, drücken sie die F5-Taste, oder klicken sie rechts oben auf den Button"})})})})}const GA="ToastProvider",[vb,Ree,Pee]=Wd("Toast"),[YA,Cte]=Tn("Toast",[Pee]),[Tee,Eg]=YA(GA),XA=e=>{const{__scopeToast:t,label:n="Notification",duration:r=5e3,swipeDirection:o="right",swipeThreshold:i=50,children:s}=e,[l,u]=d.useState(null),[f,m]=d.useState(0),p=d.useRef(!1),g=d.useRef(!1);return d.createElement(vb.Provider,{scope:t},d.createElement(Tee,{scope:t,label:n,duration:r,swipeDirection:o,swipeThreshold:i,toastCount:f,viewport:l,onViewportChange:u,onToastAdd:d.useCallback(()=>m(y=>y+1),[]),onToastRemove:d.useCallback(()=>m(y=>y-1),[]),isFocusedToastEscapeKeyDownRef:p,isClosePausedRef:g},s))};XA.propTypes={label(e){if(e.label&&typeof e.label=="string"&&!e.label.trim()){const t=`Invalid prop \`label\` supplied to \`${GA}\`. Expected non-empty \`string\`.`;return new Error(t)}return null}};const kee="ToastViewport",Aee=["F8"],ow="toast.viewportPause",iw="toast.viewportResume",Mee=d.forwardRef((e,t)=>{const{__scopeToast:n,hotkey:r=Aee,label:o="Notifications ({hotkey})",...i}=e,s=Eg(kee,n),l=Ree(n),u=d.useRef(null),f=d.useRef(null),m=d.useRef(null),p=d.useRef(null),g=Ve(t,p,s.onViewportChange),y=r.join("+").replace(/Key/g,"").replace(/Digit/g,""),x=s.toastCount>0;d.useEffect(()=>{const E=_=>{var b;r.every(R=>_[R]||_.code===R)&&((b=p.current)===null||b===void 0||b.focus())};return document.addEventListener("keydown",E),()=>document.removeEventListener("keydown",E)},[r]),d.useEffect(()=>{const E=u.current,_=p.current;if(x&&E&&_){const b=()=>{if(!s.isClosePausedRef.current){const O=new CustomEvent(ow);_.dispatchEvent(O),s.isClosePausedRef.current=!0}},C=()=>{if(s.isClosePausedRef.current){const O=new CustomEvent(iw);_.dispatchEvent(O),s.isClosePausedRef.current=!1}},R=O=>{!E.contains(O.relatedTarget)&&C()},k=()=>{E.contains(document.activeElement)||C()};return E.addEventListener("focusin",b),E.addEventListener("focusout",R),E.addEventListener("pointermove",b),E.addEventListener("pointerleave",k),window.addEventListener("blur",b),window.addEventListener("focus",C),()=>{E.removeEventListener("focusin",b),E.removeEventListener("focusout",R),E.removeEventListener("pointermove",b),E.removeEventListener("pointerleave",k),window.removeEventListener("blur",b),window.removeEventListener("focus",C)}}},[x,s.isClosePausedRef]);const S=d.useCallback(({tabbingDirection:E})=>{const b=l().map(C=>{const R=C.ref.current,k=[R,...Yee(R)];return E==="forwards"?k:k.reverse()});return(E==="forwards"?b.reverse():b).flat()},[l]);return d.useEffect(()=>{const E=p.current;if(E){const _=b=>{const C=b.altKey||b.ctrlKey||b.metaKey;if(b.key==="Tab"&&!C){const I=document.activeElement,z=b.shiftKey;if(b.target===E&&z){var k;(k=f.current)===null||k===void 0||k.focus();return}const K=S({tabbingDirection:z?"backwards":"forwards"}),te=K.findIndex(U=>U===I);if(Y0(K.slice(te+1)))b.preventDefault();else{var O,A;z?(O=f.current)===null||O===void 0||O.focus():(A=m.current)===null||A===void 0||A.focus()}}};return E.addEventListener("keydown",_),()=>E.removeEventListener("keydown",_)}},[l,S]),d.createElement(Rz,{ref:u,role:"region","aria-label":o.replace("{hotkey}",y),tabIndex:-1,style:{pointerEvents:x?void 0:"none"}},x&&d.createElement(u$,{ref:f,onFocusFromOutsideViewport:()=>{const E=S({tabbingDirection:"forwards"});Y0(E)}}),d.createElement(vb.Slot,{scope:n},d.createElement(Ae.ol,Y({tabIndex:-1},i,{ref:g}))),x&&d.createElement(u$,{ref:m,onFocusFromOutsideViewport:()=>{const E=S({tabbingDirection:"backwards"});Y0(E)}}))}),Oee="ToastFocusProxy",u$=d.forwardRef((e,t)=>{const{__scopeToast:n,onFocusFromOutsideViewport:r,...o}=e,i=Eg(Oee,n);return d.createElement(Xm,Y({"aria-hidden":!0,tabIndex:0},o,{ref:t,style:{position:"fixed"},onFocus:s=>{var l;const u=s.relatedTarget;!((l=i.viewport)!==null&&l!==void 0&&l.contains(u))&&r()}}))}),Cg="Toast",Nee="toast.swipeStart",Dee="toast.swipeMove",Iee="toast.swipeCancel",Lee="toast.swipeEnd",Fee=d.forwardRef((e,t)=>{const{forceMount:n,open:r,defaultOpen:o,onOpenChange:i,...s}=e,[l=!0,u]=eo({prop:r,defaultProp:o,onChange:i});return d.createElement(xn,{present:n||l},d.createElement(ZA,Y({open:l},s,{ref:t,onClose:()=>u(!1),onPause:Lt(e.onPause),onResume:Lt(e.onResume),onSwipeStart:fe(e.onSwipeStart,f=>{f.currentTarget.setAttribute("data-swipe","start")}),onSwipeMove:fe(e.onSwipeMove,f=>{const{x:m,y:p}=f.detail.delta;f.currentTarget.setAttribute("data-swipe","move"),f.currentTarget.style.setProperty("--radix-toast-swipe-move-x",`${m}px`),f.currentTarget.style.setProperty("--radix-toast-swipe-move-y",`${p}px`)}),onSwipeCancel:fe(e.onSwipeCancel,f=>{f.currentTarget.setAttribute("data-swipe","cancel"),f.currentTarget.style.removeProperty("--radix-toast-swipe-move-x"),f.currentTarget.style.removeProperty("--radix-toast-swipe-move-y"),f.currentTarget.style.removeProperty("--radix-toast-swipe-end-x"),f.currentTarget.style.removeProperty("--radix-toast-swipe-end-y")}),onSwipeEnd:fe(e.onSwipeEnd,f=>{const{x:m,y:p}=f.detail.delta;f.currentTarget.setAttribute("data-swipe","end"),f.currentTarget.style.removeProperty("--radix-toast-swipe-move-x"),f.currentTarget.style.removeProperty("--radix-toast-swipe-move-y"),f.currentTarget.style.setProperty("--radix-toast-swipe-end-x",`${m}px`),f.currentTarget.style.setProperty("--radix-toast-swipe-end-y",`${p}px`),u(!1)})})))}),[jee,Bee]=YA(Cg,{onClose(){}}),ZA=d.forwardRef((e,t)=>{const{__scopeToast:n,type:r="foreground",duration:o,open:i,onClose:s,onEscapeKeyDown:l,onPause:u,onResume:f,onSwipeStart:m,onSwipeMove:p,onSwipeCancel:g,onSwipeEnd:y,...x}=e,S=Eg(Cg,n),[E,_]=d.useState(null),b=Ve(t,U=>_(U)),C=d.useRef(null),R=d.useRef(null),k=o||S.duration,O=d.useRef(0),A=d.useRef(k),I=d.useRef(0),{onToastAdd:z,onToastRemove:H}=S,ie=Lt(()=>{var U;(E==null?void 0:E.contains(document.activeElement))&&((U=S.viewport)===null||U===void 0||U.focus()),s()}),K=d.useCallback(U=>{!U||U===1/0||(window.clearTimeout(I.current),O.current=new Date().getTime(),I.current=window.setTimeout(ie,U))},[ie]);d.useEffect(()=>{const U=S.viewport;if(U){const re=()=>{K(A.current),f==null||f()},V=()=>{const J=new Date().getTime()-O.current;A.current=A.current-J,window.clearTimeout(I.current),u==null||u()};return U.addEventListener(ow,V),U.addEventListener(iw,re),()=>{U.removeEventListener(ow,V),U.removeEventListener(iw,re)}}},[S.viewport,k,u,f,K]),d.useEffect(()=>{i&&!S.isClosePausedRef.current&&K(k)},[i,k,S.isClosePausedRef,K]),d.useEffect(()=>(z(),()=>H()),[z,H]);const te=d.useMemo(()=>E?eM(E):null,[E]);return S.viewport?d.createElement(d.Fragment,null,te&&d.createElement(zee,{__scopeToast:n,role:"status","aria-live":r==="foreground"?"assertive":"polite","aria-atomic":!0},te),d.createElement(jee,{scope:n,onClose:ie},Bs.createPortal(d.createElement(vb.ItemSlot,{scope:n},d.createElement($z,{asChild:!0,onEscapeKeyDown:fe(l,()=>{S.isFocusedToastEscapeKeyDownRef.current||ie(),S.isFocusedToastEscapeKeyDownRef.current=!1})},d.createElement(Ae.li,Y({role:"status","aria-live":"off","aria-atomic":!0,tabIndex:0,"data-state":i?"open":"closed","data-swipe-direction":S.swipeDirection},x,{ref:b,style:{userSelect:"none",touchAction:"none",...e.style},onKeyDown:fe(e.onKeyDown,U=>{U.key==="Escape"&&(l==null||l(U.nativeEvent),U.nativeEvent.defaultPrevented||(S.isFocusedToastEscapeKeyDownRef.current=!0,ie()))}),onPointerDown:fe(e.onPointerDown,U=>{U.button===0&&(C.current={x:U.clientX,y:U.clientY})}),onPointerMove:fe(e.onPointerMove,U=>{if(!C.current)return;const re=U.clientX-C.current.x,V=U.clientY-C.current.y,J=!!R.current,G=["left","right"].includes(S.swipeDirection),Z=["left","up"].includes(S.swipeDirection)?Math.min:Math.max,Q=G?Z(0,re):0,le=G?0:Z(0,V),L=U.pointerType==="touch"?10:2,ue={x:Q,y:le},Ne={originalEvent:U,delta:ue};J?(R.current=ue,Mh(Dee,p,Ne,{discrete:!1})):d$(ue,S.swipeDirection,L)?(R.current=ue,Mh(Nee,m,Ne,{discrete:!1}),U.target.setPointerCapture(U.pointerId)):(Math.abs(re)>L||Math.abs(V)>L)&&(C.current=null)}),onPointerUp:fe(e.onPointerUp,U=>{const re=R.current,V=U.target;if(V.hasPointerCapture(U.pointerId)&&V.releasePointerCapture(U.pointerId),R.current=null,C.current=null,re){const J=U.currentTarget,G={originalEvent:U,delta:re};d$(re,S.swipeDirection,S.swipeThreshold)?Mh(Lee,y,G,{discrete:!0}):Mh(Iee,g,G,{discrete:!0}),J.addEventListener("click",Z=>Z.preventDefault(),{once:!0})}})})))),S.viewport))):null});ZA.propTypes={type(e){if(e.type&&!["foreground","background"].includes(e.type)){const t=`Invalid prop \`type\` supplied to \`${Cg}\`. Expected \`foreground | background\`.`;return new Error(t)}return null}};const zee=e=>{const{__scopeToast:t,children:n,...r}=e,o=Eg(Cg,t),[i,s]=d.useState(!1),[l,u]=d.useState(!1);return Kee(()=>s(!0)),d.useEffect(()=>{const f=window.setTimeout(()=>u(!0),1e3);return()=>window.clearTimeout(f)},[]),l?null:d.createElement(jd,{asChild:!0},d.createElement(Xm,r,i&&d.createElement(d.Fragment,null,o.label," ",n)))},Uee=d.forwardRef((e,t)=>{const{__scopeToast:n,...r}=e;return d.createElement(Ae.div,Y({},r,{ref:t}))}),Vee=d.forwardRef((e,t)=>{const{__scopeToast:n,...r}=e;return d.createElement(Ae.div,Y({},r,{ref:t}))}),Wee="ToastAction",qA=d.forwardRef((e,t)=>{const{altText:n,...r}=e;return n?d.createElement(JA,{altText:n,asChild:!0},d.createElement(QA,Y({},r,{ref:t}))):null});qA.propTypes={altText(e){return e.altText?null:new Error(`Missing prop \`altText\` expected on \`${Wee}\``)}};const Hee="ToastClose",QA=d.forwardRef((e,t)=>{const{__scopeToast:n,...r}=e,o=Bee(Hee,n);return d.createElement(JA,{asChild:!0},d.createElement(Ae.button,Y({type:"button"},r,{ref:t,onClick:fe(e.onClick,o.onClose)})))}),JA=d.forwardRef((e,t)=>{const{__scopeToast:n,altText:r,...o}=e;return d.createElement(Ae.div,Y({"data-radix-toast-announce-exclude":"","data-radix-toast-announce-alt":r||void 0},o,{ref:t}))});function eM(e){const t=[];return Array.from(e.childNodes).forEach(r=>{if(r.nodeType===r.TEXT_NODE&&r.textContent&&t.push(r.textContent),Gee(r)){const o=r.ariaHidden||r.hidden||r.style.display==="none",i=r.dataset.radixToastAnnounceExclude==="";if(!o)if(i){const s=r.dataset.radixToastAnnounceAlt;s&&t.push(s)}else t.push(...eM(r))}}),t}function Mh(e,t,n,{discrete:r}){const o=n.originalEvent.currentTarget,i=new CustomEvent(e,{bubbles:!0,cancelable:!0,detail:n});t&&o.addEventListener(e,t,{once:!0}),r?mx(o,i):o.dispatchEvent(i)}const d$=(e,t,n=0)=>{const r=Math.abs(e.x),o=Math.abs(e.y),i=r>o;return t==="left"||t==="right"?i&&r>n:!i&&o>n};function Kee(e=()=>{}){const t=Lt(e);Pn(()=>{let n=0,r=0;return n=window.requestAnimationFrame(()=>r=window.requestAnimationFrame(t)),()=>{window.cancelAnimationFrame(n),window.cancelAnimationFrame(r)}},[t])}function Gee(e){return e.nodeType===e.ELEMENT_NODE}function Yee(e){const t=[],n=document.createTreeWalker(e,NodeFilter.SHOW_ELEMENT,{acceptNode:r=>{const o=r.tagName==="INPUT"&&r.type==="hidden";return r.disabled||r.hidden||o?NodeFilter.FILTER_SKIP:r.tabIndex>=0?NodeFilter.FILTER_ACCEPT:NodeFilter.FILTER_SKIP}});for(;n.nextNode();)t.push(n.currentNode);return t}function Y0(e){const t=document.activeElement;return e.some(n=>n===t?!0:(n.focus(),document.activeElement!==t))}const Xee=XA,tM=Mee,nM=Fee,rM=Uee,oM=Vee,iM=qA,sM=QA,Zee=Xee,aM=d.forwardRef(({className:e,...t},n)=>v.jsx(tM,{ref:n,className:xe("fixed top-0 z-[100] flex max-h-screen w-full flex-col-reverse p-4 sm:bottom-0 sm:right-0 sm:top-auto sm:flex-col md:max-w-[420px]",e),tabIndex:-1,...t}));aM.displayName=tM.displayName;const qee=Fm("group pointer-events-auto relative flex w-full items-center justify-between space-x-2 overflow-hidden rounded-md border p-4 pr-6 shadow-lg transition-all data-[swipe=cancel]:translate-x-0 data-[swipe=end]:translate-x-[var(--radix-toast-swipe-end-x)] data-[swipe=move]:translate-x-[var(--radix-toast-swipe-move-x)] data-[swipe=move]:transition-none data-[state=open]:animate-in data-[state=closed]:animate-out data-[swipe=end]:animate-out data-[state=closed]:fade-out-80 data-[state=closed]:slide-out-to-right-full data-[state=open]:slide-in-from-top-full data-[state=open]:sm:slide-in-from-bottom-full",{variants:{variant:{default:"border bg-background text-foreground",destructive:"destructive group border-destructive bg-destructive text-destructive-foreground"}},defaultVariants:{variant:"default"}}),lM=d.forwardRef(({className:e,variant:t,...n},r)=>v.jsx(nM,{ref:r,className:xe(qee({variant:t}),e),tabIndex:-1,...n}));lM.displayName=nM.displayName;const Qee=d.forwardRef(({className:e,...t},n)=>v.jsx(iM,{ref:n,className:xe("inline-flex h-8 shrink-0 items-center justify-center rounded-md border bg-transparent px-3 text-sm font-medium transition-colors hover:bg-secondary focus:outline-none focus:ring-1 focus:ring-ring disabled:pointer-events-none disabled:opacity-50 group-[.destructive]:border-muted/40 group-[.destructive]:hover:border-destructive/30 group-[.destructive]:hover:bg-destructive group-[.destructive]:hover:text-destructive-foreground group-[.destructive]:focus:ring-destructive",e),tabIndex:-1,...t}));Qee.displayName=iM.displayName;const cM=d.forwardRef(({className:e,...t},n)=>v.jsx(sM,{ref:n,className:xe("absolute right-1 top-1 rounded-md p-1 text-foreground/50 opacity-0 transition-opacity hover:text-foreground focus:opacity-100 focus:outline-none focus:ring-1 group-hover:opacity-100 group-[.destructive]:text-red-300 group-[.destructive]:hover:text-red-50 group-[.destructive]:focus:ring-red-400 group-[.destructive]:focus:ring-offset-red-600",e),"toast-close":"",tabIndex:-1,...t,children:v.jsx(CP,{className:"h-4 w-4"})}));cM.displayName=sM.displayName;const uM=d.forwardRef(({className:e,...t},n)=>v.jsx(rM,{ref:n,className:xe("text-sm font-semibold [&+div]:text-xs",e),tabIndex:-1,...t}));uM.displayName=rM.displayName;const dM=d.forwardRef(({className:e,...t},n)=>v.jsx(oM,{ref:n,className:xe("text-sm opacity-90",e),...t,tabIndex:-1}));dM.displayName=oM.displayName;function Jee(){const{toasts:e}=Id();return v.jsxs(Zee,{children:[e.map(function({id:t,title:n,description:r,action:o,...i}){return v.jsxs(lM,{...i,children:[v.jsxs("div",{className:"grid gap-1",children:[n&&v.jsx(uM,{children:n}),r&&v.jsx(dM,{children:r})]}),o,v.jsx(cM,{})]},t)}),v.jsx(aM,{})]})}const ete=["image/jpeg","image/png","image/webp","image/bmp","image/tiff"];function tte(){const[e,t,n,r]=xt(g=>[g.file,g.updateAppState,g.setServerConfig,g.setFile]),o=qj(),i=rH();d.useEffect(()=>{o&&r(o)},[o,r]),d.useEffect(()=>{t({windowSize:i})},[i]),d.useEffect(()=>{(async()=>{const y=await SP();n(y),y.isDesktop&&gF()})()},[]);const s=d.useRef(0),l=d.useCallback(g=>{g.preventDefault(),g.stopPropagation()},[]),u=d.useCallback(g=>{g.preventDefault(),g.stopPropagation(),s.current+=1},[]),f=d.useCallback(g=>{g.preventDefault(),g.stopPropagation(),s.current-=1,s.current>0},[]),m=d.useCallback(g=>{if(g.preventDefault(),g.stopPropagation(),g.dataTransfer.files&&g.dataTransfer.files.length>0){if(!(g.dataTransfer.files.length>1)){const y=g.dataTransfer.files[0],x=y.type;ete.includes(x)&&r(y)}g.dataTransfer.clearData()}},[]),p=d.useCallback(g=>{if(!g.clipboardData)return;const y=g.clipboardData.items,x=[].slice.call(y).filter(_=>_.type.indexOf("image")!==-1);if(x.length===0)return;g.preventDefault(),g.stopPropagation();const E=x[0].getAsFile();E&&r(E)},[]);return d.useEffect(()=>(window.addEventListener("dragenter",u),window.addEventListener("dragleave",f),window.addEventListener("dragover",l),window.addEventListener("drop",m),window.addEventListener("paste",p),function(){window.removeEventListener("dragenter",u),window.removeEventListener("dragleave",f),window.removeEventListener("dragover",l),window.removeEventListener("drop",m),window.removeEventListener("paste",p)})),v.jsxs("main",{className:"flex min-h-screen flex-col items-center justify-between w-full bg-[radial-gradient(circle_at_1px_1px,_#8e8e8e8e_1px,_transparent_0)] [background-size:20px_20px] bg-repeat",children:[v.jsx(Jee,{}),v.jsx(eZ,{}),v.jsx(Eee,{}),e?v.jsx(v.Fragment,{}):v.jsx($ee,{onSelection:async g=>{r(g)}})]})}const fm=["light","dark"],yb="(prefers-color-scheme: dark)",nte=typeof window>"u",fM=d.createContext(void 0),rte=e=>d.useContext(fM)?Be.createElement(d.Fragment,null,e.children):Be.createElement(ite,e),ote=["light","dark"],ite=({forcedTheme:e,disableTransitionOnChange:t=!1,enableSystem:n=!0,enableColorScheme:r=!0,storageKey:o="theme",themes:i=ote,defaultTheme:s=n?"system":"light",attribute:l="data-theme",value:u,children:f,nonce:m})=>{const[p,g]=d.useState(()=>f$(o,s)),[y,x]=d.useState(()=>f$(o)),S=u?Object.values(u):i,E=d.useCallback(R=>{let k=R;if(!k)return;R==="system"&&n&&(k=h$());const O=u?u[k]:k,A=t?ate():null,I=document.documentElement;if(l==="class"?(I.classList.remove(...S),O&&I.classList.add(O)):O?I.setAttribute(l,O):I.removeAttribute(l),r){const z=fm.includes(s)?s:null,H=fm.includes(k)?k:z;I.style.colorScheme=H}A==null||A()},[]),_=d.useCallback(R=>{g(R);try{localStorage.setItem(o,R)}catch{}},[e]),b=d.useCallback(R=>{const k=h$(R);x(k),p==="system"&&n&&!e&&E("system")},[p,e]);d.useEffect(()=>{const R=window.matchMedia(yb);return R.addListener(b),b(R),()=>R.removeListener(b)},[b]),d.useEffect(()=>{const R=k=>{k.key===o&&_(k.newValue||s)};return window.addEventListener("storage",R),()=>window.removeEventListener("storage",R)},[_]),d.useEffect(()=>{E(e??p)},[e,p]);const C=d.useMemo(()=>({theme:p,setTheme:_,forcedTheme:e,resolvedTheme:p==="system"?y:p,themes:n?[...i,"system"]:i,systemTheme:n?y:void 0}),[p,_,e,y,n,i]);return Be.createElement(fM.Provider,{value:C},Be.createElement(ste,{forcedTheme:e,disableTransitionOnChange:t,enableSystem:n,enableColorScheme:r,storageKey:o,themes:i,defaultTheme:s,attribute:l,value:u,children:f,attrs:S,nonce:m}),f)},ste=d.memo(({forcedTheme:e,storageKey:t,attribute:n,enableSystem:r,enableColorScheme:o,defaultTheme:i,value:s,attrs:l,nonce:u})=>{const f=i==="system",m=n==="class"?`var d=document.documentElement,c=d.classList;c.remove(${l.map(x=>`'${x}'`).join(",")});`:`var d=document.documentElement,n='${n}',s='setAttribute';`,p=o?fm.includes(i)&&i?`if(e==='light'||e==='dark'||!e)d.style.colorScheme=e||'${i}'`:"if(e==='light'||e==='dark')d.style.colorScheme=e":"",g=(x,S=!1,E=!0)=>{const _=s?s[x]:x,b=S?x+"|| ''":`'${_}'`;let C="";return o&&E&&!S&&fm.includes(x)&&(C+=`d.style.colorScheme = '${x}';`),n==="class"?C+=S||_?`c.add(${b})`:"null":_&&(C+=`d[s](n,${b})`),C},y=e?`!function(){${m}${g(e)}}()`:r?`!function(){try{${m}var e=localStorage.getItem('${t}');if('system'===e||(!e&&${f})){var t='${yb}',m=window.matchMedia(t);if(m.media!==t||m.matches){${g("dark")}}else{${g("light")}}}else if(e){${s?`var x=${JSON.stringify(s)};`:""}${g(s?"x[e]":"e",!0)}}${f?"":"else{"+g(i,!1,!1)+"}"}${p}}catch(e){}}()`:`!function(){try{${m}var e=localStorage.getItem('${t}');if(e){${s?`var x=${JSON.stringify(s)};`:""}${g(s?"x[e]":"e",!0)}}else{${g(i,!1,!1)};}${p}}catch(t){}}();`;return Be.createElement("script",{nonce:u,dangerouslySetInnerHTML:{__html:y}})},()=>!0),f$=(e,t)=>{if(nte)return;let n;try{n=localStorage.getItem(e)||void 0}catch{}return n||t},ate=()=>{const e=document.createElement("style");return e.appendChild(document.createTextNode("*{-webkit-transition:none!important;-moz-transition:none!important;-o-transition:none!important;-ms-transition:none!important;transition:none!important}")),document.head.appendChild(e),()=>{window.getComputedStyle(document.body),setTimeout(()=>{document.head.removeChild(e)},1)}},h$=e=>(e||(e=window.matchMedia(yb)),e.matches?"dark":"light"),lte=new mL;X0.createRoot(document.getElementById("root")).render(v.jsx(Be.StrictMode,{children:v.jsx(xL,{client:lte,children:v.jsx(rte,{defaultTheme:"dark",disableTransitionOnChange:!0,children:v.jsx(KU,{children:v.jsx(tte,{})})})})})); diff --git a/inpaint/web_app/index.html b/inpaint/web_app/index.html new file mode 100644 index 0000000..fde5af5 --- /dev/null +++ b/inpaint/web_app/index.html @@ -0,0 +1,13 @@ + + + + + + Image Sorter | Inpaint + + + + +
+ + diff --git a/inpaint/web_config.py b/inpaint/web_config.py new file mode 100644 index 0000000..61921e0 --- /dev/null +++ b/inpaint/web_config.py @@ -0,0 +1,319 @@ +import json +import os +from pathlib import Path + +import mimetypes + +# fix for windows mimetypes registry entries being borked +# see https://github.com/invoke-ai/InvokeAI/discussions/3684#discussioncomment-6391352 +mimetypes.add_type("application/javascript", ".js") +mimetypes.add_type("text/css", ".css") + +from inpaint.schema import ( + Device, + InteractiveSegModel, + RemoveBGModel, + RealESRGANModel, + ApiConfig, +) + +os.environ["GRADIO_ANALYTICS_ENABLED"] = "False" + +from datetime import datetime +from json import JSONDecodeError + +import gradio as gr +from inpaint.download import scan_models +from loguru import logger + +from inpaint.const import * + +_config_file: Path = None + +default_configs = dict( + host="127.0.0.1", + port=8080, + inbrowser=True, + model=DEFAULT_MODEL, + model_dir=DEFAULT_MODEL_DIR, + no_half=False, + low_mem=False, + cpu_offload=False, + disable_nsfw_checker=False, + local_files_only=False, + cpu_textencoder=False, + device=Device.cuda, + input=None, + output_dir=None, + quality=95, + enable_interactive_seg=False, + interactive_seg_model=InteractiveSegModel.vit_b, + interactive_seg_device=Device.cpu, + enable_remove_bg=False, + remove_bg_model=RemoveBGModel.briaai_rmbg_1_4, + enable_anime_seg=False, + enable_realesrgan=False, + realesrgan_device=Device.cpu, + realesrgan_model=RealESRGANModel.realesr_general_x4v3, + enable_gfpgan=False, + gfpgan_device=Device.cpu, + enable_restoreformer=False, + restoreformer_device=Device.cpu, +) + + +class WebConfig(ApiConfig): + model_dir: str = DEFAULT_MODEL_DIR + + +def load_config(p: Path) -> WebConfig: + if p.exists(): + with open(p, "r", encoding="utf-8") as f: + try: + return WebConfig(**{**default_configs, **json.load(f)}) + except JSONDecodeError: + print("Load config file failed, using default configs") + return WebConfig(**default_configs) + else: + return WebConfig(**default_configs) + + +def save_config( + host, + port, + model, + model_dir, + no_half, + low_mem, + cpu_offload, + disable_nsfw_checker, + local_files_only, + cpu_textencoder, + device, + input, + mask_dir, + output_dir, + quality, + enable_interactive_seg, + interactive_seg_model, + interactive_seg_device, + enable_remove_bg, + remove_bg_model, + enable_anime_seg, + enable_realesrgan, + realesrgan_device, + realesrgan_model, + enable_gfpgan, + gfpgan_device, + enable_restoreformer, + restoreformer_device, + inbrowser, +): + config = WebConfig(**locals()) + if str(config.input) == ".": + config.input = None + if str(config.output_dir) == ".": + config.output_dir = None + if str(config.mask_dir) == ".": + config.mask_dir = None + config.model = config.model.strip() + print(config.model_dump_json(indent=4)) + if config.input and not os.path.exists(config.input): + return "[Error] Input file or directory does not exist" + + current_time = datetime.now().strftime("%H:%M:%S") + msg = f"[{current_time}] Successful save config to: {str(_config_file.absolute())}" + logger.info(msg) + try: + with open(_config_file, "w", encoding="utf-8") as f: + f.write(config.model_dump_json(indent=4)) + except Exception as e: + return f"Save configure file failed: {str(e)}" + return msg + + +def change_current_model(new_model): + return new_model + + +def main(config_file: Path): + global _config_file + _config_file = config_file + + init_config = load_config(config_file) + downloaded_models = [it.name for it in scan_models()] + + with gr.Blocks() as demo: + with gr.Row(): + with gr.Column(): + gr.Textbox(config_file, label="Config file", interactive=False) + with gr.Column(): + save_btn = gr.Button(value="Save configurations") + message = gr.HTML() + + with gr.Tabs(): + with gr.Tab("Common"): + with gr.Row(): + host = gr.Textbox(init_config.host, label="Host") + port = gr.Number(init_config.port, label="Port", precision=0) + inbrowser = gr.Checkbox(init_config.inbrowser, label=INBROWSER_HELP) + + with gr.Row(): + recommend_model = gr.Dropdown( + ["lama", "mat", "migan"] + DIFFUSION_MODELS, + label="Recommended Models", + ) + downloaded_model = gr.Dropdown( + downloaded_models, label="Downloaded Models" + ) + with gr.Column(): + model = gr.Textbox( + init_config.model, + label="Current Model. Model will be automatically downloaded. " + "You can select a model in Recommended Models or Downloaded Models or manually enter the SD/SDXL model ID from HuggingFace, for example, runwayml/stable-diffusion-inpainting.", + ) + + device = gr.Radio( + Device.values(), label="Device", value=init_config.device + ) + quality = gr.Slider( + value=95, + label=f"Image Quality ({QUALITY_HELP})", + minimum=75, + maximum=100, + step=1, + ) + + no_half = gr.Checkbox(init_config.no_half, label=f"{NO_HALF_HELP}") + cpu_offload = gr.Checkbox( + init_config.cpu_offload, label=f"{CPU_OFFLOAD_HELP}" + ) + low_mem = gr.Checkbox(init_config.low_mem, label=f"{LOW_MEM_HELP}") + cpu_textencoder = gr.Checkbox( + init_config.cpu_textencoder, label=f"{CPU_TEXTENCODER_HELP}" + ) + disable_nsfw_checker = gr.Checkbox( + init_config.disable_nsfw_checker, label=f"{DISABLE_NSFW_HELP}" + ) + local_files_only = gr.Checkbox( + init_config.local_files_only, label=f"{LOCAL_FILES_ONLY_HELP}" + ) + + with gr.Column(): + model_dir = gr.Textbox( + init_config.model_dir, label=f"{MODEL_DIR_HELP}" + ) + input = gr.Textbox( + init_config.input, + label=f"Input file or directory. {INPUT_HELP}", + ) + output_dir = gr.Textbox( + init_config.output_dir, + label=f"Output directory. {OUTPUT_DIR_HELP}", + ) + mask_dir = gr.Textbox( + init_config.mask_dir, + label=f"Mask directory. {MASK_DIR_HELP}", + ) + + with gr.Tab("Plugins"): + with gr.Row(): + enable_interactive_seg = gr.Checkbox( + init_config.enable_interactive_seg, label=INTERACTIVE_SEG_HELP + ) + interactive_seg_model = gr.Radio( + InteractiveSegModel.values(), + label=f"Segment Anything models. {INTERACTIVE_SEG_MODEL_HELP}", + value=init_config.interactive_seg_model, + ) + interactive_seg_device = gr.Radio( + Device.values(), + label="Segment Anything Device", + value=init_config.interactive_seg_device, + ) + with gr.Row(): + enable_remove_bg = gr.Checkbox( + init_config.enable_remove_bg, label=REMOVE_BG_HELP + ) + remove_bg_model = gr.Radio( + RemoveBGModel.values(), + label="Remove bg model", + value=init_config.remove_bg_model, + ) + with gr.Row(): + enable_anime_seg = gr.Checkbox( + init_config.enable_anime_seg, label=ANIMESEG_HELP + ) + + with gr.Row(): + enable_realesrgan = gr.Checkbox( + init_config.enable_realesrgan, label=REALESRGAN_HELP + ) + realesrgan_device = gr.Radio( + Device.values(), + label="RealESRGAN Device", + value=init_config.realesrgan_device, + ) + realesrgan_model = gr.Radio( + RealESRGANModel.values(), + label="RealESRGAN model", + value=init_config.realesrgan_model, + ) + with gr.Row(): + enable_gfpgan = gr.Checkbox( + init_config.enable_gfpgan, label=GFPGAN_HELP + ) + gfpgan_device = gr.Radio( + Device.values(), + label="GFPGAN Device", + value=init_config.gfpgan_device, + ) + with gr.Row(): + enable_restoreformer = gr.Checkbox( + init_config.enable_restoreformer, label=RESTOREFORMER_HELP + ) + restoreformer_device = gr.Radio( + Device.values(), + label="RestoreFormer Device", + value=init_config.restoreformer_device, + ) + + downloaded_model.change(change_current_model, [downloaded_model], model) + recommend_model.change(change_current_model, [recommend_model], model) + + save_btn.click( + save_config, + [ + host, + port, + model, + model_dir, + no_half, + low_mem, + cpu_offload, + disable_nsfw_checker, + local_files_only, + cpu_textencoder, + device, + input, + mask_dir, + output_dir, + quality, + enable_interactive_seg, + interactive_seg_model, + interactive_seg_device, + enable_remove_bg, + remove_bg_model, + enable_anime_seg, + enable_realesrgan, + realesrgan_device, + realesrgan_model, + enable_gfpgan, + gfpgan_device, + enable_restoreformer, + restoreformer_device, + inbrowser, + ], + message, + ) + demo.launch(inbrowser=True, show_api=False)