Spaces:

jbilcke-hf
/

ai-toolkit

Paused

App Files Files Community

jbilcke-hf commited on Aug 17

Commit

8822914

1 Parent(s): 3180dc0

Convert AI-Toolkit to a HF Space

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +3 -0
.gitignore +184 -0
.vscode/launch.json +56 -0
CLAUDE.md +123 -0
Dockerfile +83 -0
FAQ.md +10 -0
LICENSE +21 -0
README.md +482 -6
assets/VAE_test1.jpg +3 -0
assets/glif.svg +40 -0
assets/lora_ease_ui.png +3 -0
build_and_push_docker +29 -0
build_and_push_docker_dev +21 -0
config/examples/extract.example.yml +75 -0
config/examples/generate.example.yaml +60 -0
config/examples/mod_lora_scale.yaml +48 -0
config/examples/modal/modal_train_lora_flux_24gb.yaml +96 -0
config/examples/modal/modal_train_lora_flux_schnell_24gb.yaml +98 -0
config/examples/train_flex_redux.yaml +112 -0
config/examples/train_full_fine_tune_flex.yaml +107 -0
config/examples/train_full_fine_tune_lumina.yaml +99 -0
config/examples/train_lora_chroma_24gb.yaml +104 -0
config/examples/train_lora_flex2_24gb.yaml +165 -0
config/examples/train_lora_flex_24gb.yaml +101 -0
config/examples/train_lora_flux_24gb.yaml +96 -0
config/examples/train_lora_flux_kontext_24gb.yaml +106 -0
config/examples/train_lora_flux_schnell_24gb.yaml +98 -0
config/examples/train_lora_hidream_48.yaml +112 -0
config/examples/train_lora_lumina.yaml +96 -0
config/examples/train_lora_omnigen2_24gb.yaml +94 -0
config/examples/train_lora_qwen_image_24gb.yaml +95 -0
config/examples/train_lora_qwen_image_edit_32gb.yaml +102 -0
config/examples/train_lora_sd35_large_24gb.yaml +97 -0
config/examples/train_lora_wan21_14b_24gb.yaml +101 -0
config/examples/train_lora_wan21_1b_24gb.yaml +90 -0
config/examples/train_lora_wan22_14b_24gb.yaml +111 -0
config/examples/train_slider.example.yml +230 -0
docker-compose.yml +25 -0
extensions/example/ExampleMergeModels.py +129 -0
extensions/example/__init__.py +25 -0
extensions/example/config/config.example.yaml +48 -0
extensions_built_in/.DS_Store +0 -0
extensions_built_in/advanced_generator/Img2ImgGenerator.py +256 -0
extensions_built_in/advanced_generator/PureLoraGenerator.py +102 -0
extensions_built_in/advanced_generator/ReferenceGenerator.py +212 -0
extensions_built_in/advanced_generator/__init__.py +59 -0
extensions_built_in/advanced_generator/config/train.example.yaml +91 -0
extensions_built_in/concept_replacer/ConceptReplacer.py +151 -0
extensions_built_in/concept_replacer/__init__.py +26 -0
extensions_built_in/concept_replacer/config/train.example.yaml +91 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,6 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/lora_ease_ui.png filter=lfs diff=lfs merge=lfs -text
+assets/VAE_test1.jpg filter=lfs diff=lfs merge=lfs -text
+toolkit/timestep_weighing/flex_timestep_weights_plot.png filter=lfs diff=lfs merge=lfs -text

.gitignore ADDED Viewed

	@@ -0,0 +1,184 @@

+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+# C extensions
+*.so
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+cover/
+# Translations
+*.mo
+*.pot
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+# Flask stuff:
+instance/
+.webassets-cache
+# Scrapy stuff:
+.scrapy
+# Sphinx documentation
+docs/_build/
+# PyBuilder
+.pybuilder/
+target/
+# Jupyter Notebook
+.ipynb_checkpoints
+# IPython
+profile_default/
+ipython_config.py
+# pyenv
+#   For a library or package, you might want to ignore these files since the code is
+#   intended to run in multiple environments; otherwise, check them in:
+# .python-version
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+# poetry
+#   Similar to Pipfile.lock, it is generally recommended to include poetry.lock in version control.
+#   This is especially recommended for binary packages to ensure reproducibility, and is more
+#   commonly ignored for libraries.
+#   https://python-poetry.org/docs/basic-usage/#commit-your-poetrylock-file-to-version-control
+#poetry.lock
+# pdm
+#   Similar to Pipfile.lock, it is generally recommended to include pdm.lock in version control.
+#pdm.lock
+#   pdm stores project-wide configurations in .pdm.toml, but it is recommended to not include it
+#   in version control.
+#   https://pdm.fming.dev/#use-with-ide
+.pdm.toml
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow and github.com/pdm-project/pdm
+__pypackages__/
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+# SageMath parsed files
+*.sage.py
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+# Spyder project settings
+.spyderproject
+.spyproject
+# Rope project settings
+.ropeproject
+# mkdocs documentation
+/site
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+# Pyre type checker
+.pyre/
+# pytype static type analyzer
+.pytype/
+# Cython debug symbols
+cython_debug/
+# PyCharm
+#  JetBrains specific template is maintained in a separate JetBrains.gitignore that can
+#  be found at https://github.com/github/gitignore/blob/main/Global/JetBrains.gitignore
+#  and can be added to the global gitignore or merged into this file.  For a more nuclear
+#  option (not recommended) you can uncomment the following to ignore the entire idea folder.
+.idea/
+/env.sh
+/models
+/datasets
+/custom/*
+!/custom/.gitkeep
+/.tmp
+/venv.bkp
+/venv.*
+/config/*
+!/config/examples
+!/config/_PUT_YOUR_CONFIGS_HERE).txt
+/output/*
+!/output/.gitkeep
+/extensions/*
+!/extensions/example
+/temp
+/wandb
+.vscode/settings.json
+.DS_Store
+._.DS_Store
+aitk_db.db
+/notes.md
+/data

.vscode/launch.json ADDED Viewed

	@@ -0,0 +1,56 @@

+{
+    "version": "0.2.0",
+    "configurations": [
+        {
+            "name": "Run current config",
+            "type": "python",
+            "request": "launch",
+            "program": "${workspaceFolder}/run.py",
+            "args": [
+                "${file}"
+            ],
+            "env": {
+                "CUDA_LAUNCH_BLOCKING": "1",
+                "DEBUG_TOOLKIT": "1"
+            },
+            "console": "integratedTerminal",
+            "justMyCode": false
+        },
+        {
+            "name": "Run current config (cuda:1)",
+            "type": "python",
+            "request": "launch",
+            "program": "${workspaceFolder}/run.py",
+            "args": [
+                "${file}"
+            ],
+            "env": {
+                "CUDA_LAUNCH_BLOCKING": "1",
+                "DEBUG_TOOLKIT": "1",
+                "CUDA_VISIBLE_DEVICES": "1"
+            },
+            "console": "integratedTerminal",
+            "justMyCode": false
+        },
+        {
+            "name": "Python: Debug Current File",
+            "type": "python",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal",
+            "justMyCode": false
+        },
+        {
+            "name": "Python: Debug Current File (cuda:1)",
+            "type": "python",
+            "request": "launch",
+            "program": "${file}",
+            "console": "integratedTerminal",
+            "env": {
+                "CUDA_LAUNCH_BLOCKING": "1",
+                "CUDA_VISIBLE_DEVICES": "1"
+            },
+            "justMyCode": false
+        },
+    ]
+}

CLAUDE.md ADDED Viewed

	@@ -0,0 +1,123 @@

+# CLAUDE.md
+This file provides guidance to Claude Code (claude.ai/code) when working with code in this repository.
+## Project Overview
+This is the AI Toolkit by Ostris, packaged as a Hugging Face Space for Docker deployment. It's a comprehensive training suite for diffusion models supporting the latest models on consumer-grade hardware. The toolkit includes both CLI and web UI interfaces for training LoRA models, particularly focused on FLUX.1 models.
+## Architecture
+### Core Structure
+- **Main Entry Points**:
+  - `run.py` - CLI interface for running training jobs with config files
+  - `flux_train_ui.py` - Gradio-based simple training interface
+  - `start.sh` - Docker entry point that launches the web UI
+- **Web UI** (`ui/`): Next.js application with TypeScript
+  - Frontend in `src/app/` with API routes
+  - Background worker process for job management
+  - SQLite database via Prisma for job persistence
+- **Core Toolkit** (`toolkit/`): Python modules for ML operations
+  - Model implementations in `toolkit/models/`
+  - Training processes in `jobs/process/`
+  - Configuration management and data loading utilities
+- **Extensions** (`extensions_built_in/`): Modular training components
+  - Support for various model types (FLUX, SDXL, SD 1.5, etc.)
+  - Different training strategies (LoRA, fine-tuning, etc.)
+### Key Configuration
+- Training configs in `config/examples/` with YAML format
+- Docker setup supports GPU passthrough with nvidia runtime
+- Environment variables for HuggingFace tokens and authentication
+## Common Development Commands
+### Setup and Installation
+```bash
+# Python environment setup
+python3 -m venv venv
+source venv/bin/activate  # or .\venv\Scripts\activate on Windows
+pip3 install --no-cache-dir torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/cu126
+pip3 install -r requirements.txt
+```
+### Running Training Jobs
+```bash
+# CLI training with config file
+python run.py config/your_config.yml
+# Simple Gradio UI for FLUX training
+python flux_train_ui.py
+```
+### Web UI Development
+```bash
+# Development mode (from ui/ directory)
+cd ui
+npm install
+npm run dev
+# Production build and start
+npm run build_and_start
+# Database updates
+npm run update_db
+```
+### Docker Operations
+```bash
+# Run with docker-compose
+docker-compose up
+# Build custom image
+docker build -f docker/Dockerfile -t ai-toolkit .
+```
+## Authentication Requirements
+### HuggingFace Access
+- FLUX.1-dev requires accepting license at https://huggingface.co/black-forest-labs/FLUX.1-dev
+- Set `HF_TOKEN` environment variable with READ access token
+- Create `.env` file in root: `HF_TOKEN=your_key_here`
+### UI Security
+- Set `AI_TOOLKIT_AUTH` environment variable for UI authentication
+- Default password is "password" if not set
+## Training Configuration
+### Model Support
+- **FLUX.1-dev**: Requires HF token, non-commercial license
+- **FLUX.1-schnell**: Apache 2.0, needs training adapter
+- **SDXL, SD 1.5**: Standard Stable Diffusion models
+- **Video models**: Various I2V and text-to-video architectures
+### Memory Requirements
+- FLUX.1 training requires minimum 24GB VRAM
+- Use `low_vram: true` in config if running with displays attached
+- Supports various quantization options to reduce memory usage
+### Dataset Format
+- Images: JPG, JPEG, PNG (no WebP)
+- Captions: `.txt` files with same name as images
+- Use `[trigger]` placeholder in captions, replaced by `trigger_word` config
+- Images auto-resized and bucketed, no manual preprocessing needed
+## Key Files to Understand
+- `run.py:46-85` - Main training job runner and argument parsing
+- `toolkit/job.py` - Job management and configuration loading
+- `ui/src/app/api/jobs/route.ts` - API endpoints for job management
+- `config/examples/train_lora_flux_24gb.yaml` - Standard FLUX training template
+- `extensions_built_in/sd_trainer/SDTrainer.py` - Core training logic
+## Development Notes
+- Jobs run independently of UI - UI is only for management
+- Training can be stopped/resumed via checkpoints
+- Output stored in `output/` directory with samples and models
+- Extensions system allows custom training implementations
+- Multi-GPU support via accelerate library

Dockerfile ADDED Viewed

	@@ -0,0 +1,83 @@

+FROM nvidia/cuda:12.8.1-devel-ubuntu22.04
+LABEL authors="jaret"
+# Set noninteractive to avoid timezone prompts
+ENV DEBIAN_FRONTEND=noninteractive
+# ref https://en.wikipedia.org/wiki/CUDA
+ENV TORCH_CUDA_ARCH_LIST="8.0 8.6 8.9 9.0 10.0 12.0"
+# Install dependencies
+RUN apt-get update && apt-get install --no-install-recommends -y \
+    git \
+    curl \
+    build-essential \
+    cmake \
+    wget \
+    python3.10 \
+    python3-pip \
+    python3-dev \
+    python3-setuptools \
+    python3-wheel \
+    python3-venv \
+    ffmpeg \
+    tmux \
+    htop \
+    nvtop \
+    python3-opencv \
+    openssh-client \
+    openssh-server \
+    openssl \
+    rsync \
+    unzip \
+    && apt-get clean \
+    && rm -rf /var/lib/apt/lists/*
+# Install nodejs
+WORKDIR /tmp
+RUN curl -sL https://deb.nodesource.com/setup_23.x -o nodesource_setup.sh && \
+    bash nodesource_setup.sh && \
+    apt-get update && \
+    apt-get install -y nodejs && \
+    apt-get clean && \
+    rm -rf /var/lib/apt/lists/*
+WORKDIR /app
+# Set aliases for python and pip
+RUN ln -s /usr/bin/python3 /usr/bin/python
+# install pytorch before cache bust to avoid redownloading pytorch
+RUN pip install --pre --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128
+# Fix cache busting by moving CACHEBUST to right before git clone
+ARG CACHEBUST=1234
+ARG GIT_COMMIT=main
+RUN echo "Cache bust: ${CACHEBUST}" && \
+    git clone https://github.com/ostris/ai-toolkit.git && \
+    cd ai-toolkit && \
+    git checkout ${GIT_COMMIT}
+WORKDIR /app/ai-toolkit
+# Install Python dependencies
+RUN pip install --no-cache-dir -r requirements.txt && \
+    pip install --pre --no-cache-dir torch torchvision torchaudio --index-url https://download.pytorch.org/whl/nightly/cu128 --force && \
+    pip install setuptools==69.5.1 --no-cache-dir
+# Build UI (generate temporary Prisma client for Next.js build)
+WORKDIR /app/ai-toolkit/ui
+RUN npm install && \
+    npx prisma generate --schema prisma/schema.prisma && \
+    npm run build
+# Expose port (assuming the application runs on port 3000)
+EXPOSE 8675
+WORKDIR /
+COPY start.sh /start.sh
+RUN chmod +x /start.sh
+CMD ["/start.sh"]

FAQ.md ADDED Viewed

	@@ -0,0 +1,10 @@

+# FAQ
+WIP. Will continue to add things as they are needed.
+## FLUX.1 Training
+#### How much VRAM is required to train a lora on FLUX.1?
+24GB minimum is required.

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 Ostris, LLC
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,11 +1,487 @@
 ---
-title: Ai Toolkit
-emoji: 🚀
-colorFrom: red
-colorTo: gray
 sdk: docker
-pinned: false
 short_description: Ostris AI Toolkit running as a HF space
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: AI Toolkit
+emoji: 📹
+colorFrom: gray
+colorTo: red
 sdk: docker
+pinned: true
 short_description: Ostris AI Toolkit running as a HF space
+app_port: 8675
 ---
+# AI Toolkit by Ostris
+AI Toolkit is an all in one training suite for diffusion models. I try to support all the latest models on consumer grade hardware. Image and video models. It can be run as a GUI or CLI. It is designed to be easy to use but still have every feature imaginable.
+## Support My Work
+If you enjoy my projects or use them commercially, please consider sponsoring me. Every bit helps! 💖
+[Sponsor on GitHub](https://github.com/orgs/ostris) | [Support on Patreon](https://www.patreon.com/ostris) | [Donate on PayPal](https://www.paypal.com/donate/?hosted_button_id=9GEFUKC8T9R9W)
+### Current Sponsors
+All of these people / organizations are the ones who selflessly make this project possible. Thank you!!
+_Last updated: 2025-08-08 17:01 UTC_
+<p align="center">
+<a href="https://x.com/NuxZoe" target="_blank" rel="noopener noreferrer"><img src="https://pbs.twimg.com/profile_images/1919488160125616128/QAZXTMEj_400x400.png" alt="a16z" width="200" height="200" style="border-radius:8px;margin:5px;display: inline-block;"></a>
+<a href="https://github.com/replicate" target="_blank" rel="noopener noreferrer"><img src="https://avatars.githubusercontent.com/u/60410876?v=4" alt="Replicate" width="200" height="200" style="border-radius:8px;margin:5px;display: inline-block;"></a>
+<a href="https://github.com/huggingface" target="_blank" rel="noopener noreferrer"><img src="https://avatars.githubusercontent.com/u/25720743?v=4" alt="Hugging Face" width="200" height="200" style="border-radius:8px;margin:5px;display: inline-block;"></a>
+<a href="https://github.com/josephrocca" target="_blank" rel="noopener noreferrer"><img src="https://avatars.githubusercontent.com/u/1167575?u=92d92921b4cb5c8c7e225663fed53c4b41897736&v=4" alt="josephrocca" width="200" height="200" style="border-radius:8px;margin:5px;display: inline-block;"></a>
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/162524101/81a72689c3754ac5b9e38612ce5ce914/eyJ3IjoyMDB9/1.png?token-hash=JHRjAxd2XxV1aXIUijj-l65pfTnLoefYSvgNPAsw2lI%3D" alt="Prasanth Veerina" width="200" height="200" style="border-radius:8px;margin:5px;display: inline-block;">
+<a href="https://github.com/weights-ai" target="_blank" rel="noopener noreferrer"><img src="https://avatars.githubusercontent.com/u/185568492?v=4" alt="Weights" width="200" height="200" style="border-radius:8px;margin:5px;display: inline-block;"></a>
+</p>
+<hr style="width:100%;border:none;height:2px;background:#ddd;margin:30px 0;">
+<p align="center">
+<img src="https://c8.patreon.com/4/200/93304/J" alt="Joseph Rocca" width="150" height="150" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/161471720/dd330b4036d44a5985ed5985c12a5def/eyJ3IjoyMDB9/1.jpeg?token-hash=k1f4Vv7TevzYa9tqlzAjsogYmkZs8nrXQohPCDGJGkc%3D" alt="Vladimir Sotnikov" width="150" height="150" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/33158543/C" alt="clement Delangue" width="150" height="150" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/8654302/b0f5ebedc62a47c4b56222693e1254e9/eyJ3IjoyMDB9/2.jpeg?token-hash=suI7_QjKUgWpdPuJPaIkElkTrXfItHlL8ZHLPT-w_d4%3D" alt="Misch Strotz" width="150" height="150" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/120239481/49b1ce70d3d24704b8ec34de24ec8f55/eyJ3IjoyMDB9/1.jpeg?token-hash=o0y1JqSXqtGvVXnxb06HMXjQXs6OII9yMMx5WyyUqT4%3D" alt="nitish PNR" width="150" height="150" style="border-radius:8px;margin:5px;display: inline-block;">
+</p>
+<hr style="width:100%;border:none;height:2px;background:#ddd;margin:30px 0;">
+<p align="center">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/2298192/1228b69bd7d7481baf3103315183250d/eyJ3IjoyMDB9/1.jpg?token-hash=opN1e4r4Nnvqbtr8R9HI8eyf9m5F50CiHDOdHzb4UcA%3D" alt="Mohamed Oumoumad" width="100" height="100" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/548524/S" alt="Steve Hanff" width="100" height="100" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/152118848/3b15a43d71714552b5ed1c9f84e66adf/eyJ3IjoyMDB9/1.png?token-hash=MKf3sWHz0MFPm_OAFjdsNvxoBfN5B5l54mn1ORdlRy8%3D" alt="Kristjan Retter" width="100" height="100" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/83319230/M" alt="Miguel Lara" width="100" height="100" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/8449560/P" alt="Patron" width="100" height="100" style="border-radius:8px;margin:5px;display: inline-block;">
+<a href="https://x.com/NuxZoe" target="_blank" rel="noopener noreferrer"><img src="https://pbs.twimg.com/profile_images/1916482710069014528/RDLnPRSg_400x400.jpg" alt="tungsten" width="100" height="100" style="border-radius:8px;margin:5px;display: inline-block;"></a>
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/169502989/220069e79ce745b29237e94c22a729df/eyJ3IjoyMDB9/1.png?token-hash=E8E2JOqx66k2zMtYUw8Gy57dw-gVqA6OPpdCmWFFSFw%3D" alt="Timothy Bielec" width="100" height="100" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/34200989/58ae95ebda0640c8b7a91b4fa31357aa/eyJ3IjoyMDB9/1.jpeg?token-hash=4mVDM1kCYGauYa33zLG14_g0oj9_UjDK_-Qp4zk42GE%3D" alt="Noah Miller" width="100" height="100" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/27288932/6c35d2d961ee4e14a7a368c990791315/eyJ3IjoyMDB9/1.jpeg?token-hash=TGIto_PGEG2NEKNyqwzEnRStOkhrjb3QlMhHA3raKJY%3D" alt="David Garrido" width="100" height="100" style="border-radius:8px;margin:5px;display: inline-block;">
+<a href="https://x.com/RalFingerLP" target="_blank" rel="noopener noreferrer"><img src="https://pbs.twimg.com/profile_images/919595465041162241/ZU7X3T5k_400x400.jpg" alt="RalFinger" width="100" height="100" style="border-radius:8px;margin:5px;display: inline-block;"></a>
+</p>
+<hr style="width:100%;border:none;height:2px;background:#ddd;margin:30px 0;">
+<p align="center">
+<a href="http://www.ir-ltd.net" target="_blank" rel="noopener noreferrer"><img src="https://pbs.twimg.com/profile_images/1602579392198283264/6Tm2GYus_400x400.jpg" alt="IR-Entertainment Ltd" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;"></a>
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/9547341/bb35d9a222fd460e862e960ba3eacbaf/eyJ3IjoyMDB9/1.jpeg?token-hash=Q2XGDvkCbiONeWNxBCTeTMOcuwTjOaJ8Z-CAf5xq3Hs%3D" alt="Travis Harrington" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/98811435/3a3632d1795b4c2b9f8f0270f2f6a650/eyJ3IjoyMDB9/1.jpeg?token-hash=657rzuJ0bZavMRZW3XZ-xQGqm3Vk6FkMZgFJVMCOPdk%3D" alt="EmmanuelMr18" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/81275465/1e4148fe9c47452b838949d02dd9a70f/eyJ3IjoyMDB9/1.jpeg?token-hash=YAX1ucxybpCIujUCXfdwzUQkttIn3c7pfi59uaFPSwM%3D" alt="Aaron Amortegui" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/155963250/6f8fd7075c3b4247bfeb054ba49172d6/eyJ3IjoyMDB9/1.png?token-hash=z81EHmdU2cqSrwa9vJmZTV3h0LG-z9Qakhxq34FrYT4%3D" alt="Un Defined" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/45562978/0de33cf52ec642ae8a2f612cddec4ca6/eyJ3IjoyMDB9/1.jpeg?token-hash=aD4debMD5ZQjqTII6s4zYSgVK2-bdQt9p3eipi0bENs%3D" alt="Jack English" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/27791680/J" alt="Jean-Tristan Marin" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/570742/4ceb33453a5a4745b430a216aba9280f/eyJ3IjoyMDB9/1.jpg?token-hash=nPcJ2zj3sloND9jvbnbYnob2vMXRnXdRuujthqDLWlU%3D" alt="Al H" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/82763/f99cc484361d4b9d94fe4f0814ada303/eyJ3IjoyMDB9/1.jpeg?token-hash=A3JWlBNL0b24FFWb-FCRDAyhs-OAxg-zrhfBXP_axuU%3D" alt="Doron Adler" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/103077711/bb215761cc004e80bd9cec7d4bcd636d/eyJ3IjoyMDB9/2.jpeg?token-hash=3U8kdZSUpnmeYIDVK4zK9TTXFpnAud_zOwBRXx18018%3D" alt="John Dopamine" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/99036356/7ae9c4d80e604e739b68cca12ee2ed01/eyJ3IjoyMDB9/3.png?token-hash=ZhsBMoTOZjJ-Y6h5NOmU5MT-vDb2fjK46JDlpEehkVQ%3D" alt="Noctre" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/141098579/1a9f0a1249d447a7a0df718a57343912/eyJ3IjoyMDB9/2.png?token-hash=_n-AQmPgY0FP9zCGTIEsr5ka4Y7YuaMkt3qL26ZqGg8%3D" alt="The Local Lab" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/93348210/5c650f32a0bc481d80900d2674528777/eyJ3IjoyMDB9/1.jpeg?token-hash=0jiknRw3jXqYWW6En8bNfuHgVDj4LI_rL7lSS4-_xlo%3D" alt="Armin Behjati" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/134129880/680c7e14cd1a4d1a9face921fb010f88/eyJ3IjoyMDB9/1.png?token-hash=5fqqHE6DCTbt7gDQL7VRcWkV71jF7FvWcLhpYl5aMXA%3D" alt="Bharat Prabhakar" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/70218846/C" alt="Cosmosis" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/30931983/54ab4e4ceab946e79a6418d205f9ed51/eyJ3IjoyMDB9/1.png?token-hash=j2phDrgd6IWuqKqNIDbq9fR2B3fMF-GUCQSdETS1w5Y%3D" alt="HestoySeghuro ." width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/4105384/J" alt="Jack Blakely" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/4541423/S" alt="Sören " width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<a href="https://www.youtube.com/@happyme7055" target="_blank" rel="noopener noreferrer"><img src="https://yt3.googleusercontent.com/ytc/AIdro_mFqhIRk99SoEWY2gvSvVp6u1SkCGMkRqYQ1OlBBeoOVp8=s160-c-k-c0x00ffffff-no-rj" alt="Marcus Rass" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;"></a>
+<img src="https://c8.patreon.com/4/200/53077895/M" alt="Marc" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/157407541/bb9d80cffdab4334ad78366060561520/eyJ3IjoyMDB9/2.png?token-hash=WYz-U_9zabhHstOT5UIa5jBaoFwrwwqyWxWEzIR2m_c%3D" alt="Tokio Studio srl IT10640050968" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/44568304/a9d83a0e786b41b4bdada150f7c9271c/eyJ3IjoyMDB9/1.jpeg?token-hash=FtxnwrSrknQUQKvDRv2rqPceX2EF23eLq4pNQYM_fmw%3D" alt="Albert Bukoski" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/5048649/B" alt="Ben Ward" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/111904990/08b1cf65be6a4de091c9b73b693b3468/eyJ3IjoyMDB9/1.png?token-hash=_Odz6RD3CxtubEHbUxYujcjw6zAajbo3w8TRz249VBA%3D" alt="Brian Smith" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/494309/J" alt="Julian Tsependa" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/5602036/K" alt="Kelevra" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/159203973/36c817f941ac4fa18103a4b8c0cb9cae/eyJ3IjoyMDB9/1.png?token-hash=zkt72HW3EoiIEAn3LSk9gJPBsXfuTVcc4rRBS3CeR8w%3D" alt="Marko jak" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/24653779/R" alt="RayHell" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/76566911/6485eaf5ec6249a7b524ee0b979372f0/eyJ3IjoyMDB9/1.jpeg?token-hash=mwCSkTelDBaengG32NkN0lVl5mRjB-cwo6-a47wnOsU%3D" alt="the biitz" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/32633822/1ab5612efe80417cbebfe91e871fc052/eyJ3IjoyMDB9/1.png?token-hash=pOS_IU3b3RL5-iL96A3Xqoj2bQ-dDo4RUkBylcMED_s%3D" alt="Zack Abrams" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/97985240/3d1d0e6905d045aba713e8132cab4a30/eyJ3IjoyMDB9/1.png?token-hash=fRavvbO_yqWKA_OsJb5DzjfKZ1Yt-TG-ihMoeVBvlcM%3D" alt="עומר מכלוף" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<a href="https://github.com/julien-blanchon" target="_blank" rel="noopener noreferrer"><img src="https://avatars.githubusercontent.com/u/11278197?v=4" alt="Blanchon" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;"></a>
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/11198131/e696d9647feb4318bcf16243c2425805/eyJ3IjoyMDB9/1.jpeg?token-hash=c2c2p1SaiX86iXAigvGRvzm4jDHvIFCg298A49nIfUM%3D" alt="Nicholas Agranoff" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/785333/bdb9ede5765d42e5a2021a86eebf0d8f/eyJ3IjoyMDB9/2.jpg?token-hash=l_rajMhxTm6wFFPn7YdoKBxeUqhdRXKdy6_8SGCuNsE%3D" alt="Sapjes " width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/2446176/S" alt="Scott VanKirk" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/83034/W" alt="william tatum" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/138787189/2b5662dcb638466282ac758e3ac651b4/eyJ3IjoyMDB9/1.png?token-hash=zwj7MScO18vhDxhKt6s5q4gdeNJM3xCLuhSt8zlqlZs%3D" alt="Антон Антонио" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/30530914/T" alt="Techer " width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/25209707/36ae876d662d4d85aaf162b6d67d31e7/eyJ3IjoyMDB9/1.png?token-hash=Zows_A6uqlY5jClhfr4Y3QfMnDKVkS3mbxNHUDkVejo%3D" alt="fjioq8" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/46680573/ee3d99c04a674dd5a8e1ecfb926db6a2/eyJ3IjoyMDB9/1.jpeg?token-hash=cgD4EXyfZMPnXIrcqWQ5jGqzRUfqjPafb9yWfZUPB4Q%3D" alt="Neil Murray" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://ostris.com/wp-content/uploads/2025/08/supporter_default.jpg" alt="Joakim Sällström" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/63510241/A" alt="Andrew Park" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<a href="https://github.com/Spikhalskiy" target="_blank" rel="noopener noreferrer"><img src="https://avatars.githubusercontent.com/u/532108?u=2464983638afea8caf4cd9f0e4a7bc3e6a63bb0a&v=4" alt="Dmitry Spikhalsky" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;"></a>
+<img src="https://c8.patreon.com/4/200/88567307/E" alt="el Chavo" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/117569999/55f75c57f95343e58402529cec852b26/eyJ3IjoyMDB9/1.jpeg?token-hash=squblHZH4-eMs3gI46Uqu1oTOK9sQ-0gcsFdZcB9xQg%3D" alt="James Thompson" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/66157709/6fe70df085e24464995a1a9293a53760/eyJ3IjoyMDB9/1.jpeg?token-hash=eqe0wvg6JfbRUGMKpL_x3YPI5Ppf18aUUJe2EzADU-g%3D" alt="Joey Santana" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://ostris.com/wp-content/uploads/2025/08/supporter_default.jpg" alt="Heikki Rinkinen" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/6175608/B" alt="Bobbie " width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<a href="https://github.com/Slartibart23" target="_blank" rel="noopener noreferrer"><img src="https://avatars.githubusercontent.com/u/133593860?u=31217adb2522fb295805824ffa7e14e8f0fca6fa&v=4" alt="Slarti" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;"></a>
+<img src="https://ostris.com/wp-content/uploads/2025/08/supporter_default.jpg" alt="Tommy Falkowski" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/28533016/e8f6044ccfa7483f87eeaa01c894a773/eyJ3IjoyMDB9/2.png?token-hash=ak-h3JWB50hyenCavcs32AAPw6nNhmH2nBFKpdk5hvM%3D" alt="William Tatum" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://ostris.com/wp-content/uploads/2025/08/supporter_default.jpg" alt="Karol Stępień" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/156564939/17dbfd45c59d4cf29853d710cb0c5d6f/eyJ3IjoyMDB9/1.png?token-hash=e6wXA_S8cgJeEDI9eJK934eB0TiM8mxJm9zW_VH0gDU%3D" alt="Hans Untch" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/59408413/B" alt="ByteC" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/3712451/432e22a355494ec0a1ea1927ff8d452e/eyJ3IjoyMDB9/7.jpeg?token-hash=OpQ9SAfVQ4Un9dSYlGTHuApZo5GlJ797Mo0DtVtMOSc%3D" alt="David Shorey" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/53634141/c1441f6c605344bbaef885d4272977bb/eyJ3IjoyMDB9/1.JPG?token-hash=Aizd6AxQhY3n6TBE5AwCVeSwEBbjALxQmu6xqc08qBo%3D" alt="Jana Spacelight" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/11180426/J" alt="jarrett towe" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/21828017/J" alt="Jim" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/63232055/2300b4ab370341b5b476902c9b8218ee/eyJ3IjoyMDB9/1.png?token-hash=R9Nb4O0aLBRwxT1cGHUMThlvf6A2MD5SO88lpZBdH7M%3D" alt="Marek P" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/9944625/P" alt="Pomoe " width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/25047900/423e4cb73aba457f8f9c6e5582eddaeb/eyJ3IjoyMDB9/1.jpeg?token-hash=81RvQXBbT66usxqtyWum9Ul4oBn3qHK1cM71IvthC-U%3D" alt="Ruairi Robinson" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c10.patreonusercontent.com/4/patreon-media/p/user/178476551/0b9e83efcd234df5a6bea30d59e6c1cd/eyJ3IjoyMDB9/1.png?token-hash=3XoYMrMxk-K6GelM22mE-FwkjFulX9hpIL7QI3wO2jI%3D" alt="Timmy" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://c8.patreon.com/4/200/10876902/T" alt="Tyssel" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+<img src="https://ostris.com/wp-content/uploads/2025/08/supporter_default.jpg" alt="Juan Franco" width="60" height="60" style="border-radius:8px;margin:5px;display: inline-block;">
+</p>
+---
+## Installation
+Requirements:
+- python >3.10
+- Nvidia GPU with enough ram to do what you need
+- python venv
+- git
+Linux:
+```bash
+git clone https://github.com/ostris/ai-toolkit.git
+cd ai-toolkit
+python3 -m venv venv
+source venv/bin/activate
+# install torch first
+pip3 install --no-cache-dir torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/cu126
+pip3 install -r requirements.txt
+```
+Windows:
+If you are having issues with Windows. I recommend using the easy install script at [https://github.com/Tavris1/AI-Toolkit-Easy-Install](https://github.com/Tavris1/AI-Toolkit-Easy-Install)
+```bash
+git clone https://github.com/ostris/ai-toolkit.git
+cd ai-toolkit
+python -m venv venv
+.\venv\Scripts\activate
+pip install --no-cache-dir torch==2.7.0 torchvision==0.22.0 torchaudio==2.7.0 --index-url https://download.pytorch.org/whl/cu126
+pip install -r requirements.txt
+```
+# AI Toolkit UI
+<img src="https://ostris.com/wp-content/uploads/2025/02/toolkit-ui.jpg" alt="AI Toolkit UI" width="100%">
+The AI Toolkit UI is a web interface for the AI Toolkit. It allows you to easily start, stop, and monitor jobs. It also allows you to easily train models with a few clicks. It also allows you to set a token for the UI to prevent unauthorized access so it is mostly safe to run on an exposed server.
+## Running the UI
+Requirements:
+- Node.js > 18
+The UI does not need to be kept running for the jobs to run. It is only needed to start/stop/monitor jobs. The commands below
+will install / update the UI and it's dependencies and start the UI.
+```bash
+cd ui
+npm run build_and_start
+```
+You can now access the UI at `http://localhost:8675` or `http://<your-ip>:8675` if you are running it on a server.
+## Securing the UI
+If you are hosting the UI on a cloud provider or any network that is not secure, I highly recommend securing it with an auth token.
+You can do this by setting the environment variable `AI_TOOLKIT_AUTH` to super secure password. This token will be required to access
+the UI. You can set this when starting the UI like so:
+```bash
+# Linux
+AI_TOOLKIT_AUTH=super_secure_password npm run build_and_start
+# Windows
+set AI_TOOLKIT_AUTH=super_secure_password && npm run build_and_start
+# Windows Powershell
+$env:AI_TOOLKIT_AUTH="super_secure_password"; npm run build_and_start
+```
+## FLUX.1 Training
+### Tutorial
+To get started quickly, check out [@araminta_k](https://x.com/araminta_k) tutorial on [Finetuning Flux Dev on a 3090](https://www.youtube.com/watch?v=HzGW_Kyermg) with 24GB VRAM.
+### Requirements
+You currently need a GPU with **at least 24GB of VRAM** to train FLUX.1. If you are using it as your GPU to control
+your monitors, you probably need to set the flag `low_vram: true` in the config file under `model:`. This will quantize
+the model on CPU and should allow it to train with monitors attached. Users have gotten it to work on Windows with WSL,
+but there are some reports of a bug when running on windows natively.
+I have only tested on linux for now. This is still extremely experimental
+and a lot of quantizing and tricks had to happen to get it to fit on 24GB at all.
+### FLUX.1-dev
+FLUX.1-dev has a non-commercial license. Which means anything you train will inherit the
+non-commercial license. It is also a gated model, so you need to accept the license on HF before using it.
+Otherwise, this will fail. Here are the required steps to setup a license.
+1. Sign into HF and accept the model access here [black-forest-labs/FLUX.1-dev](https://huggingface.co/black-forest-labs/FLUX.1-dev)
+2. Make a file named `.env` in the root on this folder
+3. [Get a READ key from huggingface](https://huggingface.co/settings/tokens/new?) and add it to the `.env` file like so `HF_TOKEN=your_key_here`
+### FLUX.1-schnell
+FLUX.1-schnell is Apache 2.0. Anything trained on it can be licensed however you want and it does not require a HF_TOKEN to train.
+However, it does require a special adapter to train with it, [ostris/FLUX.1-schnell-training-adapter](https://huggingface.co/ostris/FLUX.1-schnell-training-adapter).
+It is also highly experimental. For best overall quality, training on FLUX.1-dev is recommended.
+To use it, You just need to add the assistant to the `model` section of your config file like so:
+```yaml
+      model:
+        name_or_path: "black-forest-labs/FLUX.1-schnell"
+        assistant_lora_path: "ostris/FLUX.1-schnell-training-adapter"
+        is_flux: true
+        quantize: true
+```
+You also need to adjust your sample steps since schnell does not require as many
+```yaml
+      sample:
+        guidance_scale: 1  # schnell does not do guidance
+        sample_steps: 4  # 1 - 4 works well
+```
+### Training
+1. Copy the example config file located at `config/examples/train_lora_flux_24gb.yaml` (`config/examples/train_lora_flux_schnell_24gb.yaml` for schnell) to the `config` folder and rename it to `whatever_you_want.yml`
+2. Edit the file following the comments in the file
+3. Run the file like so `python run.py config/whatever_you_want.yml`
+A folder with the name and the training folder from the config file will be created when you start. It will have all
+checkpoints and images in it. You can stop the training at any time using ctrl+c and when you resume, it will pick back up
+from the last checkpoint.
+IMPORTANT. If you press crtl+c while it is saving, it will likely corrupt that checkpoint. So wait until it is done saving
+### Need help?
+Please do not open a bug report unless it is a bug in the code. You are welcome to [Join my Discord](https://discord.gg/VXmU2f5WEU)
+and ask for help there. However, please refrain from PMing me directly with general question or support. Ask in the discord
+and I will answer when I can.
+## Gradio UI
+To get started training locally with a with a custom UI, once you followed the steps above and `ai-toolkit` is installed:
+```bash
+cd ai-toolkit #in case you are not yet in the ai-toolkit folder
+huggingface-cli login #provide a `write` token to publish your LoRA at the end
+python flux_train_ui.py
+```
+You will instantiate a UI that will let you upload your images, caption them, train and publish your LoRA
+![image](assets/lora_ease_ui.png)
+## Training in RunPod
+Example RunPod template: **runpod/pytorch:2.2.0-py3.10-cuda12.1.1-devel-ubuntu22.04**
+> You need a minimum of 24GB VRAM, pick a GPU by your preference.
+#### Example config ($0.5/hr):
+- 1x A40 (48 GB VRAM)
+- 19 vCPU 100 GB RAM
+#### Custom overrides (you need some storage to clone FLUX.1, store datasets, store trained models and samples):
+- ~120 GB Disk
+- ~120 GB Pod Volume
+- Start Jupyter Notebook
+### 1. Setup
+```
+git clone https://github.com/ostris/ai-toolkit.git
+cd ai-toolkit
+git submodule update --init --recursive
+python -m venv venv
+source venv/bin/activate
+pip install torch
+pip install -r requirements.txt
+pip install --upgrade accelerate transformers diffusers huggingface_hub #Optional, run it if you run into issues
+```
+### 2. Upload your dataset
+- Create a new folder in the root, name it `dataset` or whatever you like.
+- Drag and drop your .jpg, .jpeg, or .png images and .txt files inside the newly created dataset folder.
+### 3. Login into Hugging Face with an Access Token
+- Get a READ token from [here](https://huggingface.co/settings/tokens) and request access to Flux.1-dev model from [here](https://huggingface.co/black-forest-labs/FLUX.1-dev).
+- Run ```huggingface-cli login``` and paste your token.
+### 4. Training
+- Copy an example config file located at ```config/examples``` to the config folder and rename it to ```whatever_you_want.yml```.
+- Edit the config following the comments in the file.
+- Change ```folder_path: "/path/to/images/folder"``` to your dataset path like ```folder_path: "/workspace/ai-toolkit/your-dataset"```.
+- Run the file: ```python run.py config/whatever_you_want.yml```.
+### Screenshot from RunPod
+<img width="1728" alt="RunPod Training Screenshot" src="https://github.com/user-attachments/assets/53a1b8ef-92fa-4481-81a7-bde45a14a7b5">
+## Training in Modal
+### 1. Setup
+#### ai-toolkit:
+```
+git clone https://github.com/ostris/ai-toolkit.git
+cd ai-toolkit
+git submodule update --init --recursive
+python -m venv venv
+source venv/bin/activate
+pip install torch
+pip install -r requirements.txt
+pip install --upgrade accelerate transformers diffusers huggingface_hub #Optional, run it if you run into issues
+```
+#### Modal:
+- Run `pip install modal` to install the modal Python package.
+- Run `modal setup` to authenticate (if this doesn’t work, try `python -m modal setup`).
+#### Hugging Face:
+- Get a READ token from [here](https://huggingface.co/settings/tokens) and request access to Flux.1-dev model from [here](https://huggingface.co/black-forest-labs/FLUX.1-dev).
+- Run `huggingface-cli login` and paste your token.
+### 2. Upload your dataset
+- Drag and drop your dataset folder containing the .jpg, .jpeg, or .png images and .txt files in `ai-toolkit`.
+### 3. Configs
+- Copy an example config file located at ```config/examples/modal``` to the `config` folder and rename it to ```whatever_you_want.yml```.
+- Edit the config following the comments in the file, **<ins>be careful and follow the example `/root/ai-toolkit` paths</ins>**.
+### 4. Edit run_modal.py
+- Set your entire local `ai-toolkit` path at `code_mount = modal.Mount.from_local_dir` like:
+   ```
+   code_mount = modal.Mount.from_local_dir("/Users/username/ai-toolkit", remote_path="/root/ai-toolkit")
+   ```
+- Choose a `GPU` and `Timeout` in `@app.function` _(default is A100 40GB and 2 hour timeout)_.
+### 5. Training
+- Run the config file in your terminal: `modal run run_modal.py --config-file-list-str=/root/ai-toolkit/config/whatever_you_want.yml`.
+- You can monitor your training in your local terminal, or on [modal.com](https://modal.com/).
+- Models, samples and optimizer will be stored in `Storage > flux-lora-models`.
+### 6. Saving the model
+- Check contents of the volume by running `modal volume ls flux-lora-models`.
+- Download the content by running `modal volume get flux-lora-models your-model-name`.
+- Example: `modal volume get flux-lora-models my_first_flux_lora_v1`.
+### Screenshot from Modal
+<img width="1728" alt="Modal Traning Screenshot" src="https://github.com/user-attachments/assets/7497eb38-0090-49d6-8ad9-9c8ea7b5388b">
+---
+## Dataset Preparation
+Datasets generally need to be a folder containing images and associated text files. Currently, the only supported
+formats are jpg, jpeg, and png. Webp currently has issues. The text files should be named the same as the images
+but with a `.txt` extension. For example `image2.jpg` and `image2.txt`. The text file should contain only the caption.
+You can add the word `[trigger]` in the caption file and if you have `trigger_word` in your config, it will be automatically
+replaced.
+Images are never upscaled but they are downscaled and placed in buckets for batching. **You do not need to crop/resize your images**.
+The loader will automatically resize them and can handle varying aspect ratios.
+## Training Specific Layers
+To train specific layers with LoRA, you can use the `only_if_contains` network kwargs. For instance, if you want to train only the 2 layers
+used by The Last Ben, [mentioned in this post](https://x.com/__TheBen/status/1829554120270987740), you can adjust your
+network kwargs like so:
+```yaml
+      network:
+        type: "lora"
+        linear: 128
+        linear_alpha: 128
+        network_kwargs:
+          only_if_contains:
+            - "transformer.single_transformer_blocks.7.proj_out"
+            - "transformer.single_transformer_blocks.20.proj_out"
+```
+The naming conventions of the layers are in diffusers format, so checking the state dict of a model will reveal
+the suffix of the name of the layers you want to train. You can also use this method to only train specific groups of weights.
+For instance to only train the `single_transformer` for FLUX.1, you can use the following:
+```yaml
+      network:
+        type: "lora"
+        linear: 128
+        linear_alpha: 128
+        network_kwargs:
+          only_if_contains:
+            - "transformer.single_transformer_blocks."
+```
+You can also exclude layers by their names by using `ignore_if_contains` network kwarg. So to exclude all the single transformer blocks,
+```yaml
+      network:
+        type: "lora"
+        linear: 128
+        linear_alpha: 128
+        network_kwargs:
+          ignore_if_contains:
+            - "transformer.single_transformer_blocks."
+```
+`ignore_if_contains` takes priority over `only_if_contains`. So if a weight is covered by both,
+if will be ignored.
+## LoKr Training
+To learn more about LoKr, read more about it at [KohakuBlueleaf/LyCORIS](https://github.com/KohakuBlueleaf/LyCORIS/blob/main/docs/Guidelines.md). To train a LoKr model, you can adjust the network type in the config file like so:
+```yaml
+      network:
+        type: "lokr"
+        lokr_full_rank: true
+        lokr_factor: 8
+```
+Everything else should work the same including layer targeting.
+## Updates
+Only larger updates are listed here. There are usually smaller daily updated that are omitted.
+### Jul 17, 2025
+- Make it easy to add control images to the samples in the ui
+### Jul 11, 2025
+- Added better video config settings to the UI for video models.
+- Added Wan I2V training to the UI
+### June 29, 2025
+- Fixed issue where Kontext forced sizes on sampling
+### June 26, 2025
+- Added support for FLUX.1 Kontext training
+- added support for instruction dataset training
+### June 25, 2025
+- Added support for OmniGen2 training
+-
+### June 17, 2025
+- Performance optimizations for batch preparation
+- Added some docs via a popup for items in the simple ui explaining what settings do. Still a WIP
+### June 16, 2025
+- Hide control images in the UI when viewing datasets
+- WIP on mean flow loss
+### June 12, 2025
+- Fixed issue that resulted in blank captions in the dataloader
+### June 10, 2025
+- Decided to keep track up updates in the readme
+- Added support for SDXL in the UI
+- Added support for SD 1.5 in the UI
+- Fixed UI Wan 2.1 14b name bug
+- Added support for for conv training in the UI for models that support it

assets/VAE_test1.jpg ADDED Viewed

Git LFS Details

SHA256: 879fcb537d039408d7aada297b7397420132684f0106edacc1205fb5cc839476
Pointer size: 132 Bytes
Size of remote file: 1.51 MB

assets/glif.svg ADDED Viewed

assets/lora_ease_ui.png ADDED Viewed

Git LFS Details

SHA256: f647b9fe90cc96db2aa84d1cb25a73b60ffcc5394822f99e9dac27d373f89d79
Pointer size: 131 Bytes
Size of remote file: 349 kB

build_and_push_docker ADDED Viewed

	@@ -0,0 +1,29 @@

+#!/usr/bin/env bash
+# Extract version from version.py
+if [ -f "version.py" ]; then
+    VERSION=$(python3 -c "from version import VERSION; print(VERSION)")
+    echo "Building version: $VERSION"
+else
+    echo "Error: version.py not found. Please create a version.py file with VERSION defined."
+    exit 1
+fi
+echo "Docker builds from the repo, not this dir. Make sure changes are pushed to the repo."
+echo "Building version: $VERSION and latest"
+# wait 2 seconds
+sleep 2
+# Build the image with cache busting
+docker build --build-arg CACHEBUST=$(date +%s) -t aitoolkit:$VERSION -f docker/Dockerfile .
+# Tag with version and latest
+docker tag aitoolkit:$VERSION ostris/aitoolkit:$VERSION
+docker tag aitoolkit:$VERSION ostris/aitoolkit:latest
+# Push both tags
+echo "Pushing images to Docker Hub..."
+docker push ostris/aitoolkit:$VERSION
+docker push ostris/aitoolkit:latest
+echo "Successfully built and pushed ostris/aitoolkit:$VERSION and ostris/aitoolkit:latest"

build_and_push_docker_dev ADDED Viewed

	@@ -0,0 +1,21 @@

+#!/usr/bin/env bash
+VERSION=dev
+GIT_COMMIT=dev
+echo "Docker builds from the repo, not this dir. Make sure changes are pushed to the repo."
+echo "Building version: $VERSION"
+# wait 2 seconds
+sleep 2
+# Build the image with cache busting
+docker build --build-arg CACHEBUST=$(date +%s) -t aitoolkit:$VERSION -f docker/Dockerfile .
+# Tag with version and latest
+docker tag aitoolkit:$VERSION ostris/aitoolkit:$VERSION
+# Push both tags
+echo "Pushing images to Docker Hub..."
+docker push ostris/aitoolkit:$VERSION
+echo "Successfully built and pushed ostris/aitoolkit:$VERSION"

config/examples/extract.example.yml ADDED Viewed

	@@ -0,0 +1,75 @@

+---
+# this is in yaml format. You can use json if you prefer
+# I like both but yaml is easier to read and write
+# plus it has comments which is nice for documentation
+job: extract # tells the runner what to do
+config:
+  # the name will be used to create a folder in the output folder
+  # it will also replace any [name] token in the rest of this config
+  name: name_of_your_model
+  # can be hugging face model, a .ckpt, or a .safetensors
+  base_model: "/path/to/base/model.safetensors"
+  # can be hugging face model, a .ckpt, or a .safetensors
+  extract_model: "/path/to/model/to/extract/trained.safetensors"
+  # we will create folder here with name above so. This will create /path/to/output/folder/name_of_your_model
+  output_folder: "/path/to/output/folder"
+  is_v2: false
+  dtype: fp16 # saved dtype
+  device: cpu # cpu, cuda:0, etc
+  # processes can be chained like this to run multiple in a row
+  # they must all use same models above, but great for testing different
+  # sizes and typed of extractions. It is much faster as we already have the models loaded
+  process:
+  # process 1
+  - type: locon  # locon or lora (locon is lycoris)
+    filename: "[name]_64_32.safetensors" # will be put in output folder
+    dtype: fp16
+    mode: fixed
+    linear: 64
+    conv: 32
+  # process 2
+  - type: locon
+    output_path: "/absolute/path/for/this/output.safetensors" # can be absolute
+    mode: ratio
+    linear: 0.2
+    conv: 0.2
+  # process 3
+  - type: locon
+    filename: "[name]_ratio_02.safetensors"
+    mode: quantile
+    linear: 0.5
+    conv: 0.5
+  # process 4
+  - type: lora  # traditional lora extraction (lierla) with linear layers only
+    filename: "[name]_4.safetensors"
+    mode: fixed  # fixed, ratio, quantile supported for lora as well
+    linear: 4 # lora dim or rank
+    # no conv for lora
+  # process 5
+  - type: lora
+    filename: "[name]_q05.safetensors"
+    mode: quantile
+    linear: 0.5
+# you can put any information you want here, and it will be saved in the model
+# the below is an example. I recommend doing trigger words at a minimum
+# in the metadata. The software will include this plus some other information
+meta:
+  name: "[name]"  # [name] gets replaced with the name above
+  description: A short description of your model
+  trigger_words:
+  - put
+  - trigger
+  - words
+  - here
+  version: '0.1'
+  creator:
+    name: Your Name
+    email: [email protected]
+    website: https://yourwebsite.com
+  any: All meta data above is arbitrary, it can be whatever you want.

config/examples/generate.example.yaml ADDED Viewed

	@@ -0,0 +1,60 @@

+---
+job: generate # tells the runner what to do
+config:
+  name: "generate" # this is not really used anywhere currently but required by runner
+  process:
+    # process 1
+    - type: to_folder  # process images to a folder
+      output_folder: "output/gen"
+      device: cuda:0 # cpu, cuda:0, etc
+      generate:
+        # these are your defaults you can override most of them with flags
+        sampler: "ddpm" # ignored for now, will add later though ddpm is used regardless for now
+        width: 1024
+        height: 1024
+        neg: "cartoon, fake, drawing, illustration, cgi, animated, anime"
+        seed: -1 # -1 is random
+        guidance_scale: 7
+        sample_steps: 20
+        ext: ".png" # .png, .jpg, .jpeg, .webp
+        # here ate the flags you can use for prompts. Always start with
+        # your prompt first then add these flags after. You can use as many
+        # like
+        # photo of a baseball --n painting, ugly --w 1024 --h 1024 --seed 42 --cfg 7 --steps 20
+        # we will try to support all sd-scripts flags where we can
+        # FROM SD-SCRIPTS
+        # --n Treat everything until the next option as a negative prompt.
+        # --w Specify the width of the generated image.
+        # --h Specify the height of the generated image.
+        # --d Specify the seed for the generated image.
+        # --l Specify the CFG scale for the generated image.
+        # --s Specify the number of steps during generation.
+        # OURS and some QOL additions
+        # --p2 Prompt for the second text encoder (SDXL only)
+        # --n2 Negative prompt for the second text encoder (SDXL only)
+        # --gr Specify the guidance rescale for the generated image (SDXL only)
+        # --seed Specify the seed for the generated image same as --d
+        # --cfg Specify the CFG scale for the generated image same as --l
+        # --steps Specify the number of steps during generation same as --s
+        prompt_file: false # if true a txt file will be created next to images with prompt strings used
+        # prompts can also be a path to a text file with one prompt per line
+        # prompts: "/path/to/prompts.txt"
+        prompts:
+          - "photo of batman"
+          - "photo of superman"
+          - "photo of spiderman"
+          - "photo of a superhero --n batman superman spiderman"
+      model:
+        # huggingface name, relative prom project path, or absolute path to .safetensors or .ckpt
+        #      name_or_path: "runwayml/stable-diffusion-v1-5"
+        name_or_path: "/mnt/Models/stable-diffusion/models/stable-diffusion/Ostris/Ostris_Real_v1.safetensors"
+        is_v2: false  # for v2 models
+        is_v_pred: false # for v-prediction models (most v2 models)
+        is_xl: false  # for SDXL models
+        dtype: bf16

config/examples/mod_lora_scale.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+---
+job: mod
+config:
+  name: name_of_your_model_v1
+  process:
+    - type: rescale_lora
+      # path to your current lora model
+      input_path: "/path/to/lora/lora.safetensors"
+      # output path for your new lora model, can be the same as input_path to replace
+      output_path: "/path/to/lora/output_lora_v1.safetensors"
+      # replaces meta with the meta below (plus minimum meta fields)
+      # if false, we will leave the meta alone except for updating hashes (sd-script hashes)
+      replace_meta: true
+      # how to adjust, we can scale the up_down weights or the alpha
+      # up_down is the default and probably the best, they will both net the same outputs
+      # would only affect rare NaN cases and maybe merging with old merge tools
+      scale_target: 'up_down'
+      # precision to save, fp16 is the default and standard
+      save_dtype: fp16
+      # current_weight is the ideal weight you use as a multiplier when using the lora
+      # IE in automatic1111 <lora:my_lora:6.0> the 6.0 is the current_weight
+      # you can do negatives here too if you want to flip the lora
+      current_weight: 6.0
+      # target_weight is the ideal weight you use as a multiplier when using the lora
+      # instead of the one above. IE in automatic1111 instead of using <lora:my_lora:6.0>
+      # we want to use <lora:my_lora:1.0> so 1.0 is the target_weight
+      target_weight: 1.0
+      # base model for the lora
+      # this is just used to add meta so automatic111 knows which model it is for
+      # assume v1.5 if these are not set
+      is_xl: false
+      is_v2: false
+meta:
+  # this is only used if you set replace_meta to true above
+  name: "[name]"  # [name] gets replaced with the name above
+  description: A short description of your lora
+  trigger_words:
+  - put
+  - trigger
+  - words
+  - here
+  version: '0.1'
+  creator:
+    name: Your Name
+    email: [email protected]
+    website: https://yourwebsite.com
+  any: All meta data above is arbitrary, it can be whatever you want.

config/examples/modal/modal_train_lora_flux_24gb.yaml ADDED Viewed

	@@ -0,0 +1,96 @@

+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_flux_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "/root/ai-toolkit/modal_output" # must match MOUNT_DIR from run_modal.py
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 16
+        linear_alpha: 16
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        # your dataset must be placed in /ai-toolkit and /root is for modal to find the dir:
+        - folder_path: "/root/ai-toolkit/your-dataset"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 512, 768, 1024 ]  # flux enjoys multiple resolutions
+      train:
+        batch_size: 1
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation_steps: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with flux
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # uncomment to use new vell curved weighting. Experimental but may produce better results
+#        linear_timesteps: true
+        # ema will smooth out learning, but could slow it down. Recommended to leave on.
+        ema_config:
+          use_ema: true
+          ema_decay: 0.99
+        # will probably need this if gpu supports it for flux, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # huggingface model name or path
+        # if you get an error, or get stuck while downloading,
+        # check https://github.com/ostris/ai-toolkit/issues/84, download the model locally and
+        # place it like "/root/ai-toolkit/FLUX.1-dev"
+        name_or_path: "black-forest-labs/FLUX.1-dev"
+        is_flux: true
+        quantize: true  # run 8bit mixed precision
+#        low_vram: true  # uncomment this if the GPU is connected to your monitors. It will use less vram to quantize, but is slower.
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+#          - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a man, white background, medium shot, modeling clothing, studio lighting, white backdrop"
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""  # not used on flux
+        seed: 42
+        walk_seed: true
+        guidance_scale: 4
+        sample_steps: 20
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

config/examples/modal/modal_train_lora_flux_schnell_24gb.yaml ADDED Viewed

	@@ -0,0 +1,98 @@

+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_flux_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "/root/ai-toolkit/modal_output" # must match MOUNT_DIR from run_modal.py
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 16
+        linear_alpha: 16
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        # your dataset must be placed in /ai-toolkit and /root is for modal to find the dir:
+        - folder_path: "/root/ai-toolkit/your-dataset"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 512, 768, 1024 ]  # flux enjoys multiple resolutions
+      train:
+        batch_size: 1
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation_steps: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with flux
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # uncomment to use new vell curved weighting. Experimental but may produce better results
+#        linear_timesteps: true
+        # ema will smooth out learning, but could slow it down. Recommended to leave on.
+        ema_config:
+          use_ema: true
+          ema_decay: 0.99
+        # will probably need this if gpu supports it for flux, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # huggingface model name or path
+        # if you get an error, or get stuck while downloading,
+        # check https://github.com/ostris/ai-toolkit/issues/84, download the models locally and
+        # place them like "/root/ai-toolkit/FLUX.1-schnell" and "/root/ai-toolkit/FLUX.1-schnell-training-adapter"
+        name_or_path: "black-forest-labs/FLUX.1-schnell"
+        assistant_lora_path: "ostris/FLUX.1-schnell-training-adapter" # Required for flux schnell training
+        is_flux: true
+        quantize: true  # run 8bit mixed precision
+        # low_vram is painfully slow to fuse in the adapter avoid it unless absolutely necessary
+#        low_vram: true  # uncomment this if the GPU is connected to your monitors. It will use less vram to quantize, but is slower.
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+#          - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a man, white background, medium shot, modeling clothing, studio lighting, white backdrop"
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""  # not used on flux
+        seed: 42
+        walk_seed: true
+        guidance_scale: 1  # schnell does not do guidance
+        sample_steps: 4  # 1 - 4 works well
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

config/examples/train_flex_redux.yaml ADDED Viewed

	@@ -0,0 +1,112 @@

+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_flex_redux_finetune_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      adapter:
+        type: "redux"
+        # you can finetune an existing adapter or start from scratch. Set to null to start from scratch
+        name_or_path: '/local/path/to/redux_adapter_to_finetune.safetensors'
+        # name_or_path: null
+        # image_encoder_path: 'google/siglip-so400m-patch14-384' # Flux.1 redux adapter
+        image_encoder_path: 'google/siglip2-so400m-patch16-512' # Flex.1 512 redux adapter
+        # image_encoder_arch: 'siglip' # for Flux.1
+        image_encoder_arch: 'siglip2'
+        # You need a control input for each sample. Best to do squares for both images
+        test_img_path:
+          - "/path/to/x_01.jpg"
+          - "/path/to/x_02.jpg"
+          - "/path/to/x_03.jpg"
+          - "/path/to/x_04.jpg"
+          - "/path/to/x_05.jpg"
+          - "/path/to/x_06.jpg"
+          - "/path/to/x_07.jpg"
+          - "/path/to/x_08.jpg"
+          - "/path/to/x_09.jpg"
+          - "/path/to/x_10.jpg"
+        clip_layer: 'last_hidden_state'
+        train: true
+      save:
+        dtype: bf16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          # clip_image_path is directory containting your control images. They must have filename as their train image. (extension does not matter)
+          # for normal redux, we are just recreating the same image, so you can use the same folder path above
+          clip_image_path: "/path/to/control/images/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          resolution: [ 512, 768, 1024 ]  # flex enjoys multiple resolutions
+      train:
+        # this is what I used for the 24GB card, but feel free to adjust
+        # total batch size is 6 here
+        batch_size: 3
+        gradient_accumulation: 2
+        # captions are not needed for this training, we cache a blank proompt and rely on the vision encoder
+        unload_text_encoder: true
+        loss_type: "mse"
+        train_unet: true
+        train_text_encoder: false
+        steps: 4000000  # I set this very high and stop when I like the results
+        content_or_style: balanced  # content, style, balanced
+        gradient_checkpointing: true
+        noise_scheduler: "flowmatch" # or "ddpm", "lms", "euler_a"
+        timestep_type: "flux_shift"
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        # this is for Flex.1, comment this out for FLUX.1-dev
+        bypass_guidance_embedding: true
+        dtype: bf16
+        ema_config:
+          use_ema: true
+          ema_decay: 0.99
+      model:
+        name_or_path: "ostris/Flex.1-alpha"
+        is_flux: true
+        quantize: true
+        text_encoder_bits: 8
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        # I leave half blank to test prompt and unprompted
+        prompts:
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - ""
+          - ""
+          - ""
+          - ""
+          - ""
+        neg: ""
+        seed: 42
+        walk_seed: true
+        guidance_scale: 4
+        sample_steps: 25
+        network_multiplier: 1.0
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

config/examples/train_full_fine_tune_flex.yaml ADDED Viewed

	@@ -0,0 +1,107 @@

+---
+# This configuration requires 48GB of VRAM or more to operate
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_flex_finetune_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+      # performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+      # trigger_word: "p3r5on"
+      save:
+        dtype: bf16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 2 # how many intermittent saves to keep
+        save_format: 'diffusers' # 'diffusers'
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          # cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 512, 768, 1024 ]  # flex enjoys multiple resolutions
+      train:
+        batch_size: 1
+        # IMPORTANT! For Flex, you must bypass the guidance embedder during training
+        bypass_guidance_embedding: true
+        # can be 'sigmoid', 'linear', or 'lognorm_blend'
+        timestep_type: 'sigmoid'
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with flex
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        optimizer: "adafactor"
+        lr: 3e-5
+        # Paramiter swapping can reduce vram requirements. Set factor from 1.0 to 0.0.
+        # 0.1 is 10% of paramiters active at easc step. Only works with adafactor
+        # do_paramiter_swapping: true
+        # paramiter_swapping_factor: 0.9
+        # uncomment this to skip the pre training sample
+        # skip_first_sample: true
+        # uncomment to completely disable sampling
+        # disable_sampling: true
+        # ema will smooth out learning, but could slow it down. Recommended to leave on if you have the vram
+        ema_config:
+          use_ema: true
+          ema_decay: 0.99
+        # will probably need this if gpu supports it for flex, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # huggingface model name or path
+        name_or_path: "ostris/Flex.1-alpha"
+        is_flux: true # flex is flux architecture
+        # full finetuning quantized models is a crapshoot and results in subpar outputs
+        # quantize: true
+        # you can quantize just the T5 text encoder here to save vram
+        quantize_te: true
+        # only train the transformer blocks
+        only_if_contains:
+          - "transformer.transformer_blocks."
+          - "transformer.single_transformer_blocks."
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+          # - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a man, white background, medium shot, modeling clothing, studio lighting, white backdrop"
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""  # not used on flex
+        seed: 42
+        walk_seed: true
+        guidance_scale: 4
+        sample_steps: 25
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

config/examples/train_full_fine_tune_lumina.yaml ADDED Viewed

	@@ -0,0 +1,99 @@

+---
+# This configuration requires 24GB of VRAM or more to operate
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_lumina_finetune_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+      # performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+      # trigger_word: "p3r5on"
+      save:
+        dtype: bf16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 2 # how many intermittent saves to keep
+        save_format: 'diffusers' # 'diffusers'
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          # cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 512, 768, 1024 ]  # lumina2 enjoys multiple resolutions
+      train:
+        batch_size: 1
+        # can be 'sigmoid', 'linear', or 'lumina2_shift'
+        timestep_type: 'lumina2_shift'
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with lumina2
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        optimizer: "adafactor"
+        lr: 3e-5
+        # Paramiter swapping can reduce vram requirements. Set factor from 1.0 to 0.0.
+        # 0.1 is 10% of paramiters active at easc step. Only works with adafactor
+        # do_paramiter_swapping: true
+        # paramiter_swapping_factor: 0.9
+        # uncomment this to skip the pre training sample
+        # skip_first_sample: true
+        # uncomment to completely disable sampling
+        # disable_sampling: true
+        # ema will smooth out learning, but could slow it down. Recommended to leave on if you have the vram
+        # ema_config:
+        #   use_ema: true
+        #   ema_decay: 0.99
+        # will probably need this if gpu supports it for lumina2, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # huggingface model name or path
+        name_or_path: "Alpha-VLLM/Lumina-Image-2.0"
+        is_lumina2: true # lumina2 architecture
+        # you can quantize just the Gemma2 text encoder here to save vram
+        quantize_te: true
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+          # - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a cat that is half black and half orange tabby, split down the middle. The cat has on a blue tophat. They are holding a martini glass with a pink ball of yarn in it with green knitting needles sticking out, in one paw. In the other paw, they are holding a DVD case for a movie titled, \"This is a test\" that has a golden robot on it. In the background is a busy night club with a giant mushroom man dancing with a bear."
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""
+        seed: 42
+        walk_seed: true
+        guidance_scale: 4.0
+        sample_steps: 25
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

config/examples/train_lora_chroma_24gb.yaml ADDED Viewed

	@@ -0,0 +1,104 @@

+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_chroma_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 16
+        linear_alpha: 16
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+        push_to_hub: false #change this to True to push your trained model to Hugging Face.
+        # You can either set up a HF_TOKEN env variable or you'll be prompted to log-in
+#       hf_repo_id: your-username/your-model-slug
+#       hf_private: true #whether the repo is private or public
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 512, 768, 1024 ]  # chroma enjoys multiple resolutions
+      train:
+        batch_size: 1
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with chroma
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # uncomment to use new vell curved weighting. Experimental but may produce better results
+#        linear_timesteps: true
+        # ema will smooth out learning, but could slow it down. Recommended to leave on.
+        ema_config:
+          use_ema: true
+          ema_decay: 0.99
+        # will probably need this if gpu supports it for chroma, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # Download the whichever model you prefer from the Chroma repo
+        # https://huggingface.co/lodestones/Chroma/tree/main
+        # point to it here.
+        # name_or_path: "/path/to/chroma/chroma-unlocked-vVERSION.safetensors"
+        # using lodestones/Chroma will automatically use the latest version
+        name_or_path: "lodestones/Chroma"
+        # # You can also select a version of Chroma like so
+        # name_or_path: "lodestones/Chroma/v28"
+        arch: "chroma"
+        quantize: true  # run 8bit mixed precision
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+#          - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a man, white background, medium shot, modeling clothing, studio lighting, white backdrop"
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""  # negative prompt, optional
+        seed: 42
+        walk_seed: true
+        guidance_scale: 4
+        sample_steps: 25
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

config/examples/train_lora_flex2_24gb.yaml ADDED Viewed

	@@ -0,0 +1,165 @@

+# Note, Flex2 is a highly experimental WIP model. Finetuning a model with built in controls and inpainting has not
+# been done before, so you will be experimenting with me on how to do it. This is my recommended setup, but this is highly
+# subject to change as we learn more about how Flex2 works.
+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_flex2_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 32
+        linear_alpha: 32
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+        push_to_hub: false #change this to True to push your trained model to Hugging Face.
+        # You can either set up a HF_TOKEN env variable or you'll be prompted to log-in
+#       hf_repo_id: your-username/your-model-slug
+#       hf_private: true #whether the repo is private or public
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          # Flex2 is trained with controls and inpainting. If you want the model to truely understand how the
+          # controls function with your dataset, it is a good idea to keep doing controls during training.
+          # this will automatically generate the controls for you before training. The current script is not
+          # fully optimized so this could be rather slow for large datasets, but it caches them to disk so it
+          # only needs to be done once. If you want to skip this step, you can set the controls to [] and it will
+          controls:
+            - "depth"
+            - "line"
+            - "pose"
+            - "inpaint"
+          # you can make custom inpainting images as well. These images must be webp or png format with an alpha.
+          # just erase the part of the image you want to inpaint and save it as a webp or png. Again, erase your
+          # train target. So the person if training a person. The automatic controls above with inpaint will
+          # just run a background remover mask and erase the foreground, which works well for subjects.
+          # inpaint_path: "/my/impaint/images"
+          # you can also specify existing control image pairs. It can handle multiple groups and will randomly
+          # select one for each step.
+          # control_path:
+          #   - "/my/custom/control/images"
+          #   - "/my/custom/control/images2"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          resolution: [ 512, 768, 1024 ]  # flex2 enjoys multiple resolutions
+      train:
+        batch_size: 1
+        # IMPORTANT! For Flex2, you must bypass the guidance embedder during training
+        bypass_guidance_embedding: true
+        steps: 3000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with flex2
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        # shift works well for training fast and learning composition and style.
+        # for just subject, you may want to change this to sigmoid
+        timestep_type: 'shift'  # 'linear', 'sigmoid', 'shift'
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        optimizer_params:
+          weight_decay: 1e-5
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # uncomment to use new vell curved weighting. Experimental but may produce better results
+#        linear_timesteps: true
+        # ema will smooth out learning, but could slow it down. Defaults off
+        ema_config:
+          use_ema: false
+          ema_decay: 0.99
+        # will probably need this if gpu supports it for flex, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # huggingface model name or path
+        name_or_path: "ostris/Flex.2-preview"
+        arch: "flex2"
+        quantize: true  # run 8bit mixed precision
+        quantize_te: true
+        # you can pass special training infor for controls to the model here
+        # percentages are decimal based so 0.0 is 0% and 1.0 is 100% of the time.
+        model_kwargs:
+          # inverts the inpainting mask, good to learn outpainting as well, recommended 0.0 for characters
+          invert_inpaint_mask_chance: 0.5
+          # this will do a normal t2i training step without inpaint when dropped out. REcommended if you want
+          # your lora to be able to inference with and without inpainting.
+          inpaint_dropout: 0.5
+          # randomly drops out the control image. Dropout recvommended if your want it to work without controls as well.
+          control_dropout: 0.5
+          # does a random inpaint blob. Usually a good idea to keep. Without it, the model will learn to always 100%
+          # fill the inpaint area with your subject. This is not always a good thing.
+          inpaint_random_chance: 0.5
+          # generates random inpaint blobs if you did not provide an inpaint image for your dataset. Inpaint breaks down fast
+          # if you are not training with it. Controls are a little more robust and can be left out,
+          # but when in doubt, always leave this on
+          do_random_inpainting: false
+          # does random blurring of the inpaint mask. Helps prevent weird edge artifacts for real workd inpainting. Leave on.
+          random_blur_mask: true
+          # applies a small amount of random dialition and restriction to the inpaint mask. Helps with edge artifacts.
+          # Leave on.
+          random_dialate_mask: true
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+          # - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          # you can use a single inpaint or single control image on your samples.
+          # for controls, the ctrl_idx is 1, the images can be any name and image format.
+          # use either a pose/line/depth image or whatever you are training with. An example is
+          # - "photo of [trigger] --ctrl_idx 1 --ctrl_img /path/to/control/image.jpg"
+          # for an inpainting image, it must be png/webp. Erase the part of the image you want to inpaint
+          # IMPORTANT! the inpaint images must be ctrl_idx 0 and have .inpaint.{ext} in the name for this to work right.
+          # - "photo of [trigger] --ctrl_idx 0 --ctrl_img /path/to/inpaint/image.inpaint.png"
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a man, white background, medium shot, modeling clothing, studio lighting, white backdrop"
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""  # not used on flex2
+        seed: 42
+        walk_seed: true
+        guidance_scale: 4
+        sample_steps: 25
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

config/examples/train_lora_flex_24gb.yaml ADDED Viewed

	@@ -0,0 +1,101 @@

+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_flex_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 16
+        linear_alpha: 16
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+        push_to_hub: false #change this to True to push your trained model to Hugging Face.
+        # You can either set up a HF_TOKEN env variable or you'll be prompted to log-in
+#       hf_repo_id: your-username/your-model-slug
+#       hf_private: true #whether the repo is private or public
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 512, 768, 1024 ]  # flex enjoys multiple resolutions
+      train:
+        batch_size: 1
+        # IMPORTANT! For Flex, you must bypass the guidance embedder during training
+        bypass_guidance_embedding: true
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with flex
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # uncomment to use new vell curved weighting. Experimental but may produce better results
+#        linear_timesteps: true
+        # ema will smooth out learning, but could slow it down. Recommended to leave on.
+        ema_config:
+          use_ema: true
+          ema_decay: 0.99
+        # will probably need this if gpu supports it for flex, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # huggingface model name or path
+        name_or_path: "ostris/Flex.1-alpha"
+        is_flux: true
+        quantize: true  # run 8bit mixed precision
+        quantize_kwargs:
+          exclude:
+            - "*time_text_embed*"  # exclude the time text embedder from quantization
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+#          - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a man, white background, medium shot, modeling clothing, studio lighting, white backdrop"
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""  # not used on flex
+        seed: 42
+        walk_seed: true
+        guidance_scale: 4
+        sample_steps: 25
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

config/examples/train_lora_flux_24gb.yaml ADDED Viewed

	@@ -0,0 +1,96 @@

+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_flux_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 16
+        linear_alpha: 16
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+        push_to_hub: false #change this to True to push your trained model to Hugging Face.
+        # You can either set up a HF_TOKEN env variable or you'll be prompted to log-in
+#       hf_repo_id: your-username/your-model-slug
+#       hf_private: true #whether the repo is private or public
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 512, 768, 1024 ]  # flux enjoys multiple resolutions
+      train:
+        batch_size: 1
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation_steps: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with flux
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # uncomment to use new vell curved weighting. Experimental but may produce better results
+#        linear_timesteps: true
+        # ema will smooth out learning, but could slow it down. Recommended to leave on.
+        ema_config:
+          use_ema: true
+          ema_decay: 0.99
+        # will probably need this if gpu supports it for flux, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # huggingface model name or path
+        name_or_path: "black-forest-labs/FLUX.1-dev"
+        is_flux: true
+        quantize: true  # run 8bit mixed precision
+#        low_vram: true  # uncomment this if the GPU is connected to your monitors. It will use less vram to quantize, but is slower.
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+#          - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a man, white background, medium shot, modeling clothing, studio lighting, white backdrop"
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""  # not used on flux
+        seed: 42
+        walk_seed: true
+        guidance_scale: 4
+        sample_steps: 20
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

config/examples/train_lora_flux_kontext_24gb.yaml ADDED Viewed

	@@ -0,0 +1,106 @@

+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_flux_kontext_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 16
+        linear_alpha: 16
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+        push_to_hub: false #change this to True to push your trained model to Hugging Face.
+        # You can either set up a HF_TOKEN env variable or you'll be prompted to log-in
+#       hf_repo_id: your-username/your-model-slug
+#       hf_private: true #whether the repo is private or public
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          # control path is the input images for kontext for a paired dataset. These are the source images you want to change.
+          # You can comment this out and only use normal images if you don't have a paired dataset.
+          # Control images need to match the filenames on the folder path but in
+          # a different folder. These do not need captions.
+          control_path: "/path/to/control/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          # Kontext runs images in at 2x the latent size. It may OOM at 1024 resolution with 24GB vram.
+          resolution: [ 512, 768 ]  # flux enjoys multiple resolutions
+          # resolution: [ 512, 768, 1024 ]
+      train:
+        batch_size: 1
+        steps: 3000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation_steps: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with flux
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        timestep_type: "weighted" # sigmoid, linear, or weighted.
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # ema will smooth out learning, but could slow it down.
+        # ema_config:
+        #   use_ema: true
+        #   ema_decay: 0.99
+        # will probably need this if gpu supports it for flux, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # huggingface model name or path. This model is gated.
+        # visit https://huggingface.co/black-forest-labs/FLUX.1-Kontext-dev to accept the terms and conditions
+        # and then you can use this model.
+        name_or_path: "black-forest-labs/FLUX.1-Kontext-dev"
+        arch: "flux_kontext"
+        quantize: true  # run 8bit mixed precision
+#        low_vram: true  # uncomment this if the GPU is connected to your monitors. It will use less vram to quantize, but is slower.
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+          # the --ctrl_img path is the one loaded to apply the kontext editing to
+#          - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "make the person smile  --ctrl_img /path/to/control/folder/person1.jpg"
+          - "give the person an afro  --ctrl_img /path/to/control/folder/person1.jpg"
+          - "turn this image into a cartoon  --ctrl_img /path/to/control/folder/person1.jpg"
+          - "put this person in an action film  --ctrl_img /path/to/control/folder/person1.jpg"
+          - "make this person a rapper in a rap music video  --ctrl_img /path/to/control/folder/person1.jpg"
+          - "make the person smile  --ctrl_img /path/to/control/folder/person1.jpg"
+          - "give the person an afro  --ctrl_img /path/to/control/folder/person1.jpg"
+          - "turn this image into a cartoon  --ctrl_img /path/to/control/folder/person1.jpg"
+          - "put this person in an action film  --ctrl_img /path/to/control/folder/person1.jpg"
+          - "make this person a rapper in a rap music video  --ctrl_img /path/to/control/folder/person1.jpg"
+        neg: ""  # not used on flux
+        seed: 42
+        walk_seed: true
+        guidance_scale: 4
+        sample_steps: 20
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

config/examples/train_lora_flux_schnell_24gb.yaml ADDED Viewed

	@@ -0,0 +1,98 @@

+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_flux_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 16
+        linear_alpha: 16
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+        push_to_hub: false #change this to True to push your trained model to Hugging Face.
+        # You can either set up a HF_TOKEN env variable or you'll be prompted to log-in
+#       hf_repo_id: your-username/your-model-slug
+#       hf_private: true #whether the repo is private or public
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 512, 768, 1024 ]  # flux enjoys multiple resolutions
+      train:
+        batch_size: 1
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation_steps: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with flux
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # uncomment to use new bell curved weighting. Experimental but may produce better results
+#        linear_timesteps: true
+        # ema will smooth out learning, but could slow it down. Recommended to leave on.
+        ema_config:
+          use_ema: true
+          ema_decay: 0.99
+        # will probably need this if gpu supports it for flux, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # huggingface model name or path
+        name_or_path: "black-forest-labs/FLUX.1-schnell"
+        assistant_lora_path: "ostris/FLUX.1-schnell-training-adapter" # Required for flux schnell training
+        is_flux: true
+        quantize: true  # run 8bit mixed precision
+        # low_vram is painfully slow to fuse in the adapter avoid it unless absolutely necessary
+#        low_vram: true  # uncomment this if the GPU is connected to your monitors. It will use less vram to quantize, but is slower.
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+#          - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a man, white background, medium shot, modeling clothing, studio lighting, white backdrop"
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""  # not used on flux
+        seed: 42
+        walk_seed: true
+        guidance_scale: 1  # schnell does not do guidance
+        sample_steps: 4  # 1 - 4 works well
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

config/examples/train_lora_hidream_48.yaml ADDED Viewed

	@@ -0,0 +1,112 @@

+# HiDream training is still highly experimental. The settings here will take ~35.2GB of vram to train.
+# It is not possible to train on a single 24GB card yet, but I am working on it. If you have more VRAM
+# I highly recommend first disabling quantization on the model itself if you can. You can leave the TEs quantized.
+# HiDream has a mixture of experts that may take special training considerations that I do not
+# have implemented properly. The current implementation seems to work well for LoRA training, but
+# may not be effective for longer training runs. The implementation could change in future updates
+# so your results may vary when this happens.
+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_hidream_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 32
+        linear_alpha: 32
+        network_kwargs:
+          # it is probably best to ignore the mixture of experts since only 2 are active each block. It works activating it, but I wouldnt.
+          # proper training of it is not fully implemented
+          ignore_if_contains:
+            - "ff_i.experts"
+            - "ff_i.gate"
+      save:
+        dtype: bfloat16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          resolution: [ 512, 768, 1024 ]  # hidream enjoys multiple resolutions
+      train:
+        batch_size: 1
+        steps: 3000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation_steps: 1
+        train_unet: true
+        train_text_encoder: false  # wont work with hidream
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        timestep_type: shift # sigmoid, shift, linear
+        optimizer: "adamw8bit"
+        lr: 2e-4
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # uncomment to use new vell curved weighting. Experimental but may produce better results
+#        linear_timesteps: true
+        # ema will smooth out learning, but could slow it down. Defaults off
+        ema_config:
+          use_ema: false
+          ema_decay: 0.99
+        # will probably need this if gpu supports it for hidream, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # the transformer will get grabbed from this hf repo
+        # warning ONLY train on Full. The dev and fast models are distilled and will break
+        name_or_path: "HiDream-ai/HiDream-I1-Full"
+        # the extras will be grabbed from this hf repo. (text encoder, vae)
+        extras_name_or_path: "HiDream-ai/HiDream-I1-Full"
+        arch: "hidream"
+        # both need to be quantized to train on 48GB currently
+        quantize: true
+        quantize_te: true
+        model_kwargs:
+          # llama is a gated model, It defaults to unsloth version, but you can set the llama path here
+          llama_model_path: "unsloth/Meta-Llama-3.1-8B-Instruct"
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+#          - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a man, white background, medium shot, modeling clothing, studio lighting, white backdrop"
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""
+        seed: 42
+        walk_seed: true
+        guidance_scale: 4
+        sample_steps: 25
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

config/examples/train_lora_lumina.yaml ADDED Viewed

	@@ -0,0 +1,96 @@

+---
+# This configuration requires 20GB of VRAM or more to operate
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_lumina_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+      # performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+      # trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 16
+        linear_alpha: 16
+      save:
+        dtype: bf16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 2 # how many intermittent saves to keep
+        save_format: 'diffusers' # 'diffusers'
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          # cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 512, 768, 1024 ]  # lumina2 enjoys multiple resolutions
+      train:
+        batch_size: 1
+        # can be 'sigmoid', 'linear', or 'lumina2_shift'
+        timestep_type: 'lumina2_shift'
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with lumina2
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        # uncomment this to skip the pre training sample
+        # skip_first_sample: true
+        # uncomment to completely disable sampling
+        # disable_sampling: true
+        # ema will smooth out learning, but could slow it down. Recommended to leave on if you have the vram
+        ema_config:
+          use_ema: true
+          ema_decay: 0.99
+        # will probably need this if gpu supports it for lumina2, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # huggingface model name or path
+        name_or_path: "Alpha-VLLM/Lumina-Image-2.0"
+        is_lumina2: true # lumina2 architecture
+        # you can quantize just the Gemma2 text encoder here to save vram
+        quantize_te: true
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+          # - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a cat that is half black and half orange tabby, split down the middle. The cat has on a blue tophat. They are holding a martini glass with a pink ball of yarn in it with green knitting needles sticking out, in one paw. In the other paw, they are holding a DVD case for a movie titled, \"This is a test\" that has a golden robot on it. In the background is a busy night club with a giant mushroom man dancing with a bear."
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""
+        seed: 42
+        walk_seed: true
+        guidance_scale: 4.0
+        sample_steps: 25
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

config/examples/train_lora_omnigen2_24gb.yaml ADDED Viewed

	@@ -0,0 +1,94 @@

+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_omnigen2_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 16
+        linear_alpha: 16
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+        push_to_hub: false #change this to True to push your trained model to Hugging Face.
+        # You can either set up a HF_TOKEN env variable or you'll be prompted to log-in
+#       hf_repo_id: your-username/your-model-slug
+#       hf_private: true #whether the repo is private or public
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 512, 768, 1024 ]  # omnigen2 should work with multiple resolutions
+      train:
+        batch_size: 1
+        steps: 3000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with omnigen2
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        timestep_type: 'sigmoid' # sigmoid, linear, shift
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # ema will smooth out learning, but could slow it down.
+        # ema_config:
+        #   use_ema: true
+        #   ema_decay: 0.99
+        # will probably need this if gpu supports it for omnigen2, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        name_or_path: "OmniGen2/OmniGen2
+        arch: "omnigen2"
+        quantize_te: true  # quantize_only te
+        # quantize: true  # quantize transformer
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+#          - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a man, white background, medium shot, modeling clothing, studio lighting, white backdrop"
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""  # negative prompt, optional
+        seed: 42
+        walk_seed: true
+        guidance_scale: 4
+        sample_steps: 25
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

config/examples/train_lora_qwen_image_24gb.yaml ADDED Viewed

	@@ -0,0 +1,95 @@

+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_qwen_image_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+      # Trigger words will not work when caching text embeddings
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 16
+        linear_alpha: 16
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          caption_ext: "txt"
+          # default_caption: "a person" # if caching text embeddings, if you dont have captions, this will get cached
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          cache_latents_to_disk: true  # leave this true unless you have a large dataset
+          # if you OOM, 1024 may be too much, but should work
+          resolution: [ 512, 768, 1024 ]  # qwen image enjoys multiple resolutions
+      train:
+        batch_size: 1
+        # caching text embeddings is required for 24GB
+        cache_text_embeddings: true
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with qwen image
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        dtype: bf16
+      model:
+        # huggingface model name or path
+        name_or_path: "Qwen/Qwen-Image"
+        arch: "qwen_image"
+        quantize: true
+        # qtype_te: "qfloat8" Default float8 qquantization
+        # to use the ARA use the | pipe to point to hf path, or a local path if you have one.
+        # 3bit is required for 24GB
+        qtype: "uint3|ostris/accuracy_recovery_adapters/qwen_image_torchao_uint3.safetensors"
+        quantize_te: true
+        qtype_te: "qfloat8"
+        low_vram: true
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+          # - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a man, white background, medium shot, modeling clothing, studio lighting, white backdrop"
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""
+        seed: 42
+        walk_seed: true
+        guidance_scale: 3
+        sample_steps: 25
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

config/examples/train_lora_qwen_image_edit_32gb.yaml ADDED Viewed

	@@ -0,0 +1,102 @@

+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_qwen_image_edit_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+      # Trigger words will not work when caching text embeddings
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 16
+        linear_alpha: 16
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          control_path: "/path/to/control/images/folder"
+          caption_ext: "txt"
+          # default_caption: "a person" # if caching text embeddings, if you don't have captions, this will get cached
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          resolution: [ 512, 768, 1024 ]  # qwen image enjoys multiple resolutions
+      train:
+        batch_size: 1
+        # caching text embeddings is required for 32GB
+        cache_text_embeddings: true
+        steps: 3000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation: 1
+        timestep_type: "weighted"
+        train_unet: true
+        train_text_encoder: false  # probably won't work with qwen image
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        dtype: bf16
+      model:
+        # huggingface model name or path
+        name_or_path: "Qwen/Qwen-Image-Edit"
+        arch: "qwen_image_edit"
+        quantize: true
+        # qtype_te: "qfloat8" Default float8 qquantization
+        # to use the ARA use the | pipe to point to hf path, or a local path if you have one.
+        # 3bit is required for 32GB
+        qtype: "uint3|qwen_image_edit_torchao_uint3.safetensors"
+        quantize_te: true
+        qtype_te: "qfloat8"
+        low_vram: true
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        samples:
+          - prompt: "do the thing to it"
+            ctrl_img: "/path/to/control/image.jpg"
+          - prompt: "do the thing to it"
+            ctrl_img: "/path/to/control/image.jpg"
+          - prompt: "do the thing to it"
+            ctrl_img: "/path/to/control/image.jpg"
+          - prompt: "do the thing to it"
+            ctrl_img: "/path/to/control/image.jpg"
+          - prompt: "do the thing to it"
+            ctrl_img: "/path/to/control/image.jpg"
+          - prompt: "do the thing to it"
+            ctrl_img: "/path/to/control/image.jpg"
+          - prompt: "do the thing to it"
+            ctrl_img: "/path/to/control/image.jpg"
+          - prompt: "do the thing to it"
+            ctrl_img: "/path/to/control/image.jpg"
+          - prompt: "do the thing to it"
+            ctrl_img: "/path/to/control/image.jpg"
+          - prompt: "do the thing to it"
+            ctrl_img: "/path/to/control/image.jpg"
+        neg: ""
+        seed: 42
+        walk_seed: true
+        guidance_scale: 3
+        sample_steps: 25
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

config/examples/train_lora_sd35_large_24gb.yaml ADDED Viewed

	@@ -0,0 +1,97 @@

+---
+# NOTE!! THIS IS CURRENTLY EXPERIMENTAL AND UNDER DEVELOPMENT. SOME THINGS WILL CHANGE
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_sd3l_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 16
+        linear_alpha: 16
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+        push_to_hub: false #change this to True to push your trained model to Hugging Face.
+        # You can either set up a HF_TOKEN env variable or you'll be prompted to log-in
+#       hf_repo_id: your-username/your-model-slug
+#       hf_private: true #whether the repo is private or public
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 1024 ]
+      train:
+        batch_size: 1
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation_steps: 1
+        train_unet: true
+        train_text_encoder: false  # May not fully work with SD3 yet
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch"
+        timestep_type: "linear" # linear or sigmoid
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # uncomment to use new vell curved weighting. Experimental but may produce better results
+#        linear_timesteps: true
+        # ema will smooth out learning, but could slow it down. Recommended to leave on.
+        ema_config:
+          use_ema: true
+          ema_decay: 0.99
+        # will probably need this if gpu supports it for sd3, other dtypes may not work correctly
+        dtype: bf16
+      model:
+        # huggingface model name or path
+        name_or_path: "stabilityai/stable-diffusion-3.5-large"
+        is_v3: true
+        quantize: true  # run 8bit mixed precision
+      sample:
+        sampler: "flowmatch" # must match train.noise_scheduler
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+#          - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a man, white background, medium shot, modeling clothing, studio lighting, white backdrop"
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""
+        seed: 42
+        walk_seed: true
+        guidance_scale: 4
+        sample_steps: 25
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

config/examples/train_lora_wan21_14b_24gb.yaml ADDED Viewed

	@@ -0,0 +1,101 @@

+# IMPORTANT: The Wan2.1 14B model is huge. This config should work on 24GB GPUs. It cannot
+# support keeping the text encoder on GPU while training with 24GB, so it is only good
+# for training on a single prompt, for example a person with a trigger word.
+# to train on captions, you need more vran for now.
+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_wan21_14b_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+      # this is probably needed for 24GB cards when offloading TE to CPU
+      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 32
+        linear_alpha: 32
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+        push_to_hub: false #change this to True to push your trained model to Hugging Face.
+        # You can either set up a HF_TOKEN env variable or you'll be prompted to log-in
+#       hf_repo_id: your-username/your-model-slug
+#       hf_private: true #whether the repo is private or public
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        # AI-Toolkit does not currently support video datasets, we will train on 1 frame at a time
+        # it works well for characters, but not as well for "actions"
+        - folder_path: "/path/to/images/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 632 ]  # will be around 480p
+      train:
+        batch_size: 1
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with wan
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        timestep_type: 'sigmoid'
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        optimizer_params:
+          weight_decay: 1e-4
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # ema will smooth out learning, but could slow it down. Recommended to leave on.
+        ema_config:
+          use_ema: true
+          ema_decay: 0.99
+        dtype: bf16
+        # required for 24GB cards
+        # this will encode your trigger word and use those embeddings for every image in the dataset
+        unload_text_encoder: true
+      model:
+        # huggingface model name or path
+        name_or_path: "Wan-AI/Wan2.1-T2V-14B-Diffusers"
+        arch: 'wan21'
+        # these settings will save as much vram as possible
+        quantize: true
+        quantize_te: true
+        low_vram: true
+      sample:
+        sampler: "flowmatch"
+        sample_every: 250 # sample every this many steps
+        width: 832
+        height: 480
+        num_frames: 40
+        fps: 15
+        # samples take a long time. so use them sparingly
+        # samples will be animated webp files, if you don't see them animated, open in a browser.
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+#          - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+        neg: ""
+        seed: 42
+        walk_seed: true
+        guidance_scale: 5
+        sample_steps: 30
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

config/examples/train_lora_wan21_1b_24gb.yaml ADDED Viewed

	@@ -0,0 +1,90 @@

+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_wan21_1b_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # if a trigger word is specified, it will be added to captions of training data if it does not already exist
+      # alternatively, in your captions you can add [trigger] and it will be replaced with the trigger word
+#      trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 32
+        linear_alpha: 32
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+        push_to_hub: false #change this to True to push your trained model to Hugging Face.
+        # You can either set up a HF_TOKEN env variable or you'll be prompted to log-in
+#       hf_repo_id: your-username/your-model-slug
+#       hf_private: true #whether the repo is private or public
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt. Only jpg, jpeg, and png are supported currently
+        # images will automatically be resized and bucketed into the resolution specified
+        # on windows, escape back slashes with another backslash so
+        # "C:\\path\\to\\images\\folder"
+        # AI-Toolkit does not currently support video datasets, we will train on 1 frame at a time
+        # it works well for characters, but not as well for "actions"
+        - folder_path: "/path/to/images/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          shuffle_tokens: false  # shuffle caption order, split by commas
+          cache_latents_to_disk: true  # leave this true unless you know what you're doing
+          resolution: [ 632 ]  # will be around 480p
+      train:
+        batch_size: 1
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with wan
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        timestep_type: 'sigmoid'
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        optimizer_params:
+          weight_decay: 1e-4
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        # ema will smooth out learning, but could slow it down. Recommended to leave on.
+        ema_config:
+          use_ema: true
+          ema_decay: 0.99
+        dtype: bf16
+      model:
+        # huggingface model name or path
+        name_or_path: "Wan-AI/Wan2.1-T2V-1.3B-Diffusers"
+        arch: 'wan21'
+        quantize_te: true # saves vram
+      sample:
+        sampler: "flowmatch"
+        sample_every: 250 # sample every this many steps
+        width: 832
+        height: 480
+        num_frames: 40
+        fps: 15
+        # samples take a long time. so use them sparingly
+        # samples will be animated webp files, if you don't see them animated, open in a browser.
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+#          - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+        neg: ""
+        seed: 42
+        walk_seed: true
+        guidance_scale: 5
+        sample_steps: 30
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

config/examples/train_lora_wan22_14b_24gb.yaml ADDED Viewed

	@@ -0,0 +1,111 @@

+# this example focuses mainly for training Wan2.2 14b on images. It will work for video as well by increasing
+# the number of frames in the dataset and samples. Training on and generating video is very VRAM intensive.
+---
+job: extension
+config:
+  # this name will be the folder and filename name
+  name: "my_first_wan22_14b_lora_v1"
+  process:
+    - type: 'sd_trainer'
+      # root folder to save training sessions/samples/weights
+      training_folder: "output"
+      # uncomment to see performance stats in the terminal every N steps
+#      performance_log_every: 1000
+      device: cuda:0
+      # Use a trigger word if train.unload_text_encoder is true, however, if caching text embeddings, do not use a trigger word
+      # trigger_word: "p3r5on"
+      network:
+        type: "lora"
+        linear: 32
+        linear_alpha: 32
+      save:
+        dtype: float16 # precision to save
+        save_every: 250 # save every this many steps
+        max_step_saves_to_keep: 4 # how many intermittent saves to keep
+      datasets:
+        # datasets are a folder of images. captions need to be txt files with the same name as the image
+        # for instance image2.jpg and image2.txt.
+        # "C:\\path\\to\\images\\folder"
+        - folder_path: "/path/to/images/or/video/folder"
+          caption_ext: "txt"
+          caption_dropout_rate: 0.05  # will drop out the caption 5% of time
+          # number of frames to extract from your video. It will automatically extract them evenly spaced
+          # set to 1 frame for images
+          num_frames: 1
+          resolution: [ 512, 768, 1024]
+      train:
+        batch_size: 1
+        steps: 2000  # total number of steps to train 500 - 4000 is a good range
+        gradient_accumulation: 1
+        train_unet: true
+        train_text_encoder: false  # probably won't work with wan
+        gradient_checkpointing: true  # need the on unless you have a ton of vram
+        noise_scheduler: "flowmatch" # for training only
+        timestep_type: 'linear'
+        optimizer: "adamw8bit"
+        lr: 1e-4
+        optimizer_params:
+          weight_decay: 1e-4
+        # uncomment this to skip the pre training sample
+#        skip_first_sample: true
+        # uncomment to completely disable sampling
+#        disable_sampling: true
+        dtype: bf16
+        # IMPORTANT: this is for Wan 2.2 MOE. It will switch training one stage or the other every this many steps
+        switch_boundary_every: 10
+        # required for 24GB cards. You must do either unload_text_encoder or cache_text_embeddings but not both
+        # this will encode your trigger word and use those embeddings for every image in the dataset, captions will be ignored
+        # unload_text_encoder: true
+        # this will cache all captions in your dataset.
+        cache_text_embeddings: true
+      model:
+        # huggingface model name or path, this one if bf16, vs the float32 of the official repo
+        name_or_path: "ai-toolkit/Wan2.2-T2V-A14B-Diffusers-bf16"
+        arch: 'wan22_14b'
+        quantize: true
+        # This will pull and use a custom Accuracy Recovery Adapter to train at 4bit
+        qtype: "uint4|ostris/accuracy_recovery_adapters/wan22_14b_t2i_torchao_uint4.safetensors"
+        quantize_te: true
+        qtype_te: "qfloat8"
+        low_vram: true
+        model_kwargs:
+          # you can train high noise, low noise, or both. With low vram it will automatically unload the one not being trained.
+          train_high_noise: true
+          train_low_noise: true
+      sample:
+        sampler: "flowmatch"
+        sample_every: 250 # sample every this many steps
+        width: 1024
+        height: 1024
+        # set to 1 for images
+        num_frames: 1
+        fps: 16
+        # samples take a long time. so use them sparingly
+        # samples will be animated webp files, if you don't see them animated, open in a browser.
+        prompts:
+          # you can add [trigger] to the prompts here and it will be replaced with the trigger word
+#          - "[trigger] holding a sign that says 'I LOVE PROMPTS!'"\
+          - "woman with red hair, playing chess at the park, bomb going off in the background"
+          - "a woman holding a coffee cup, in a beanie, sitting at a cafe"
+          - "a horse is a DJ at a night club, fish eye lens, smoke machine, lazer lights, holding a martini"
+          - "a man showing off his cool new t shirt at the beach, a shark is jumping out of the water in the background"
+          - "a bear building a log cabin in the snow covered mountains"
+          - "woman playing the guitar, on stage, singing a song, laser lights, punk rocker"
+          - "hipster man with a beard, building a chair, in a wood shop"
+          - "photo of a man, white background, medium shot, modeling clothing, studio lighting, white backdrop"
+          - "a man holding a sign that says, 'this is a sign'"
+          - "a bulldog, in a post apocalyptic world, with a shotgun, in a leather jacket, in a desert, with a motorcycle"
+        neg: ""
+        seed: 42
+        walk_seed: true
+        guidance_scale: 3.5
+        sample_steps: 25
+# you can add any additional meta info here. [name] is replaced with config name at top
+meta:
+  name: "[name]"
+  version: '1.0'

config/examples/train_slider.example.yml ADDED Viewed

	@@ -0,0 +1,230 @@

+---
+# This is in yaml format. You can use json if you prefer
+# I like both but yaml is easier to write
+# Plus it has comments which is nice for documentation
+# This is the config I use on my sliders, It is solid and tested
+job: train
+config:
+  # the name will be used to create a folder in the output folder
+  # it will also replace any [name] token in the rest of this config
+  name: detail_slider_v1
+  # folder will be created with name above in folder below
+  # it can be relative to the project root or absolute
+  training_folder: "output/LoRA"
+  device: cuda:0 # cpu, cuda:0, etc
+  # for tensorboard logging, we will make a subfolder for this job
+  log_dir: "output/.tensorboard"
+  # you can stack processes for other jobs, It is not tested with sliders though
+  # just use one for now
+  process:
+    - type: slider # tells runner to run the slider process
+      # network is the LoRA network for a slider, I recommend to leave this be
+      network:
+        # network type lierla is traditional LoRA that works everywhere, only linear layers
+        type: "lierla"
+        # rank / dim of the network. Bigger is not always better. Especially for sliders. 8 is good
+        linear: 8
+        linear_alpha: 4 # Do about half of rank
+      # training config
+      train:
+        # this is also used in sampling. Stick with ddpm unless you know what you are doing
+        noise_scheduler: "ddpm" # or "ddpm", "lms", "euler_a"
+        # how many steps to train. More is not always better. I rarely go over 1000
+        steps: 500
+        # I have had good results with 4e-4 to 1e-4 at 500 steps
+        lr: 2e-4
+        # enables gradient checkpoint, saves vram, leave it on
+        gradient_checkpointing: true
+        # train the unet. I recommend leaving this true
+        train_unet: true
+        # train the text encoder. I don't recommend this unless you have a special use case
+        # for sliders we are adjusting representation of the concept (unet),
+        # not the description of it (text encoder)
+        train_text_encoder: false
+        # same as from sd-scripts, not fully tested but should speed up training
+        min_snr_gamma: 5.0
+        # just leave unless you know what you are doing
+        # also supports "dadaptation" but set lr to 1 if you use that,
+        # but it learns too fast and I don't recommend it
+        optimizer: "adamw"
+        # only constant for now
+        lr_scheduler: "constant"
+        # we randomly denoise random num of steps form 1 to this number
+        # while training. Just leave it
+        max_denoising_steps: 40
+        # works great at 1. I do 1 even with my 4090.
+        # higher may not work right with newer single batch stacking code anyway
+        batch_size: 1
+        # bf16 works best if your GPU supports it (modern)
+        dtype: bf16  # fp32, bf16, fp16
+        # if you have it, use it. It is faster and better
+        # torch 2.0 doesnt need xformers anymore, only use if you have lower version
+#        xformers: true
+        # I don't recommend using unless you are trying to make a darker lora. Then do 0.1 MAX
+        # although, the way we train sliders is comparative, so it probably won't work anyway
+        noise_offset: 0.0
+#        noise_offset: 0.0357  # SDXL was trained with offset of 0.0357. So use that when training on SDXL
+      # the model to train the LoRA network on
+      model:
+        # huggingface name, relative prom project path, or absolute path to .safetensors or .ckpt
+        name_or_path: "runwayml/stable-diffusion-v1-5"
+        is_v2: false  # for v2 models
+        is_v_pred: false # for v-prediction models (most v2 models)
+        # has some issues with the dual text encoder and the way we train sliders
+        # it works bit weights need to probably be higher to see it.
+        is_xl: false  # for SDXL models
+      # saving config
+      save:
+        dtype: float16 # precision to save. I recommend float16
+        save_every: 50 # save every this many steps
+        # this will remove step counts more than this number
+        # allows you to save more often in case of a crash without filling up your drive
+        max_step_saves_to_keep: 2
+      # sampling config
+      sample:
+        # must match train.noise_scheduler, this is not used here
+        # but may be in future and in other processes
+        sampler: "ddpm"
+        # sample every this many steps
+        sample_every: 20
+        # image size
+        width: 512
+        height: 512
+        # prompts to use for sampling. Do as many as you want, but it slows down training
+        # pick ones that will best represent the concept you are trying to adjust
+        # allows some flags after the prompt
+        #  --m [number]  # network multiplier. LoRA weight. -3 for the negative slide, 3 for the positive
+        #      slide are good tests. will inherit sample.network_multiplier if not set
+        #  --n [string]  # negative prompt, will inherit sample.neg if not set
+        # Only 75 tokens allowed currently
+        # I like to do a wide positive and negative spread so I can see a good range and stop
+        # early if the network is braking down
+        prompts:
+          - "a woman in a coffee shop, black hat, blonde hair, blue jacket --m -5"
+          - "a woman in a coffee shop, black hat, blonde hair, blue jacket --m -3"
+          - "a woman in a coffee shop, black hat, blonde hair, blue jacket --m 3"
+          - "a woman in a coffee shop, black hat, blonde hair, blue jacket --m 5"
+          - "a golden retriever sitting on a leather couch, --m -5"
+          - "a golden retriever sitting on a leather couch --m -3"
+          - "a golden retriever sitting on a leather couch --m 3"
+          - "a golden retriever sitting on a leather couch --m 5"
+          - "a man with a beard and red flannel shirt, wearing vr goggles, walking into traffic --m -5"
+          - "a man with a beard and red flannel shirt, wearing vr goggles, walking into traffic --m -3"
+          - "a man with a beard and red flannel shirt, wearing vr goggles, walking into traffic --m 3"
+          - "a man with a beard and red flannel shirt, wearing vr goggles, walking into traffic --m 5"
+        # negative prompt used on all prompts above as default if they don't have one
+        neg: "cartoon, fake, drawing, illustration, cgi, animated, anime, monochrome"
+        # seed for sampling. 42 is the answer for everything
+        seed: 42
+        # walks the seed so s1 is 42, s2 is 43, s3 is 44, etc
+        # will start over on next sample_every so s1 is always seed
+        # works well if you use same prompt but want different results
+        walk_seed: false
+        # cfg scale (4 to 10 is good)
+        guidance_scale: 7
+        # sampler steps (20 to 30 is good)
+        sample_steps: 20
+        # default network multiplier for all prompts
+        # since we are training a slider, I recommend overriding this with --m [number]
+        # in the prompts above to get both sides of the slider
+        network_multiplier: 1.0
+      # logging information
+      logging:
+        log_every: 10 # log every this many steps
+        use_wandb: false # not supported yet
+        verbose: false # probably done need unless you are debugging
+      # slider training config, best for last
+      slider:
+        # resolutions to train on. [ width, height ]. This is less important for sliders
+        # as we are not teaching the model anything it doesn't already know
+        # but must be a size it understands [ 512, 512 ] for sd_v1.5  and [ 768, 768 ] for sd_v2.1
+        # and [ 1024, 1024 ] for sd_xl
+        # you can do as many as you want here
+        resolutions:
+          - [ 512, 512 ]
+#          - [ 512, 768 ]
+#          - [ 768, 768 ]
+        # slider training uses 4 combined steps for a single round. This will do it in one gradient
+        # step. It is highly optimized and shouldn't take anymore vram than doing without it,
+        # since we break down batches for gradient accumulation now. so just leave it on.
+        batch_full_slide: true
+        # These are the concepts to train on. You can do as many as you want here,
+        # but they can conflict outweigh each other. Other than experimenting, I recommend
+        # just doing one for good results
+        targets:
+            # target_class is the base concept we are adjusting the representation of
+            # for example, if we are adjusting the representation of a person, we would use "person"
+            # if we are adjusting the representation of a cat, we would use "cat" It is not
+            # a keyword necessarily but what the model understands the concept to represent.
+            # "person" will affect men, women, children, etc but will not affect cats, dogs, etc
+            # it is the models base general understanding of the concept and everything it represents
+            # you can leave it blank to affect everything. In this example, we are adjusting
+            # detail, so we will leave it blank to affect everything
+          - target_class: ""
+            # positive is the prompt for the positive side of the slider.
+            # It is the concept that will be excited and amplified in the model when we slide the slider
+            # to the positive side and forgotten / inverted when we slide
+            # the slider to the negative side. It is generally best to include the target_class in
+            # the prompt. You want it to be the extreme of what you want to train on. For example,
+            # if you want to train on fat people, you would use "an extremely fat, morbidly obese person"
+            # as the prompt. Not just "fat person"
+            # max 75 tokens for now
+            positive: "high detail, 8k, intricate, detailed, high resolution, high res, high quality"
+            # negative is the prompt for the negative side of the slider and works the same as positive
+            # it does not necessarily work the same as a negative prompt when generating images
+            # these need to be polar opposites.
+            # max 76 tokens for now
+            negative: "blurry, boring, fuzzy, low detail, low resolution, low res, low quality"
+            # the loss for this target is multiplied by this number.
+            # if you are doing more than one target it may be good to set less important ones
+            # to a lower number like 0.1 so they don't outweigh the primary target
+            weight: 1.0
+            # shuffle the prompts split by the comma. We will run every combination randomly
+            # this will make the LoRA more robust. You probably want this on unless prompt order
+            # is important for some reason
+            shuffle: true
+        # anchors are prompts that we will try to hold on to while training the slider
+        # these are NOT necessary and can prevent the slider from converging if not done right
+        # leave them off if you are having issues, but they can help lock the network
+        # on certain concepts to help prevent catastrophic forgetting
+        # you want these to generate an image that is not your target_class, but close to it
+        # is fine as long as it does not directly overlap it.
+        # For example, if you are training on a person smiling,
+        # you could use "a person with a face mask" as an anchor. It is a person, the image is the same
+        # regardless if they are smiling or not, however, the closer the concept is to the target_class
+        # the less the multiplier needs to be. Keep multipliers less than 1.0 for anchors usually
+        # for close concepts, you want to be closer to 0.1 or 0.2
+        # these will slow down training. I am leaving them off for the demo
+#        anchors:
+#          - prompt: "a woman"
+#            neg_prompt: "animal"
+#            # the multiplier applied to the LoRA when this is run.
+#            # higher will give it more weight but also help keep the lora from collapsing
+#            multiplier: 1.0
+#          - prompt: "a man"
+#            neg_prompt: "animal"
+#            multiplier: 1.0
+#          - prompt: "a person"
+#            neg_prompt: "animal"
+#            multiplier: 1.0
+# You can put any information you want here, and it will be saved in the model.
+# The below is an example, but you can put your grocery list in it if you want.
+# It is saved in the model so be aware of that. The software will include this
+# plus some other information for you automatically
+meta:
+  # [name] gets replaced with the name above
+  name: "[name]"
+#  version: '1.0'
+#  creator:
+#    name: Your Name
+#    email: [email protected]
+#    website: https://your.website

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,25 @@

+version: "3.8"
+services:
+  ai-toolkit:
+    image: ostris/aitoolkit:latest
+    restart: unless-stopped
+    ports:
+      - "8675:8675"
+    volumes:
+      - ~/.cache/huggingface/hub:/root/.cache/huggingface/hub
+      - ./aitk_db.db:/app/ai-toolkit/aitk_db.db
+      - ./datasets:/app/ai-toolkit/datasets
+      - ./output:/app/ai-toolkit/output
+      - ./config:/app/ai-toolkit/config
+    environment:
+      - AI_TOOLKIT_AUTH=${AI_TOOLKIT_AUTH:-password}
+      - NODE_ENV=production
+      - TZ=UTC
+    deploy:
+      resources:
+        reservations:
+          devices:
+            - driver: nvidia
+              count: all
+              capabilities: [gpu]

extensions/example/ExampleMergeModels.py ADDED Viewed

	@@ -0,0 +1,129 @@

+import torch
+import gc
+from collections import OrderedDict
+from typing import TYPE_CHECKING
+from jobs.process import BaseExtensionProcess
+from toolkit.config_modules import ModelConfig
+from toolkit.stable_diffusion_model import StableDiffusion
+from toolkit.train_tools import get_torch_dtype
+from tqdm import tqdm
+# Type check imports. Prevents circular imports
+if TYPE_CHECKING:
+    from jobs import ExtensionJob
+# extend standard config classes to add weight
+class ModelInputConfig(ModelConfig):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.weight = kwargs.get('weight', 1.0)
+        # overwrite default dtype unless user specifies otherwise
+        # float 32 will give up better precision on the merging functions
+        self.dtype: str = kwargs.get('dtype', 'float32')
+def flush():
+    torch.cuda.empty_cache()
+    gc.collect()
+# this is our main class process
+class ExampleMergeModels(BaseExtensionProcess):
+    def __init__(
+            self,
+            process_id: int,
+            job: 'ExtensionJob',
+            config: OrderedDict
+    ):
+        super().__init__(process_id, job, config)
+        # this is the setup process, do not do process intensive stuff here, just variable setup and
+        # checking requirements. This is called before the run() function
+        # no loading models or anything like that, it is just for setting up the process
+        # all of your process intensive stuff should be done in the run() function
+        # config will have everything from the process item in the config file
+        # convince methods exist on BaseProcess to get config values
+        # if required is set to true and the value is not found it will throw an error
+        # you can pass a default value to get_conf() as well if it was not in the config file
+        # as well as a type to cast the value to
+        self.save_path = self.get_conf('save_path', required=True)
+        self.save_dtype = self.get_conf('save_dtype', default='float16', as_type=get_torch_dtype)
+        self.device = self.get_conf('device', default='cpu', as_type=torch.device)
+        # build models to merge list
+        models_to_merge = self.get_conf('models_to_merge', required=True, as_type=list)
+        # build list of ModelInputConfig objects. I find it is a good idea to make a class for each config
+        # this way you can add methods to it and it is easier to read and code. There are a lot of
+        # inbuilt config classes located in toolkit.config_modules as well
+        self.models_to_merge = [ModelInputConfig(**model) for model in models_to_merge]
+        # setup is complete. Don't load anything else here, just setup variables and stuff
+    # this is the entire run process be sure to call super().run() first
+    def run(self):
+        # always call first
+        super().run()
+        print(f"Running process: {self.__class__.__name__}")
+        # let's adjust our weights first to normalize them so the total is 1.0
+        total_weight = sum([model.weight for model in self.models_to_merge])
+        weight_adjust = 1.0 / total_weight
+        for model in self.models_to_merge:
+            model.weight *= weight_adjust
+        output_model: StableDiffusion = None
+        # let's do the merge, it is a good idea to use tqdm to show progress
+        for model_config in tqdm(self.models_to_merge, desc="Merging models"):
+            # setup model class with our helper class
+            sd_model = StableDiffusion(
+                device=self.device,
+                model_config=model_config,
+                dtype="float32"
+            )
+            # load the model
+            sd_model.load_model()
+            # adjust the weight of the text encoder
+            if isinstance(sd_model.text_encoder, list):
+                # sdxl model
+                for text_encoder in sd_model.text_encoder:
+                    for key, value in text_encoder.state_dict().items():
+                        value *= model_config.weight
+            else:
+                # normal model
+                for key, value in sd_model.text_encoder.state_dict().items():
+                    value *= model_config.weight
+            # adjust the weights of the unet
+            for key, value in sd_model.unet.state_dict().items():
+                value *= model_config.weight
+            if output_model is None:
+                # use this one as the base
+                output_model = sd_model
+            else:
+                # merge the models
+                # text encoder
+                if isinstance(output_model.text_encoder, list):
+                    # sdxl model
+                    for i, text_encoder in enumerate(output_model.text_encoder):
+                        for key, value in text_encoder.state_dict().items():
+                            value += sd_model.text_encoder[i].state_dict()[key]
+                else:
+                    # normal model
+                    for key, value in output_model.text_encoder.state_dict().items():
+                        value += sd_model.text_encoder.state_dict()[key]
+                # unet
+                for key, value in output_model.unet.state_dict().items():
+                    value += sd_model.unet.state_dict()[key]
+                # remove the model to free memory
+                del sd_model
+                flush()
+        # merge loop is done, let's save the model
+        print(f"Saving merged model to {self.save_path}")
+        output_model.save(self.save_path, meta=self.meta, save_dtype=self.save_dtype)
+        print(f"Saved merged model to {self.save_path}")
+        # do cleanup here
+        del output_model
+        flush()

extensions/example/__init__.py ADDED Viewed

	@@ -0,0 +1,25 @@

+# This is an example extension for custom training. It is great for experimenting with new ideas.
+from toolkit.extension import Extension
+# We make a subclass of Extension
+class ExampleMergeExtension(Extension):
+    # uid must be unique, it is how the extension is identified
+    uid = "example_merge_extension"
+    # name is the name of the extension for printing
+    name = "Example Merge Extension"
+    # This is where your process class is loaded
+    # keep your imports in here so they don't slow down the rest of the program
+    @classmethod
+    def get_process(cls):
+        # import your process class here so it is only loaded when needed and return it
+        from .ExampleMergeModels import ExampleMergeModels
+        return ExampleMergeModels
+AI_TOOLKIT_EXTENSIONS = [
+    # you can put a list of extensions here
+    ExampleMergeExtension
+]

extensions/example/config/config.example.yaml ADDED Viewed

	@@ -0,0 +1,48 @@

+---
+# Always include at least one example config file to show how to use your extension.
+# use plenty of comments so users know how to use it and what everything does
+# all extensions will use this job name
+job: extension
+config:
+  name: 'my_awesome_merge'
+  process:
+    # Put your example processes here. This will be passed
+    # to your extension process in the config argument.
+    # the type MUST match your extension uid
+    - type: "example_merge_extension"
+      # save path for the merged model
+      save_path: "output/merge/[name].safetensors"
+      # save type
+      dtype: fp16
+      # device to run it on
+      device: cuda:0
+      # input models can only be SD1.x and SD2.x models for this example (currently)
+      models_to_merge:
+        # weights are relative, total weights will be normalized
+        # for example. If you have 2 models with weight 1.0, they will
+        # both be weighted 0.5. If you have 1 model with weight 1.0 and
+        # another with weight 2.0, the first will be weighted 1/3 and the
+        # second will be weighted 2/3
+        - name_or_path: "input/model1.safetensors"
+          weight: 1.0
+        - name_or_path: "input/model2.safetensors"
+          weight: 1.0
+        - name_or_path: "input/model3.safetensors"
+          weight: 0.3
+        - name_or_path: "input/model4.safetensors"
+          weight: 1.0
+# you can put any information you want here, and it will be saved in the model
+# the below is an example. I recommend doing trigger words at a minimum
+# in the metadata. The software will include this plus some other information
+meta:
+  name: "[name]"  # [name] gets replaced with the name above
+  description: A short description of your model
+  version: '0.1'
+  creator:
+    name: Your Name
+    email: [email protected]
+    website: https://yourwebsite.com
+  any: All meta data above is arbitrary, it can be whatever you want.

extensions_built_in/.DS_Store ADDED Viewed

Binary file (8.2 kB). View file

extensions_built_in/advanced_generator/Img2ImgGenerator.py ADDED Viewed

	@@ -0,0 +1,256 @@

+import math
+import os
+import random
+from collections import OrderedDict
+from typing import List
+import numpy as np
+from PIL import Image
+from diffusers import T2IAdapter
+from diffusers.utils.torch_utils import randn_tensor
+from torch.utils.data import DataLoader
+from diffusers import StableDiffusionXLImg2ImgPipeline, PixArtSigmaPipeline
+from tqdm import tqdm
+from toolkit.config_modules import ModelConfig, GenerateImageConfig, preprocess_dataset_raw_config, DatasetConfig
+from toolkit.data_transfer_object.data_loader import FileItemDTO, DataLoaderBatchDTO
+from toolkit.sampler import get_sampler
+from toolkit.stable_diffusion_model import StableDiffusion
+import gc
+import torch
+from jobs.process import BaseExtensionProcess
+from toolkit.data_loader import get_dataloader_from_datasets
+from toolkit.train_tools import get_torch_dtype
+from controlnet_aux.midas import MidasDetector
+from diffusers.utils import load_image
+from torchvision.transforms import ToTensor
+def flush():
+    torch.cuda.empty_cache()
+    gc.collect()
+class GenerateConfig:
+    def __init__(self, **kwargs):
+        self.prompts: List[str]
+        self.sampler = kwargs.get('sampler', 'ddpm')
+        self.neg = kwargs.get('neg', '')
+        self.seed = kwargs.get('seed', -1)
+        self.walk_seed = kwargs.get('walk_seed', False)
+        self.guidance_scale = kwargs.get('guidance_scale', 7)
+        self.sample_steps = kwargs.get('sample_steps', 20)
+        self.guidance_rescale = kwargs.get('guidance_rescale', 0.0)
+        self.ext = kwargs.get('ext', 'png')
+        self.denoise_strength = kwargs.get('denoise_strength', 0.5)
+        self.trigger_word = kwargs.get('trigger_word', None)
+class Img2ImgGenerator(BaseExtensionProcess):
+    def __init__(self, process_id: int, job, config: OrderedDict):
+        super().__init__(process_id, job, config)
+        self.output_folder = self.get_conf('output_folder', required=True)
+        self.copy_inputs_to = self.get_conf('copy_inputs_to', None)
+        self.device = self.get_conf('device', 'cuda')
+        self.model_config = ModelConfig(**self.get_conf('model', required=True))
+        self.generate_config = GenerateConfig(**self.get_conf('generate', required=True))
+        self.is_latents_cached = True
+        raw_datasets = self.get_conf('datasets', None)
+        if raw_datasets is not None and len(raw_datasets) > 0:
+            raw_datasets = preprocess_dataset_raw_config(raw_datasets)
+        self.datasets = None
+        self.datasets_reg = None
+        self.dtype = self.get_conf('dtype', 'float16')
+        self.torch_dtype = get_torch_dtype(self.dtype)
+        self.params = []
+        if raw_datasets is not None and len(raw_datasets) > 0:
+            for raw_dataset in raw_datasets:
+                dataset = DatasetConfig(**raw_dataset)
+                is_caching = dataset.cache_latents or dataset.cache_latents_to_disk
+                if not is_caching:
+                    self.is_latents_cached = False
+                if dataset.is_reg:
+                    if self.datasets_reg is None:
+                        self.datasets_reg = []
+                    self.datasets_reg.append(dataset)
+                else:
+                    if self.datasets is None:
+                        self.datasets = []
+                    self.datasets.append(dataset)
+        self.progress_bar = None
+        self.sd = StableDiffusion(
+            device=self.device,
+            model_config=self.model_config,
+            dtype=self.dtype,
+        )
+        print(f"Using device {self.device}")
+        self.data_loader: DataLoader = None
+        self.adapter: T2IAdapter = None
+    def to_pil(self, img):
+        # image comes in -1 to 1. convert to a PIL RGB image
+        img = (img + 1) / 2
+        img = img.clamp(0, 1)
+        img = img[0].permute(1, 2, 0).cpu().numpy()
+        img = (img * 255).astype(np.uint8)
+        image = Image.fromarray(img)
+        return image
+    def run(self):
+        with torch.no_grad():
+            super().run()
+            print("Loading model...")
+            self.sd.load_model()
+            device = torch.device(self.device)
+            if self.model_config.is_xl:
+                pipe = StableDiffusionXLImg2ImgPipeline(
+                    vae=self.sd.vae,
+                    unet=self.sd.unet,
+                    text_encoder=self.sd.text_encoder[0],
+                    text_encoder_2=self.sd.text_encoder[1],
+                    tokenizer=self.sd.tokenizer[0],
+                    tokenizer_2=self.sd.tokenizer[1],
+                    scheduler=get_sampler(self.generate_config.sampler),
+                ).to(device, dtype=self.torch_dtype)
+            elif self.model_config.is_pixart:
+                pipe = self.sd.pipeline.to(device, dtype=self.torch_dtype)
+            else:
+                raise NotImplementedError("Only XL models are supported")
+            pipe.set_progress_bar_config(disable=True)
+            # pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+            # midas_depth = torch.compile(midas_depth, mode="reduce-overhead", fullgraph=True)
+            self.data_loader = get_dataloader_from_datasets(self.datasets, 1, self.sd)
+            num_batches = len(self.data_loader)
+            pbar = tqdm(total=num_batches, desc="Generating images")
+            seed = self.generate_config.seed
+            # load images from datasets, use tqdm
+            for i, batch in enumerate(self.data_loader):
+                batch: DataLoaderBatchDTO = batch
+                gen_seed = seed if seed > 0 else random.randint(0, 2 ** 32 - 1)
+                generator = torch.manual_seed(gen_seed)
+                file_item: FileItemDTO = batch.file_items[0]
+                img_path = file_item.path
+                img_filename = os.path.basename(img_path)
+                img_filename_no_ext = os.path.splitext(img_filename)[0]
+                img_filename = img_filename_no_ext + '.' + self.generate_config.ext
+                output_path = os.path.join(self.output_folder, img_filename)
+                output_caption_path = os.path.join(self.output_folder, img_filename_no_ext + '.txt')
+                if self.copy_inputs_to is not None:
+                    output_inputs_path = os.path.join(self.copy_inputs_to, img_filename)
+                    output_inputs_caption_path = os.path.join(self.copy_inputs_to, img_filename_no_ext + '.txt')
+                else:
+                    output_inputs_path = None
+                    output_inputs_caption_path = None
+                caption = batch.get_caption_list()[0]
+                if self.generate_config.trigger_word is not None:
+                    caption = caption.replace('[trigger]', self.generate_config.trigger_word)
+                img: torch.Tensor = batch.tensor.clone()
+                image = self.to_pil(img)
+                # image.save(output_depth_path)
+                if self.model_config.is_pixart:
+                    pipe: PixArtSigmaPipeline = pipe
+                    # Encode the full image once
+                    encoded_image = pipe.vae.encode(
+                        pipe.image_processor.preprocess(image).to(device=pipe.device, dtype=pipe.dtype))
+                    if hasattr(encoded_image, "latent_dist"):
+                        latents = encoded_image.latent_dist.sample(generator)
+                    elif hasattr(encoded_image, "latents"):
+                        latents = encoded_image.latents
+                    else:
+                        raise AttributeError("Could not access latents of provided encoder_output")
+                    latents = pipe.vae.config.scaling_factor * latents
+                    # latents = self.sd.encode_images(img)
+                    # self.sd.noise_scheduler.set_timesteps(self.generate_config.sample_steps)
+                    # start_step = math.floor(self.generate_config.sample_steps * self.generate_config.denoise_strength)
+                    # timestep = self.sd.noise_scheduler.timesteps[start_step].unsqueeze(0)
+                    # timestep = timestep.to(device, dtype=torch.int32)
+                    # latent = latent.to(device, dtype=self.torch_dtype)
+                    # noise = torch.randn_like(latent, device=device, dtype=self.torch_dtype)
+                    # latent = self.sd.add_noise(latent, noise, timestep)
+                    # timesteps_to_use = self.sd.noise_scheduler.timesteps[start_step + 1:]
+                    batch_size = 1
+                    num_images_per_prompt = 1
+                    shape = (batch_size, pipe.transformer.config.in_channels, image.height // pipe.vae_scale_factor,
+                             image.width // pipe.vae_scale_factor)
+                    noise = randn_tensor(shape, generator=generator, device=pipe.device, dtype=pipe.dtype)
+                    # noise = torch.randn_like(latents, device=device, dtype=self.torch_dtype)
+                    num_inference_steps = self.generate_config.sample_steps
+                    strength = self.generate_config.denoise_strength
+                    # Get timesteps
+                    init_timestep = min(int(num_inference_steps * strength), num_inference_steps)
+                    t_start = max(num_inference_steps - init_timestep, 0)
+                    pipe.scheduler.set_timesteps(num_inference_steps, device="cpu")
+                    timesteps = pipe.scheduler.timesteps[t_start:]
+                    timestep = timesteps[:1].repeat(batch_size * num_images_per_prompt)
+                    latents = pipe.scheduler.add_noise(latents, noise, timestep)
+                    gen_images = pipe.__call__(
+                        prompt=caption,
+                        negative_prompt=self.generate_config.neg,
+                        latents=latents,
+                        timesteps=timesteps,
+                        width=image.width,
+                        height=image.height,
+                        num_inference_steps=num_inference_steps,
+                        num_images_per_prompt=num_images_per_prompt,
+                        guidance_scale=self.generate_config.guidance_scale,
+                        # strength=self.generate_config.denoise_strength,
+                        use_resolution_binning=False,
+                        output_type="np"
+                    ).images[0]
+                    gen_images = (gen_images * 255).clip(0, 255).astype(np.uint8)
+                    gen_images = Image.fromarray(gen_images)
+                else:
+                    pipe: StableDiffusionXLImg2ImgPipeline = pipe
+                    gen_images = pipe.__call__(
+                        prompt=caption,
+                        negative_prompt=self.generate_config.neg,
+                        image=image,
+                        num_inference_steps=self.generate_config.sample_steps,
+                        guidance_scale=self.generate_config.guidance_scale,
+                        strength=self.generate_config.denoise_strength,
+                    ).images[0]
+                os.makedirs(os.path.dirname(output_path), exist_ok=True)
+                gen_images.save(output_path)
+                # save caption
+                with open(output_caption_path, 'w') as f:
+                    f.write(caption)
+                if output_inputs_path is not None:
+                    os.makedirs(os.path.dirname(output_inputs_path), exist_ok=True)
+                    image.save(output_inputs_path)
+                    with open(output_inputs_caption_path, 'w') as f:
+                        f.write(caption)
+                pbar.update(1)
+                batch.cleanup()
+            pbar.close()
+            print("Done generating images")
+            # cleanup
+            del self.sd
+            gc.collect()
+            torch.cuda.empty_cache()

extensions_built_in/advanced_generator/PureLoraGenerator.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import os
+from collections import OrderedDict
+from toolkit.config_modules import ModelConfig, GenerateImageConfig, SampleConfig, LoRMConfig
+from toolkit.lorm import ExtractMode, convert_diffusers_unet_to_lorm
+from toolkit.sd_device_states_presets import get_train_sd_device_state_preset
+from toolkit.stable_diffusion_model import StableDiffusion
+import gc
+import torch
+from jobs.process import BaseExtensionProcess
+from toolkit.train_tools import get_torch_dtype
+def flush():
+    torch.cuda.empty_cache()
+    gc.collect()
+class PureLoraGenerator(BaseExtensionProcess):
+    def __init__(self, process_id: int, job, config: OrderedDict):
+        super().__init__(process_id, job, config)
+        self.output_folder = self.get_conf('output_folder', required=True)
+        self.device = self.get_conf('device', 'cuda')
+        self.device_torch = torch.device(self.device)
+        self.model_config = ModelConfig(**self.get_conf('model', required=True))
+        self.generate_config = SampleConfig(**self.get_conf('sample', required=True))
+        self.dtype = self.get_conf('dtype', 'float16')
+        self.torch_dtype = get_torch_dtype(self.dtype)
+        lorm_config = self.get_conf('lorm', None)
+        self.lorm_config = LoRMConfig(**lorm_config) if lorm_config is not None else None
+        self.device_state_preset = get_train_sd_device_state_preset(
+            device=torch.device(self.device),
+        )
+        self.progress_bar = None
+        self.sd = StableDiffusion(
+            device=self.device,
+            model_config=self.model_config,
+            dtype=self.dtype,
+        )
+    def run(self):
+        super().run()
+        print("Loading model...")
+        with torch.no_grad():
+            self.sd.load_model()
+            self.sd.unet.eval()
+            self.sd.unet.to(self.device_torch)
+            if isinstance(self.sd.text_encoder, list):
+                for te in self.sd.text_encoder:
+                    te.eval()
+                    te.to(self.device_torch)
+            else:
+                self.sd.text_encoder.eval()
+                self.sd.to(self.device_torch)
+            print(f"Converting to LoRM UNet")
+            # replace the unet with LoRMUnet
+            convert_diffusers_unet_to_lorm(
+                self.sd.unet,
+                config=self.lorm_config,
+            )
+            sample_folder = os.path.join(self.output_folder)
+            gen_img_config_list = []
+            sample_config = self.generate_config
+            start_seed = sample_config.seed
+            current_seed = start_seed
+            for i in range(len(sample_config.prompts)):
+                if sample_config.walk_seed:
+                    current_seed = start_seed + i
+                filename = f"[time]_[count].{self.generate_config.ext}"
+                output_path = os.path.join(sample_folder, filename)
+                prompt = sample_config.prompts[i]
+                extra_args = {}
+                gen_img_config_list.append(GenerateImageConfig(
+                    prompt=prompt,  # it will autoparse the prompt
+                    width=sample_config.width,
+                    height=sample_config.height,
+                    negative_prompt=sample_config.neg,
+                    seed=current_seed,
+                    guidance_scale=sample_config.guidance_scale,
+                    guidance_rescale=sample_config.guidance_rescale,
+                    num_inference_steps=sample_config.sample_steps,
+                    network_multiplier=sample_config.network_multiplier,
+                    output_path=output_path,
+                    output_ext=sample_config.ext,
+                    adapter_conditioning_scale=sample_config.adapter_conditioning_scale,
+                    **extra_args
+                ))
+            # send to be generated
+            self.sd.generate_images(gen_img_config_list, sampler=sample_config.sampler)
+            print("Done generating images")
+            # cleanup
+            del self.sd
+            gc.collect()
+            torch.cuda.empty_cache()

extensions_built_in/advanced_generator/ReferenceGenerator.py ADDED Viewed

	@@ -0,0 +1,212 @@

+import os
+import random
+from collections import OrderedDict
+from typing import List
+import numpy as np
+from PIL import Image
+from diffusers import T2IAdapter
+from torch.utils.data import DataLoader
+from diffusers import StableDiffusionXLAdapterPipeline, StableDiffusionAdapterPipeline
+from tqdm import tqdm
+from toolkit.config_modules import ModelConfig, GenerateImageConfig, preprocess_dataset_raw_config, DatasetConfig
+from toolkit.data_transfer_object.data_loader import FileItemDTO, DataLoaderBatchDTO
+from toolkit.sampler import get_sampler
+from toolkit.stable_diffusion_model import StableDiffusion
+import gc
+import torch
+from jobs.process import BaseExtensionProcess
+from toolkit.data_loader import get_dataloader_from_datasets
+from toolkit.train_tools import get_torch_dtype
+from controlnet_aux.midas import MidasDetector
+from diffusers.utils import load_image
+def flush():
+    torch.cuda.empty_cache()
+    gc.collect()
+class GenerateConfig:
+    def __init__(self, **kwargs):
+        self.prompts: List[str]
+        self.sampler = kwargs.get('sampler', 'ddpm')
+        self.neg = kwargs.get('neg', '')
+        self.seed = kwargs.get('seed', -1)
+        self.walk_seed = kwargs.get('walk_seed', False)
+        self.t2i_adapter_path = kwargs.get('t2i_adapter_path', None)
+        self.guidance_scale = kwargs.get('guidance_scale', 7)
+        self.sample_steps = kwargs.get('sample_steps', 20)
+        self.prompt_2 = kwargs.get('prompt_2', None)
+        self.neg_2 = kwargs.get('neg_2', None)
+        self.prompts = kwargs.get('prompts', None)
+        self.guidance_rescale = kwargs.get('guidance_rescale', 0.0)
+        self.ext = kwargs.get('ext', 'png')
+        self.adapter_conditioning_scale = kwargs.get('adapter_conditioning_scale', 1.0)
+        if kwargs.get('shuffle', False):
+            # shuffle the prompts
+            random.shuffle(self.prompts)
+class ReferenceGenerator(BaseExtensionProcess):
+    def __init__(self, process_id: int, job, config: OrderedDict):
+        super().__init__(process_id, job, config)
+        self.output_folder = self.get_conf('output_folder', required=True)
+        self.device = self.get_conf('device', 'cuda')
+        self.model_config = ModelConfig(**self.get_conf('model', required=True))
+        self.generate_config = GenerateConfig(**self.get_conf('generate', required=True))
+        self.is_latents_cached = True
+        raw_datasets = self.get_conf('datasets', None)
+        if raw_datasets is not None and len(raw_datasets) > 0:
+            raw_datasets = preprocess_dataset_raw_config(raw_datasets)
+        self.datasets = None
+        self.datasets_reg = None
+        self.dtype = self.get_conf('dtype', 'float16')
+        self.torch_dtype = get_torch_dtype(self.dtype)
+        self.params = []
+        if raw_datasets is not None and len(raw_datasets) > 0:
+            for raw_dataset in raw_datasets:
+                dataset = DatasetConfig(**raw_dataset)
+                is_caching = dataset.cache_latents or dataset.cache_latents_to_disk
+                if not is_caching:
+                    self.is_latents_cached = False
+                if dataset.is_reg:
+                    if self.datasets_reg is None:
+                        self.datasets_reg = []
+                    self.datasets_reg.append(dataset)
+                else:
+                    if self.datasets is None:
+                        self.datasets = []
+                    self.datasets.append(dataset)
+        self.progress_bar = None
+        self.sd = StableDiffusion(
+            device=self.device,
+            model_config=self.model_config,
+            dtype=self.dtype,
+        )
+        print(f"Using device {self.device}")
+        self.data_loader: DataLoader = None
+        self.adapter: T2IAdapter = None
+    def run(self):
+        super().run()
+        print("Loading model...")
+        self.sd.load_model()
+        device = torch.device(self.device)
+        if self.generate_config.t2i_adapter_path is not None:
+            self.adapter = T2IAdapter.from_pretrained(
+                self.generate_config.t2i_adapter_path,
+                torch_dtype=self.torch_dtype,
+                varient="fp16"
+            ).to(device)
+        midas_depth = MidasDetector.from_pretrained(
+            "valhalla/t2iadapter-aux-models", filename="dpt_large_384.pt", model_type="dpt_large"
+        ).to(device)
+        if self.model_config.is_xl:
+            pipe = StableDiffusionXLAdapterPipeline(
+                vae=self.sd.vae,
+                unet=self.sd.unet,
+                text_encoder=self.sd.text_encoder[0],
+                text_encoder_2=self.sd.text_encoder[1],
+                tokenizer=self.sd.tokenizer[0],
+                tokenizer_2=self.sd.tokenizer[1],
+                scheduler=get_sampler(self.generate_config.sampler),
+                adapter=self.adapter,
+            ).to(device, dtype=self.torch_dtype)
+        else:
+            pipe = StableDiffusionAdapterPipeline(
+                vae=self.sd.vae,
+                unet=self.sd.unet,
+                text_encoder=self.sd.text_encoder,
+                tokenizer=self.sd.tokenizer,
+                scheduler=get_sampler(self.generate_config.sampler),
+                safety_checker=None,
+                feature_extractor=None,
+                requires_safety_checker=False,
+                adapter=self.adapter,
+            ).to(device, dtype=self.torch_dtype)
+        pipe.set_progress_bar_config(disable=True)
+        pipe.unet = torch.compile(pipe.unet, mode="reduce-overhead", fullgraph=True)
+        # midas_depth = torch.compile(midas_depth, mode="reduce-overhead", fullgraph=True)
+        self.data_loader = get_dataloader_from_datasets(self.datasets, 1, self.sd)
+        num_batches = len(self.data_loader)
+        pbar = tqdm(total=num_batches, desc="Generating images")
+        seed = self.generate_config.seed
+        # load images from datasets, use tqdm
+        for i, batch in enumerate(self.data_loader):
+            batch: DataLoaderBatchDTO = batch
+            file_item: FileItemDTO = batch.file_items[0]
+            img_path = file_item.path
+            img_filename = os.path.basename(img_path)
+            img_filename_no_ext = os.path.splitext(img_filename)[0]
+            output_path = os.path.join(self.output_folder, img_filename)
+            output_caption_path = os.path.join(self.output_folder, img_filename_no_ext + '.txt')
+            output_depth_path = os.path.join(self.output_folder, img_filename_no_ext + '.depth.png')
+            caption = batch.get_caption_list()[0]
+            img: torch.Tensor = batch.tensor.clone()
+            # image comes in -1 to 1. convert to a PIL RGB image
+            img = (img + 1) / 2
+            img = img.clamp(0, 1)
+            img = img[0].permute(1, 2, 0).cpu().numpy()
+            img = (img * 255).astype(np.uint8)
+            image = Image.fromarray(img)
+            width, height = image.size
+            min_res = min(width, height)
+            if self.generate_config.walk_seed:
+                seed = seed + 1
+            if self.generate_config.seed == -1:
+                # random
+                seed = random.randint(0, 1000000)
+            torch.manual_seed(seed)
+            torch.cuda.manual_seed(seed)
+            # generate depth map
+            image = midas_depth(
+                image,
+                detect_resolution=min_res,  # do 512 ?
+                image_resolution=min_res
+            )
+            # image.save(output_depth_path)
+            gen_images = pipe(
+                prompt=caption,
+                negative_prompt=self.generate_config.neg,
+                image=image,
+                num_inference_steps=self.generate_config.sample_steps,
+                adapter_conditioning_scale=self.generate_config.adapter_conditioning_scale,
+                guidance_scale=self.generate_config.guidance_scale,
+            ).images[0]
+            os.makedirs(os.path.dirname(output_path), exist_ok=True)
+            gen_images.save(output_path)
+            # save caption
+            with open(output_caption_path, 'w') as f:
+                f.write(caption)
+            pbar.update(1)
+            batch.cleanup()
+        pbar.close()
+        print("Done generating images")
+        # cleanup
+        del self.sd
+        gc.collect()
+        torch.cuda.empty_cache()

extensions_built_in/advanced_generator/__init__.py ADDED Viewed

	@@ -0,0 +1,59 @@

+# This is an example extension for custom training. It is great for experimenting with new ideas.
+from toolkit.extension import Extension
+# This is for generic training (LoRA, Dreambooth, FineTuning)
+class AdvancedReferenceGeneratorExtension(Extension):
+    # uid must be unique, it is how the extension is identified
+    uid = "reference_generator"
+    # name is the name of the extension for printing
+    name = "Reference Generator"
+    # This is where your process class is loaded
+    # keep your imports in here so they don't slow down the rest of the program
+    @classmethod
+    def get_process(cls):
+        # import your process class here so it is only loaded when needed and return it
+        from .ReferenceGenerator import ReferenceGenerator
+        return ReferenceGenerator
+# This is for generic training (LoRA, Dreambooth, FineTuning)
+class PureLoraGenerator(Extension):
+    # uid must be unique, it is how the extension is identified
+    uid = "pure_lora_generator"
+    # name is the name of the extension for printing
+    name = "Pure LoRA Generator"
+    # This is where your process class is loaded
+    # keep your imports in here so they don't slow down the rest of the program
+    @classmethod
+    def get_process(cls):
+        # import your process class here so it is only loaded when needed and return it
+        from .PureLoraGenerator import PureLoraGenerator
+        return PureLoraGenerator
+# This is for generic training (LoRA, Dreambooth, FineTuning)
+class Img2ImgGeneratorExtension(Extension):
+    # uid must be unique, it is how the extension is identified
+    uid = "batch_img2img"
+    # name is the name of the extension for printing
+    name = "Img2ImgGeneratorExtension"
+    # This is where your process class is loaded
+    # keep your imports in here so they don't slow down the rest of the program
+    @classmethod
+    def get_process(cls):
+        # import your process class here so it is only loaded when needed and return it
+        from .Img2ImgGenerator import Img2ImgGenerator
+        return Img2ImgGenerator
+AI_TOOLKIT_EXTENSIONS = [
+    # you can put a list of extensions here
+    AdvancedReferenceGeneratorExtension, PureLoraGenerator, Img2ImgGeneratorExtension
+]

extensions_built_in/advanced_generator/config/train.example.yaml ADDED Viewed

	@@ -0,0 +1,91 @@

+---
+job: extension
+config:
+  name: test_v1
+  process:
+    - type: 'textual_inversion_trainer'
+      training_folder: "out/TI"
+      device: cuda:0
+      # for tensorboard logging
+      log_dir: "out/.tensorboard"
+      embedding:
+        trigger: "your_trigger_here"
+        tokens: 12
+        init_words: "man with short brown hair"
+        save_format: "safetensors"  # 'safetensors' or 'pt'
+      save:
+        dtype: float16 # precision to save
+        save_every: 100 # save every this many steps
+        max_step_saves_to_keep: 5 # only affects step counts
+      datasets:
+        - folder_path: "/path/to/dataset"
+          caption_ext: "txt"
+          default_caption: "[trigger]"
+          buckets: true
+          resolution: 512
+      train:
+        noise_scheduler: "ddpm" # or "ddpm", "lms", "euler_a"
+        steps: 3000
+        weight_jitter: 0.0
+        lr: 5e-5
+        train_unet: false
+        gradient_checkpointing: true
+        train_text_encoder: false
+        optimizer: "adamw"
+#        optimizer: "prodigy"
+        optimizer_params:
+          weight_decay: 1e-2
+        lr_scheduler: "constant"
+        max_denoising_steps: 1000
+        batch_size: 4
+        dtype: bf16
+        xformers: true
+        min_snr_gamma: 5.0
+#        skip_first_sample: true
+        noise_offset: 0.0 # not needed for this
+      model:
+        # objective reality v2
+        name_or_path: "https://civitai.com/models/128453?modelVersionId=142465"
+        is_v2: false  # for v2 models
+        is_xl: false  # for SDXL models
+        is_v_pred: false # for v-prediction models (most v2 models)
+      sample:
+        sampler: "ddpm" # must match train.noise_scheduler
+        sample_every: 100 # sample every this many steps
+        width: 512
+        height: 512
+        prompts:
+          - "photo of [trigger] laughing"
+          - "photo of [trigger] smiling"
+          - "[trigger] close up"
+          - "dark scene [trigger] frozen"
+          - "[trigger] nighttime"
+          - "a painting of [trigger]"
+          - "a drawing of [trigger]"
+          - "a cartoon of [trigger]"
+          - "[trigger] pixar style"
+          - "[trigger] costume"
+        neg: ""
+        seed: 42
+        walk_seed: false
+        guidance_scale: 7
+        sample_steps: 20
+        network_multiplier: 1.0
+      logging:
+        log_every: 10 # log every this many steps
+        use_wandb: false # not supported yet
+        verbose: false
+# You can put any information you want here, and it will be saved in the model.
+# The below is an example, but you can put your grocery list in it if you want.
+# It is saved in the model so be aware of that. The software will include this
+# plus some other information for you automatically
+meta:
+  # [name] gets replaced with the name above
+  name: "[name]"
+#  version: '1.0'
+#  creator:
+#    name: Your Name
+#    email: [email protected]
+#    website: https://your.website

extensions_built_in/concept_replacer/ConceptReplacer.py ADDED Viewed

	@@ -0,0 +1,151 @@

+import random
+from collections import OrderedDict
+from torch.utils.data import DataLoader
+from toolkit.prompt_utils import concat_prompt_embeds, split_prompt_embeds
+from toolkit.stable_diffusion_model import StableDiffusion, BlankNetwork
+from toolkit.train_tools import get_torch_dtype, apply_snr_weight
+import gc
+import torch
+from jobs.process import BaseSDTrainProcess
+def flush():
+    torch.cuda.empty_cache()
+    gc.collect()
+class ConceptReplacementConfig:
+    def __init__(self, **kwargs):
+        self.concept: str = kwargs.get('concept', '')
+        self.replacement: str = kwargs.get('replacement', '')
+class ConceptReplacer(BaseSDTrainProcess):
+    def __init__(self, process_id: int, job, config: OrderedDict, **kwargs):
+        super().__init__(process_id, job, config, **kwargs)
+        replacement_list = self.config.get('replacements', [])
+        self.replacement_list = [ConceptReplacementConfig(**x) for x in replacement_list]
+    def before_model_load(self):
+        pass
+    def hook_before_train_loop(self):
+        self.sd.vae.eval()
+        self.sd.vae.to(self.device_torch)
+        # textual inversion
+        if self.embedding is not None:
+            # set text encoder to train. Not sure if this is necessary but diffusers example did it
+            self.sd.text_encoder.train()
+    def hook_train_loop(self, batch):
+        with torch.no_grad():
+            dtype = get_torch_dtype(self.train_config.dtype)
+            noisy_latents, noise, timesteps, conditioned_prompts, imgs = self.process_general_training_batch(batch)
+            network_weight_list = batch.get_network_weight_list()
+            # have a blank network so we can wrap it in a context and set multipliers without checking every time
+            if self.network is not None:
+                network = self.network
+            else:
+                network = BlankNetwork()
+            batch_replacement_list = []
+            # get a random replacement for each prompt
+            for prompt in conditioned_prompts:
+                replacement = random.choice(self.replacement_list)
+                batch_replacement_list.append(replacement)
+            # build out prompts
+            concept_prompts = []
+            replacement_prompts = []
+            for idx, replacement in enumerate(batch_replacement_list):
+                prompt = conditioned_prompts[idx]
+                # insert shuffled concept at beginning and end of prompt
+                shuffled_concept = [x.strip() for x in replacement.concept.split(',')]
+                random.shuffle(shuffled_concept)
+                shuffled_concept = ', '.join(shuffled_concept)
+                concept_prompts.append(f"{shuffled_concept}, {prompt}, {shuffled_concept}")
+                # insert replacement at beginning and end of prompt
+                shuffled_replacement = [x.strip() for x in replacement.replacement.split(',')]
+                random.shuffle(shuffled_replacement)
+                shuffled_replacement = ', '.join(shuffled_replacement)
+                replacement_prompts.append(f"{shuffled_replacement}, {prompt}, {shuffled_replacement}")
+            # predict the replacement without network
+            conditional_embeds = self.sd.encode_prompt(replacement_prompts).to(self.device_torch, dtype=dtype)
+            replacement_pred = self.sd.predict_noise(
+                latents=noisy_latents.to(self.device_torch, dtype=dtype),
+                conditional_embeddings=conditional_embeds.to(self.device_torch, dtype=dtype),
+                timestep=timesteps,
+                guidance_scale=1.0,
+            )
+            del conditional_embeds
+            replacement_pred = replacement_pred.detach()
+        self.optimizer.zero_grad()
+        flush()
+        # text encoding
+        grad_on_text_encoder = False
+        if self.train_config.train_text_encoder:
+            grad_on_text_encoder = True
+        if self.embedding:
+            grad_on_text_encoder = True
+        # set the weights
+        network.multiplier = network_weight_list
+        # activate network if it exits
+        with network:
+            with torch.set_grad_enabled(grad_on_text_encoder):
+                # embed the prompts
+                conditional_embeds = self.sd.encode_prompt(concept_prompts).to(self.device_torch, dtype=dtype)
+            if not grad_on_text_encoder:
+                # detach the embeddings
+                conditional_embeds = conditional_embeds.detach()
+                self.optimizer.zero_grad()
+                flush()
+            noise_pred = self.sd.predict_noise(
+                latents=noisy_latents.to(self.device_torch, dtype=dtype),
+                conditional_embeddings=conditional_embeds.to(self.device_torch, dtype=dtype),
+                timestep=timesteps,
+                guidance_scale=1.0,
+            )
+            loss = torch.nn.functional.mse_loss(noise_pred.float(), replacement_pred.float(), reduction="none")
+            loss = loss.mean([1, 2, 3])
+            if self.train_config.min_snr_gamma is not None and self.train_config.min_snr_gamma > 0.000001:
+                # add min_snr_gamma
+                loss = apply_snr_weight(loss, timesteps, self.sd.noise_scheduler, self.train_config.min_snr_gamma)
+            loss = loss.mean()
+            # back propagate loss to free ram
+            loss.backward()
+            flush()
+        # apply gradients
+        self.optimizer.step()
+        self.optimizer.zero_grad()
+        self.lr_scheduler.step()
+        if self.embedding is not None:
+            # Let's make sure we don't update any embedding weights besides the newly added token
+            self.embedding.restore_embeddings()
+        loss_dict = OrderedDict(
+            {'loss': loss.item()}
+        )
+        # reset network multiplier
+        network.multiplier = 1.0
+        return loss_dict

extensions_built_in/concept_replacer/__init__.py ADDED Viewed

	@@ -0,0 +1,26 @@

+# This is an example extension for custom training. It is great for experimenting with new ideas.
+from toolkit.extension import Extension
+# This is for generic training (LoRA, Dreambooth, FineTuning)
+class ConceptReplacerExtension(Extension):
+    # uid must be unique, it is how the extension is identified
+    uid = "concept_replacer"
+    # name is the name of the extension for printing
+    name = "Concept Replacer"
+    # This is where your process class is loaded
+    # keep your imports in here so they don't slow down the rest of the program
+    @classmethod
+    def get_process(cls):
+        # import your process class here so it is only loaded when needed and return it
+        from .ConceptReplacer import ConceptReplacer
+        return ConceptReplacer
+AI_TOOLKIT_EXTENSIONS = [
+    # you can put a list of extensions here
+    ConceptReplacerExtension,
+]

extensions_built_in/concept_replacer/config/train.example.yaml ADDED Viewed

	@@ -0,0 +1,91 @@

+---
+job: extension
+config:
+  name: test_v1
+  process:
+    - type: 'textual_inversion_trainer'
+      training_folder: "out/TI"
+      device: cuda:0
+      # for tensorboard logging
+      log_dir: "out/.tensorboard"
+      embedding:
+        trigger: "your_trigger_here"
+        tokens: 12
+        init_words: "man with short brown hair"
+        save_format: "safetensors"  # 'safetensors' or 'pt'
+      save:
+        dtype: float16 # precision to save
+        save_every: 100 # save every this many steps
+        max_step_saves_to_keep: 5 # only affects step counts
+      datasets:
+        - folder_path: "/path/to/dataset"
+          caption_ext: "txt"
+          default_caption: "[trigger]"
+          buckets: true
+          resolution: 512
+      train:
+        noise_scheduler: "ddpm" # or "ddpm", "lms", "euler_a"
+        steps: 3000
+        weight_jitter: 0.0
+        lr: 5e-5
+        train_unet: false
+        gradient_checkpointing: true
+        train_text_encoder: false
+        optimizer: "adamw"
+#        optimizer: "prodigy"
+        optimizer_params:
+          weight_decay: 1e-2
+        lr_scheduler: "constant"
+        max_denoising_steps: 1000
+        batch_size: 4
+        dtype: bf16
+        xformers: true
+        min_snr_gamma: 5.0
+#        skip_first_sample: true
+        noise_offset: 0.0 # not needed for this
+      model:
+        # objective reality v2
+        name_or_path: "https://civitai.com/models/128453?modelVersionId=142465"
+        is_v2: false  # for v2 models
+        is_xl: false  # for SDXL models
+        is_v_pred: false # for v-prediction models (most v2 models)
+      sample:
+        sampler: "ddpm" # must match train.noise_scheduler
+        sample_every: 100 # sample every this many steps
+        width: 512
+        height: 512
+        prompts:
+          - "photo of [trigger] laughing"
+          - "photo of [trigger] smiling"
+          - "[trigger] close up"
+          - "dark scene [trigger] frozen"
+          - "[trigger] nighttime"
+          - "a painting of [trigger]"
+          - "a drawing of [trigger]"
+          - "a cartoon of [trigger]"
+          - "[trigger] pixar style"
+          - "[trigger] costume"
+        neg: ""
+        seed: 42
+        walk_seed: false
+        guidance_scale: 7
+        sample_steps: 20
+        network_multiplier: 1.0
+      logging:
+        log_every: 10 # log every this many steps
+        use_wandb: false # not supported yet
+        verbose: false
+# You can put any information you want here, and it will be saved in the model.
+# The below is an example, but you can put your grocery list in it if you want.
+# It is saved in the model so be aware of that. The software will include this
+# plus some other information for you automatically
+meta:
+  # [name] gets replaced with the name above
+  name: "[name]"
+#  version: '1.0'
+#  creator:
+#    name: Your Name
+#    email: [email protected]
+#    website: https://your.website