diff --git a/backend/args.py b/backend/args.py
index 3254b7f8..0302dfc1 100644
--- a/backend/args.py
+++ b/backend/args.py
@@ -56,8 +56,6 @@ parser.add_argument("--cuda-malloc", action="store_true")
 parser.add_argument("--cuda-stream", action="store_true")
 parser.add_argument("--pin-shared-memory", action="store_true")
 
-parser.add_argument("--i-am-lllyasviel", action="store_true")
-
 args = parser.parse_known_args()[0]
 
 # Some dynamic args that may be changed by webui rather than cmd flags.
diff --git a/backend/diffusion_engine/flux.py b/backend/diffusion_engine/flux.py
index 010022ad..714acaba 100644
--- a/backend/diffusion_engine/flux.py
+++ b/backend/diffusion_engine/flux.py
@@ -7,7 +7,7 @@ from backend.patcher.vae import VAE
 from backend.patcher.unet import UnetPatcher
 from backend.text_processing.classic_engine import ClassicTextProcessingEngine
 from backend.text_processing.t5_engine import T5TextProcessingEngine
-from backend.args import dynamic_args, args
+from backend.args import dynamic_args
 from backend.modules.k_prediction import PredictionFlux
 from backend import memory_management
 
@@ -16,9 +16,6 @@ class Flux(ForgeDiffusionEngine):
     matched_guesses = [model_list.Flux]
 
     def __init__(self, estimated_config, huggingface_components):
-        if not args.i_am_lllyasviel:
-            raise NotImplementedError('Flux is not implemented yet!')
-
         super().__init__(estimated_config, huggingface_components)
         self.is_inpaint = False
 
@@ -68,9 +65,6 @@ class Flux(ForgeDiffusionEngine):
 
         self.use_distilled_cfg_scale = True
 
-        # WebUI Legacy
-        self.first_stage_model = vae.first_stage_model
-
     def set_clip_skip(self, clip_skip):
         self.text_processing_engine_l.clip_skip = clip_skip
 
@@ -93,7 +87,7 @@ class Flux(ForgeDiffusionEngine):
 
     @torch.inference_mode()
     def get_prompt_lengths_on_ui(self, prompt):
-        _, token_count = self.text_processing_engine_t5.process_texts([prompt])
+        token_count = len(self.text_processing_engine_t5.tokenize([prompt])[0])
         return token_count, max(255, token_count)
 
     @torch.inference_mode()
diff --git a/backend/diffusion_engine/sd15.py b/backend/diffusion_engine/sd15.py
index f16d046c..af47eb53 100644
--- a/backend/diffusion_engine/sd15.py
+++ b/backend/diffusion_engine/sd15.py
@@ -53,7 +53,6 @@ class StableDiffusion(ForgeDiffusionEngine):
 
         # WebUI Legacy
         self.is_sd1 = True
-        self.first_stage_model = vae.first_stage_model
 
     def set_clip_skip(self, clip_skip):
         self.text_processing_engine.clip_skip = clip_skip
diff --git a/backend/diffusion_engine/sd20.py b/backend/diffusion_engine/sd20.py
index f9af0b53..adb69528 100644
--- a/backend/diffusion_engine/sd20.py
+++ b/backend/diffusion_engine/sd20.py
@@ -53,7 +53,6 @@ class StableDiffusion2(ForgeDiffusionEngine):
 
         # WebUI Legacy
         self.is_sd2 = True
-        self.first_stage_model = vae.first_stage_model
 
     def set_clip_skip(self, clip_skip):
         self.text_processing_engine.clip_skip = clip_skip
diff --git a/backend/diffusion_engine/sdxl.py b/backend/diffusion_engine/sdxl.py
index fe3a3796..0873da18 100644
--- a/backend/diffusion_engine/sdxl.py
+++ b/backend/diffusion_engine/sdxl.py
@@ -72,7 +72,6 @@ class StableDiffusionXL(ForgeDiffusionEngine):
 
         # WebUI Legacy
         self.is_sdxl = True
-        self.first_stage_model = vae.first_stage_model
 
     def set_clip_skip(self, clip_skip):
         self.text_processing_engine_l.clip_skip = clip_skip
diff --git a/backend/loader.py b/backend/loader.py
index 4d60ceb8..c8f8a256 100644
--- a/backend/loader.py
+++ b/backend/loader.py
@@ -3,6 +3,7 @@ import torch
 import logging
 import importlib
 
+import backend.args
 import huggingface_guess
 
 from diffusers import DiffusionPipeline
@@ -69,9 +70,10 @@ def load_huggingface_component(guess, component_name, lib_name, cls_name, repo_p
             config = read_arbitrary_config(config_path)
 
             dtype = memory_management.text_encoder_dtype()
-            sd_dtype = state_dict['transformer.encoder.block.0.layer.0.SelfAttention.k.weight'].dtype
+            sd_dtype = memory_management.state_dict_dtype(state_dict)
 
             if sd_dtype in [torch.float8_e4m3fn, torch.float8_e5m2]:
+                print(f'Using Detected T5 Data Type: {sd_dtype}')
                 dtype = sd_dtype
 
             with modeling_utils.no_init_weights():
@@ -81,32 +83,60 @@ def load_huggingface_component(guess, component_name, lib_name, cls_name, repo_p
             load_state_dict(model, state_dict, log_name=cls_name, ignore_errors=['transformer.encoder.embed_tokens.weight'])
 
             return model
-        if cls_name == 'UNet2DConditionModel':
+        if cls_name in ['UNet2DConditionModel', 'FluxTransformer2DModel']:
+            model_loader = None
+            if cls_name == 'UNet2DConditionModel':
+                model_loader = lambda c: IntegratedUNet2DConditionModel.from_config(c)
+            if cls_name == 'FluxTransformer2DModel':
+                from backend.nn.flux import IntegratedFluxTransformer2DModel
+                model_loader = lambda c: IntegratedFluxTransformer2DModel(**c)
+
             unet_config = guess.unet_config.copy()
             state_dict_size = memory_management.state_dict_size(state_dict)
-            ini_dtype = memory_management.unet_dtype(model_params=state_dict_size)
-            ini_device = memory_management.unet_inital_load_device(parameters=state_dict_size, dtype=ini_dtype)
-            to_args = dict(device=ini_device, dtype=ini_dtype)
+            state_dict_dtype = memory_management.state_dict_dtype(state_dict)
 
-            with using_forge_operations(**to_args):
-                model = IntegratedUNet2DConditionModel.from_config(unet_config).to(**to_args)
-                model._internal_dict = unet_config
+            storage_dtype = memory_management.unet_dtype(model_params=state_dict_size, supported_dtypes=guess.supported_inference_dtypes)
+
+            unet_storage_dtype_overwrite = backend.args.dynamic_args.get('forge_unet_storage_dtype')
+
+            if unet_storage_dtype_overwrite is not None:
+                storage_dtype = unet_storage_dtype_overwrite
+            else:
+                if state_dict_dtype in [torch.float8_e4m3fn, torch.float8_e5m2, 'nf4', 'fp4']:
+                    print(f'Using Detected UNet Type: {state_dict_dtype}')
+                    storage_dtype = state_dict_dtype
+                    if state_dict_dtype in ['nf4', 'fp4']:
+                        print(f'Using pre-quant state dict!')
+
+            load_device = memory_management.get_torch_device()
+            computation_dtype = memory_management.get_computation_dtype(load_device, supported_dtypes=guess.supported_inference_dtypes)
+            offload_device = memory_management.unet_offload_device()
+
+            if storage_dtype in ['nf4', 'fp4']:
+                initial_device = memory_management.unet_inital_load_device(parameters=state_dict_size, dtype=computation_dtype)
+                with using_forge_operations(device=initial_device, dtype=computation_dtype, manual_cast_enabled=False, bnb_dtype=storage_dtype):
+                    model = model_loader(unet_config)
+            else:
+                initial_device = memory_management.unet_inital_load_device(parameters=state_dict_size, dtype=storage_dtype)
+                need_manual_cast = storage_dtype != computation_dtype
+                to_args = dict(device=initial_device, dtype=storage_dtype)
+
+                with using_forge_operations(**to_args, manual_cast_enabled=need_manual_cast):
+                    model = model_loader(unet_config).to(**to_args)
 
             load_state_dict(model, state_dict)
-            return model
-        if cls_name == 'FluxTransformer2DModel':
-            from backend.nn.flux import IntegratedFluxTransformer2DModel
-            unet_config = guess.unet_config.copy()
-            state_dict_size = memory_management.state_dict_size(state_dict)
-            ini_dtype = memory_management.unet_dtype(model_params=state_dict_size)
-            ini_device = memory_management.unet_inital_load_device(parameters=state_dict_size, dtype=ini_dtype)
-            to_args = dict(device=ini_device, dtype=ini_dtype)
 
-            with using_forge_operations(**to_args):
-                model = IntegratedFluxTransformer2DModel(**unet_config).to(**to_args)
+            if hasattr(model, '_internal_dict'):
+                model._internal_dict = unet_config
+            else:
                 model.config = unet_config
 
-            load_state_dict(model, state_dict)
+            model.storage_dtype = storage_dtype
+            model.computation_dtype = computation_dtype
+            model.load_device = load_device
+            model.initial_device = initial_device
+            model.offload_device = offload_device
+
             return model
 
     print(f'Skipped: {component_name} = {lib_name}.{cls_name}')
diff --git a/backend/memory_management.py b/backend/memory_management.py
index 15749987..050d7ea8 100644
--- a/backend/memory_management.py
+++ b/backend/memory_management.py
@@ -8,7 +8,7 @@ import platform
 
 from enum import Enum
 from backend import stream
-from backend.args import args, dynamic_args
+from backend.args import args
 
 
 cpu = torch.device('cpu')
@@ -281,12 +281,8 @@ except:
     print("Could not pick default device.")
 
 if 'rtx' in torch_device_name.lower():
-    if not args.pin_shared_memory:
-        print('Hint: your device supports --pin-shared-memory for potential speed improvements.')
     if not args.cuda_malloc:
         print('Hint: your device supports --cuda-malloc for potential speed improvements.')
-    if not args.cuda_stream:
-        print('Hint: your device supports --cuda-stream for potential speed improvements.')
 
 
 current_loaded_models = []
@@ -305,8 +301,54 @@ def state_dict_size(sd, exclude_device=None):
     return module_mem
 
 
+def state_dict_dtype(state_dict):
+    for k in state_dict.keys():
+        if 'bitsandbytes__nf4' in k:
+            return 'nf4'
+        if 'bitsandbytes__fp4' in k:
+            return 'fp4'
+
+    dtype_counts = {}
+
+    for tensor in state_dict.values():
+        dtype = tensor.dtype
+        if dtype in dtype_counts:
+            dtype_counts[dtype] += 1
+        else:
+            dtype_counts[dtype] = 1
+
+    major_dtype = None
+    max_count = 0
+
+    for dtype, count in dtype_counts.items():
+        if count > max_count:
+            max_count = count
+            major_dtype = dtype
+
+    return major_dtype
+
+
 def module_size(module, exclude_device=None):
-    return state_dict_size(module.state_dict(), exclude_device=exclude_device)
+    module_mem = 0
+    for p in module.parameters():
+        t = p.data
+
+        if exclude_device is not None:
+            if t.device == exclude_device:
+                continue
+
+        element_size = t.element_size()
+
+        if getattr(p, 'quant_type', None) in ['fp4', 'nf4']:
+            if element_size > 1:
+                # not quanted yet
+                element_size = 0.55  # a bit more than 0.5 because of quant state parameters
+            else:
+                # quanted
+                element_size = 1.1  # a bit more than 0.5 because of quant state parameters
+
+        module_mem += t.nelement() * element_size
+    return module_mem
 
 
 class LoadedModel:
@@ -587,11 +629,6 @@ def unet_inital_load_device(parameters, dtype):
 
 
 def unet_dtype(device=None, model_params=0, supported_dtypes=[torch.float16, torch.bfloat16, torch.float32]):
-    unet_storage_dtype_overwrite = dynamic_args.get('forge_unet_storage_dtype')
-
-    if unet_storage_dtype_overwrite is not None:
-        return unet_storage_dtype_overwrite
-
     if args.unet_in_bf16:
         return torch.bfloat16
 
@@ -1040,6 +1077,18 @@ def should_use_bf16(device=None, model_params=0, prioritize_performance=True, ma
     return False
 
 
+def can_install_bnb():
+    if not torch.cuda.is_available():
+        return False
+
+    cuda_version = tuple(int(x) for x in torch.version.cuda.split('.'))
+
+    if cuda_version >= (11, 7):
+        return True
+
+    return False
+
+
 def soft_empty_cache(force=False):
     global cpu_state
     if cpu_state == CPUState.MPS:
diff --git a/backend/modules/k_model.py b/backend/modules/k_model.py
index b178ae94..3c7d5ee8 100644
--- a/backend/modules/k_model.py
+++ b/backend/modules/k_model.py
@@ -5,16 +5,13 @@ from backend.modules.k_prediction import k_prediction_from_diffusers_scheduler
 
 
 class KModel(torch.nn.Module):
-    def __init__(self, model, diffusers_scheduler, storage_dtype, computation_dtype, k_predictor=None):
+    def __init__(self, model, diffusers_scheduler, k_predictor=None):
         super().__init__()
 
-        self.storage_dtype = storage_dtype
-        self.computation_dtype = computation_dtype
+        self.storage_dtype = model.storage_dtype
+        self.computation_dtype = model.computation_dtype
 
-        need_manual_cast = self.storage_dtype != self.computation_dtype
-        operations.shift_manual_cast(model, enabled=need_manual_cast)
-
-        print(f'K-Model Created: {dict(storage_dtype=storage_dtype, computation_dtype=computation_dtype, manual_cast=need_manual_cast)}')
+        print(f'K-Model Created: {dict(storage_dtype=self.storage_dtype, computation_dtype=self.computation_dtype)}')
 
         self.diffusion_model = model
 
diff --git a/backend/nn/clip.py b/backend/nn/clip.py
index 42005d85..c353275f 100644
--- a/backend/nn/clip.py
+++ b/backend/nn/clip.py
@@ -10,4 +10,3 @@ class IntegratedCLIP(torch.nn.Module):
         if add_text_projection:
             embed_dim = config.hidden_size
             self.transformer.text_projection = torch.nn.Linear(embed_dim, embed_dim, bias=False)
-            self.transformer.text_projection.weight.copy_(torch.eye(embed_dim))
diff --git a/backend/nn/unet.py b/backend/nn/unet.py
index b8d905c6..f4a3d22c 100644
--- a/backend/nn/unet.py
+++ b/backend/nn/unet.py
@@ -433,9 +433,9 @@ class ResBlock(TimestepBlock):
     def _forward(self, x, emb, transformer_options={}):
         if self.updown:
             in_rest, in_conv = self.in_layers[:-1], self.in_layers[-1]
-            if "groupnorm_wrapper" in transformer_options:
+            if "group_norm_wrapper" in transformer_options:
                 in_norm, in_rest = in_rest[0], in_rest[1:]
-                h = transformer_options["groupnorm_wrapper"](in_norm, x, transformer_options)
+                h = transformer_options["group_norm_wrapper"](in_norm, x, transformer_options)
                 h = in_rest(h)
             else:
                 h = in_rest(x)
@@ -443,9 +443,9 @@ class ResBlock(TimestepBlock):
             x = self.x_upd(x)
             h = in_conv(h)
         else:
-            if "groupnorm_wrapper" in transformer_options:
+            if "group_norm_wrapper" in transformer_options:
                 in_norm = self.in_layers[0]
-                h = transformer_options["groupnorm_wrapper"](in_norm, x, transformer_options)
+                h = transformer_options["group_norm_wrapper"](in_norm, x, transformer_options)
                 h = self.in_layers[1:](h)
             else:
                 h = self.in_layers(x)
@@ -456,8 +456,8 @@ class ResBlock(TimestepBlock):
                 emb_out = emb_out[..., None]
         if self.use_scale_shift_norm:
             out_norm, out_rest = self.out_layers[0], self.out_layers[1:]
-            if "groupnorm_wrapper" in transformer_options:
-                h = transformer_options["groupnorm_wrapper"](out_norm, h, transformer_options)
+            if "group_norm_wrapper" in transformer_options:
+                h = transformer_options["group_norm_wrapper"](out_norm, h, transformer_options)
             else:
                 h = out_norm(h)
             if emb_out is not None:
@@ -470,8 +470,8 @@ class ResBlock(TimestepBlock):
                 if self.exchange_temb_dims:
                     emb_out = rearrange(emb_out, "b t c ... -> b c t ...")
                 h = h + emb_out
-            if "groupnorm_wrapper" in transformer_options:
-                h = transformer_options["groupnorm_wrapper"](self.out_layers[0], h, transformer_options)
+            if "group_norm_wrapper" in transformer_options:
+                h = transformer_options["group_norm_wrapper"](self.out_layers[0], h, transformer_options)
                 h = self.out_layers[1:](h)
             else:
                 h = self.out_layers(h)
@@ -752,9 +752,9 @@ class IntegratedUNet2DConditionModel(nn.Module, ConfigMixin):
         transformer_options["block"] = ("last", 0)
         for block_modifier in block_modifiers:
             h = block_modifier(h, 'before', transformer_options)
-        if "groupnorm_wrapper" in transformer_options:
+        if "group_norm_wrapper" in transformer_options:
             out_norm, out_rest = self.out[0], self.out[1:]
-            h = transformer_options["groupnorm_wrapper"](out_norm, h, transformer_options)
+            h = transformer_options["group_norm_wrapper"](out_norm, h, transformer_options)
             h = out_rest(h)
         else:
             h = self.out(h)
diff --git a/backend/operations.py b/backend/operations.py
index db411dee..06a56688 100644
--- a/backend/operations.py
+++ b/backend/operations.py
@@ -1,3 +1,5 @@
+# Copyright Forge 2024
+
 import time
 import torch
 import contextlib
@@ -8,7 +10,7 @@ from backend import stream, memory_management
 stash = {}
 
 
-def weights_manual_cast(layer, x, skip_dtype=False):
+def weights_manual_cast(layer, x, skip_weight_dtype=False, skip_bias_dtype=False):
     weight, bias, signal = None, None, None
     non_blocking = True
 
@@ -18,21 +20,28 @@ def weights_manual_cast(layer, x, skip_dtype=False):
     target_dtype = x.dtype
     target_device = x.device
 
-    if skip_dtype:
-        target_dtype = None
+    if skip_weight_dtype:
+        weight_args = dict(device=target_device, non_blocking=non_blocking)
+    else:
+        weight_args = dict(device=target_device, dtype=target_dtype, non_blocking=non_blocking)
+
+    if skip_bias_dtype:
+        bias_args = dict(device=target_device, non_blocking=non_blocking)
+    else:
+        bias_args = dict(device=target_device, dtype=target_dtype, non_blocking=non_blocking)
 
     if stream.should_use_stream():
         with stream.stream_context()(stream.mover_stream):
             if layer.weight is not None:
-                weight = layer.weight.to(device=target_device, dtype=target_dtype, non_blocking=non_blocking)
+                weight = layer.weight.to(**weight_args)
             if layer.bias is not None:
-                bias = layer.bias.to(device=target_device, dtype=target_dtype, non_blocking=non_blocking)
+                bias = layer.bias.to(**bias_args)
             signal = stream.mover_stream.record_event()
     else:
         if layer.weight is not None:
-            weight = layer.weight.to(device=target_device, dtype=target_dtype, non_blocking=non_blocking)
+            weight = layer.weight.to(**weight_args)
         if layer.bias is not None:
-            bias = layer.bias.to(device=target_device, dtype=target_dtype, non_blocking=non_blocking)
+            bias = layer.bias.to(**bias_args)
 
     return weight, bias, signal
 
@@ -72,19 +81,27 @@ def cleanup_cache():
 current_device = None
 current_dtype = None
 current_manual_cast_enabled = False
+current_bnb_dtype = None
 
 
 class ForgeOperations:
-    class Linear(torch.nn.Linear):
-
+    class Linear(torch.nn.Module):
         def __init__(self, *args, **kwargs):
-            kwargs['device'] = current_device
-            kwargs['dtype'] = current_dtype
-            super().__init__(*args, **kwargs)
+            super().__init__()
+            self.dummy = torch.nn.Parameter(torch.empty(1, device=current_device, dtype=current_dtype))
+            self.weight = None
+            self.bias = None
             self.parameters_manual_cast = current_manual_cast_enabled
 
-        def reset_parameters(self):
-            return None
+        def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
+            if hasattr(self, 'dummy'):
+                if prefix + 'weight' in state_dict:
+                    self.weight = torch.nn.Parameter(state_dict[prefix + 'weight'].to(self.dummy))
+                if prefix + 'bias' in state_dict:
+                    self.bias = torch.nn.Parameter(state_dict[prefix + 'bias'].to(self.dummy))
+                del self.dummy
+            else:
+                super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
 
         def forward(self, x):
             if self.parameters_manual_cast:
@@ -92,7 +109,7 @@ class ForgeOperations:
                 with main_stream_worker(weight, bias, signal):
                     return torch.nn.functional.linear(x, weight, bias)
             else:
-                return super().forward(x)
+                return torch.nn.functional.linear(x, self.weight, self.bias)
 
     class Conv2d(torch.nn.Conv2d):
 
@@ -269,21 +286,61 @@ class ForgeOperations:
 
         def forward(self, x):
             if self.parameters_manual_cast:
-                weight, bias, signal = weights_manual_cast(self, x, skip_dtype=True)
+                weight, bias, signal = weights_manual_cast(self, x, skip_weight_dtype=True, skip_bias_dtype=True)
                 with main_stream_worker(weight, bias, signal):
                     return torch.nn.functional.embedding(x, weight, self.padding_idx, self.max_norm, self.norm_type, self.scale_grad_by_freq, self.sparse)
             else:
                 return super().forward(x)
 
 
-@contextlib.contextmanager
-def using_forge_operations(operations=None, device=None, dtype=None, manual_cast_enabled=False):
-    global current_device, current_dtype, current_manual_cast_enabled
+try:
+    from backend.operations_bnb import ForgeLoader4Bit, ForgeParams4bit, functional_linear_4bits
 
-    current_device, current_dtype, current_manual_cast_enabled = device, dtype, manual_cast_enabled
+    class ForgeOperationsBNB4bits(ForgeOperations):
+        class Linear(ForgeLoader4Bit):
+            def __init__(self, *args, **kwargs):
+                super().__init__(device=current_device, dtype=current_dtype, quant_type=current_bnb_dtype)
+                self.parameters_manual_cast = current_manual_cast_enabled
+
+            def forward(self, x):
+                self.weight.quant_state = self.quant_state
+
+                if self.bias is not None and self.bias.dtype != x.dtype:
+                    # Maybe this can also be set to all non-bnb ops since the cost is very low.
+                    # And it only invokes one time, and most linear does not have bias
+                    self.bias.data = self.bias.data.to(x.dtype)
+
+                if not self.parameters_manual_cast:
+                    return functional_linear_4bits(x, self.weight, self.bias)
+                elif not self.weight.bnb_quantized:
+                    assert x.device.type == 'cuda', 'BNB Must Use CUDA as Computation Device!'
+                    layer_original_device = self.weight.device
+                    self.weight = self.weight._quantize(x.device)
+                    bias = self.bias.to(x.device) if self.bias is not None else None
+                    out = functional_linear_4bits(x, self.weight, bias)
+                    self.weight = self.weight.to(layer_original_device)
+                    return out
+                else:
+                    weight, bias, signal = weights_manual_cast(self, x, skip_weight_dtype=True, skip_bias_dtype=True)
+                    with main_stream_worker(weight, bias, signal):
+                        return functional_linear_4bits(x, weight, bias)
+
+    bnb_avaliable = True
+except:
+    bnb_avaliable = False
+
+
+@contextlib.contextmanager
+def using_forge_operations(operations=None, device=None, dtype=None, manual_cast_enabled=False, bnb_dtype=None):
+    global current_device, current_dtype, current_manual_cast_enabled, current_bnb_dtype
+
+    current_device, current_dtype, current_manual_cast_enabled, current_bnb_dtype = device, dtype, manual_cast_enabled, bnb_dtype
 
     if operations is None:
-        operations = ForgeOperations
+        if bnb_avaliable and bnb_dtype in ['nf4', 'fp4']:
+            operations = ForgeOperationsBNB4bits
+        else:
+            operations = ForgeOperations
 
     op_names = ['Linear', 'Conv1d', 'Conv2d', 'Conv3d', 'ConvTranspose1d', 'ConvTranspose2d', 'ConvTranspose3d', 'GroupNorm', 'LayerNorm', 'Embedding']
     backups = {op_name: getattr(torch.nn, op_name) for op_name in op_names}
diff --git a/backend/operations_bnb.py b/backend/operations_bnb.py
index 826c64da..92984533 100644
--- a/backend/operations_bnb.py
+++ b/backend/operations_bnb.py
@@ -1,1095 +1,120 @@
-# Copyright (c) Facebook, Inc. and its affiliates.
-#
-# This source code is licensed under the MIT license found in the
-# LICENSE file in the root directory of this source tree.
-
-import copy
-from typing import Any, Dict, Optional, TypeVar, Union, overload
-import warnings
+# Copyright Forge 2024
 
 import torch
-from torch import Tensor, device, dtype, nn
-import torch.nn.functional as F
+import bitsandbytes as bnb
 
-try:
-    import bitsandbytes as bnb
-    from bitsandbytes.autograd._functions import get_tile_inds, undo_layout
-    from bitsandbytes.functional import QuantState
-    from bitsandbytes.optim import GlobalOptimManager
-    from bitsandbytes.utils import (
-        INVERSE_LINEAR_8BIT_WEIGHTS_FORMAT_MAPPING,
-        LINEAR_8BIT_WEIGHTS_FORMAT_MAPPING,
-        OutlierTracer,
+from bitsandbytes.nn.modules import Params4bit, QuantState
+
+
+def functional_linear_4bits(x, weight, bias):
+    out = bnb.matmul_4bit(x, weight.t(), bias=bias, quant_state=weight.quant_state)
+    out = out.to(x)
+    return out
+
+
+def copy_quant_state(state: QuantState, device: torch.device = None) -> QuantState:
+    if state is None:
+        return None
+
+    device = device or state.absmax.device
+
+    state2 = (
+        QuantState(
+            absmax=state.state2.absmax.to(device),
+            shape=state.state2.shape,
+            code=state.state2.code.to(device),
+            blocksize=state.state2.blocksize,
+            quant_type=state.state2.quant_type,
+            dtype=state.state2.dtype,
+        )
+        if state.nested
+        else None
     )
 
-    bnb_avaliable = True
-except:
-    bnb_avaliable = False
-
-T = TypeVar("T", bound="torch.nn.Module")
+    return QuantState(
+        absmax=state.absmax.to(device),
+        shape=state.shape,
+        code=state.code.to(device),
+        blocksize=state.blocksize,
+        quant_type=state.quant_type,
+        dtype=state.dtype,
+        offset=state.offset.to(device) if state.nested else None,
+        state2=state2,
+    )
 
 
-class StableEmbedding(torch.nn.Embedding):
-    """
-    Custom embedding layer designed to improve stability during training for NLP tasks by using 32-bit optimizer states. It is designed to reduce gradient variations that can result from quantization. This embedding layer is initialized with Xavier uniform initialization followed by layer normalization.
-
-    Example:
-
-    ```
-    # Initialize StableEmbedding layer with vocabulary size 1000, embedding dimension 300
-    embedding_layer = StableEmbedding(num_embeddings=1000, embedding_dim=300)
-
-    # Reset embedding parameters
-    embedding_layer.reset_parameters()
-
-    # Perform a forward pass with input tensor
-    input_tensor = torch.tensor([1, 2, 3])
-    output_embedding = embedding_layer(input_tensor)
-    ```
-
-    Attributes:
-        norm (`torch.nn.LayerNorm`): Layer normalization applied after the embedding.
-
-    Methods:
-        reset_parameters(): Reset embedding parameters using Xavier uniform initialization.
-        forward(input: Tensor) -> Tensor: Forward pass through the stable embedding layer.
-    """
-
-    def __init__(
-            self,
-            num_embeddings: int,
-            embedding_dim: int,
-            padding_idx: Optional[int] = None,
-            max_norm: Optional[float] = None,
-            norm_type: float = 2.0,
-            scale_grad_by_freq: bool = False,
-            sparse: bool = False,
-            _weight: Optional[Tensor] = None,
-            device=None,
-            dtype=None,
-    ) -> None:
-        """
-        Args:
-            num_embeddings (`int`):
-                The number of unique embeddings (vocabulary size).
-            embedding_dim (`int`):
-                The dimensionality of the embedding.
-            padding_idx (`Optional[int]`):
-                Pads the output with zeros at the given index.
-            max_norm (`Optional[float]`):
-                Renormalizes embeddings to have a maximum L2 norm.
-            norm_type (`float`, defaults to `2.0`):
-                The p-norm to compute for the `max_norm` option.
-            scale_grad_by_freq (`bool`, defaults to `False`):
-                Scale gradient by frequency during backpropagation.
-            sparse (`bool`, defaults to `False`):
-                Computes dense gradients. Set to `True` to compute sparse gradients instead.
-            _weight (`Optional[Tensor]`):
-                Pretrained embeddings.
-        """
-        super().__init__(
-            num_embeddings,
-            embedding_dim,
-            padding_idx,
-            max_norm,
-            norm_type,
-            scale_grad_by_freq,
-            sparse,
-            _weight,
-            device,
-            dtype,
-        )
-        self.norm = torch.nn.LayerNorm(embedding_dim, device=device)
-        GlobalOptimManager.get_instance().register_module_override(self, "weight", {"optim_bits": 32})
-
-    def reset_parameters(self) -> None:
-        torch.nn.init.xavier_uniform_(self.weight)
-        self._fill_padding_idx_with_zero()
-
-    """ !!! This is a redefinition of _fill_padding_idx_with_zero in torch.nn.Embedding
-        to make the Layer compatible with Pytorch < 1.9.
-        This means that if this changes in future PyTorch releases this need to change too
-        which is cumbersome. However, with this we can ensure compatibility with previous
-        PyTorch releases.
-    """
-
-    def _fill_padding_idx_with_zero(self) -> None:
-        if self.padding_idx is not None:
-            with torch.no_grad():
-                self.weight[self.padding_idx].fill_(0)
-
-    def forward(self, input: Tensor) -> Tensor:
-        emb = F.embedding(
-            input,
-            self.weight,
-            self.padding_idx,
-            self.max_norm,
-            self.norm_type,
-            self.scale_grad_by_freq,
-            self.sparse,
-        )
-
-        # always apply layer norm in full precision
-        emb = emb.to(torch.get_default_dtype())
-
-        return self.norm(emb).to(self.weight.dtype)
-
-
-class Embedding(torch.nn.Embedding):
-    """
-    Embedding class to store and retrieve word embeddings from their indices.
-    """
-
-    def __init__(
-            self,
-            num_embeddings: int,
-            embedding_dim: int,
-            padding_idx: Optional[int] = None,
-            max_norm: Optional[float] = None,
-            norm_type: float = 2.0,
-            scale_grad_by_freq: bool = False,
-            sparse: bool = False,
-            _weight: Optional[Tensor] = None,
-            device: Optional[device] = None,
-    ) -> None:
-        """
-        Args:
-            num_embeddings (`int`):
-                The number of unique embeddings (vocabulary size).
-            embedding_dim (`int`):
-                The dimensionality of the embedding.
-            padding_idx (`Optional[int]`):
-                Pads the output with zeros at the given index.
-            max_norm (`Optional[float]`):
-                Renormalizes embeddings to have a maximum L2 norm.
-            norm_type (`float`, defaults to `2.0`):
-                The p-norm to compute for the `max_norm` option.
-            scale_grad_by_freq (`bool`, defaults to `False`):
-                Scale gradient by frequency during backpropagation.
-            sparse (`bool`, defaults to `False`):
-                Computes dense gradients. Set to `True` to compute sparse gradients instead.
-            _weight (`Optional[Tensor]`):
-                Pretrained embeddings.
-        """
-        super().__init__(
-            num_embeddings,
-            embedding_dim,
-            padding_idx,
-            max_norm,
-            norm_type,
-            scale_grad_by_freq,
-            sparse,
-            _weight,
-            device=device,
-        )
-        GlobalOptimManager.get_instance().register_module_override(self, "weight", {"optim_bits": 32})
-
-    def reset_parameters(self) -> None:
-        torch.nn.init.xavier_uniform_(self.weight)
-        self._fill_padding_idx_with_zero()
-
-    """ !!! This is a redefinition of _fill_padding_idx_with_zero in torch.nn.Embedding
-        to make the Layer compatible with Pytorch < 1.9.
-        This means that if this changes in future PyTorch releases this need to change too
-        which is cumbersome. However, with this we can ensure compatibility with previous
-        PyTorch releases.
-    """
-
-    def _fill_padding_idx_with_zero(self) -> None:
-        if self.padding_idx is not None:
-            with torch.no_grad():
-                self.weight[self.padding_idx].fill_(0)
-
-    def forward(self, input: Tensor) -> Tensor:
-        emb = F.embedding(
-            input,
-            self.weight,
-            self.padding_idx,
-            self.max_norm,
-            self.norm_type,
-            self.scale_grad_by_freq,
-            self.sparse,
-        )
-
-        return emb
-
-
-class Params4bit(torch.nn.Parameter):
-    def __new__(
-            cls,
-            data: Optional[torch.Tensor] = None,
-            requires_grad=False,  # quantized weights should be frozen by default
-            quant_state: Optional[QuantState] = None,
-            blocksize: int = 64,
-            compress_statistics: bool = True,
-            quant_type: str = "fp4",
-            quant_storage: torch.dtype = torch.uint8,
-            module: Optional["Linear4bit"] = None,
-            bnb_quantized: bool = False,
-    ) -> "Params4bit":
-        if data is None:
-            data = torch.empty(0)
-
-        self = torch.Tensor._make_subclass(cls, data, requires_grad)
-        self.blocksize = blocksize
-        self.compress_statistics = compress_statistics
-        self.quant_type = quant_type
-        self.quant_state = quant_state
-        self.quant_storage = quant_storage
-        self.bnb_quantized = bnb_quantized
-        self.data = data
-        self.module = module
-        return self
-
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        state["data"] = self.data
-        state["requires_grad"] = self.requires_grad
-        return state
-
-    def __setstate__(self, state):
-        self.requires_grad = state["requires_grad"]
-        self.blocksize = state["blocksize"]
-        self.compress_statistics = state["compress_statistics"]
-        self.quant_type = state["quant_type"]
-        self.quant_state = state["quant_state"]
-        self.data = state["data"]
-        self.quant_storage = state["quant_storage"]
-        self.bnb_quantized = state["bnb_quantized"]
-        self.module = state["module"]
-
-    def __deepcopy__(self, memo):
-        new_instance = type(self).__new__(type(self))
-        state = self.__getstate__()
-        new_instance.__setstate__(state)
-        new_instance.quant_state = copy.deepcopy(state["quant_state"])
-        new_instance.data = copy.deepcopy(state["data"])
-        return new_instance
-
-    def __copy__(self):
-        new_instance = type(self).__new__(type(self))
-        state = self.__getstate__()
-        new_instance.__setstate__(state)
-        return new_instance
-
-    @classmethod
-    def from_prequantized(
-            cls,
-            data: torch.Tensor,
-            quantized_stats: Dict[str, Any],
-            requires_grad: bool = False,
-            device="cuda",
-            module: Optional["Linear4bit"] = None,
-            **kwargs,
-    ) -> "Params4bit":
-        self = torch.Tensor._make_subclass(cls, data.to(device))
-        self.requires_grad = requires_grad
-        self.quant_state = QuantState.from_dict(qs_dict=quantized_stats, device=device)
-        self.blocksize = self.quant_state.blocksize
-        self.compress_statistics = self.quant_state.nested
-        self.quant_type = self.quant_state.quant_type
-        self.bnb_quantized = True
-
-        self.quant_storage = data.dtype
-        self.module = module
-
-        if self.module is not None:
-            self.module.quant_state = self.quant_state
-
-        return self
-
-    def _quantize(self, device):
-        w = self.data.contiguous().to(device)
-        w_4bit, quant_state = bnb.functional.quantize_4bit(
-            w,
-            blocksize=self.blocksize,
-            compress_statistics=self.compress_statistics,
-            quant_type=self.quant_type,
-            quant_storage=self.quant_storage,
-        )
-        self.data = w_4bit
-        self.quant_state = quant_state
-        if self.module is not None:
-            self.module.quant_state = quant_state
-        self.bnb_quantized = True
-        return self
-
-    def cuda(self, device: Optional[Union[int, device, str]] = None, non_blocking: bool = False):
-        return self.to(device="cuda" if device is None else device, non_blocking=non_blocking)
-
-    @overload
-    def to(
-            self: T,
-            device: Optional[Union[int, device]] = ...,
-            dtype: Optional[Union[dtype, str]] = ...,
-            non_blocking: bool = ...,
-    ) -> T:
-        ...
-
-    @overload
-    def to(self: T, dtype: Union[dtype, str], non_blocking: bool = ...) -> T:
-        ...
-
-    @overload
-    def to(self: T, tensor: Tensor, non_blocking: bool = ...) -> T:
-        ...
-
+class ForgeParams4bit(Params4bit):
     def to(self, *args, **kwargs):
         device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
-
         if device is not None and device.type == "cuda" and not self.bnb_quantized:
             return self._quantize(device)
         else:
-            if self.quant_state is not None:
-                self.quant_state.to(device)
-
-            new_param = Params4bit(
-                super().to(device=device, dtype=dtype, non_blocking=non_blocking),
+            n = ForgeParams4bit(
+                torch.nn.Parameter.to(self, device=device, dtype=dtype, non_blocking=non_blocking),
                 requires_grad=self.requires_grad,
-                quant_state=self.quant_state,
+                quant_state=copy_quant_state(self.quant_state, device),
                 blocksize=self.blocksize,
                 compress_statistics=self.compress_statistics,
                 quant_type=self.quant_type,
                 quant_storage=self.quant_storage,
+                bnb_quantized=self.bnb_quantized,
+                module=self.module
             )
-
-            return new_param
+            self.module.quant_state = n.quant_state
+            return n
 
 
-def fix_4bit_weight_quant_state_from_module(module: Union["Embedding4bit", "Linear4bit"]):
-    if getattr(module.weight, "quant_state", None) is not None:
-        return
-
-    if getattr(module, "quant_state", None) is None:
-        warnings.warn(
-            "FP4 quantization state not initialized. Please call .cuda() or .to(device) on the LinearFP4 layer first.",
-        )
-
-    # the quant state got lost when the parameter got converted. This happens for example for fsdp
-    # since we registered the module, we can recover the state here
-    assert module.weight.shape[1] == 1
-    if not isinstance(module.weight, Params4bit):
-        module.weight = Params4bit(module.weight, quant_storage=module.quant_storage, bnb_quantized=True)
-    module.weight.quant_state = module.quant_state
-
-
-class Linear4bit(nn.Linear):
-    """
-    This class is the base module for the 4-bit quantization algorithm presented in [QLoRA](https://arxiv.org/abs/2305.14314).
-    QLoRA 4-bit linear layers uses blockwise k-bit quantization under the hood, with the possibility of selecting various
-    compute datatypes such as FP4 and NF4.
-
-    In order to quantize a linear layer one should first load the original fp16 / bf16 weights into
-    the Linear4bit module, then call `quantized_module.to("cuda")` to quantize the fp16 / bf16 weights.
-
-    Example:
-
-    ```python
-    import torch
-    import torch.nn as nn
-
-    import bitsandbytes as bnb
-    from bnb.nn import Linear4bit
-
-    fp16_model = nn.Sequential(
-        nn.Linear(64, 64),
-        nn.Linear(64, 64)
-    )
-
-    quantized_model = nn.Sequential(
-        Linear4bit(64, 64),
-        Linear4bit(64, 64)
-    )
-
-    quantized_model.load_state_dict(fp16_model.state_dict())
-    quantized_model = quantized_model.to(0) # Quantization happens here
-    ```
-    """
-
-    def __init__(
-            self,
-            input_features,
-            output_features,
-            bias=True,
-            compute_dtype=None,
-            compress_statistics=True,
-            quant_type="fp4",
-            quant_storage=torch.uint8,
-            device=None,
-    ):
-        """
-        Initialize Linear4bit class.
-
-        Args:
-            input_features (`str`):
-                Number of input features of the linear layer.
-            output_features (`str`):
-                Number of output features of the linear layer.
-            bias (`bool`, defaults to `True`):
-                Whether the linear class uses the bias term as well.
-        """
-        super().__init__(input_features, output_features, bias, device)
-        self.weight = Params4bit(
-            self.weight.data,
-            requires_grad=False,
-            compress_statistics=compress_statistics,
-            quant_type=quant_type,
-            quant_storage=quant_storage,
-            module=self,
-        )
-        # self.persistent_buffers = []  # TODO consider as way to save quant state
-        self.compute_dtype = compute_dtype
-        self.compute_type_is_set = False
+class ForgeLoader4Bit(torch.nn.Module):
+    def __init__(self, *, device, dtype, quant_type, **kwargs):
+        super().__init__()
+        self.dummy = torch.nn.Parameter(torch.empty(1, device=device, dtype=dtype))
+        self.weight = None
         self.quant_state = None
-        self.quant_storage = quant_storage
-
-    def set_compute_type(self, x):
-        if x.dtype in [torch.float32, torch.bfloat16]:
-            # the input is in a dtype that is safe to compute in, we switch
-            # to this type for speed and stability
-            self.compute_dtype = x.dtype
-        elif x.dtype == torch.float16:
-            # we take the compoute dtype passed into the layer
-            if self.compute_dtype == torch.float32 and (x.numel() == x.shape[-1]):
-                # single batch inference with input torch.float16 and compute_dtype float32 -> slow inference when it could be fast
-                # warn the user about this
-                warnings.warn(
-                    "Input type into Linear4bit is torch.float16, but bnb_4bit_compute_dtype=torch.float32 (default). This will lead to slow inference.",
-                )
-                warnings.filterwarnings("ignore", message=".*inference.")
-            if self.compute_dtype == torch.float32 and (x.numel() != x.shape[-1]):
-                warnings.warn(
-                    "Input type into Linear4bit is torch.float16, but bnb_4bit_compute_dtype=torch.float32 (default). This will lead to slow inference or training speed.",
-                )
-                warnings.filterwarnings("ignore", message=".*inference or training")
-
-    def _save_to_state_dict(self, destination, prefix, keep_vars):
-        """
-        save weight and bias,
-        then fill state_dict with components of quant_state
-        """
-        super()._save_to_state_dict(destination, prefix, keep_vars)  # saving weight and bias
-
-        if getattr(self.weight, "quant_state", None) is not None:
-            for k, v in self.weight.quant_state.as_dict(packed=True).items():
-                destination[prefix + "weight." + k] = v if keep_vars else v.detach()
-
-    def forward(self, x: torch.Tensor):
-        fix_4bit_weight_quant_state_from_module(self)
-
-        # weights are cast automatically as Int8Params, but the bias has to be cast manually
-        if self.bias is not None and self.bias.dtype != x.dtype:
-            self.bias.data = self.bias.data.to(x.dtype)
-
-        if not self.compute_type_is_set:
-            self.set_compute_type(x)
-            self.compute_type_is_set = True
-
-        inp_dtype = x.dtype
-        if self.compute_dtype is not None:
-            x = x.to(self.compute_dtype)
-
-        bias = None if self.bias is None else self.bias.to(self.compute_dtype)
-        out = bnb.matmul_4bit(x, self.weight.t(), bias=bias, quant_state=self.weight.quant_state)
-
-        out = out.to(inp_dtype)
-
-        return out
-
-
-class LinearFP4(Linear4bit):
-    """
-    Implements the FP4 data type.
-    """
-
-    def __init__(
-            self,
-            input_features,
-            output_features,
-            bias=True,
-            compute_dtype=None,
-            compress_statistics=True,
-            quant_storage=torch.uint8,
-            device=None,
-    ):
-        """
-        Args:
-            input_features (`str`):
-                Number of input features of the linear layer.
-            output_features (`str`):
-                Number of output features of the linear layer.
-            bias (`bool`, defaults to `True`):
-                Whether the linear class uses the bias term as well.
-        """
-        super().__init__(
-            input_features,
-            output_features,
-            bias,
-            compute_dtype,
-            compress_statistics,
-            "fp4",
-            quant_storage,
-            device,
-        )
-
-
-class LinearNF4(Linear4bit):
-    """Implements the NF4 data type.
-
-    Constructs a quantization data type where each bin has equal area under a standard normal distribution N(0, 1) that
-    is normalized into the range [-1, 1].
-
-    For more information read the paper: QLoRA: Efficient Finetuning of Quantized LLMs (https://arxiv.org/abs/2305.14314)
-
-    Implementation of the NF4 data type in bitsandbytes can be found in the `create_normal_map` function in
-    the `functional.py` file: https://github.com/TimDettmers/bitsandbytes/blob/main/bitsandbytes/functional.py#L236.
-    """
-
-    def __init__(
-            self,
-            input_features,
-            output_features,
-            bias=True,
-            compute_dtype=None,
-            compress_statistics=True,
-            quant_storage=torch.uint8,
-            device=None,
-    ):
-        """
-        Args:
-            input_features (`str`):
-                Number of input features of the linear layer.
-            output_features (`str`):
-                Number of output features of the linear layer.
-            bias (`bool`, defaults to `True`):
-                Whether the linear class uses the bias term as well.
-        """
-        super().__init__(
-            input_features,
-            output_features,
-            bias,
-            compute_dtype,
-            compress_statistics,
-            "nf4",
-            quant_storage,
-            device,
-        )
-
-
-class Int8Params(torch.nn.Parameter):
-    def __new__(
-            cls,
-            data=None,
-            requires_grad=True,
-            has_fp16_weights=False,
-            CB=None,
-            SCB=None,
-    ):
-        if data is None:
-            data = torch.empty(0)
-        obj = torch.Tensor._make_subclass(cls, data, requires_grad)
-        obj.CB = CB
-        obj.SCB = SCB
-        obj.has_fp16_weights = has_fp16_weights
-        return obj
-
-    def cuda(self, device):
-        if self.has_fp16_weights:
-            return super().cuda(device)
-        else:
-            # we store the 8-bit rows-major weight
-            # we convert this weight to the turning/ampere weight during the first inference pass
-            B = self.data.contiguous().half().cuda(device)
-            CB, CBt, SCB, SCBt, coo_tensorB = bnb.functional.double_quant(B)
-            del CBt
-            del SCBt
-            self.data = CB
-            self.CB = CB
-            self.SCB = SCB
-
-        return self
-
-    def __deepcopy__(self, memo):
-        # adjust this if new arguments are added to the constructor
-        new_instance = type(self).__new__(
-            type(self),
-            data=copy.deepcopy(self.data, memo),
-            requires_grad=self.requires_grad,
-            has_fp16_weights=self.has_fp16_weights,
-            CB=copy.deepcopy(self.CB, memo),
-            SCB=copy.deepcopy(self.SCB, memo),
-        )
-        return new_instance
-
-    @overload
-    def to(
-            self: T,
-            device: Optional[Union[int, device]] = ...,
-            dtype: Optional[Union[dtype, str]] = ...,
-            non_blocking: bool = ...,
-    ) -> T:
-        ...
-
-    @overload
-    def to(self: T, dtype: Union[dtype, str], non_blocking: bool = ...) -> T:
-        ...
-
-    @overload
-    def to(self: T, tensor: Tensor, non_blocking: bool = ...) -> T:
-        ...
-
-    def to(self, *args, **kwargs):
-        device, dtype, non_blocking, convert_to_format = torch._C._nn._parse_to(*args, **kwargs)
-
-        if device is not None and device.type == "cuda" and self.data.device.type == "cpu":
-            return self.cuda(device)
-        else:
-            new_param = Int8Params(
-                super().to(device=device, dtype=dtype, non_blocking=non_blocking),
-                requires_grad=self.requires_grad,
-                has_fp16_weights=self.has_fp16_weights,
-            )
-            new_param.CB = self.CB
-            new_param.SCB = self.SCB
-
-            return new_param
-
-
-def maybe_rearrange_weight(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
-    weight = state_dict.get(f"{prefix}weight")
-    if weight is None:
-        # if the state dict has no weights for this layer (e.g., LoRA finetuning), do nothing
-        return
-    weight_format = state_dict.pop(f"{prefix}weight_format", "row")
-
-    if isinstance(weight_format, torch.Tensor):
-        weight_format = weight_format.item()
-
-    # For new weights format storage type, we explicitly check
-    # if weights_format is on the mapping
-    if isinstance(weight_format, int) and weight_format not in INVERSE_LINEAR_8BIT_WEIGHTS_FORMAT_MAPPING:
-        raise ValueError(f"Expected supported weight format - got {weight_format}")
-    elif isinstance(weight_format, int) and weight_format in INVERSE_LINEAR_8BIT_WEIGHTS_FORMAT_MAPPING:
-        weight_format = INVERSE_LINEAR_8BIT_WEIGHTS_FORMAT_MAPPING[weight_format]
-
-    if weight_format != "row":
-        tile_indices = get_tile_inds(weight_format, weight.device)
-        state_dict[f"{prefix}weight"] = undo_layout(weight, tile_indices)
-
-
-class Embedding8bit(nn.Embedding):
-    """
-    This class implements [LLM.int8()](https://arxiv.org/abs/2208.07339) algorithm for embedding layer
-
-    Quantization API is similar to Linear8bitLt:
-    ```python
-    import torch
-    import torch.nn as nn
-
-    from bitsandbytes.nn import Embedding8bit
-
-    fp16_module = nn.Embedding(128, 64)
-    int8_module = Embedding8bit(128, 64)
-
-    int8_module.load_state_dict(fp16_module.state_dict())
-
-    int8_module = int8_module.to(0) # Quantization happens here
-    ```
-    """
-
-    def __init__(self, num_embeddings, embedding_dim, device=None, dtype=None):
-        super().__init__(num_embeddings, embedding_dim, device=device, dtype=dtype)
-        self.dtype = self.weight.data.dtype
-
-        self.weight = Int8Params(self.weight.data, has_fp16_weights=False, requires_grad=False)
-
-    def _save_to_state_dict(self, destination, prefix, keep_vars):
-        raise NotImplementedError("Saving Embedding8bit module is not implemented")
-
-    def forward(self, input: Tensor) -> Tensor:
-        if not hasattr(self.weight, "SCB"):
-            raise RuntimeError("Embedding layer is not quantized. Please call .cuda() or .to(device) first.")
-
-        rows = self.weight.data
-        row_stats = self.weight.SCB
-
-        assert rows.shape == (self.num_embeddings, self.embedding_dim)
-        assert row_stats.shape == (self.num_embeddings,)
-
-        compressed_output = F.embedding(input, rows)
-        compressed_output_stats = F.embedding(input, row_stats.view(self.num_embeddings, 1))
-
-        output = compressed_output * (compressed_output_stats / 127.0)
-
-        return output.to(self.dtype)
-
-
-class Embedding4bit(nn.Embedding):
-    """
-    This is the base class similar to Linear4bit. It implements the 4-bit quantization algorithm presented in
-    [QLoRA](https://arxiv.org/abs/2305.14314) for embeddings.
-
-    Quantization API is similar to Linear4bit:
-    ```python
-    import torch
-    import torch.nn as nn
-
-    from bitsandbytes.nn import Embedding4bit
-
-    fp16_module = nn.Embedding(128, 64)
-    quantized_module = Embedding4bit(128, 64)
-
-    quantized_module.load_state_dict(fp16_module.state_dict())
-
-    quantized_module = quantized_module.to(0) # Quantization happens here
-    ```
-    """
-
-    def __init__(
-            self,
-            num_embeddings,
-            embedding_dim,
-            dtype=None,
-            quant_type="fp4",
-            quant_storage=torch.uint8,
-            device=None,
-    ):
-        super().__init__(num_embeddings, embedding_dim, device=device, dtype=dtype)
-        self.dtype = self.weight.data.dtype
-
-        self.weight = Params4bit(
-            self.weight.data,
-            requires_grad=False,
-            compress_statistics=None,
-            quant_type=quant_type,
-            quant_storage=quant_storage,
-            module=self,
-        )
-
-        blocksize = self.weight.blocksize
-
-        if embedding_dim % blocksize != 0:
-            warnings.warn(
-                f"Embedding size {embedding_dim} is not divisible by block size {blocksize}. "
-                "This will lead to slow inference.",
-            )
-
-    def _forward_with_partial_dequantize(self, input: Tensor):
-        assert self.embedding_dim % self.weight.quant_state.blocksize == 0
-
-        w_4bit_uint8 = self.weight.data.view(torch.uint8).view(self.num_embeddings * self.embedding_dim // 2, 1)
-
-        output_4bit = torch.nn.functional.embedding(
-            weight=w_4bit_uint8.view(self.num_embeddings, self.embedding_dim // 2),
-            input=input,
-        ).view(-1, 1)
-        assert output_4bit.shape == (input.numel() * self.embedding_dim // 2, 1)
-
-        blocks_per_emb = self.embedding_dim // self.weight.blocksize
-
-        absmax = self.weight.quant_state.absmax
-        assert absmax.shape == (self.num_embeddings * blocks_per_emb,)
-
-        output_absmax = torch.nn.functional.embedding(
-            weight=absmax.view(self.num_embeddings, blocks_per_emb),
-            input=input,
-        ).view(
-            -1,
-        )
-        assert output_absmax.shape == (input.numel() * blocks_per_emb,)
-
-        output_quant_state = copy.deepcopy(self.weight.quant_state)
-        output_quant_state.absmax = output_absmax
-        output_quant_state.shape = torch.Size((*input.shape, self.embedding_dim))
-
-        output = bnb.functional.dequantize_4bit(output_4bit, output_quant_state)
-        assert output.shape == (*input.shape, self.embedding_dim)
-
-        return output.to(self.dtype)
-
-    def _save_to_state_dict(self, destination, prefix, keep_vars):
-        raise NotImplementedError("Saving Embedding4bit module is not implemented")
-
-    def forward(self, input: Tensor) -> Tensor:
-        fix_4bit_weight_quant_state_from_module(self)
-
-        if self.embedding_dim % self.weight.quant_state.blocksize == 0:
-            return self._forward_with_partial_dequantize(input)
-
-        dequantized_weight = bnb.functional.dequantize_4bit(self.weight.data, self.weight.quant_state)
-
-        return torch.nn.functional.embedding(
-            weight=dequantized_weight,
-            input=input,
-        ).to(self.dtype)
-
-
-class EmbeddingFP4(Embedding4bit):
-    def __init__(
-            self,
-            num_embeddings,
-            embedding_dim,
-            dtype=None,
-            quant_storage=torch.uint8,
-            device=None,
-    ):
-        super().__init__(
-            num_embeddings,
-            embedding_dim,
-            dtype=dtype,
-            quant_type="fp4",
-            quant_storage=quant_storage,
-            device=device,
-        )
-
-
-class EmbeddingNF4(Embedding4bit):
-    def __init__(
-            self,
-            num_embeddings,
-            embedding_dim,
-            dtype=None,
-            quant_storage=torch.uint8,
-            device=None,
-    ):
-        super().__init__(
-            num_embeddings,
-            embedding_dim,
-            dtype=dtype,
-            quant_type="nf4",
-            quant_storage=quant_storage,
-            device=device,
-        )
-
-
-class Linear8bitLt(nn.Linear):
-    """
-    This class is the base module for the [LLM.int8()](https://arxiv.org/abs/2208.07339) algorithm.
-    To read more about it, have a look at the paper.
-
-    In order to quantize a linear layer one should first load the original fp16 / bf16 weights into
-    the Linear8bitLt module, then call `int8_module.to("cuda")` to quantize the fp16 weights.
-
-    Example:
-
-    ```python
-    import torch
-    import torch.nn as nn
-
-    import bitsandbytes as bnb
-    from bnb.nn import Linear8bitLt
-
-    fp16_model = nn.Sequential(
-        nn.Linear(64, 64),
-        nn.Linear(64, 64)
-    )
-
-    int8_model = nn.Sequential(
-        Linear8bitLt(64, 64, has_fp16_weights=False),
-        Linear8bitLt(64, 64, has_fp16_weights=False)
-    )
-
-    int8_model.load_state_dict(fp16_model.state_dict())
-    int8_model = int8_model.to(0) # Quantization happens here
-    ```
-    """
-
-    def __init__(
-            self,
-            input_features: int,
-            output_features: int,
-            bias=True,
-            has_fp16_weights=True,
-            memory_efficient_backward=False,
-            threshold=0.0,
-            index=None,
-            device=None,
-    ):
-        """
-        Initialize Linear8bitLt class.
-
-        Args:
-            input_features (`int`):
-                Number of input features of the linear layer.
-            output_features (`int`):
-                Number of output features of the linear layer.
-            bias (`bool`, defaults to `True`):
-                Whether the linear class uses the bias term as well.
-        """
-        super().__init__(input_features, output_features, bias, device)
-        assert not memory_efficient_backward, "memory_efficient_backward is no longer required and the argument is deprecated in 0.37.0 and will be removed in 0.39.0"
-        self.state = bnb.MatmulLtState()
-        self.index = index
-
-        self.state.threshold = threshold
-        self.state.has_fp16_weights = has_fp16_weights
-        self.state.memory_efficient_backward = memory_efficient_backward
-        if threshold > 0.0 and not has_fp16_weights:
-            self.state.use_pool = True
-
-        self.weight = Int8Params(self.weight.data, has_fp16_weights=has_fp16_weights, requires_grad=has_fp16_weights)
-        self._register_load_state_dict_pre_hook(maybe_rearrange_weight)
+        self.bias = None
+        self.quant_type = quant_type
 
     def _save_to_state_dict(self, destination, prefix, keep_vars):
         super()._save_to_state_dict(destination, prefix, keep_vars)
+        quant_state = getattr(self.weight, "quant_state", None)
+        if quant_state is not None:
+            for k, v in quant_state.as_dict(packed=True).items():
+                destination[prefix + "weight." + k] = v if keep_vars else v.detach()
+        return
 
-        # we only need to save SCB as extra data, because CB for quantized weights is already stored in weight.data
-        scb_name = "SCB"
+    def _load_from_state_dict(self, state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs):
+        quant_state_keys = {k[len(prefix + "weight."):] for k in state_dict.keys() if k.startswith(prefix + "weight.")}
 
-        # case 1: .cuda was called, SCB is in self.weight
-        param_from_weight = getattr(self.weight, scb_name)
-        # case 2: self.init_8bit_state was called, SCB is in self.state
-        param_from_state = getattr(self.state, scb_name)
-        # case 3: SCB is in self.state, weight layout reordered after first forward()
-        layout_reordered = self.state.CxB is not None
+        if any('bitsandbytes' in k for k in quant_state_keys):
+            quant_state_dict = {k: state_dict[prefix + "weight." + k] for k in quant_state_keys}
 
-        key_name = prefix + f"{scb_name}"
-        format_name = prefix + "weight_format"
+            self.weight = ForgeParams4bit.from_prequantized(
+                data=state_dict[prefix + 'weight'],
+                quantized_stats=quant_state_dict,
+                requires_grad=False,
+                device=self.dummy.device,
+                module=self
+            )
+            self.quant_state = self.weight.quant_state
 
-        if not self.state.has_fp16_weights:
-            if param_from_weight is not None:
-                destination[key_name] = param_from_weight if keep_vars else param_from_weight.detach()
-                destination[format_name] = torch.tensor(0, dtype=torch.uint8)
-            elif param_from_state is not None and not layout_reordered:
-                destination[key_name] = param_from_state if keep_vars else param_from_state.detach()
-                destination[format_name] = torch.tensor(0, dtype=torch.uint8)
-            elif param_from_state is not None:
-                destination[key_name] = param_from_state if keep_vars else param_from_state.detach()
-                weights_format = self.state.formatB
-                # At this point `weights_format` is an str
-                if weights_format not in LINEAR_8BIT_WEIGHTS_FORMAT_MAPPING:
-                    raise ValueError(f"Unrecognized weights format {weights_format}")
+            if prefix + 'bias' in state_dict:
+                self.bias = torch.nn.Parameter(state_dict[prefix + 'bias'].to(self.dummy))
 
-                weights_format = LINEAR_8BIT_WEIGHTS_FORMAT_MAPPING[weights_format]
+            del self.dummy
+        elif hasattr(self, 'dummy'):
+            if prefix + 'weight' in state_dict:
+                self.weight = ForgeParams4bit(
+                    state_dict[prefix + 'weight'].to(self.dummy),
+                    requires_grad=False,
+                    compress_statistics=True,
+                    quant_type=self.quant_type,
+                    quant_storage=torch.uint8,
+                    module=self,
+                )
+                self.quant_state = self.weight.quant_state
 
-                destination[format_name] = torch.tensor(weights_format, dtype=torch.uint8)
+            if prefix + 'bias' in state_dict:
+                self.bias = torch.nn.Parameter(state_dict[prefix + 'bias'].to(self.dummy))
 
-    def _load_from_state_dict(
-            self,
-            state_dict,
-            prefix,
-            local_metadata,
-            strict,
-            missing_keys,
-            unexpected_keys,
-            error_msgs,
-    ):
-        super()._load_from_state_dict(
-            state_dict,
-            prefix,
-            local_metadata,
-            strict,
-            missing_keys,
-            unexpected_keys,
-            error_msgs,
-        )
-        unexpected_copy = list(unexpected_keys)
-
-        for key in unexpected_copy:
-            input_name = key[len(prefix):]
-            if input_name == "SCB":
-                if self.weight.SCB is None:
-                    # buffers not yet initialized, can't access them directly without quantizing first
-                    raise RuntimeError(
-                        "Loading a quantized checkpoint into non-quantized Linear8bitLt is "
-                        "not supported. Please call module.cuda() before module.load_state_dict()",
-                    )
-
-                input_param = state_dict[key]
-                self.weight.SCB.copy_(input_param)
-
-                if self.state.SCB is not None:
-                    self.state.SCB = self.weight.SCB
-
-                unexpected_keys.remove(key)
-
-    def init_8bit_state(self):
-        self.state.CB = self.weight.CB
-        self.state.SCB = self.weight.SCB
-        self.weight.CB = None
-        self.weight.SCB = None
-
-    def forward(self, x: torch.Tensor):
-        self.state.is_training = self.training
-        if self.weight.CB is not None:
-            self.init_8bit_state()
-
-        # weights are cast automatically as Int8Params, but the bias has to be cast manually
-        if self.bias is not None and self.bias.dtype != x.dtype:
-            self.bias.data = self.bias.data.to(x.dtype)
-
-        out = bnb.matmul(x, self.weight, bias=self.bias, state=self.state)
-
-        if not self.state.has_fp16_weights:
-            if self.state.CB is not None and self.state.CxB is not None:
-                # we converted 8-bit row major to turing/ampere format in the first inference pass
-                # we no longer need the row-major weight
-                del self.state.CB
-                self.weight.data = self.state.CxB
-        return out
-
-
-class OutlierAwareLinear(nn.Linear):
-    def __init__(self, input_features, output_features, bias=True, device=None):
-        super().__init__(input_features, output_features, bias, device)
-        self.outlier_dim = None
-        self.is_quantized = False
-
-    def forward_with_outliers(self, x, outlier_idx):
-        raise NotImplementedError("Please override the `forward_with_outliers(self, x, outlier_idx)` function")
-
-    def quantize_weight(self, w, outlier_idx):
-        raise NotImplementedError("Please override the `quantize_weights(self, w, outlier_idx)` function")
-
-    def forward(self, x):
-        if self.outlier_dim is None:
-            tracer = OutlierTracer.get_instance()
-            if not tracer.is_initialized():
-                print("Please use OutlierTracer.initialize(model) before using the OutlierAwareLinear layer")
-            outlier_idx = tracer.get_outliers(self.weight)
-            # print(outlier_idx, tracer.get_hvalue(self.weight))
-            self.outlier_dim = outlier_idx
-
-        if not self.is_quantized:
-            w = self.quantize_weight(self.weight, self.outlier_dim)
-            self.weight.data.copy_(w)
-            self.is_quantized = True
-
-
-class SwitchBackLinearBnb(nn.Linear):
-    def __init__(
-            self,
-            input_features,
-            output_features,
-            bias=True,
-            has_fp16_weights=True,
-            memory_efficient_backward=False,
-            threshold=0.0,
-            index=None,
-            device=None,
-    ):
-        super().__init__(input_features, output_features, bias, device)
-        self.state = bnb.MatmulLtState()
-        self.index = index
-
-        self.state.threshold = threshold
-        self.state.has_fp16_weights = has_fp16_weights
-        self.state.memory_efficient_backward = memory_efficient_backward
-        if threshold > 0.0 and not has_fp16_weights:
-            self.state.use_pool = True
-
-        self.weight = Int8Params(self.weight.data, has_fp16_weights=has_fp16_weights, requires_grad=has_fp16_weights)
-
-    def init_8bit_state(self):
-        self.state.CB = self.weight.CB
-        self.state.SCB = self.weight.SCB
-        self.weight.CB = None
-        self.weight.SCB = None
-
-    def forward(self, x):
-        self.state.is_training = self.training
-
-        if self.weight.CB is not None:
-            self.init_8bit_state()
-
-        out = bnb.matmul_mixed(x.half(), self.weight.half(), bias=None, state=self.state) + self.bias
+            del self.dummy
+        else:
+            super()._load_from_state_dict(state_dict, prefix, local_metadata, strict, missing_keys, unexpected_keys, error_msgs)
diff --git a/backend/patcher/controlnet.py b/backend/patcher/controlnet.py
index ebbbf4fe..b21900b2 100644
--- a/backend/patcher/controlnet.py
+++ b/backend/patcher/controlnet.py
@@ -428,11 +428,17 @@ class ControlLora(ControlNet):
         controlnet_config = model.diffusion_model.config.copy()
         controlnet_config.pop("out_channels")
         controlnet_config["hint_channels"] = self.control_weights["input_hint_block.0.weight"].shape[1]
-        controlnet_config["dtype"] = dtype = model.storage_dtype
+
+        dtype = model.storage_dtype
+
+        if dtype in ['nf4', 'fp4']:
+            dtype = torch.float16
+
+        controlnet_config["dtype"] = dtype
 
         self.manual_cast_dtype = model.computation_dtype
 
-        with using_forge_operations(operations=ControlLoraOps):
+        with using_forge_operations(operations=ControlLoraOps, dtype=dtype):
             self.control_model = cldm.ControlNet(**controlnet_config)
 
         self.control_model.to(device=memory_management.get_torch_device(), dtype=dtype)
diff --git a/backend/patcher/unet.py b/backend/patcher/unet.py
index 49a84a95..8e5155a3 100644
--- a/backend/patcher/unet.py
+++ b/backend/patcher/unet.py
@@ -3,20 +3,18 @@ import torch
 
 from backend.modules.k_model import KModel
 from backend.patcher.base import ModelPatcher
-from backend import memory_management
 
 
 class UnetPatcher(ModelPatcher):
     @classmethod
     def from_model(cls, model, diffusers_scheduler, config, k_predictor=None):
-        parameters = memory_management.module_size(model)
-        unet_dtype = memory_management.unet_dtype(model_params=parameters)
-        load_device = memory_management.get_torch_device()
-        initial_load_device = memory_management.unet_inital_load_device(parameters, unet_dtype)
-        computation_dtype = memory_management.get_computation_dtype(load_device, supported_dtypes=config.supported_inference_dtypes)
-        model.to(device=initial_load_device, dtype=unet_dtype)
-        model = KModel(model=model, diffusers_scheduler=diffusers_scheduler, k_predictor=k_predictor, storage_dtype=unet_dtype, computation_dtype=computation_dtype)
-        return UnetPatcher(model, load_device=load_device, offload_device=memory_management.unet_offload_device(), current_device=initial_load_device)
+        model = KModel(model=model, diffusers_scheduler=diffusers_scheduler, k_predictor=k_predictor)
+        return UnetPatcher(
+            model,
+            load_device=model.diffusion_model.load_device,
+            offload_device=model.diffusion_model.offload_device,
+            current_device=model.diffusion_model.initial_device
+        )
 
     def __init__(self, *args, **kwargs):
         super().__init__(*args, **kwargs)
@@ -169,8 +167,8 @@ class UnetPatcher(ModelPatcher):
         self.append_transformer_option('controlnet_conditioning_modifiers', modifier, ensure_uniqueness)
         return
 
-    def set_groupnorm_wrapper(self, wrapper):
-        self.set_transformer_option('groupnorm_wrapper', wrapper)
+    def set_group_norm_wrapper(self, wrapper):
+        self.set_transformer_option('group_norm_wrapper', wrapper)
         return
 
     def set_controlnet_model_function_wrapper(self, wrapper):
diff --git a/backend/text_processing/t5_engine.py b/backend/text_processing/t5_engine.py
index 49b9e89e..dfeead3a 100644
--- a/backend/text_processing/t5_engine.py
+++ b/backend/text_processing/t5_engine.py
@@ -1,9 +1,7 @@
-import math
 import torch
 
 from collections import namedtuple
 from backend.text_processing import parsing, emphasis
-from backend.text_processing.textual_inversion import EmbeddingDatabase
 from backend import memory_management
 
 
@@ -50,9 +48,6 @@ class T5TextProcessingEngine:
             if mult != 1.0:
                 self.token_mults[ident] = mult
 
-    def get_target_prompt_token_count(self, token_count):
-        return token_count
-
     def tokenize(self, texts):
         tokenized = self.tokenizer(texts, truncation=False, add_special_tokens=False)["input_ids"]
         return tokenized
@@ -112,45 +107,33 @@ class T5TextProcessingEngine:
 
         return chunks, token_count
 
-    def process_texts(self, texts):
-        token_count = 0
-
+    def __call__(self, texts):
+        zs = []
         cache = {}
-        batch_chunks = []
+
         for line in texts:
             if line in cache:
-                chunks = cache[line]
+                line_z_values = cache[line]
             else:
-                chunks, current_token_count = self.tokenize_line(line)
-                token_count = max(current_token_count, token_count)
+                chunks, token_count = self.tokenize_line(line)
+                line_z_values = []
+                for chunk in chunks:
+                    tokens = chunk.tokens
+                    multipliers = chunk.multipliers
+                    z = self.process_tokens([tokens], [multipliers])[0]
+                    line_z_values.append(z)
+                cache[line] = line_z_values
 
-                cache[line] = chunks
+            zs.extend(line_z_values)
 
-            batch_chunks.append(chunks)
+        return torch.stack(zs)
 
-        return batch_chunks, token_count
-
-    def __call__(self, texts):
-        batch_chunks, token_count = self.process_texts(texts)
-        chunk_count = max([len(x) for x in batch_chunks])
-
-        zs = []
-
-        for i in range(chunk_count):
-            batch_chunk = [chunks[i] for chunks in batch_chunks]
-            tokens = [x.tokens for x in batch_chunk]
-            multipliers = [x.multipliers for x in batch_chunk]
-            z = self.process_tokens(tokens, multipliers)
-            zs.append(z)
-
-        return torch.hstack(zs)
-
-    def process_tokens(self, remade_batch_tokens, batch_multipliers):
-        tokens = torch.asarray(remade_batch_tokens)
+    def process_tokens(self, batch_tokens, batch_multipliers):
+        tokens = torch.asarray(batch_tokens)
 
         z = self.encode_with_transformers(tokens)
 
-        self.emphasis.tokens = remade_batch_tokens
+        self.emphasis.tokens = batch_tokens
         self.emphasis.multipliers = torch.asarray(batch_multipliers).to(z)
         self.emphasis.z = z
         self.emphasis.after_transformers()
diff --git a/modules/hashes.py b/modules/hashes.py
index d22e5fad..d5ecc771 100644
--- a/modules/hashes.py
+++ b/modules/hashes.py
@@ -8,7 +8,7 @@ dump_cache = modules.cache.dump_cache
 cache = modules.cache.cache
 
 
-def calculate_sha256(filename):
+def calculate_sha256_real(filename):
     hash_sha256 = hashlib.sha256()
     blksize = 1024 * 1024
 
@@ -19,6 +19,17 @@ def calculate_sha256(filename):
     return hash_sha256.hexdigest()
 
 
+def calculate_sha256(filename):
+    return forge_fake_calculate_sha256(filename)
+
+
+def forge_fake_calculate_sha256(filename):
+    basename = os.path.basename(filename)
+    hash_sha256 = hashlib.sha256()
+    hash_sha256.update(basename.encode('utf-8'))
+    return hash_sha256.hexdigest()
+
+
 def sha256_from_cache(filename, title, use_addnet_hash=False):
     hashes = cache("hashes-addnet") if use_addnet_hash else cache("hashes")
     try:
@@ -49,11 +60,7 @@ def sha256(filename, title, use_addnet_hash=False):
         return None
 
     print(f"Calculating sha256 for {filename}: ", end='')
-    if use_addnet_hash:
-        with open(filename, "rb") as file:
-            sha256_value = addnet_hash_safetensors(file)
-    else:
-        sha256_value = calculate_sha256(filename)
+    sha256_value = forge_fake_calculate_sha256(filename)
     print(f"{sha256_value}")
 
     hashes[title] = {
diff --git a/modules/launch_utils.py b/modules/launch_utils.py
index 6cca114b..af83e62d 100644
--- a/modules/launch_utils.py
+++ b/modules/launch_utils.py
@@ -361,8 +361,8 @@ def requirements_met(requirements_file):
 
 
 def prepare_environment():
-    torch_index_url = os.environ.get('TORCH_INDEX_URL', "https://download.pytorch.org/whl/cu121")
-    torch_command = os.environ.get('TORCH_COMMAND', f"pip install torch==2.1.2 torchvision==0.16.2 --extra-index-url {torch_index_url}")
+    torch_index_url = os.environ.get('TORCH_INDEX_URL', "https://download.pytorch.org/whl/cu124")
+    torch_command = os.environ.get('TORCH_COMMAND', f"pip install torch==2.4.0 torchvision==0.19.0 --extra-index-url {torch_index_url}")
     if args.use_ipex:
         if platform.system() == "Windows":
             # The "Nuullll/intel-extension-for-pytorch" wheels were built from IPEX source for Intel Arc GPU: https://github.com/intel/intel-extension-for-pytorch/tree/xpu-main
@@ -386,7 +386,7 @@ def prepare_environment():
     requirements_file = os.environ.get('REQS_FILE', "requirements_versions.txt")
     requirements_file_for_npu = os.environ.get('REQS_FILE_FOR_NPU', "requirements_npu.txt")
 
-    xformers_package = os.environ.get('XFORMERS_PACKAGE', 'xformers==0.0.23.post1')
+    xformers_package = os.environ.get('XFORMERS_PACKAGE', 'xformers==0.0.27.post2')
     clip_package = os.environ.get('CLIP_PACKAGE', "https://github.com/openai/CLIP/archive/d50d76daa670286dd6cacf3bcd80b5e4823fc8e1.zip")
     openclip_package = os.environ.get('OPENCLIP_PACKAGE', "https://github.com/mlfoundations/open_clip/archive/bb6e834e9c70d9c27d0dc3ecedeebeaeb1ffad6b.zip")
 
diff --git a/modules/processing.py b/modules/processing.py
index 90311c07..3571f575 100644
--- a/modules/processing.py
+++ b/modules/processing.py
@@ -780,11 +780,12 @@ need_global_unload = False
 def process_images(p: StableDiffusionProcessing) -> Processed:
     global need_global_unload
 
-    if need_global_unload:
-        need_global_unload = False
+    p.sd_model, just_reloaded = forge_model_reload()
+
+    if need_global_unload and not just_reloaded:
         memory_management.unload_all_models()
 
-    p.sd_model = forge_model_reload()
+    need_global_unload = False
 
     if p.scripts is not None:
         p.scripts.before_process(p)
diff --git a/modules/sd_models.py b/modules/sd_models.py
index 450f80ff..8686c264 100644
--- a/modules/sd_models.py
+++ b/modules/sd_models.py
@@ -159,7 +159,7 @@ def setup_model():
 
 
 def checkpoint_tiles(use_short=False):
-    return [x.short_title if use_short else x.title for x in checkpoints_list.values()]
+    return [x.short_title if use_short else x.name for x in checkpoints_list.values()]
 
 
 def list_models():
@@ -475,7 +475,7 @@ def forge_model_reload():
     current_hash = str(model_data.forge_loading_parameters)
 
     if model_data.forge_hash == current_hash:
-        return model_data.sd_model
+        return model_data.sd_model, False
 
     print('Loading Model: ' + str(model_data.forge_loading_parameters))
 
@@ -536,4 +536,4 @@ def forge_model_reload():
 
     model_data.forge_hash = current_hash
 
-    return sd_model
+    return sd_model, True
diff --git a/modules/shared.py b/modules/shared.py
index 2dd07c6a..af5ec283 100644
--- a/modules/shared.py
+++ b/modules/shared.py
@@ -16,7 +16,7 @@ parser = shared_cmd_options.parser
 
 batch_cond_uncond = True  # old field, unused now in favor of shared.opts.batch_cond_uncond
 parallel_processing_allowed = True
-styles_filename = cmd_opts.styles_file = cmd_opts.styles_file if len(cmd_opts.styles_file) > 0 else [os.path.join(data_path, 'styles.csv')]
+styles_filename = cmd_opts.styles_file = cmd_opts.styles_file if len(cmd_opts.styles_file) > 0 else [os.path.join(data_path, 'styles.csv'), os.path.join(data_path, 'styles_integrated.csv')]
 config_filename = cmd_opts.ui_settings_file
 hide_dirs = {"visible": not cmd_opts.hide_ui_dir_config}
 
diff --git a/modules/ui.py b/modules/ui.py
index fa57e8a1..618e9da9 100644
--- a/modules/ui.py
+++ b/modules/ui.py
@@ -307,8 +307,7 @@ def create_ui():
 
                     elif category == "cfg":
                         with gr.Row():
-                            from backend.args import args
-                            distilled_cfg_scale = gr.Slider(minimum=0.0, maximum=30.0, step=0.5, label='Distilled CFG Scale', value=3.5, elem_id="txt2img_distilled_cfg_scale", visible=args.i_am_lllyasviel)
+                            distilled_cfg_scale = gr.Slider(minimum=0.0, maximum=30.0, step=0.5, label='Distilled CFG Scale', value=3.5, elem_id="txt2img_distilled_cfg_scale")
                         with gr.Row():
                             cfg_scale = gr.Slider(minimum=1.0, maximum=30.0, step=0.5, label='CFG Scale', value=7.0, elem_id="txt2img_cfg_scale")
                             cfg_scale.change(lambda x: gr.update(interactive=(x != 1)), inputs=[cfg_scale], outputs=[toprow.negative_prompt], queue=False, show_progress=False)
@@ -649,8 +648,7 @@ def create_ui():
 
                     elif category == "cfg":
                         with gr.Row():
-                            from backend.args import args as backend_args
-                            distilled_cfg_scale = gr.Slider(minimum=0.0, maximum=30.0, step=0.5, label='Distilled CFG Scale', value=3.5, elem_id="img2img_distilled_cfg_scale", visible=backend_args.i_am_lllyasviel)
+                            distilled_cfg_scale = gr.Slider(minimum=0.0, maximum=30.0, step=0.5, label='Distilled CFG Scale', value=3.5, elem_id="img2img_distilled_cfg_scale")
                         with gr.Row():
                             cfg_scale = gr.Slider(minimum=1.0, maximum=30.0, step=0.5, label='CFG Scale', value=7.0, elem_id="img2img_cfg_scale")
                             image_cfg_scale = gr.Slider(minimum=0, maximum=3.0, step=0.05, label='Image CFG Scale', value=1.5, elem_id="img2img_image_cfg_scale", visible=False)
diff --git a/modules_forge/bnb_installer.py b/modules_forge/bnb_installer.py
new file mode 100644
index 00000000..0b24bc15
--- /dev/null
+++ b/modules_forge/bnb_installer.py
@@ -0,0 +1,21 @@
+import pkg_resources
+
+from modules.launch_utils import run_pip
+
+target_bitsandbytes_version = '0.43.3'
+
+
+def try_install_bnb():
+    try:
+        bitsandbytes_version = pkg_resources.get_distribution('bitsandbytes').version
+    except Exception:
+        bitsandbytes_version = None
+
+    try:
+        if bitsandbytes_version != target_bitsandbytes_version:
+            run_pip(
+                f"install -U bitsandbytes=={target_bitsandbytes_version}",
+                f"bitsandbytes=={target_bitsandbytes_version}",
+            )
+    except Exception as e:
+        print(f'Cannot install bitsandbytes. Skipped.')
diff --git a/modules_forge/forge_version.py b/modules_forge/forge_version.py
index 221bb42c..4b3b87bf 100644
--- a/modules_forge/forge_version.py
+++ b/modules_forge/forge_version.py
@@ -1 +1 @@
-version = '1.0.2v1.10.1'
+version = '2.0.0v1.10.1'
diff --git a/modules_forge/initialization.py b/modules_forge/initialization.py
index 5e43b44e..22133f47 100644
--- a/modules_forge/initialization.py
+++ b/modules_forge/initialization.py
@@ -54,6 +54,10 @@ def initialize_forge():
     torch.zeros((1, 1)).to(device, torch.float32)
     memory_management.soft_empty_cache()
 
+    if memory_management.can_install_bnb():
+        from modules_forge.bnb_installer import try_install_bnb
+        try_install_bnb()
+
     import modules_forge.patch_basic
     modules_forge.patch_basic.patch_all_basics()
 
diff --git a/modules_forge/main_entry.py b/modules_forge/main_entry.py
index 3799ad9b..1becebbd 100644
--- a/modules_forge/main_entry.py
+++ b/modules_forge/main_entry.py
@@ -1,21 +1,32 @@
 import torch
 import gradio as gr
 
-from modules import shared_items, shared, ui_common, sd_models, processing
+from gradio.context import Context
+from modules import shared_items, shared, ui_common, sd_models, processing, infotext_utils
 from modules import sd_vae as sd_vae_module
 from backend import memory_management, stream
 
 
 total_vram = int(memory_management.total_vram)
 
+ui_forge_preset: gr.Radio = None
+
 ui_checkpoint: gr.Dropdown = None
 ui_vae: gr.Dropdown = None
+ui_vae_refresh_button: gr.Button = None
 ui_clip_skip: gr.Slider = None
 
+ui_forge_unet_storage_dtype_options: gr.Radio = None
+ui_forge_async_loading: gr.Radio = None
+ui_forge_pin_shared_memory: gr.Radio = None
+ui_forge_inference_memory: gr.Slider = None
+
 forge_unet_storage_dtype_options = {
-    'None': None,
-    'fp8e4m3': torch.float8_e4m3fn,
-    'fp8e5m2': torch.float8_e5m2,
+    'Auto': None,
+    'nf4': 'nf4',
+    'fp8e4': torch.float8_e4m3fn,
+    'fp4': 'fp4',
+    'fp8e5': torch.float8_e5m2,
 }
 
 
@@ -28,22 +39,24 @@ def bind_to_opts(comp, k, save=False, callback=None):
             callback()
         return
 
-    comp.change(on_change, inputs=[comp], show_progress=False)
+    comp.change(on_change, inputs=[comp], queue=False, show_progress=False)
     return
 
 
 def make_checkpoint_manager_ui():
-    global ui_checkpoint, ui_vae, ui_clip_skip
+    global ui_checkpoint, ui_vae, ui_clip_skip, ui_forge_unet_storage_dtype_options, ui_forge_async_loading, ui_forge_pin_shared_memory, ui_forge_inference_memory, ui_forge_preset, ui_vae_refresh_button
 
     if shared.opts.sd_model_checkpoint in [None, 'None', 'none', '']:
         if len(sd_models.checkpoints_list) == 0:
             sd_models.list_models()
         if len(sd_models.checkpoints_list) > 0:
-            shared.opts.set('sd_model_checkpoint', next(iter(sd_models.checkpoints_list.keys())))
+            shared.opts.set('sd_model_checkpoint', next(iter(sd_models.checkpoints_list.values())).name)
+
+    ui_forge_preset = gr.Radio(label="UI", value=lambda: shared.opts.forge_preset, choices=['sd', 'xl', 'flux', 'all'])
 
     sd_model_checkpoint_args = lambda: {"choices": shared_items.list_checkpoint_tiles(shared.opts.sd_checkpoint_dropdown_use_short)}
     ui_checkpoint = gr.Dropdown(
-        value=shared.opts.sd_model_checkpoint,
+        value=lambda: shared.opts.sd_model_checkpoint,
         label="Checkpoint",
         elem_classes=['model_selection'],
         **sd_model_checkpoint_args()
@@ -52,30 +65,32 @@ def make_checkpoint_manager_ui():
 
     sd_vae_args = lambda: {"choices": shared_items.sd_vae_items()}
     ui_vae = gr.Dropdown(
-        value=shared.opts.sd_vae,
+        value=lambda: shared.opts.sd_vae,
         label="VAE",
         **sd_vae_args()
     )
-    ui_common.create_refresh_button(ui_vae, shared_items.refresh_vae_list, sd_vae_args, f"forge_refresh_vae")
+    ui_vae_refresh_button = ui_common.create_refresh_button(ui_vae, shared_items.refresh_vae_list, sd_vae_args, f"forge_refresh_vae")
 
-    ui_forge_unet_storage_dtype_options = gr.Radio(label="Diffusion in FP8", value=shared.opts.forge_unet_storage_dtype, choices=list(forge_unet_storage_dtype_options.keys()))
+    ui_forge_unet_storage_dtype_options = gr.Radio(label="Diffusion with Low Bits", value=lambda: shared.opts.forge_unet_storage_dtype, choices=list(forge_unet_storage_dtype_options.keys()))
     bind_to_opts(ui_forge_unet_storage_dtype_options, 'forge_unet_storage_dtype', save=True, callback=refresh_model_loading_parameters)
 
-    from backend.args import args as backend_args
-
-    ui_forge_inference_memory = gr.Slider(label="Model Memory (MB)", value=total_vram - shared.opts.forge_inference_memory, minimum=0, maximum=int(memory_management.total_vram), step=1, visible=backend_args.i_am_lllyasviel)
-    ui_forge_async_loading = gr.Checkbox(label="Async Loader", value=shared.opts.forge_async_loading, visible=backend_args.i_am_lllyasviel)
-    ui_forge_pin_shared_memory = gr.Checkbox(label="Offload to Shared Memory", value=shared.opts.forge_pin_shared_memory, visible=backend_args.i_am_lllyasviel)
+    ui_forge_async_loading = gr.Radio(label="Swap Method", value=lambda: shared.opts.forge_async_loading, choices=['Queue', 'Async'])
+    ui_forge_pin_shared_memory = gr.Radio(label="Swap Location", value=lambda: shared.opts.forge_pin_shared_memory, choices=['CPU', 'Shared'])
+    ui_forge_inference_memory = gr.Slider(label="GPU Weights (MB)", value=lambda: total_vram - shared.opts.forge_inference_memory, minimum=0, maximum=int(memory_management.total_vram), step=1)
 
     mem_comps = [ui_forge_inference_memory, ui_forge_async_loading, ui_forge_pin_shared_memory]
 
-    ui_forge_inference_memory.change(refresh_memory_management_settings, inputs=mem_comps)
-    ui_forge_async_loading.change(refresh_memory_management_settings, inputs=mem_comps)
-    ui_forge_pin_shared_memory.change(refresh_memory_management_settings, inputs=mem_comps)
+    ui_forge_inference_memory.change(refresh_memory_management_settings, inputs=mem_comps, queue=False, show_progress=False)
+    ui_forge_async_loading.change(refresh_memory_management_settings, inputs=mem_comps, queue=False, show_progress=False)
+    ui_forge_pin_shared_memory.change(refresh_memory_management_settings, inputs=mem_comps, queue=False, show_progress=False)
+    Context.root_block.load(refresh_memory_management_settings, inputs=mem_comps, queue=False, show_progress=False)
 
-    ui_clip_skip = gr.Slider(label="Clip skip", value=shared.opts.CLIP_stop_at_last_layers, **{"minimum": 1, "maximum": 12, "step": 1})
+    ui_clip_skip = gr.Slider(label="Clip skip", value=lambda: shared.opts.CLIP_stop_at_last_layers, **{"minimum": 1, "maximum": 12, "step": 1})
     bind_to_opts(ui_clip_skip, 'CLIP_stop_at_last_layers', save=False)
 
+    ui_checkpoint.change(checkpoint_change, inputs=[ui_checkpoint], show_progress=False)
+    ui_vae.change(vae_change, inputs=[ui_vae], queue=False, show_progress=False)
+
     return
 
 
@@ -86,14 +101,17 @@ def refresh_memory_management_settings(model_memory, async_loading, pin_shared_m
     shared.opts.set('forge_inference_memory', inference_memory)
     shared.opts.set('forge_pin_shared_memory', pin_shared_memory)
 
-    stream.stream_activated = async_loading
+    stream.stream_activated = async_loading == 'Async'
     memory_management.current_inference_memory = inference_memory * 1024 * 1024
-    memory_management.PIN_SHARED_MEMORY = pin_shared_memory
+    memory_management.PIN_SHARED_MEMORY = pin_shared_memory == 'Shared'
 
-    print(f'Stream Set to: {stream.stream_activated}')
-    print(f'Stream Used by CUDA: {stream.should_use_stream()}')
-    print(f'Current Inference Memory: {memory_management.minimum_inference_memory() / (1024 * 1024):.2f} MB')
-    print(f'PIN Shared Memory: {pin_shared_memory}')
+    log_dict = dict(
+        stream=stream.should_use_stream(),
+        inference_memory=memory_management.minimum_inference_memory() / (1024 * 1024),
+        pin_shared_memory=memory_management.PIN_SHARED_MEMORY
+    )
+
+    print(f'Environment vars changed: {log_dict}')
 
     processing.need_global_unload = True
     return
@@ -111,7 +129,7 @@ def refresh_model_loading_parameters():
         unet_storage_dtype=forge_unet_storage_dtype_options[shared.opts.forge_unet_storage_dtype]
     )
 
-    print(f'Loading parameters: {model_data.forge_loading_parameters}')
+    print(f'Model selected: {model_data.forge_loading_parameters}')
 
     return
 
@@ -131,10 +149,149 @@ def vae_change(vae_name):
     return
 
 
-def forge_main_entry():
-    ui_checkpoint.change(checkpoint_change, inputs=[ui_checkpoint], show_progress=False)
-    ui_vae.change(vae_change, inputs=[ui_vae], show_progress=False)
+def get_a1111_ui_component(tab, label):
+    fields = infotext_utils.paste_fields[tab]['fields']
+    for f in fields:
+        if f.label == label or f.api == label:
+            return f.component
+
+
+def forge_main_entry():
+    ui_txt2img_width = get_a1111_ui_component('txt2img', 'Size-1')
+    ui_txt2img_height = get_a1111_ui_component('txt2img', 'Size-2')
+    ui_txt2img_cfg = get_a1111_ui_component('txt2img', 'CFG scale')
+    ui_txt2img_distilled_cfg = get_a1111_ui_component('txt2img', 'Distilled CFG Scale')
+    ui_txt2img_sampler = get_a1111_ui_component('txt2img', 'sampler_name')
+    ui_txt2img_scheduler = get_a1111_ui_component('txt2img', 'scheduler')
+
+    ui_img2img_width = get_a1111_ui_component('img2img', 'Size-1')
+    ui_img2img_height = get_a1111_ui_component('img2img', 'Size-2')
+    ui_img2img_cfg = get_a1111_ui_component('img2img', 'CFG scale')
+    ui_img2img_distilled_cfg = get_a1111_ui_component('img2img', 'Distilled CFG Scale')
+    ui_img2img_sampler = get_a1111_ui_component('img2img', 'sampler_name')
+    ui_img2img_scheduler = get_a1111_ui_component('img2img', 'scheduler')
+
+    output_targets = [
+        ui_vae,
+        ui_vae_refresh_button,
+        ui_clip_skip,
+        ui_forge_unet_storage_dtype_options,
+        ui_forge_async_loading,
+        ui_forge_pin_shared_memory,
+        ui_forge_inference_memory,
+        ui_txt2img_width,
+        ui_img2img_width,
+        ui_txt2img_height,
+        ui_img2img_height,
+        ui_txt2img_cfg,
+        ui_img2img_cfg,
+        ui_txt2img_distilled_cfg,
+        ui_img2img_distilled_cfg,
+        ui_txt2img_sampler,
+        ui_img2img_sampler,
+        ui_txt2img_scheduler,
+        ui_img2img_scheduler
+    ]
+
+    ui_forge_preset.change(on_preset_change, inputs=[ui_forge_preset], outputs=output_targets, queue=False, show_progress=False)
+    Context.root_block.load(on_preset_change, inputs=None, outputs=output_targets, queue=False, show_progress=False)
 
-    # Load Model
     refresh_model_loading_parameters()
     return
+
+
+def on_preset_change(preset=None):
+    if preset is not None:
+        shared.opts.set('forge_preset', preset)
+        shared.opts.save(shared.config_filename)
+
+    if shared.opts.forge_preset == 'sd':
+        return [
+            gr.update(visible=True, value='Automatic'),  # ui_vae
+            gr.update(visible=True),  # ui_vae_refresh_button
+            gr.update(visible=True, value=1),  # ui_clip_skip
+            gr.update(visible=False, value='Auto'),  # ui_forge_unet_storage_dtype_options
+            gr.update(visible=False, value='Queue'),  # ui_forge_async_loading
+            gr.update(visible=False, value='CPU'),  # ui_forge_pin_shared_memory
+            gr.update(visible=False, value=total_vram - 1024),  # ui_forge_inference_memory
+            gr.update(value=512),  # ui_txt2img_width
+            gr.update(value=512),  # ui_img2img_width
+            gr.update(value=640),  # ui_txt2img_height
+            gr.update(value=640),  # ui_img2img_height
+            gr.update(value=7),  # ui_txt2img_cfg
+            gr.update(value=7),  # ui_img2img_cfg
+            gr.update(visible=False, value=3.5),  # ui_txt2img_distilled_cfg
+            gr.update(visible=False, value=3.5),  # ui_img2img_distilled_cfg
+            gr.update(value='Euler a'),  # ui_txt2img_sampler
+            gr.update(value='Euler a'),  # ui_img2img_sampler
+            gr.update(value='Automatic'),  # ui_txt2img_scheduler
+            gr.update(value='Automatic'),  # ui_img2img_scheduler
+        ]
+
+    if shared.opts.forge_preset == 'xl':
+        return [
+            gr.update(visible=False, value='Automatic'),  # ui_vae
+            gr.update(visible=False),  # ui_vae_refresh_button
+            gr.update(visible=False, value=1),  # ui_clip_skip
+            gr.update(visible=True, value='Auto'),  # ui_forge_unet_storage_dtype_options
+            gr.update(visible=False, value='Queue'),  # ui_forge_async_loading
+            gr.update(visible=False, value='CPU'),  # ui_forge_pin_shared_memory
+            gr.update(visible=False, value=total_vram - 1024),  # ui_forge_inference_memory
+            gr.update(value=896),  # ui_txt2img_width
+            gr.update(value=896),  # ui_img2img_width
+            gr.update(value=1152),  # ui_txt2img_height
+            gr.update(value=1152),  # ui_img2img_height
+            gr.update(value=5),  # ui_txt2img_cfg
+            gr.update(value=5),  # ui_img2img_cfg
+            gr.update(visible=False, value=3.5),  # ui_txt2img_distilled_cfg
+            gr.update(visible=False, value=3.5),  # ui_img2img_distilled_cfg
+            gr.update(value='DPM++ 2M SDE'),  # ui_txt2img_sampler
+            gr.update(value='DPM++ 2M SDE'),  # ui_img2img_sampler
+            gr.update(value='Karras'),  # ui_txt2img_scheduler
+            gr.update(value='Karras'),  # ui_img2img_scheduler
+        ]
+
+    if shared.opts.forge_preset == 'flux':
+        return [
+            gr.update(visible=False, value='Automatic'),  # ui_vae
+            gr.update(visible=False),  # ui_vae_refresh_button
+            gr.update(visible=False, value=1),  # ui_clip_skip
+            gr.update(visible=True, value='Auto'),  # ui_forge_unet_storage_dtype_options
+            gr.update(visible=True, value='Queue'),  # ui_forge_async_loading
+            gr.update(visible=True, value='CPU'),  # ui_forge_pin_shared_memory
+            gr.update(visible=True, value=total_vram - 1024),  # ui_forge_inference_memory
+            gr.update(value=896),  # ui_txt2img_width
+            gr.update(value=896),  # ui_img2img_width
+            gr.update(value=1152),  # ui_txt2img_height
+            gr.update(value=1152),  # ui_img2img_height
+            gr.update(value=1),  # ui_txt2img_cfg
+            gr.update(value=1),  # ui_img2img_cfg
+            gr.update(visible=True, value=3.5),  # ui_txt2img_distilled_cfg
+            gr.update(visible=True, value=3.5),  # ui_img2img_distilled_cfg
+            gr.update(value='Euler'),  # ui_txt2img_sampler
+            gr.update(value='Euler'),  # ui_img2img_sampler
+            gr.update(value='Simple'),  # ui_txt2img_scheduler
+            gr.update(value='Simple'),  # ui_img2img_scheduler
+        ]
+
+    return [
+        gr.update(visible=True, value='Automatic'),  # ui_vae
+        gr.update(visible=True),  # ui_vae_refresh_button
+        gr.update(visible=True, value=1),  # ui_clip_skip
+        gr.update(visible=True, value='Auto'),  # ui_forge_unet_storage_dtype_options
+        gr.update(visible=True, value='Queue'),  # ui_forge_async_loading
+        gr.update(visible=True, value='CPU'),  # ui_forge_pin_shared_memory
+        gr.update(visible=True, value=total_vram - 1024),  # ui_forge_inference_memory
+        gr.update(value=896),  # ui_txt2img_width
+        gr.update(value=896),  # ui_img2img_width
+        gr.update(value=1152),  # ui_txt2img_height
+        gr.update(value=1152),  # ui_img2img_height
+        gr.update(value=7),  # ui_txt2img_cfg
+        gr.update(value=7),  # ui_img2img_cfg
+        gr.update(visible=True, value=3.5),  # ui_txt2img_distilled_cfg
+        gr.update(visible=True, value=3.5),  # ui_img2img_distilled_cfg
+        gr.update(value='DPM++ 2M'),  # ui_txt2img_sampler
+        gr.update(value='DPM++ 2M'),  # ui_img2img_sampler
+        gr.update(value='Automatic'),  # ui_txt2img_scheduler
+        gr.update(value='Automatic'),  # ui_img2img_scheduler
+    ]
diff --git a/modules_forge/shared_options.py b/modules_forge/shared_options.py
index 6c9a84eb..f88510fa 100644
--- a/modules_forge/shared_options.py
+++ b/modules_forge/shared_options.py
@@ -1,8 +1,9 @@
 
 def register(options_templates, options_section, OptionInfo):
     options_templates.update(options_section((None, "Forge Hidden options"), {
-        "forge_unet_storage_dtype": OptionInfo('None'),
+        "forge_unet_storage_dtype": OptionInfo('Auto'),
         "forge_inference_memory": OptionInfo(1024),
-        "forge_async_loading": OptionInfo(False),
-        "forge_pin_shared_memory": OptionInfo(False),
+        "forge_async_loading": OptionInfo('Queue'),
+        "forge_pin_shared_memory": OptionInfo('CPU'),
+        "forge_preset": OptionInfo('sd'),
     }))
diff --git a/style.css b/style.css
index f0e867bd..680f46b1 100644
--- a/style.css
+++ b/style.css
@@ -8,7 +8,6 @@
     --checkbox-label-gap: 0.25em 0.1em;
     --section-header-text-size: 12pt;
     --block-background-fill: transparent;
-
 }
 
 .block.padded:not(.gradio-accordion) {
@@ -427,6 +426,7 @@ div.toprow-compact-tools{
 
 /* settings */
 #quicksettings {
+    --checkbox-label-padding: 6px 6px;
     align-items: end;
 }
 
diff --git a/styles_integrated.csv b/styles_integrated.csv
new file mode 100644
index 00000000..463d0775
--- /dev/null
+++ b/styles_integrated.csv
@@ -0,0 +1,214 @@
+﻿name,prompt,negative_prompt
+Kamph_Default_Negative (NO),,"Watermark, Text, censored, deformed, bad anatomy, disfigured, poorly drawn face, mutated, extra limb, ugly, poorly drawn hands, missing limb, floating limbs, disconnected limbs, disconnected head, malformed hands, long neck, mutated hands and fingers, bad hands, missing fingers, cropped, worst quality, low quality, mutation, poorly drawn, huge calf, bad hands, fused hand, missing hand, disappearing arms, disappearing thigh, disappearing calf, disappearing legs, missing fingers, fused fingers, abnormal eye proportion, Abnormal hands, abnormal legs, abnormal feet,  abnormal fingers"
+Kamph_Default_Negative (sfw) (NO),,"NSFW, Cleavage, Pubic Hair, Nudity, Naked, Au naturel, Watermark, Text, censored, deformed, bad anatomy, disfigured, poorly drawn face, mutated, extra limb, ugly, poorly drawn hands, missing limb, floating limbs, disconnected limbs, disconnected head, malformed hands, long neck, mutated hands and fingers, bad hands, missing fingers, cropped, worst quality, low quality, mutation, poorly drawn, huge calf, bad hands, fused hand, missing hand, disappearing arms, disappearing thigh, disappearing calf, disappearing legs, missing fingers, fused fingers, abnormal eye proportion, Abnormal hands, abnormal legs, abnormal feet,  abnormal fingers"
+Kamph_Default_Negative (Low Token) (NO),,"lowres, bad anatomy, bad hands, text, error, missing fingers, extra digit, fewer digits, cropped, worst quality, low quality, normal quality, jpeg artifacts, signature, watermark, username, blurry"
+Kamph_Skin Enhancer (PO),"detailed skin texture, (blush:0.5), (goosebumps:0.5), subsurface scattering",
+Kamph_Skin Enhancer (clean)(PO),"detailed skin texture, (blush:0.2), (goosebumps:0.3), subsurface scattering",
+Kamph_Ghibli (PO),"(Studio ghibli style, Art by Hayao Miyazaki:1.2), Anime Style, Manga Style, Hand drawn, cinematic, Sharp focus, humorous illustration, big depth of field, Masterpiece, concept art, trending on artstation, Vivid colors, Simplified style, trending on ArtStation, trending on CGSociety, Intricate, Vibrant colors, Soft Shading, Simplistic Features, Sharp Angles, Playful",
+Kamph_Vector Illustrations (PO),"Vector art, Vivid colors, Clean lines, Sharp edges, Minimalist, Precise geometry, Simplistic, Smooth curves, Bold outlines, Crisp shapes, Flat colors, Illustration art piece, High contrast shadows, Technical illustration, Graphic design, Vector graphics, High contrast, Precision artwork, Linear compositions, Scalable artwork, Digital art",
+Kamph_Digital Painting (PO),"glow effects, godrays, Hand drawn, render, 8k, octane render, cinema 4d, blender, dark, atmospheric 4k ultra detailed, cinematic, Sharp focus, big depth of field, Masterpiece, colors, 3d octane render, 4k, concept art, trending on artstation, hyperrealistic, Vivid colors, extremely detailed CG unity 8k wallpaper, trending on CGSociety, Intricate, High Detail, dramatic",
+Kamph_Indie Game (PO),"Indie game art, Vector Art, Borderlands style, Arcane style, Cartoon style, Line art, Disctinct features, Hand drawn, Technical illustration, Graphic design, Vector graphics, High contrast, Precision artwork, Linear compositions, Scalable artwork, Digital art, cinematic sensual, Sharp focus, humorous illustration, big depth of field, Masterpiece, trending on artstation, Vivid colors, trending on ArtStation, trending on CGSociety, Intricate, Low Detail, dramatic",
+Kamph_Original Photo Style (PO),"Photorealistic, Hyperrealistic, Hyperdetailed, analog style, soft lighting, subsurface scattering, realistic, heavy shadow, masterpiece, best quality, ultra realistic, 8k, golden ratio, Intricate, High Detail, film photography, soft focus",
+Kamph_Black and White Film Noir (PO),"(b&w, Monochromatic, Film Photography:1.3),  Photorealistic, Hyperrealistic, Hyperdetailed, film noir, analog style, soft lighting, subsurface scattering, realistic, heavy shadow, masterpiece, best quality, ultra realistic, 8k, golden ratio, Intricate, High Detail, film photography, soft focus",
+Kamph_Isometric Rooms (PO),"Tiny cute isometric in a cutaway box, soft smooth lighting, soft colors, 100mm lens, 3d blender render",
+Kamph_Space Hologram (PO),"hologram floating in space, a vibrant digital illustration, dribbble, quantum wavetracing, black background, behance hd",
+Kamph_Cute Creatures (PO),"3d fluffy, closeup cute and adorable, cute big circular reflective eyes, long fuzzy fur, Pixar render, unreal engine cinematic smooth, intricate detail, cinematic",
+Kamph_Realistic Photo Portraits (PO),"RAW candid cinema, 16mm, color graded portra 400 film, remarkable color, ultra realistic, textured skin, remarkable detailed pupils, realistic dull skin noise, visible skin detail, skin fuzz, dry skin, shot with cinematic camera",
+Kamph_Professional Scenic Photographs (PO),"long shot scenic professional photograph of {prompt}, perfect viewpoint, highly detailed, wide-angle lens, hyper realistic, with dramatic sky, polarizing filter, natural lighting, vivid colors, everything in sharp focus, HDR, UHD, 64K",
+cinematic_still,"cinematic still {prompt} . emotional, harmonious, vignette, highly detailed, high budget, bokeh, cinemascope, moody, epic, gorgeous, film grain, grainy","anime, cartoon, graphic, text, painting, crayon, graphite, abstract, glitch, deformed, mutated, ugly, disfigured"
+sai-3d-model,"professional 3d model {prompt} . octane render, highly detailed, volumetric, dramatic lighting","ugly, deformed, noisy, low poly, blurry, painting"
+sai-analog film,"analog film photo {prompt} . faded film, desaturated, 35mm photo, grainy, vignette, vintage, Kodachrome, Lomography, stained, highly detailed, found footage","painting, drawing, illustration, glitch, deformed, mutated, cross-eyed, ugly, disfigured"
+sai-anime,"anime artwork {prompt} . anime style, key visual, vibrant, studio anime,  highly detailed","photo, deformed, black and white, realism, disfigured, low contrast"
+sai-cinematic,"cinematic film still {prompt} . shallow depth of field, vignette, highly detailed, high budget, bokeh, cinemascope, moody, epic, gorgeous, film grain, grainy","anime, cartoon, graphic, text, painting, crayon, graphite, abstract, glitch, deformed, mutated, ugly, disfigured"
+sai-comic book,"comic {prompt} . graphic illustration, comic art, graphic novel art, vibrant, highly detailed","photograph, deformed, glitch, noisy, realistic, stock photo"
+sai-craft clay,"play-doh style {prompt} . sculpture, clay art, centered composition, Claymation","sloppy, messy, grainy, highly detailed, ultra textured, photo"
+sai-digital art,"concept art {prompt} . digital artwork, illustrative, painterly, matte painting, highly detailed","photo, photorealistic, realism, ugly"
+sai-enhance,"breathtaking {prompt} . award-winning, professional, highly detailed","ugly, deformed, noisy, blurry, distorted, grainy"
+sai-fantasy art,"ethereal fantasy concept art of  {prompt} . magnificent, celestial, ethereal, painterly, epic, majestic, magical, fantasy art, cover art, dreamy","photographic, realistic, realism, 35mm film, dslr, cropped, frame, text, deformed, glitch, noise, noisy, off-center, deformed, cross-eyed, closed eyes, bad anatomy, ugly, disfigured, sloppy, duplicate, mutated, black and white"
+sai-isometric,"isometric style {prompt} . vibrant, beautiful, crisp, detailed, ultra detailed, intricate","deformed, mutated, ugly, disfigured, blur, blurry, noise, noisy, realistic, photographic"
+sai-line art,"line art drawing {prompt} . professional, sleek, modern, minimalist, graphic, line art, vector graphics","anime, photorealistic, 35mm film, deformed, glitch, blurry, noisy, off-center, deformed, cross-eyed, closed eyes, bad anatomy, ugly, disfigured, mutated, realism, realistic, impressionism, expressionism, oil, acrylic"
+sai-lowpoly,"low-poly style {prompt} . low-poly game art, polygon mesh, jagged, blocky, wireframe edges, centered composition","noisy, sloppy, messy, grainy, highly detailed, ultra textured, photo"
+sai-neonpunk,"neonpunk style {prompt} . cyberpunk, vaporwave, neon, vibes, vibrant, stunningly beautiful, crisp, detailed, sleek, ultramodern, magenta highlights, dark purple shadows, high contrast, cinematic, ultra detailed, intricate, professional","painting, drawing, illustration, glitch, deformed, mutated, cross-eyed, ugly, disfigured"
+sai-origami,"origami style {prompt} . paper art, pleated paper, folded, origami art, pleats, cut and fold, centered composition","noisy, sloppy, messy, grainy, highly detailed, ultra textured, photo"
+sai-photographic,"cinematic photo {prompt} . 35mm photograph, film, bokeh, professional, 4k, highly detailed","drawing, painting, crayon, sketch, graphite, impressionist, noisy, blurry, soft, deformed, ugly"
+sai-pixel art,"pixel-art {prompt} . low-res, blocky, pixel art style, 8-bit graphics","sloppy, messy, blurry, noisy, highly detailed, ultra textured, photo, realistic"
+sai-texture,texture {prompt} top down close-up,"ugly, deformed, noisy, blurry"
+ads-advertising,"Advertising poster style {prompt} . Professional, modern, product-focused, commercial, eye-catching, highly detailed","noisy, blurry, amateurish, sloppy, unattractive"
+ads-automotive,"Automotive advertisement style {prompt} . Sleek, dynamic, professional, commercial, vehicle-focused, high-resolution, highly detailed","noisy, blurry, unattractive, sloppy, unprofessional"
+ads-corporate,"Corporate branding style {prompt} . Professional, clean, modern, sleek, minimalist, business-oriented, highly detailed","noisy, blurry, grungy, sloppy, cluttered, disorganized"
+ads-fashion editorial,"Fashion editorial style {prompt} . High fashion, trendy, stylish, editorial, magazine style, professional, highly detailed","outdated, blurry, noisy, unattractive, sloppy"
+ads-food photography,"Food photography style {prompt} . Appetizing, professional, culinary, high-resolution, commercial, highly detailed","unappetizing, sloppy, unprofessional, noisy, blurry"
+ads-luxury,"Luxury product style {prompt} . Elegant, sophisticated, high-end, luxurious, professional, highly detailed","cheap, noisy, blurry, unattractive, amateurish"
+ads-real estate,"Real estate photography style {prompt} . Professional, inviting, well-lit, high-resolution, property-focused, commercial, highly detailed","dark, blurry, unappealing, noisy, unprofessional"
+ads-retail,"Retail packaging style {prompt} . Vibrant, enticing, commercial, product-focused, eye-catching, professional, highly detailed","noisy, blurry, amateurish, sloppy, unattractive"
+artstyle-abstract,"abstract style {prompt} . non-representational, colors and shapes, expression of feelings, imaginative, highly detailed","realistic, photographic, figurative, concrete"
+artstyle-abstract expressionism,"abstract expressionist painting {prompt} . energetic brushwork, bold colors, abstract forms, expressive, emotional","realistic, photorealistic, low contrast, plain, simple, monochrome"
+artstyle-art deco,"Art Deco style {prompt} . geometric shapes, bold colors, luxurious, elegant, decorative, symmetrical, ornate, detailed","ugly, deformed, noisy, blurry, low contrast, realism, photorealistic, modernist, minimalist"
+artstyle-art nouveau,"Art Nouveau style {prompt} . elegant, decorative, curvilinear forms, nature-inspired, ornate, detailed","ugly, deformed, noisy, blurry, low contrast, realism, photorealistic, modernist, minimalist"
+artstyle-constructivist,"constructivist style {prompt} . geometric shapes, bold colors, dynamic composition, propaganda art style","realistic, photorealistic, low contrast, plain, simple, abstract expressionism"
+artstyle-cubist,"cubist artwork {prompt} . geometric shapes, abstract, innovative, revolutionary","anime, photorealistic, 35mm film, deformed, glitch, low contrast, noisy"
+artstyle-expressionist,"expressionist {prompt} . raw, emotional, dynamic, distortion for emotional effect, vibrant, use of unusual colors, detailed","realism, symmetry, quiet, calm, photo"
+artstyle-graffiti,"graffiti style {prompt} . street art, vibrant, urban, detailed, tag, mural","ugly, deformed, noisy, blurry, low contrast, realism, photorealistic"
+artstyle-hyperrealism,"hyperrealistic art {prompt} . extremely high-resolution details, photographic, realism pushed to extreme, fine texture, incredibly lifelike","simplified, abstract, unrealistic, impressionistic, low resolution"
+artstyle-impressionist,"impressionist painting {prompt} . loose brushwork, vibrant color, light and shadow play, captures feeling over form","anime, photorealistic, 35mm film, deformed, glitch, low contrast, noisy"
+artstyle-pointillism,"pointillism style {prompt} . composed entirely of small, distinct dots of color, vibrant, highly detailed","line drawing, smooth shading, large color fields, simplistic"
+artstyle-pop art,"Pop Art style {prompt} . bright colors, bold outlines, popular culture themes, ironic or kitsch","ugly, deformed, noisy, blurry, low contrast, realism, photorealistic, minimalist"
+artstyle-psychedelic,"psychedelic style {prompt} . vibrant colors, swirling patterns, abstract forms, surreal, trippy","monochrome, black and white, low contrast, realistic, photorealistic, plain, simple"
+artstyle-renaissance,"Renaissance style {prompt} . realistic, perspective, light and shadow, religious or mythological themes, highly detailed","ugly, deformed, noisy, blurry, low contrast, modernist, minimalist, abstract"
+artstyle-steampunk,"steampunk style {prompt} . antique, mechanical, brass and copper tones, gears, intricate, detailed","deformed, glitch, noisy, low contrast, anime, photorealistic"
+artstyle-surrealist,"surrealist art {prompt} . dreamlike, mysterious, provocative, symbolic, intricate, detailed","anime, photorealistic, realistic, deformed, glitch, noisy, low contrast"
+artstyle-typography,"typographic art {prompt} . stylized, intricate, detailed, artistic, text-based","ugly, deformed, noisy, blurry, low contrast, realism, photorealistic"
+artstyle-watercolor,"watercolor painting {prompt} . vibrant, beautiful, painterly, detailed, textural, artistic","anime, photorealistic, 35mm film, deformed, glitch, low contrast, noisy"
+futuristic-biomechanical,"biomechanical style {prompt} . blend of organic and mechanical elements, futuristic, cybernetic, detailed, intricate","natural, rustic, primitive, organic, simplistic"
+futuristic-biomechanical cyberpunk,"biomechanical cyberpunk {prompt} . cybernetics, human-machine fusion, dystopian, organic meets artificial, dark, intricate, highly detailed","natural, colorful, deformed, sketch, low contrast, watercolor"
+futuristic-cybernetic,"cybernetic style {prompt} . futuristic, technological, cybernetic enhancements, robotics, artificial intelligence themes","ugly, deformed, noisy, blurry, low contrast, realism, photorealistic, historical, medieval"
+futuristic-cybernetic robot,"cybernetic robot {prompt} . android, AI, machine, metal, wires, tech, futuristic, highly detailed","organic, natural, human, sketch, watercolor, low contrast"
+futuristic-cyberpunk cityscape,"cyberpunk cityscape {prompt} . neon lights, dark alleys, skyscrapers, futuristic, vibrant colors, high contrast, highly detailed","natural, rural, deformed, low contrast, black and white, sketch, watercolor"
+futuristic-futuristic,"futuristic style {prompt} . sleek, modern, ultramodern, high tech, detailed","ugly, deformed, noisy, blurry, low contrast, realism, photorealistic, vintage, antique"
+futuristic-retro cyberpunk,"retro cyberpunk {prompt} . 80's inspired, synthwave, neon, vibrant, detailed, retro futurism","modern, desaturated, black and white, realism, low contrast"
+futuristic-retro futurism,"retro-futuristic {prompt} . vintage sci-fi, 50s and 60s style, atomic age, vibrant, highly detailed","contemporary, realistic, rustic, primitive"
+futuristic-sci-fi,"sci-fi style {prompt} . futuristic, technological, alien worlds, space themes, advanced civilizations","ugly, deformed, noisy, blurry, low contrast, realism, photorealistic, historical, medieval"
+futuristic-vaporwave,"vaporwave style {prompt} . retro aesthetic, cyberpunk, vibrant, neon colors, vintage 80s and 90s style, highly detailed","monochrome, muted colors, realism, rustic, minimalist, dark"
+game-bubble bobble,"Bubble Bobble style {prompt} . 8-bit, cute, pixelated, fantasy, vibrant, reminiscent of Bubble Bobble game","realistic, modern, photorealistic, violent, horror"
+game-cyberpunk game,"cyberpunk game style {prompt} . neon, dystopian, futuristic, digital, vibrant, detailed, high contrast, reminiscent of cyberpunk genre video games","historical, natural, rustic, low detailed"
+game-fighting game,"fighting game style {prompt} . dynamic, vibrant, action-packed, detailed character design, reminiscent of fighting video games","peaceful, calm, minimalist, photorealistic"
+game-gta,"GTA-style artwork {prompt} . satirical, exaggerated, pop art style, vibrant colors, iconic characters, action-packed","realistic, black and white, low contrast, impressionist, cubist, noisy, blurry, deformed"
+game-mario,"Super Mario style {prompt} . vibrant, cute, cartoony, fantasy, playful, reminiscent of Super Mario series","realistic, modern, horror, dystopian, violent"
+game-minecraft,"Minecraft style {prompt} . blocky, pixelated, vibrant colors, recognizable characters and objects, game assets","smooth, realistic, detailed, photorealistic, noise, blurry, deformed"
+game-pokemon,"Pokémon style {prompt} . vibrant, cute, anime, fantasy, reminiscent of Pokémon series","realistic, modern, horror, dystopian, violent"
+game-retro arcade,"retro arcade style {prompt} . 8-bit, pixelated, vibrant, classic video game, old school gaming, reminiscent of 80s and 90s arcade games","modern, ultra-high resolution, photorealistic, 3D"
+game-retro game,"retro game art {prompt} . 16-bit, vibrant colors, pixelated, nostalgic, charming, fun","realistic, photorealistic, 35mm film, deformed, glitch, low contrast, noisy"
+game-rpg fantasy game,"role-playing game (RPG) style fantasy {prompt} . detailed, vibrant, immersive, reminiscent of high fantasy RPG games","sci-fi, modern, urban, futuristic, low detailed"
+game-strategy game,"strategy game style {prompt} . overhead view, detailed map, units, reminiscent of real-time strategy video games","first-person view, modern, photorealistic"
+game-streetfighter,"Street Fighter style {prompt} . vibrant, dynamic, arcade, 2D fighting game, highly detailed, reminiscent of Street Fighter series","3D, realistic, modern, photorealistic, turn-based strategy"
+game-zelda,"Legend of Zelda style {prompt} . vibrant, fantasy, detailed, epic, heroic, reminiscent of The Legend of Zelda series","sci-fi, modern, realistic, horror"
+misc-architectural,"architectural style {prompt} . clean lines, geometric shapes, minimalist, modern, architectural drawing, highly detailed","curved lines, ornate, baroque, abstract, grunge"
+misc-disco,"disco-themed {prompt} . vibrant, groovy, retro 70s style, shiny disco balls, neon lights, dance floor, highly detailed","minimalist, rustic, monochrome, contemporary, simplistic"
+misc-dreamscape,"dreamscape {prompt} . surreal, ethereal, dreamy, mysterious, fantasy, highly detailed","realistic, concrete, ordinary, mundane"
+misc-dystopian,"dystopian style {prompt} . bleak, post-apocalyptic, somber, dramatic, highly detailed","ugly, deformed, noisy, blurry, low contrast, cheerful, optimistic, vibrant, colorful"
+misc-fairy tale,"fairy tale {prompt} . magical, fantastical, enchanting, storybook style, highly detailed","realistic, modern, ordinary, mundane"
+misc-gothic,"gothic style {prompt} . dark, mysterious, haunting, dramatic, ornate, detailed","ugly, deformed, noisy, blurry, low contrast, realism, photorealistic, cheerful, optimistic"
+misc-grunge,"grunge style {prompt} . textured, distressed, vintage, edgy, punk rock vibe, dirty, noisy","smooth, clean, minimalist, sleek, modern, photorealistic"
+misc-horror,"horror-themed {prompt} . eerie, unsettling, dark, spooky, suspenseful, grim, highly detailed","cheerful, bright, vibrant, light-hearted, cute"
+misc-kawaii,"kawaii style {prompt} . cute, adorable, brightly colored, cheerful, anime influence, highly detailed","dark, scary, realistic, monochrome, abstract"
+misc-lovecraftian,"lovecraftian horror {prompt} . eldritch, cosmic horror, unknown, mysterious, surreal, highly detailed","light-hearted, mundane, familiar, simplistic, realistic"
+misc-macabre,"macabre style {prompt} . dark, gothic, grim, haunting, highly detailed","bright, cheerful, light-hearted, cartoonish, cute"
+misc-manga,"manga style {prompt} . vibrant, high-energy, detailed, iconic, Japanese comic style","ugly, deformed, noisy, blurry, low contrast, realism, photorealistic, Western comic style"
+misc-metropolis,"metropolis-themed {prompt} . urban, cityscape, skyscrapers, modern, futuristic, highly detailed","rural, natural, rustic, historical, simple"
+misc-minimalist,"minimalist style {prompt} . simple, clean, uncluttered, modern, elegant","ornate, complicated, highly detailed, cluttered, disordered, messy, noisy"
+misc-monochrome,"monochrome {prompt} . black and white, contrast, tone, texture, detailed","colorful, vibrant, noisy, blurry, deformed"
+misc-nautical,"nautical-themed {prompt} . sea, ocean, ships, maritime, beach, marine life, highly detailed","landlocked, desert, mountains, urban, rustic"
+misc-space,"space-themed {prompt} . cosmic, celestial, stars, galaxies, nebulas, planets, science fiction, highly detailed","earthly, mundane, ground-based, realism"
+misc-stained glass,"stained glass style {prompt} . vibrant, beautiful, translucent, intricate, detailed","ugly, deformed, noisy, blurry, low contrast, realism, photorealistic"
+misc-techwear fashion,"techwear fashion {prompt} . futuristic, cyberpunk, urban, tactical, sleek, dark, highly detailed","vintage, rural, colorful, low contrast, realism, sketch, watercolor"
+misc-tribal,"tribal style {prompt} . indigenous, ethnic, traditional patterns, bold, natural colors, highly detailed","modern, futuristic, minimalist, pastel"
+misc-zentangle,"zentangle {prompt} . intricate, abstract, monochrome, patterns, meditative, highly detailed","colorful, representative, simplistic, large fields of color"
+papercraft-collage,"collage style {prompt} . mixed media, layered, textural, detailed, artistic","ugly, deformed, noisy, blurry, low contrast, realism, photorealistic"
+papercraft-flat papercut,"flat papercut style {prompt} . silhouette, clean cuts, paper, sharp edges, minimalist, color block","3D, high detail, noise, grainy, blurry, painting, drawing, photo, disfigured"
+papercraft-kirigami,"kirigami representation of {prompt} . 3D, paper folding, paper cutting, Japanese, intricate, symmetrical, precision, clean lines","painting, drawing, 2D, noisy, blurry, deformed"
+papercraft-paper mache,"paper mache representation of {prompt} . 3D, sculptural, textured, handmade, vibrant, fun","2D, flat, photo, sketch, digital art, deformed, noisy, blurry"
+papercraft-paper quilling,"paper quilling art of {prompt} . intricate, delicate, curling, rolling, shaping, coiling, loops, 3D, dimensional, ornamental","photo, painting, drawing, 2D, flat, deformed, noisy, blurry"
+papercraft-papercut collage,"papercut collage of {prompt} . mixed media, textured paper, overlapping, asymmetrical, abstract, vibrant","photo, 3D, realistic, drawing, painting, high detail, disfigured"
+papercraft-papercut shadow box,"3D papercut shadow box of {prompt} . layered, dimensional, depth, silhouette, shadow, papercut, handmade, high contrast","painting, drawing, photo, 2D, flat, high detail, blurry, noisy, disfigured"
+papercraft-stacked papercut,"stacked papercut art of {prompt} . 3D, layered, dimensional, depth, precision cut, stacked layers, papercut, high contrast","2D, flat, noisy, blurry, painting, drawing, photo, deformed"
+papercraft-thick layered papercut,"thick layered papercut art of {prompt} . deep 3D, volumetric, dimensional, depth, thick paper, high stack, heavy texture, tangible layers","2D, flat, thin paper, low stack, smooth texture, painting, drawing, photo, deformed"
+photo-alien,"alien-themed {prompt} . extraterrestrial, cosmic, otherworldly, mysterious, sci-fi, highly detailed","earthly, mundane, common, realistic, simple"
+photo-film noir,"film noir style {prompt} . monochrome, high contrast, dramatic shadows, 1940s style, mysterious, cinematic","ugly, deformed, noisy, blurry, low contrast, realism, photorealistic, vibrant, colorful"
+photo-hdr,"HDR photo of {prompt} . High dynamic range, vivid, rich details, clear shadows and highlights, realistic, intense, enhanced contrast, highly detailed","flat, low contrast, oversaturated, underexposed, overexposed, blurred, noisy"
+photo-long exposure,"long exposure photo of {prompt} . Blurred motion, streaks of light, surreal, dreamy, ghosting effect, highly detailed","static, noisy, deformed, shaky, abrupt, flat, low contrast"
+photo-neon noir,"neon noir {prompt} . cyberpunk, dark, rainy streets, neon signs, high contrast, low light, vibrant, highly detailed","bright, sunny, daytime, low contrast, black and white, sketch, watercolor"
+photo-silhouette,"silhouette style {prompt} . high contrast, minimalistic, black and white, stark, dramatic","ugly, deformed, noisy, blurry, low contrast, color, realism, photorealistic"
+photo-tilt-shift,"tilt-shift photo of {prompt} . Selective focus, miniature effect, blurred background, highly detailed, vibrant, perspective control","blurry, noisy, deformed, flat, low contrast, unrealistic, oversaturated, underexposed"
+cinematic-diva,"UHD, 8K, ultra detailed, a cinematic photograph of {prompt}, beautiful lighting, great composition","ugly, deformed, noisy, blurry, NSFW"
+Abstract Expressionism,"Abstract Expressionism Art, {prompt}, High contrast, minimalistic, colorful, stark, dramatic, expressionism","ugly, deformed, noisy, blurry, low contrast, realism, photorealistic"
+Academia,"Academia, {prompt}, preppy Ivy League style, stark, dramatic, chic boarding school, academia","ugly, deformed, noisy, blurry, low contrast, grunge, sloppy, unkempt"
+Action Figure,"Action Figure, {prompt}, plastic collectable action figure, collectable toy action figure","ugly, deformed, noisy, blurry, low contrast"
+Adorable 3D Character,"Adorable 3D Character, {prompt}, 3D render, adorable character, 3D art","ugly, deformed, noisy, blurry, low contrast, grunge, sloppy, unkempt, photograph, photo, realistic"
+Adorable Kawaii,"Adorable Kawaii, {prompt}, pretty, cute, adorable, kawaii","ugly, deformed, noisy, blurry, low contrast, gothic, dark, moody, monochromatic"
+Art Deco,"Art Deco, {prompt}, sleek, geometric forms, art deco style","ugly, deformed, noisy, blurry, low contrast"
+Art Nouveau,"Art Nouveau, beautiful art, {prompt}, sleek, organic forms, long, sinuous, art nouveau style","ugly, deformed, noisy, blurry, low contrast, industrial, mechanical"
+Astral Aura,"Astral Aura, {prompt}, astral, colorful aura, vibrant energy","ugly, deformed, noisy, blurry, low contrast"
+Avant-garde,"Avant-garde, {prompt}, unusual, experimental, avant-garde art","ugly, deformed, noisy, blurry, low contrast"
+Baroque,"Baroque, {prompt}, dramatic, exuberant, grandeur, baroque art","ugly, deformed, noisy, blurry, low contrast"
+Bauhaus-Style Poster,"Bauhaus-Style Poster, {prompt}, simple geometric shapes, clean lines, primary colors, Bauhaus-Style Poster","ugly, deformed, noisy, blurry, low contrast"
+Blueprint Schematic Drawing,"Blueprint Schematic Drawing, {prompt}, technical drawing, blueprint, schematic","ugly, deformed, noisy, blurry, low contrast"
+Caricature,"Caricature, {prompt}, exaggerated, comical, caricature","ugly, deformed, noisy, blurry, low contrast, realistic"
+Cel Shaded Art,"Cel Shaded Art, {prompt}, 2D, flat color, toon shading, cel shaded style","ugly, deformed, noisy, blurry, low contrast"
+Character Design Sheet,"Character Design Sheet, {prompt}, character reference sheet, character turn around","ugly, deformed, noisy, blurry, low contrast"
+Classicism Art,"Classicism Art, {prompt}, inspired by Roman and Greek culture, clarity, harmonious, classicism art","ugly, deformed, noisy, blurry, low contrast"
+Color Field Painting,"Color Field Painting, {prompt}, abstract, simple, geometic, color field painting style","ugly, deformed, noisy, blurry, low contrast"
+Colored Pencil Art,"Colored Pencil Art, {prompt}, colored pencil strokes, light color, visible paper texture, colored pencil art","ugly, deformed, noisy, blurry, low contrast"
+Conceptual Art,"Conceptual Art, {prompt}, concept art","ugly, deformed, noisy, blurry, low contrast"
+Constructivism,"Constructivism Art, {prompt}, minimalistic, geometric forms, constructivism art","ugly, deformed, noisy, blurry, low contrast"
+Cubism,"Cubism Art, {prompt}, flat geometric forms, cubism art","ugly, deformed, noisy, blurry, low contrast"
+Dadaism,"Dadaism Art, {prompt}, satirical, nonsensical, dadaism art","ugly, deformed, noisy, blurry, low contrast"
+Dark Fantasy,"Dark Fantasy Art, {prompt}, dark, moody, dark fantasy style","ugly, deformed, noisy, blurry, low contrast, bright, sunny"
+Dark Moody Atmosphere,"Dark Moody Atmosphere, {prompt}, dramatic, mysterious, dark moody atmosphere","ugly, deformed, noisy, blurry, low contrast, vibrant, colorful, bright"
+DMT Art Style,"DMT Art Style, {prompt}, bright colors, surreal visuals, swirling patterns, DMT art style","ugly, deformed, noisy, blurry, low contrast"
+Doodle Art,"Doodle Art Style, {prompt}, drawing, freeform, swirling patterns, doodle art style","ugly, deformed, noisy, blurry, low contrast"
+Double Exposure,"Double Exposure Style, {prompt}, double image ghost effect, image combination, double exposure style","ugly, deformed, noisy, blurry, low contrast"
+Dripping Paint Splatter Art,"Dripping Paint Splatter Art, {prompt}, dramatic, paint drips, splatters, dripping paint","ugly, deformed, noisy, blurry, low contrast"
+Expressionism,"Expressionism Art Style, {prompt}, movement, contrast, emotional, exaggerated forms, expressionism art style","ugly, deformed, noisy, blurry, low contrast"
+Faded Polaroid Photo,"Faded Polaroid Photo, {prompt}, analog, old faded photo, old polaroid","ugly, deformed, noisy, blurry, low contrast, vibrant, colorful"
+Fauvism,"Fauvism Art, {prompt}, painterly, bold colors, textured brushwork, fauvism art","ugly, deformed, noisy, blurry, low contrast"
+Flat 2D Art,"Flat 2D Art, {prompt}, simple flat color, 2-dimensional, Flat 2D Art Style","ugly, deformed, noisy, blurry, low contrast, 3D, photo, realistic"
+Fortnite Art Style,"Fortnite Art Style, {prompt}, 3D cartoon, colorful, Fortnite Art Style","ugly, deformed, noisy, blurry, low contrast, photo, realistic"
+Futurism,"Futurism Art Style, {prompt}, dynamic, dramatic, Futurism Art Style","ugly, deformed, noisy, blurry, low contrast"
+Glitchcore,"Glitchcore Art Style, {prompt}, dynamic, dramatic, distorted, vibrant colors, glitchcore art style","ugly, deformed, noisy, blurry, low contrast"
+Glo-fi,"Glo-fi Art Style, {prompt}, dynamic, dramatic, vibrant colors, glo-fi art style","ugly, deformed, noisy, blurry, low contrast"
+Googie Art Style,"Googie Art Style, {prompt}, dynamic, dramatic, 1950's futurism, bold boomerang angles, Googie art style","ugly, deformed, noisy, blurry, low contrast"
+Graffiti Art,"Graffiti Art Style, {prompt}, dynamic, dramatic, vibrant colors, graffiti art style","ugly, deformed, noisy, blurry, low contrast"
+Harlem Renaissance Art,"Harlem Renaissance Art Style, {prompt}, dynamic, dramatic, 1920s African American culture, Harlem Renaissance art style","ugly, deformed, noisy, blurry, low contrast"
+High Fashion,"High Fashion, {prompt}, dynamic, dramatic, haute couture, elegant, ornate clothing, High Fashion","ugly, deformed, noisy, blurry, low contrast"
+Idyllic,"Idyllic, {prompt}, peaceful, happy, pleasant, happy, harmonious, picturesque, charming","ugly, deformed, noisy, blurry, low contrast"
+Impressionism,"Impressionism, {prompt}, painterly, small brushstrokes, visible brushstrokes, impressionistic style","ugly, deformed, noisy, blurry, low contrast"
+Infographic Drawing,"Infographic Drawing, {prompt}, diagram, infographic","ugly, deformed, noisy, blurry, low contrast"
+Ink Dripping Drawing,"Ink Dripping Drawing, {prompt}, ink drawing, dripping ink","ugly, deformed, noisy, blurry, low contrast, colorful, vibrant"
+Japanese Ink Drawing,"Japanese Ink Drawing, {prompt}, ink drawing, inkwash, Japanese Ink Drawing","ugly, deformed, noisy, blurry, low contrast, colorful, vibrant"
+Knolling Photography,"Knolling Photography, {prompt}, flat lay photography, object arrangment, knolling photography","ugly, deformed, noisy, blurry, low contrast"
+Light Cheery Atmosphere,"Light Cheery Atmosphere, {prompt}, happy, joyful, cheerful, carefree, gleeful, lighthearted, pleasant atmosphere","ugly, deformed, noisy, blurry, low contrast, monochromatic, dark, moody"
+Logo Design,"Logo Design, {prompt}, dynamic graphic art, vector art, minimalist, professional logo design","ugly, deformed, noisy, blurry, low contrast"
+Luxurious Elegance,"Luxurious Elegance, {prompt}, extravagant, ornate, designer, opulent, picturesque, lavish","ugly, deformed, noisy, blurry, low contrast"
+Macro Photography,"Macro Photography, {prompt}, close-up, macro 100mm, macro photography","ugly, deformed, noisy, blurry, low contrast"
+Mandola Art,"Mandola art style, {prompt}, complex, circular design, mandola","ugly, deformed, noisy, blurry, low contrast"
+Marker Drawing,"Marker Drawing, {prompt}, bold marker lines, visibile paper texture, marker drawing","ugly, deformed, noisy, blurry, low contrast, photograph, realistic"
+Medievalism,"Medievalism, {prompt}, inspired by The Middle Ages, medieval art, elaborate patterns and decoration, Medievalism","ugly, deformed, noisy, blurry, low contrast"
+Minimalism,"Minimalism, {prompt}, abstract, simple geometic shapes, hard edges, sleek contours, Minimalism","ugly, deformed, noisy, blurry, low contrast"
+Neo-Baroque,"Neo-Baroque, {prompt}, ornate and elaborate, dynaimc, Neo-Baroque","ugly, deformed, noisy, blurry, low contrast"
+Neo-Byzantine,"Neo-Byzantine, {prompt}, grand decorative religious style, Orthodox Christian inspired, Neo-Byzantine","ugly, deformed, noisy, blurry, low contrast"
+Neo-Futurism,"Neo-Futurism, {prompt}, high-tech, curves, spirals, flowing lines, idealistic future, Neo-Futurism","ugly, deformed, noisy, blurry, low contrast"
+Neo-Impressionism,"Neo-Impressionism, {prompt}, tiny dabs of color, Pointillism, painterly, Neo-Impressionism","ugly, deformed, noisy, blurry, low contrast, photograph, realistic"
+Neo-Rococo,"Neo-Rococo, {prompt}, curved forms, naturalistic ornamentation, elaborate, decorative, gaudy, Neo-Rococo","ugly, deformed, noisy, blurry, low contrast"
+Neoclassicism,"Neoclassicism, {prompt}, ancient Rome and Greece inspired, idealic, sober colors, Neoclassicism","ugly, deformed, noisy, blurry, low contrast"
+Op Art,"Op Art, {prompt}, optical illusion, abstract, geometric pattern, impression of movement, Op Art","ugly, deformed, noisy, blurry, low contrast"
+Ornate and Intricate,"Ornate and Intricate, {prompt}, decorative, highly detailed, elaborate, ornate, intricate","ugly, deformed, noisy, blurry, low contrast"
+Pencil Sketch Drawing,"Pencil Sketch Drawing, {prompt}, black and white drawing, graphite drawing","ugly, deformed, noisy, blurry, low contrast"
+Pop Art 2,"Pop Art, {prompt}, vivid colors, flat color, 2D, strong lines, Pop Art","ugly, deformed, noisy, blurry, low contrast, photo, realistic"
+Rococo,"Rococo, {prompt}, flamboyant, pastel colors, curved lines, elaborate detail, Rococo","ugly, deformed, noisy, blurry, low contrast"
+Silhouette Art,"Silhouette Art, {prompt}, high contrast, well defined, Silhouette Art","ugly, deformed, noisy, blurry, low contrast"
+Simple Vector Art,"Simple Vector Art, {prompt}, 2D flat, simple shapes, minimalistic, professional graphic, flat color, high contrast, Simple Vector Art","ugly, deformed, noisy, blurry, low contrast, 3D, photo, realistic"
+Sketchup,"Sketchup, {prompt}, CAD, professional design, Sketchup","ugly, deformed, noisy, blurry, low contrast, photo, photograph"
+Steampunk 2,"Steampunk, {prompt}, retrofuturistic science fantasy, steam-powered tech, vintage industry, gears, neo-victorian, steampunk","ugly, deformed, noisy, blurry, low contrast"
+Surrealism,"Surrealism, {prompt}, expressive, dramatic, organic lines and forms, dreamlike and mysterious, Surrealism","ugly, deformed, noisy, blurry, low contrast, realistic"
+Suprematism,"Suprematism, {prompt}, abstract, limited color palette, geometric forms, Suprematism","ugly, deformed, noisy, blurry, low contrast, realistic"
+Terragen,"Terragen, {prompt}, beautiful massive landscape, epic scenery, Terragen","ugly, deformed, noisy, blurry, low contrast"
+Tranquil Relaxing Atmosphere,"Tranquil Relaxing Atmosphere, {prompt}, calming style, soothing colors, peaceful, idealic, Tranquil Relaxing Atmosphere","ugly, deformed, noisy, blurry, low contrast, oversaturated"
+Sticker Designs,"Vector Art Stickers, {prompt}, professional vector design, sticker designs, Sticker Sheet","ugly, deformed, noisy, blurry, low contrast"
+Vibrant Rim Light,"Vibrant Rim Light, {prompt}, bright rim light, high contrast, bold edge light","ugly, deformed, noisy, blurry, low contrast"
+Volumetric Lighting,"Volumetric Lighting, {prompt}, light depth, dramatic atmospheric lighting, Volumetric Lighting","ugly, deformed, noisy, blurry, low contrast"
+Watercolor 2,"Watercolor style painting, {prompt}, visible paper texture, colorwash, watercolor","ugly, deformed, noisy, blurry, low contrast, photo, realistic"
+Whimsical and Playful,"Whimsical and Playful, {prompt}, imaginative, fantastical, bight colors, stylized, happy, Whimsical and Playful","ugly, deformed, noisy, blurry, low contrast, drab, boring, moody"
+"Kamph_3d-model","professional 3d model {prompt}, octane render, highly detailed, volumetric, dramatic lighting","ugly, deformed, noisy, low poly, blurry, painting",
+"Kamph_Analog film","analog film photo {prompt}, faded film, desaturated, 35mm photo, grainy, vignette, vintage, Kodachrome, Lomography, stained, highly detailed, found footage","painting, drawing, illustration, glitch, deformed, mutated, cross-eyed, ugly, disfigured",
+"Kamph_Anime","anime artwork {prompt}, anime style, key visual, vibrant, studio anime,  highly detailed","photo, deformed, black and white, realism, disfigured, low contrast",
+"Kamph_Cinematic","cinematic film still {prompt}, shallow depth of field, vignette, highly detailed, high budget Hollywood film, bokeh, cinemascope, moody, epic, gorgeous, film grain","anime, cartoon, graphic, text, painting, crayon, graphite, abstract, glitch",
+"Kamph_Comic book","comic {prompt}, graphic illustration, comic art, graphic novel art, vibrant, highly detailed","photograph, deformed, glitch, noisy, realistic, stock photo",
+"Kamph_Digital art","concept art {prompt}, digital artwork, illustrative, painterly, matte painting, highly detailed","photo, photorealistic, realism, ugly",
+"Kamph_Fantasy art","ethereal fantasy concept art of  {prompt}, magnificent, celestial, ethereal, painterly, epic, majestic, magical, fantasy art, cover art, dreamy","photographic, realistic, realism, 35mm film, dslr, cropped, frame, text, deformed, glitch, noise, noisy, off-center, deformed, cross-eyed, closed eyes, bad anatomy, ugly, disfigured, sloppy, duplicate, mutated, black and white",
+"Kamph_Cyberpunk","neonpunk style {prompt}, cyberpunk, vaporwave, neon, vibes, vibrant, stunningly beautiful, crisp, detailed, sleek, ultramodern, magenta highlights, dark purple shadows, high contrast, cinematic, ultra detailed, intricate, professional","painting, drawing, illustration, glitch, deformed, mutated, cross-eyed, ugly, disfigured",
+"Kamph_Photo with Bokeh","cinematic photo {prompt}, 35mm photograph, film, bokeh, professional, 4k, highly detailed","drawing, painting, crayon, sketch, graphite, impressionist, noisy, blurry, soft, deformed, ugly",
+"Kamph_Photo without Bokeh","cinematic photo {prompt}, 35mm photograph, film, professional, 4k, highly detailed","bokeh, drawing, painting, crayon, sketch, graphite, impressionist, noisy, blurry, soft, deformed, ugly",
+"Kamph_Enhancer","breathtaking {prompt}, masterpiece, award-winning, professional, highly detailed","ugly, deformed, noisy, blurry, distorted, grainy",
+Bens_Red Dress (Tutorial Prompt),"a fashion shoot of a european woman with blonde hair in a maxi silk red dress with slit at front of dress from thigh to floor, (legs exposed:1.8), Red Patent Pointed Stiletto Heel Court Shoes, plain background, RAW candid cinema, 35mm, color graded portra 400 film, remarkable color, ultra realistic, textured skin, remarkable detailed pupils, realistic dull skin noise, visible skin detail, skin fuzz, dry skin, shot with cinematic camera",
+Bens_Daguerreotype Photo 1920,"daguerreotype photo, {prompt},  1920s photography, historical photo, aged photo, black & white photography, damaged photo","cartoon, illustration, painting, frame"
+Bens Underwater and god rays,"deep underwater, {prompt} highly detailed,  intricate lighting, god rays","ugly, deformed, noisy, blurry, low contrast,"