update reqs for vlm

This commit is contained in:
maybleMyers 2025-12-08 17:37:59 -08:00
parent b92a9a951a
commit faeb7d16f6
2 changed files with 413 additions and 62 deletions

47
requirements_vlm.txt Normal file
View File

@ -0,0 +1,47 @@
# Requirements for vlm.py - Qwen3-VL Chat Interface
# Use a separate virtual environment to avoid conflicts with main Forge app
#
# Setup:
# python -m venv venv_vlm
# venv_vlm\Scripts\activate (Windows)
# source venv_vlm/bin/activate (Linux/Mac)
# pip install -r requirements_vlm.txt
#
# Run:
# python vlm.py
# PyTorch - install first with CUDA support
# pip install torch torchvision --index-url https://download.pytorch.org/whl/cu128
torch>=2.4.0
torchvision
# vLLM for high-performance inference
vllm>=0.11.0
# Qwen VL utilities
qwen-vl-utils>=0.0.14
# Transformers (fallback backend)
transformers>=4.51.0
accelerate
safetensors
# Gradio UI
gradio>=5.0.0
gradio-client
# Image/Video processing
Pillow>=10.0.0
opencv-python
# Other dependencies
numpy
tqdm
pydantic>=2.0.0
huggingface-hub>=0.20.0
# Optional: Flash Attention 2 (for faster inference)
# pip install flash-attn --no-build-isolation
# Optional: bitsandbytes for quantization (transformers backend)
# pip install bitsandbytes

428
vlm.py
View File

@ -26,6 +26,27 @@ except ImportError:
CV2_AVAILABLE = False
print("Warning: opencv-python not installed. Video support will be limited.")
# Try to import vLLM for high-performance inference
try:
from vllm import LLM, SamplingParams
VLLM_AVAILABLE = True
print("vLLM loaded successfully")
except ImportError as e:
VLLM_AVAILABLE = False
print(f"Warning: vLLM not available. Install with: pip install vllm>=0.11.0")
print(f" Import error: {e}")
except Exception as e:
VLLM_AVAILABLE = False
print(f"Warning: vLLM import failed: {e}")
# Try to import qwen-vl-utils for image processing
try:
from qwen_vl_utils import process_vision_info
QWEN_VL_UTILS_AVAILABLE = True
except ImportError:
QWEN_VL_UTILS_AVAILABLE = False
print("Warning: qwen-vl-utils not installed. Install with: pip install qwen-vl-utils")
# Default model paths (relative to models/LLM)
DEFAULT_MODELS = {
"Qwen3-VL-8B-Caption-V4.5": "models/LLM/Qwen3-VL-8B-Caption-V4.5",
@ -82,13 +103,36 @@ def extract_video_frames(video_path: str, max_frames: int = 8, target_size: Tupl
class VLMManager:
"""Manages Qwen VL model loading, inference, and memory."""
def __init__(self, low_vram: bool = False):
def __init__(self, low_vram: bool = False, backend: str = "auto"):
"""
Initialize VLM Manager.
Args:
low_vram: Enable low VRAM mode for transformers backend
backend: "vllm", "transformers", or "auto" (vLLM if available, else transformers)
"""
self.model = None
self.processor = None
self.model_name = None
self.low_vram = low_vram
self.device = self._get_device()
# vLLM specific attributes
self.vllm_model = None
self.model_path = None
# Determine backend
if backend == "auto":
self.backend = "vllm" if VLLM_AVAILABLE else "transformers"
else:
self.backend = backend
if self.backend == "vllm" and not VLLM_AVAILABLE:
print("Warning: vLLM requested but not available. Falling back to transformers.")
self.backend = "transformers"
print(f"VLM Backend: {self.backend}")
def _get_device(self) -> torch.device:
"""Get the best available device."""
if torch.cuda.is_available():
@ -148,6 +192,77 @@ class VLMManager:
return "qwen2_5_vl" # Default fallback
def _load_with_vllm(self, model_name: str, quantization: str = "none", progress=gr.Progress()) -> str:
"""Load model using vLLM backend for high-performance inference."""
if not VLLM_AVAILABLE:
return "vLLM is not available. Please install with: pip install vllm>=0.11.0"
# Check if already loaded
if self.vllm_model is not None and self.model_name == model_name:
return f"Model '{model_name}' is already loaded (vLLM)."
# Unload existing model first
if self.vllm_model is not None:
self.unload_model()
progress(0.1, desc="Loading model with vLLM...")
# Determine model path
if model_name in DEFAULT_MODELS:
model_path = DEFAULT_MODELS[model_name]
else:
model_path = f"models/LLM/{model_name}"
if not Path(model_path).exists():
return f"Model path not found: {model_path}"
try:
# Detect model type
model_type = self._detect_model_type(model_path)
print(f"Detected model type: {model_type}")
progress(0.3, desc="Initializing vLLM engine...")
# Configure vLLM loading options
vllm_kwargs = {
"model": model_path,
"trust_remote_code": True,
"dtype": "bfloat16",
"max_model_len": 4096, # Adjust based on your VRAM
"gpu_memory_utilization": 0.9,
}
# Handle quantization
if quantization == "4bit":
vllm_kwargs["quantization"] = "awq" # or "gptq" depending on model
print("Using AWQ 4-bit quantization with vLLM")
elif quantization == "8bit":
vllm_kwargs["quantization"] = "fp8"
print("Using FP8 quantization with vLLM")
# Enable multimodal for VL models
vllm_kwargs["limit_mm_per_prompt"] = {"image": 10, "video": 2}
progress(0.5, desc=f"Loading {model_type} with vLLM...")
self.vllm_model = LLM(**vllm_kwargs)
self.model_path = model_path
self.model_name = model_name
# Also load processor for chat template
from transformers import AutoProcessor
self.processor = AutoProcessor.from_pretrained(model_path)
progress(1.0, desc="Model loaded with vLLM!")
quant_info = f", {quantization}" if quantization != "none" else ""
return f"Successfully loaded '{model_name}' with vLLM ({model_type}{quant_info})"
except Exception as e:
import traceback
traceback.print_exc()
return f"Failed to load model with vLLM: {str(e)}"
def load_model(self, model_name: str, quantization: str = "none", use_flash_attn: bool = False, vram_buffer: int = 0, progress=gr.Progress()) -> str:
"""Load a Qwen VL model.
@ -161,6 +276,10 @@ class VLMManager:
if model_name == "No models found":
return "No models available. Please download a model first."
# Use vLLM backend if selected
if self.backend == "vllm":
return self._load_with_vllm(model_name, quantization, progress)
# Check if already loaded
if self.model is not None and self.model_name == model_name:
return f"Model '{model_name}' is already loaded."
@ -287,22 +406,36 @@ class VLMManager:
def unload_model(self) -> str:
"""Unload the current model to free memory."""
if self.model is None:
# Check if any model is loaded (either vLLM or transformers)
if self.model is None and self.vllm_model is None:
return "No model is currently loaded."
model_name = self.model_name
backend_used = "vLLM" if self.vllm_model is not None else "transformers"
# Unload vLLM model
if self.vllm_model is not None:
del self.vllm_model
self.vllm_model = None
self.model_path = None
# Unload transformers model
if self.model is not None:
del self.model
self.model = None
# Clean up processor
if self.processor is not None:
del self.processor
self.processor = None
del self.model
del self.processor
self.model = None
self.processor = None
self.model_name = None
gc.collect()
if torch.cuda.is_available():
torch.cuda.empty_cache()
return f"Unloaded '{model_name}' and freed memory."
return f"Unloaded '{model_name}' ({backend_used}) and freed memory."
def get_memory_info(self) -> str:
"""Get current GPU memory usage."""
@ -315,6 +448,117 @@ class VLMManager:
return f"GPU Memory: {allocated:.1f}GB allocated, {reserved:.1f}GB reserved, {total:.1f}GB total"
def _generate_with_vllm(
self,
messages: List[Dict[str, Any]],
max_new_tokens: int = 512,
temperature: float = 0.7,
top_p: float = 0.9,
top_k: int = 50,
repetition_penalty: float = 1.1,
video_max_frames: int = 8,
) -> str:
"""Generate a response using vLLM backend."""
if self.vllm_model is None:
return "Error: No vLLM model loaded."
try:
# Process messages to extract images and prepare for vLLM
images = []
processed_messages = []
for msg in messages:
if isinstance(msg.get("content"), list):
new_content = []
for item in msg["content"]:
if item.get("type") == "image" and "image" in item:
img = item["image"]
images.append(img)
# For vLLM, use placeholder in text
new_content.append({"type": "image"})
elif item.get("type") == "video" and "video" in item:
# Process video into frames
video_path = item["video"]
if isinstance(video_path, str) and os.path.exists(video_path):
try:
frames = extract_video_frames(video_path, max_frames=video_max_frames)
for frame in frames:
images.append(frame)
new_content.append({"type": "image"})
if frames:
new_content.append({"type": "text", "text": f"[The above {len(frames)} images are frames extracted from a video]"})
except Exception as e:
new_content.append({"type": "text", "text": f"[Video processing error: {str(e)}]"})
elif item.get("type") == "text":
new_content.append(item)
else:
new_content.append(item)
processed_messages.append({"role": msg["role"], "content": new_content})
else:
processed_messages.append(msg)
# Apply chat template using processor
text_input = self.processor.apply_chat_template(
processed_messages,
tokenize=False,
add_generation_prompt=True,
)
print(f"[vLLM Debug] Prompt preview: {text_input[:500]}...")
print(f"[vLLM Debug] Number of images: {len(images)}")
# Configure sampling parameters
sampling_params = SamplingParams(
max_tokens=max_new_tokens,
temperature=temperature if temperature > 0 else 0.001,
top_p=top_p,
top_k=top_k if top_k > 0 else -1,
repetition_penalty=repetition_penalty,
)
# Prepare multimodal inputs for vLLM
if images:
# Convert PIL images to format vLLM expects
mm_data = {"image": images}
inputs = {
"prompt": text_input,
"multi_modal_data": mm_data,
}
else:
inputs = {"prompt": text_input}
# Generate with timing
if torch.cuda.is_available():
torch.cuda.synchronize()
start_time = time.perf_counter()
outputs = self.vllm_model.generate([inputs], sampling_params=sampling_params)
if torch.cuda.is_available():
torch.cuda.synchronize()
end_time = time.perf_counter()
# Extract response
response = outputs[0].outputs[0].text
# Calculate throughput
num_generated_tokens = len(outputs[0].outputs[0].token_ids)
generation_time = end_time - start_time
tokens_per_sec = num_generated_tokens / generation_time if generation_time > 0 else 0
print(f"[vLLM Inference] Generated {num_generated_tokens} tokens in {generation_time:.2f}s ({tokens_per_sec:.2f} tok/s)")
# Clean up thinking tags if present
if "</think>" in response:
response = response.split("</think>")[-1].strip()
return response
except Exception as e:
import traceback
traceback.print_exc()
return f"Error during vLLM generation: {str(e)}"
@torch.inference_mode()
def generate(
self,
@ -327,6 +571,18 @@ class VLMManager:
video_max_frames: int = 8,
) -> str:
"""Generate a response from the model."""
# Use vLLM backend if loaded
if self.vllm_model is not None:
return self._generate_with_vllm(
messages=messages,
max_new_tokens=max_new_tokens,
temperature=temperature,
top_p=top_p,
top_k=top_k,
repetition_penalty=repetition_penalty,
video_max_frames=video_max_frames,
)
if self.model is None:
return "Error: No model loaded. Please load a model first."
@ -442,10 +698,25 @@ class VLMManager:
vlm_manager: Optional[VLMManager] = None
def initialize_manager(low_vram: bool = False):
def initialize_manager(low_vram: bool = False, backend: str = "auto"):
"""Initialize the global VLM manager."""
global vlm_manager
vlm_manager = VLMManager(low_vram=low_vram)
vlm_manager = VLMManager(low_vram=low_vram, backend=backend)
def switch_backend_handler(backend: str):
"""Handle backend switching from UI."""
global vlm_manager
if vlm_manager is not None:
# Unload current model first
vlm_manager.unload_model()
# Get current low_vram setting
low_vram = vlm_manager.low_vram if vlm_manager else False
# Reinitialize with new backend
vlm_manager = VLMManager(low_vram=low_vram, backend=backend)
return f"Switched to {vlm_manager.backend} backend"
def load_model_handler(model_name: str, quantization: str, use_flash_attn: bool, vram_buffer: int, progress=gr.Progress()):
@ -484,7 +755,8 @@ def chat_handler(
auto_unload: bool = False,
):
"""Handle chat messages from UI."""
if vlm_manager is None or vlm_manager.model is None:
# Check if any model is loaded (either transformers or vLLM)
if vlm_manager is None or (vlm_manager.model is None and vlm_manager.vllm_model is None):
return history + [(message, "Error: No model loaded. Please load a model first.")], ""
# Build messages list for the model
@ -657,56 +929,57 @@ def create_ui():
"""Create the Gradio interface."""
available_models = vlm_manager.get_available_models() if vlm_manager else ["Manager not initialized"]
with gr.Blocks(
title="Chromaforge VLM",
theme=themes.Default(
primary_hue=colors.Color(
name="custom",
c50="#E6F0FF",
c100="#CCE0FF",
c200="#99C1FF",
c300="#66A3FF",
c400="#3384FF",
c500="#0060df",
c600="#0052C2",
c700="#003D91",
c800="#002961",
c900="#001430",
c950="#000A18"
)
),
css="""
.gallery-item:first-child { border: 2px solid #4CAF50 !important; }
.gallery-item:first-child:hover { border-color: #45a049 !important; }
.green-btn {
background: linear-gradient(to bottom right, #2ecc71, #27ae60) !important;
color: white !important;
border: none !important;
}
.green-btn:hover {
background: linear-gradient(to bottom right, #27ae60, #219651) !important;
}
.refresh-btn {
max-width: 40px !important;
min-width: 40px !important;
height: 40px !important;
border-radius: 50% !important;
padding: 0 !important;
display: flex !important;
align-items: center !important;
justify-content: center !important;
}
.light-blue-btn {
background: linear-gradient(to bottom right, #AEC6CF, #9AB8C4) !important;
color: #333 !important;
border: 1px solid #9AB8C4 !important;
}
.light-blue-btn:hover {
background: linear-gradient(to bottom right, #9AB8C4, #8AA9B5) !important;
border-color: #8AA9B5 !important;
}
""",
) as demo:
# Theme for Gradio 6.x (passed to launch() instead of Blocks())
global vlm_theme, vlm_css
vlm_theme = themes.Default(
primary_hue=colors.Color(
name="custom",
c50="#E6F0FF",
c100="#CCE0FF",
c200="#99C1FF",
c300="#66A3FF",
c400="#3384FF",
c500="#0060df",
c600="#0052C2",
c700="#003D91",
c800="#002961",
c900="#001430",
c950="#000A18"
)
)
vlm_css = """
.gallery-item:first-child { border: 2px solid #4CAF50 !important; }
.gallery-item:first-child:hover { border-color: #45a049 !important; }
.green-btn {
background: linear-gradient(to bottom right, #2ecc71, #27ae60) !important;
color: white !important;
border: none !important;
}
.green-btn:hover {
background: linear-gradient(to bottom right, #27ae60, #219651) !important;
}
.refresh-btn {
max-width: 40px !important;
min-width: 40px !important;
height: 40px !important;
border-radius: 50% !important;
padding: 0 !important;
display: flex !important;
align-items: center !important;
justify-content: center !important;
}
.light-blue-btn {
background: linear-gradient(to bottom right, #AEC6CF, #9AB8C4) !important;
color: #333 !important;
border: 1px solid #9AB8C4 !important;
}
.light-blue-btn:hover {
background: linear-gradient(to bottom right, #9AB8C4, #8AA9B5) !important;
border-color: #8AA9B5 !important;
}
"""
with gr.Blocks(title="Chromaforge VLM") as demo:
with gr.Row():
# Left column - Settings (shared across tabs)
with gr.Column(scale=1):
@ -721,6 +994,21 @@ def create_ui():
refresh_models_btn = gr.Button("Refresh Model List", size="sm")
# Backend selection (vLLM or transformers)
backend_choices = ["auto", "vllm", "transformers"] if VLLM_AVAILABLE else ["transformers"]
backend_dropdown = gr.Dropdown(
choices=backend_choices,
value="auto" if VLLM_AVAILABLE else "transformers",
label="Backend",
info="vLLM: faster inference, transformers: more compatible",
interactive=True,
)
backend_status = gr.Textbox(
label="Backend Status",
value=f"Current: {vlm_manager.backend if vlm_manager else 'not initialized'}",
interactive=False,
)
quantization_dropdown = gr.Dropdown(
choices=["none", "4bit", "8bit"],
value="none",
@ -830,7 +1118,6 @@ def create_ui():
chatbot = gr.Chatbot(
label="Conversation",
height=400,
show_copy_button=True,
)
with gr.Row():
@ -908,6 +1195,13 @@ def create_ui():
outputs=[model_status],
)
# Backend switching handler
backend_dropdown.change(
fn=switch_backend_handler,
inputs=[backend_dropdown],
outputs=[backend_status],
)
unload_btn.click(
fn=unload_model_handler,
outputs=[model_status],
@ -995,6 +1289,13 @@ def main():
action="store_true",
help="Enable low VRAM mode for smaller GPUs",
)
parser.add_argument(
"--backend",
type=str,
choices=["auto", "vllm", "transformers"],
default="auto",
help="Backend for model inference (default: auto - uses vLLM if available)",
)
args = parser.parse_args()
@ -1005,13 +1306,14 @@ def main():
print("Chromaforge VLM Chat Interface")
print("=" * 60)
print(f"Low VRAM mode: {'enabled' if args.lowvram else 'disabled'}")
print(f"Backend: {args.backend}" + (" (vLLM available)" if VLLM_AVAILABLE else " (vLLM not available)"))
print(f"Server: http://{host}:{args.port}")
if args.listen:
print("LAN access: enabled (listening on 0.0.0.0)")
print("=" * 60)
# Initialize the manager
initialize_manager(low_vram=args.lowvram)
initialize_manager(low_vram=args.lowvram, backend=args.backend)
# Create and launch the UI
demo = create_ui()
@ -1019,6 +1321,8 @@ def main():
server_name=host,
server_port=args.port,
share=args.share,
theme=vlm_theme,
css=vlm_css,
)