Spaces:

CGQN
/

MiniCPM-V-4_5-int4-CPU-0

Running

App Files Files Community

MiniCPM-V-4_5-int4-CPU-0 / app.py

CGQN

Update app.py

069261e verified 17 days ago

raw

history blame contribute delete

5.57 kB

	import os
	import torch
	import gradio as gr
	from PIL import Image
	from typing import List, Dict, Any
	from transformers import AutoModel, AutoTokenizer

	"""
	Gradio app to run MiniCPM-V-4_5 int4 on CPU for image+text chat.
	- Requires: pip install transformers accelerate gradio pillow
	- Model: openbmb/MiniCPM-V-4_5-int4 (quantized, CPU-friendly)
	- This script is self-contained and uses a simple multi-turn chat interface.
	"""

	MODEL_ID = os.environ.get("MINICPM_MODEL_ID", "openbmb/MiniCPM-V-4_5-int4")

	# Global model/tokenizer, loaded once
	model = None
	tokenizer = None

	def load_model():
	global model, tokenizer
	if model is not None and tokenizer is not None:
	return

	# For CPU inference, keep it simple and avoid .cuda() / bfloat16
	# trust_remote_code is required because MiniCPM implements custom .chat()
	tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, trust_remote_code=True)
	model = AutoModel.from_pretrained(
	MODEL_ID,
	trust_remote_code=True,
	attn_implementation="sdpa", # SDPA is fine on CPU; avoid flash-attn on CPU
	torch_dtype=torch.float32, # Safer default for CPU
	device_map="cpu", # Ensure CPU execution
	quantization_config=None,
	)
	model.eval()


	def build_messages(history: List[Dict[str, Any]], image: Image.Image, user_input: str) -> List[Dict[str, Any]]:
	"""
	Convert Gradio chat history + current inputs into the message format expected by MiniCPM's .chat().
	history: List of {"role": "user"/"assistant", "content": "..."} pairs (text-only transcript).
	image: PIL.Image or None for the current turn.
	user_input: current user text.
	Returns a msgs list with roles and content arrays [image?, text].
	"""
	msgs = []
	# Reconstruct multi-turn context: interleave user/assistant turns
	# We assume each user message is text-only and assistant reply is text-only in history.
	# For the current turn, we can attach an image (if provided) and the user's text.
	for turn in history:
	# Each turn in history is a tuple (user_text, assistant_text) from gr.Chatbot
	user_text, assistant_text = turn
	if user_text is not None:
	msgs.append({"role": "user", "content": [user_text]})
	if assistant_text is not None:
	msgs.append({"role": "assistant", "content": [assistant_text]})

	# Append current user turn (with optional image)
	content = []
	if image is not None:
	# Ensure RGB
	if image.mode != "RGB":
	image = image.convert("RGB")
	content.append(image)
	if user_input and user_input.strip():
	content.append(user_input.strip())
	else:
	# Ensure there is at least something in the content
	content.append("")

	msgs.append({"role": "user", "content": content})
	return msgs


	def respond(user_text: str, image: Image.Image, chat_history: List[List[str]], enable_thinking: bool):
	"""
	Inference handler for Gradio. Returns updated chat history and clears the user textbox.
	"""
	load_model()

	# Build MiniCPM messages
	msgs = build_messages(chat_history or [], image, user_text)

	# Run model.chat
	with torch.inference_mode():
	answer = model.chat(
	msgs=msgs,
	tokenizer=tokenizer,
	enable_thinking=enable_thinking
	)

	# Update history shown in Chatbot: append (user_text, answer)
	# If user_text is empty but image provided, show a placeholder text.
	shown_user_msg = user_text.strip() if (user_text and user_text.strip()) else "[Image]"
	chat_history = chat_history + [[shown_user_msg, answer]]
	return chat_history, ""


	def clear_history():
	return [], None, ""


	def demo_app():
	with gr.Blocks(title="MiniCPM-V-4_5-int4 (CPU) - Gradio", theme="soft") as demo:
	gr.Markdown("## MiniCPM-V-4_5-int4 (CPU) Demo\nUpload an image (optional) and ask a question.")
	with gr.Row():
	with gr.Column(scale=3):
	chatbot = gr.Chatbot(height=420, type="messages", avatar_images=(None, None))
	with gr.Row():
	img = gr.Image(type="pil", label="Image (optional)", height=240)
	user_in = gr.Textbox(
	label="Your message",
	placeholder="Ask something about the image or chat without an image...",
	lines=3
	)
	with gr.Row():
	enable_thinking = gr.Checkbox(value=False, label="Enable thinking mode")
	send_btn = gr.Button("Send", variant="primary")
	clear_btn = gr.Button("Clear")

	with gr.Column(scale=1):
	gr.Markdown("### Model")
	gr.Markdown(f"- ID: `{MODEL_ID}`\n- Device: CPU\n- Quant: int4")

	# Events
	send_btn.click(
	fn=respond,
	inputs=[user_in, img, chatbot, enable_thinking],
	outputs=[chatbot, user_in]
	)
	user_in.submit(
	fn=respond,
	inputs=[user_in, img, chatbot, enable_thinking],
	outputs=[chatbot, user_in]
	)
	clear_btn.click(
	fn=clear_history,
	inputs=[],
	outputs=[chatbot, img, user_in]
	)

	return demo


	if __name__ == "__main__":
	# Make sure we don't accidentally spawn CUDA context
	os.environ.setdefault("CUDA_VISIBLE_DEVICES", "")
	demo = demo_app()
	demo.launch(server_name="0.0.0.0", server_port=int(os.environ.get("PORT", 7860)))