Spaces:

ByteDance-Seed
/

Seed1.5-VL

Paused

App Files Files Community

Seed1.5-VL / app.py

wondervictor

change styles

d8936c7 3 months ago

raw

history blame contribute delete

22.6 kB

	# Copyright (2025) [Seed-VL-Cookbook] Bytedance Seed
	import os
	import re
	import cv2
	import json
	import time
	import numpy as np
	import gradio as gr
	from infer import SeedVLInfer, ConversationModeI18N, ConversationModeCN
	from visualizer import draw_boxes_points_with_labels

	infer = SeedVLInfer(model_id=os.getenv('MODEL_ID'),
	api_key=os.getenv('API_KEY'))

	label_translations = {
	"gr_chatinterface_ofl": {
	"English": "Chatbot",
	"中文": "对话界面"
	},
	"gr_chatinterface_ol": {
	"English": "Chatbot",
	"中文": "对话界面"
	},
	"gr_tab_ol": {
	"English": "Online",
	"中文": "在线模式"
	},
	"gr_tab_ofl": {
	"English": "Offline",
	"中文": "离线模式"
	},
	"gr_thinking": {
	"English": ConversationModeI18N.D,
	"中文": ConversationModeCN.D,
	},
	"gr_temperature": {
	"English": "Temperature",
	"中文": "温度系数"
	},
	"gr_webcam_image": {
	"English": "🤳 Open Webcam",
	"中文": "🤳 打开摄像头"
	},
	"gr_webcam_images": {
	"English": "📹 Recorded Frames",
	"中文": "📹 录制的视频帧"
	},
	"gr_chatinterface_ofl.textbox.placeholder": {
	"English":
	"Ask me anything. You can also drop in images and .mp4 videos.",
	"中文": "有什么想问的？支持上传图片和.mp4视频。"
	},
	"gr_chatinterface_ol.textbox.placeholder": {
	"English": "Ask me anything...",
	"中文": "有什么想问的？"
	},
	"gr_clear_button": {
	"English": "🧹 Clear History",
	"中文": "🧹 清除历史对话"
	}
	}


	def add_escape(text: str):
	return text.replace('<', '\<').replace('>', '\>')


	def remove_escape(text: str):
	return text.replace('\<', '<').replace('\>', '>')


	def plot_boxes_points_detections(image_path, message):
	detection_pattern = r'\[\s{.?}\s*\]'
	detection_matches = re.finditer(detection_pattern,
	message,
	flags=re.DOTALL)
	bboxes, categories = [], []
	for match in detection_matches:
	matched_str = match.group(0)
	detections = json.loads(matched_str)
	for detection in detections:
	cat, bbox_str = detection['category'], detection['bbox']
	bbox_str = bbox_str.replace('<bbox>',
	'').replace('</bbox>',
	'').replace('</bbox', '')
	bbox = list(map(float, bbox_str.split(' ')))
	bboxes.append(bbox)
	categories.append(cat)
	if not bboxes:
	box_pattern = r'<bbox>(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)</bbox>'
	box_matches = re.finditer(box_pattern, message)
	bboxes = [[
	float(match.group(1)),
	float(match.group(2)),
	float(match.group(3)),
	float(match.group(4))
	] for match in box_matches]

	points = []
	if not bboxes:
	point_pattern = r'<point>(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)</point>'
	point_matches = re.finditer(point_pattern, message)
	points = [[float(match.group(1)),
	float(match.group(2))] for match in point_matches]

	if not bboxes and not points:
	return

	bboxes = np.array(bboxes, dtype='float') / 1000
	points = np.array(points, dtype='float') / 1000

	image = cv2.imread(image_path)
	h, w, c = image.shape
	if bboxes.size:
	bboxes[:, 0::2] *= w
	bboxes[:, 1::2] *= h
	if points.size:
	points[:, 0] *= w
	points[:, 1] *= h
	output_image = draw_boxes_points_with_labels(image, bboxes, points,
	categories)
	return output_image


	def general_chat(inputs: dict,
	gr_history: list,
	infer_history: list,
	if_thinking: bool,
	temperature: float,
	online: bool = False):
	if 'text' in inputs:
	inputs['text'] = remove_escape(inputs['text'])
	mode = ConversationModeI18N.D if if_thinking else ConversationModeI18N.G
	for response_text, infer_history, finished in infer(
	inputs=inputs,
	history=infer_history,
	mode=mode,
	temperature=temperature,
	online=online):
	if if_thinking:
	reasoning_text, response_text = response_text.split('</think>')
	reasoning_text = reasoning_text.lstrip('<think>')
	response_message = [{
	"role": "assistant",
	"content": add_escape(reasoning_text),
	'metadata': {
	'title': '🤔 Thinking'
	}
	}, {
	"role": "assistant",
	"content": add_escape(response_text)
	}]
	else:
	response_message = [{
	"role": "assistant",
	"content": add_escape(response_text)
	}]
	if finished and len(inputs.get(
	'files', [])) == 1 and not inputs['files'][0].endswith('.mp4'):
	image_path = inputs['files'][0]
	response_text = infer_history[-1]['content']
	try:
	if if_thinking:
	reasoning_text, response_text = response_text.split(
	'</think>')
	output_image = plot_boxes_points_detections(
	image_path, response_text)
	if output_image is not None:
	response_message.append({
	"role": "assistant",
	"content": gr.Image(output_image),
	})
	except Exception as e:
	print(e)
	yield response_message, infer_history


	def online_record_chat(text: str, gr_history: list, gr_webcam_images: list,
	gr_counter: int, infer_history: list, if_thinking: bool,
	temperature: float):
	if not gr_webcam_images:
	gr_webcam_images = []
	gr_webcam_images = gr_webcam_images[gr_counter:]
	inputs = {'text': text, 'files': [webp for webp, _ in gr_webcam_images]}
	yield f'received {len(gr_webcam_images)} new frames, processing...', gr_counter + len(
	gr_webcam_images), infer_history
	for response_message, infer_history in general_chat(inputs,
	gr_history,
	infer_history,
	if_thinking,
	temperature,
	online=True):
	yield response_message, gr.skip(), infer_history


	with gr.Blocks(theme=gr.themes.Soft()) as demo:
	with gr.Row():
	with gr.Column():
	gr_title = gr.Markdown('<h1>Seed1.5-VL</h1>')
	gr_desc = gr.Markdown('<h3>Advancing Multimodal Understanding and Reasoning.</h3>')

	gr_lang_selector = gr.Dropdown(choices=["English", "中文"],
	value="English",
	label="🌐 English Interface/中文界面",
	interactive=True,
	min_width=400,
	scale=0)

	with gr.Tabs():
	with gr.Tab("Offline") as gr_tab_ofl:
	gr_infer_history = gr.State([])
	gr_thinking_hidden = gr.Checkbox(value=True, visible=False)
	gr_temperature_hidden = gr.Slider(minimum=0.0,
	maximum=2.0,
	step=0.1,
	value=0.0,
	interactive=True,
	visible=False)
	gr_chatinterface_ofl = gr.ChatInterface(
	fn=general_chat,
	type="messages",
	multimodal=True,
	chatbot=gr.Chatbot(height=600),
	textbox=gr.MultimodalTextbox(
	file_count="multiple",
	file_types=["image", ".mp4"],
	sources=["upload"],
	stop_btn=True,
	placeholder=label_translations[
	'gr_chatinterface_ofl.textbox.placeholder']['English'],
	),
	additional_inputs=[
	gr_infer_history, gr_thinking_hidden, gr_temperature_hidden
	],
	additional_outputs=[gr_infer_history],
	)

	def add_escape_fn(inputs: dict):
	if inputs and 'text' in inputs:
	inputs['text'] = add_escape(inputs['text'])
	return inputs

	gr_chatinterface_ofl.textbox.submit(
	fn=add_escape_fn,
	inputs=[gr_chatinterface_ofl.saved_input],
	outputs=[gr_chatinterface_ofl.saved_input])
	gr.on(triggers=[gr_chatinterface_ofl.chatbot.clear],
	fn=lambda: [],
	outputs=[gr_infer_history])
	with gr.Row():
	gr_thinking_ofl = gr.Checkbox(
	value=True,
	label=label_translations['gr_thinking']['English'],
	)
	gr_thinking_ofl.change(lambda x: x,
	inputs=gr_thinking_ofl,
	outputs=gr_thinking_hidden)
	gr_temperature_ofl = gr.Slider(
	minimum=0.0,
	maximum=2.0,
	step=0.1,
	value=0.0,
	label=label_translations['gr_temperature']['English'],
	interactive=True)
	gr_temperature_ofl.change(lambda x: x,
	inputs=gr_temperature_ofl,
	outputs=gr_temperature_hidden)
	gr_clear_button_ofl = gr.Button(
	value=label_translations['gr_clear_button']['English'])

	def clear_history_fn():
	return None, [], [], [], []

	gr_clear_button_ofl.click(
	fn=clear_history_fn,
	outputs=[
	gr_chatinterface_ofl.conversation_id,
	gr_chatinterface_ofl.saved_conversations,
	gr_chatinterface_ofl.chatbot,
	gr_chatinterface_ofl.chatbot_state, gr_infer_history
	])
	with gr.Column(visible=True) as gr_examples_en:
	gr.Examples(
	label=
	'7 Examples: text, image, video, multiple images/videos, visual puzzle, points grounding, open-vocabulary detection.',
	examples=[{
	"text": "Who are you?",
	"files": []
	}, {
	"text": "Introduce this.",
	"files": ["examples/bancopy.jpg"]
	}, {
	"text":
	"""Find Curry's "Good Night" celebration time.""",
	"files": ["examples/I7pTpMjqNRM_1080p_small.mp4"]
	}, {
	"text":
	"Share your feelings.",
	"files":
	["examples/newyork.jpg", "examples/beijing.jpg"]
	}, {
	"text": "Look and answer.",
	"files": ["examples/puzzle.jpg"]
	}, {
	"text":
	"Please point out all the hats on people's heads in the image, output concatenated point coordinates like <point>x y</point><point>x y</point>",
	"files": ["examples/000000001000.jpeg"]
	}, {
	"text":
	"""Please detect all plate, photo, kid, cup in the image, and output all objects in the JSON format, which is a list of dict like [{"category": category, "bbox": "<bbox>x1 y1 x2 y2</bbox>"}, {"category": category, "bbox": "<bbox>x1 y1 x2 y2</bbox>"}]""",
	"files": ["examples/000000018380.jpeg"]
	}],
	inputs=[gr_chatinterface_ofl.textbox],
	)
	with gr.Column(visible=False) as gr_examples_cn:
	gr.Examples(
	label='七个示例：文本，图像，视频，多个图像/视频，视觉解谜，坐标定位，开放式物体检测。',
	examples=[{
	"text": "你是谁？",
	"files": []
	}, {
	"text": "介绍一下。",
	"files": ["examples/bancopy.jpg"]
	}, {
	"text": "找到库里的“晚安”庆祝时间段。",
	"files": ["examples/I7pTpMjqNRM_1080p_small.mp4"]
	}, {
	"text":
	"你有什么感想？",
	"files":
	["examples/newyork.jpg", "examples/beijing.jpg"]
	}, {
	"text": "看图回答。",
	"files": ["examples/puzzle.jpg"]
	}, {
	"text":
	"请点出图像中所有戴在头上的帽子, 输出串联的点坐标<point>x y</point><point>x y</point>",
	"files": ["examples/000000001000.jpeg"]
	}, {
	"text":
	"""请检测图像中所有的盘子、照片、小孩和杯子。请以JSON格式输出一个由字典组成的列表，就像：[{"category": 类别, "bbox": "<bbox>x1 y1 x2 y2</bbox>"}, {"category": 类别, "bbox": "<bbox>x1 y1 x2 y2</bbox>"}]""",
	"files": ["examples/000000018380.jpeg"]
	}],
	inputs=[gr_chatinterface_ofl.textbox],
	)
	with gr.Tab("Online") as gr_tab_ol:
	with gr.Row():
	with gr.Column(scale=1):
	gr_infer_history_ol = gr.State([])
	gr_thinking_hidden = gr.Checkbox(value=True, visible=False)
	gr_temperature_hidden = gr.Slider(minimum=0.0,
	maximum=2.0,
	step=0.1,
	value=1.0,
	interactive=True,
	visible=False)
	with gr.Row():
	with gr.Column(scale=1):
	gr_webcam_image = gr.Image(
	label=label_translations['gr_webcam_image']
	['English'],
	sources="webcam",
	height=250,
	type='filepath')
	gr_webcam_images = gr.Gallery(
	label=label_translations['gr_webcam_images']
	['English'],
	show_label=True,
	format='webp',
	columns=1,
	height=250,
	preview=True,
	interactive=False)
	gr_counter = gr.Number(value=0, visible=False)
	with gr.Column(scale=3):
	gr_chatinterface_ol = gr.ChatInterface(
	fn=online_record_chat,
	type="messages",
	multimodal=False,
	chatbot=gr.Chatbot(height=600),
	textbox=gr.
	Textbox(placeholder=label_translations[
	'gr_chatinterface_ol.textbox.placeholder']
	['English'],
	submit_btn=True,
	stop_btn=True),
	additional_inputs=[
	gr_webcam_images, gr_counter,
	gr_infer_history_ol, gr_thinking_hidden,
	gr_temperature_hidden
	],
	additional_outputs=[
	gr_counter, gr_infer_history_ol
	],
	)

	def cache_webcam(recorded_image: str,
	recorded_images: list):
	if not recorded_images:
	recorded_images = []
	return recorded_images + [recorded_image]

	gr_webcam_image.stream(
	fn=cache_webcam,
	inputs=[gr_webcam_image, gr_webcam_images],
	outputs=[gr_webcam_images],
	stream_every=1,
	concurrency_limit=30,
	)
	with gr.Row():
	gr_thinking_ol = gr.Checkbox(
	value=True,
	label=label_translations['gr_thinking']
	['English'],
	)
	gr_thinking_ol.change(
	lambda x: x,
	inputs=gr_thinking_ol,
	outputs=gr_thinking_hidden)
	gr_temperature_ol = gr.Slider(
	minimum=0.0,
	maximum=2.0,
	step=0.1,
	value=1.0,
	label=label_translations['gr_temperature']
	['English'],
	interactive=True)
	gr_temperature_ol.change(
	lambda x: x,
	inputs=gr_temperature_ol,
	outputs=gr_temperature_hidden)
	gr_clear_button_ol = gr.Button(
	value=label_translations['gr_clear_button']
	['English'])

	def clear_history_fn():
	return None, [], [], [], []

	gr_clear_button_ol.click(
	fn=clear_history_fn,
	outputs=[
	gr_chatinterface_ol.conversation_id,
	gr_chatinterface_ol.
	saved_conversations,
	gr_chatinterface_ol.chatbot,
	gr_chatinterface_ol.chatbot_state,
	gr_infer_history_ol
	])

	def update_lang(lang: str):
	return (
	gr.update(label=label_translations['gr_chatinterface_ofl'][lang]),
	gr.update(label=label_translations['gr_chatinterface_ol'][lang]),
	gr.update(placeholder=label_translations[
	'gr_chatinterface_ofl.textbox.placeholder'][lang]),
	gr.update(placeholder=label_translations[
	'gr_chatinterface_ol.textbox.placeholder'][lang]),
	gr.update(label=label_translations['gr_tab_ofl'][lang]),
	gr.update(label=label_translations['gr_tab_ol'][lang]),
	gr.update(label=label_translations['gr_thinking'][lang]),
	gr.update(label=label_translations['gr_thinking'][lang]),
	gr.update(label=label_translations['gr_temperature'][lang]),
	gr.update(label=label_translations['gr_temperature'][lang]),
	gr.update(visible=lang == 'English'),
	gr.update(visible=lang != 'English'),
	gr.update(label=label_translations['gr_webcam_image'][lang]),
	gr.update(label=label_translations['gr_webcam_images'][lang]),
	gr.update(value=label_translations['gr_clear_button'][lang]),
	gr.update(value=label_translations['gr_clear_button'][lang]),
	)

	gr_lang_selector.change(fn=update_lang,
	inputs=[gr_lang_selector],
	outputs=[
	gr_chatinterface_ofl.chatbot,
	gr_chatinterface_ol.chatbot,
	gr_chatinterface_ofl.textbox,
	gr_chatinterface_ol.textbox,
	gr_tab_ofl,
	gr_tab_ol,
	gr_thinking_ofl,
	gr_thinking_ol,
	gr_temperature_ofl,
	gr_temperature_ol,
	gr_examples_en,
	gr_examples_cn,
	gr_webcam_image,
	gr_webcam_images,
	gr_clear_button_ofl,
	gr_clear_button_ol,
	])
	demo.queue(default_concurrency_limit=100, max_size=100).launch(share=True,
	max_threads=100,
	ssr_mode=False)