|
|
|
import os |
|
import re |
|
import cv2 |
|
import json |
|
import time |
|
import numpy as np |
|
import gradio as gr |
|
from infer import SeedVLInfer, ConversationModeI18N, ConversationModeCN |
|
from visualizer import draw_boxes_points_with_labels |
|
|
|
infer = SeedVLInfer(model_id=os.getenv('MODEL_ID'), |
|
api_key=os.getenv('API_KEY')) |
|
|
|
label_translations = { |
|
"gr_chatinterface_ofl": { |
|
"English": "Chatbot", |
|
"中文": "对话界面" |
|
}, |
|
"gr_chatinterface_ol": { |
|
"English": "Chatbot", |
|
"中文": "对话界面" |
|
}, |
|
"gr_tab_ol": { |
|
"English": "Online", |
|
"中文": "在线模式" |
|
}, |
|
"gr_tab_ofl": { |
|
"English": "Offline", |
|
"中文": "离线模式" |
|
}, |
|
"gr_thinking": { |
|
"English": ConversationModeI18N.D, |
|
"中文": ConversationModeCN.D, |
|
}, |
|
"gr_temperature": { |
|
"English": "Temperature", |
|
"中文": "温度系数" |
|
}, |
|
"gr_webcam_image": { |
|
"English": "🤳 Open Webcam", |
|
"中文": "🤳 打开摄像头" |
|
}, |
|
"gr_webcam_images": { |
|
"English": "📹 Recorded Frames", |
|
"中文": "📹 录制的视频帧" |
|
}, |
|
"gr_chatinterface_ofl.textbox.placeholder": { |
|
"English": |
|
"Ask me anything. You can also drop in images and .mp4 videos.", |
|
"中文": "有什么想问的?支持上传图片和.mp4视频。" |
|
}, |
|
"gr_chatinterface_ol.textbox.placeholder": { |
|
"English": "Ask me anything...", |
|
"中文": "有什么想问的?" |
|
}, |
|
"gr_clear_button": { |
|
"English": "🧹 Clear History", |
|
"中文": "🧹 清除历史对话" |
|
} |
|
} |
|
|
|
|
|
def add_escape(text: str): |
|
return text.replace('<', '\<').replace('>', '\>') |
|
|
|
|
|
def remove_escape(text: str): |
|
return text.replace('\<', '<').replace('\>', '>') |
|
|
|
|
|
def plot_boxes_points_detections(image_path, message): |
|
detection_pattern = r'\[\s*{.*?}\s*\]' |
|
detection_matches = re.finditer(detection_pattern, |
|
message, |
|
flags=re.DOTALL) |
|
bboxes, categories = [], [] |
|
for match in detection_matches: |
|
matched_str = match.group(0) |
|
detections = json.loads(matched_str) |
|
for detection in detections: |
|
cat, bbox_str = detection['category'], detection['bbox'] |
|
bbox_str = bbox_str.replace('<bbox>', |
|
'').replace('</bbox>', |
|
'').replace('</bbox', '') |
|
bbox = list(map(float, bbox_str.split(' '))) |
|
bboxes.append(bbox) |
|
categories.append(cat) |
|
if not bboxes: |
|
box_pattern = r'<bbox>(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)</bbox>' |
|
box_matches = re.finditer(box_pattern, message) |
|
bboxes = [[ |
|
float(match.group(1)), |
|
float(match.group(2)), |
|
float(match.group(3)), |
|
float(match.group(4)) |
|
] for match in box_matches] |
|
|
|
points = [] |
|
if not bboxes: |
|
point_pattern = r'<point>(\d+(?:\.\d+)?)\s+(\d+(?:\.\d+)?)</point>' |
|
point_matches = re.finditer(point_pattern, message) |
|
points = [[float(match.group(1)), |
|
float(match.group(2))] for match in point_matches] |
|
|
|
if not bboxes and not points: |
|
return |
|
|
|
bboxes = np.array(bboxes, dtype='float') / 1000 |
|
points = np.array(points, dtype='float') / 1000 |
|
|
|
image = cv2.imread(image_path) |
|
h, w, c = image.shape |
|
if bboxes.size: |
|
bboxes[:, 0::2] *= w |
|
bboxes[:, 1::2] *= h |
|
if points.size: |
|
points[:, 0] *= w |
|
points[:, 1] *= h |
|
output_image = draw_boxes_points_with_labels(image, bboxes, points, |
|
categories) |
|
return output_image |
|
|
|
|
|
def general_chat(inputs: dict, |
|
gr_history: list, |
|
infer_history: list, |
|
if_thinking: bool, |
|
temperature: float, |
|
online: bool = False): |
|
if 'text' in inputs: |
|
inputs['text'] = remove_escape(inputs['text']) |
|
mode = ConversationModeI18N.D if if_thinking else ConversationModeI18N.G |
|
for response_text, infer_history, finished in infer( |
|
inputs=inputs, |
|
history=infer_history, |
|
mode=mode, |
|
temperature=temperature, |
|
online=online): |
|
if if_thinking: |
|
reasoning_text, response_text = response_text.split('</think>') |
|
reasoning_text = reasoning_text.lstrip('<think>') |
|
response_message = [{ |
|
"role": "assistant", |
|
"content": add_escape(reasoning_text), |
|
'metadata': { |
|
'title': '🤔 Thinking' |
|
} |
|
}, { |
|
"role": "assistant", |
|
"content": add_escape(response_text) |
|
}] |
|
else: |
|
response_message = [{ |
|
"role": "assistant", |
|
"content": add_escape(response_text) |
|
}] |
|
if finished and len(inputs.get( |
|
'files', [])) == 1 and not inputs['files'][0].endswith('.mp4'): |
|
image_path = inputs['files'][0] |
|
response_text = infer_history[-1]['content'] |
|
try: |
|
if if_thinking: |
|
reasoning_text, response_text = response_text.split( |
|
'</think>') |
|
output_image = plot_boxes_points_detections( |
|
image_path, response_text) |
|
if output_image is not None: |
|
response_message.append({ |
|
"role": "assistant", |
|
"content": gr.Image(output_image), |
|
}) |
|
except Exception as e: |
|
print(e) |
|
yield response_message, infer_history |
|
|
|
|
|
def online_record_chat(text: str, gr_history: list, gr_webcam_images: list, |
|
gr_counter: int, infer_history: list, if_thinking: bool, |
|
temperature: float): |
|
if not gr_webcam_images: |
|
gr_webcam_images = [] |
|
gr_webcam_images = gr_webcam_images[gr_counter:] |
|
inputs = {'text': text, 'files': [webp for webp, _ in gr_webcam_images]} |
|
yield f'received {len(gr_webcam_images)} new frames, processing...', gr_counter + len( |
|
gr_webcam_images), infer_history |
|
for response_message, infer_history in general_chat(inputs, |
|
gr_history, |
|
infer_history, |
|
if_thinking, |
|
temperature, |
|
online=True): |
|
yield response_message, gr.skip(), infer_history |
|
|
|
|
|
with gr.Blocks(theme=gr.themes.Soft()) as demo: |
|
with gr.Row(): |
|
with gr.Column(): |
|
gr_title = gr.Markdown('<h1>Seed1.5-VL</h1>') |
|
gr_desc = gr.Markdown('<h3>Advancing Multimodal Understanding and Reasoning.</h3>') |
|
|
|
gr_lang_selector = gr.Dropdown(choices=["English", "中文"], |
|
value="English", |
|
label="🌐 English Interface/中文界面", |
|
interactive=True, |
|
min_width=400, |
|
scale=0) |
|
|
|
with gr.Tabs(): |
|
with gr.Tab("Offline") as gr_tab_ofl: |
|
gr_infer_history = gr.State([]) |
|
gr_thinking_hidden = gr.Checkbox(value=True, visible=False) |
|
gr_temperature_hidden = gr.Slider(minimum=0.0, |
|
maximum=2.0, |
|
step=0.1, |
|
value=0.0, |
|
interactive=True, |
|
visible=False) |
|
gr_chatinterface_ofl = gr.ChatInterface( |
|
fn=general_chat, |
|
type="messages", |
|
multimodal=True, |
|
chatbot=gr.Chatbot(height=600), |
|
textbox=gr.MultimodalTextbox( |
|
file_count="multiple", |
|
file_types=["image", ".mp4"], |
|
sources=["upload"], |
|
stop_btn=True, |
|
placeholder=label_translations[ |
|
'gr_chatinterface_ofl.textbox.placeholder']['English'], |
|
), |
|
additional_inputs=[ |
|
gr_infer_history, gr_thinking_hidden, gr_temperature_hidden |
|
], |
|
additional_outputs=[gr_infer_history], |
|
) |
|
|
|
def add_escape_fn(inputs: dict): |
|
if inputs and 'text' in inputs: |
|
inputs['text'] = add_escape(inputs['text']) |
|
return inputs |
|
|
|
gr_chatinterface_ofl.textbox.submit( |
|
fn=add_escape_fn, |
|
inputs=[gr_chatinterface_ofl.saved_input], |
|
outputs=[gr_chatinterface_ofl.saved_input]) |
|
gr.on(triggers=[gr_chatinterface_ofl.chatbot.clear], |
|
fn=lambda: [], |
|
outputs=[gr_infer_history]) |
|
with gr.Row(): |
|
gr_thinking_ofl = gr.Checkbox( |
|
value=True, |
|
label=label_translations['gr_thinking']['English'], |
|
) |
|
gr_thinking_ofl.change(lambda x: x, |
|
inputs=gr_thinking_ofl, |
|
outputs=gr_thinking_hidden) |
|
gr_temperature_ofl = gr.Slider( |
|
minimum=0.0, |
|
maximum=2.0, |
|
step=0.1, |
|
value=0.0, |
|
label=label_translations['gr_temperature']['English'], |
|
interactive=True) |
|
gr_temperature_ofl.change(lambda x: x, |
|
inputs=gr_temperature_ofl, |
|
outputs=gr_temperature_hidden) |
|
gr_clear_button_ofl = gr.Button( |
|
value=label_translations['gr_clear_button']['English']) |
|
|
|
def clear_history_fn(): |
|
return None, [], [], [], [] |
|
|
|
gr_clear_button_ofl.click( |
|
fn=clear_history_fn, |
|
outputs=[ |
|
gr_chatinterface_ofl.conversation_id, |
|
gr_chatinterface_ofl.saved_conversations, |
|
gr_chatinterface_ofl.chatbot, |
|
gr_chatinterface_ofl.chatbot_state, gr_infer_history |
|
]) |
|
with gr.Column(visible=True) as gr_examples_en: |
|
gr.Examples( |
|
label= |
|
'7 Examples: text, image, video, multiple images/videos, visual puzzle, points grounding, open-vocabulary detection.', |
|
examples=[{ |
|
"text": "Who are you?", |
|
"files": [] |
|
}, { |
|
"text": "Introduce this.", |
|
"files": ["examples/bancopy.jpg"] |
|
}, { |
|
"text": |
|
"""Find Curry's "Good Night" celebration time.""", |
|
"files": ["examples/I7pTpMjqNRM_1080p_small.mp4"] |
|
}, { |
|
"text": |
|
"Share your feelings.", |
|
"files": |
|
["examples/newyork.jpg", "examples/beijing.jpg"] |
|
}, { |
|
"text": "Look and answer.", |
|
"files": ["examples/puzzle.jpg"] |
|
}, { |
|
"text": |
|
"Please point out all the hats on people's heads in the image, output concatenated point coordinates like <point>x y</point><point>x y</point>", |
|
"files": ["examples/000000001000.jpeg"] |
|
}, { |
|
"text": |
|
"""Please detect all plate, photo, kid, cup in the image, and output all objects in the JSON format, which is a list of dict like [{"category": category, "bbox": "<bbox>x1 y1 x2 y2</bbox>"}, {"category": category, "bbox": "<bbox>x1 y1 x2 y2</bbox>"}]""", |
|
"files": ["examples/000000018380.jpeg"] |
|
}], |
|
inputs=[gr_chatinterface_ofl.textbox], |
|
) |
|
with gr.Column(visible=False) as gr_examples_cn: |
|
gr.Examples( |
|
label='七个示例:文本,图像,视频,多个图像/视频,视觉解谜,坐标定位,开放式物体检测。', |
|
examples=[{ |
|
"text": "你是谁?", |
|
"files": [] |
|
}, { |
|
"text": "介绍一下。", |
|
"files": ["examples/bancopy.jpg"] |
|
}, { |
|
"text": "找到库里的“晚安”庆祝时间段。", |
|
"files": ["examples/I7pTpMjqNRM_1080p_small.mp4"] |
|
}, { |
|
"text": |
|
"你有什么感想?", |
|
"files": |
|
["examples/newyork.jpg", "examples/beijing.jpg"] |
|
}, { |
|
"text": "看图回答。", |
|
"files": ["examples/puzzle.jpg"] |
|
}, { |
|
"text": |
|
"请点出图像中所有戴在头上的帽子, 输出串联的点坐标<point>x y</point><point>x y</point>", |
|
"files": ["examples/000000001000.jpeg"] |
|
}, { |
|
"text": |
|
"""请检测图像中所有的盘子、照片、小孩和杯子。请以JSON格式输出一个由字典组成的列表,就像:[{"category": 类别, "bbox": "<bbox>x1 y1 x2 y2</bbox>"}, {"category": 类别, "bbox": "<bbox>x1 y1 x2 y2</bbox>"}]""", |
|
"files": ["examples/000000018380.jpeg"] |
|
}], |
|
inputs=[gr_chatinterface_ofl.textbox], |
|
) |
|
with gr.Tab("Online") as gr_tab_ol: |
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
gr_infer_history_ol = gr.State([]) |
|
gr_thinking_hidden = gr.Checkbox(value=True, visible=False) |
|
gr_temperature_hidden = gr.Slider(minimum=0.0, |
|
maximum=2.0, |
|
step=0.1, |
|
value=1.0, |
|
interactive=True, |
|
visible=False) |
|
with gr.Row(): |
|
with gr.Column(scale=1): |
|
gr_webcam_image = gr.Image( |
|
label=label_translations['gr_webcam_image'] |
|
['English'], |
|
sources="webcam", |
|
height=250, |
|
type='filepath') |
|
gr_webcam_images = gr.Gallery( |
|
label=label_translations['gr_webcam_images'] |
|
['English'], |
|
show_label=True, |
|
format='webp', |
|
columns=1, |
|
height=250, |
|
preview=True, |
|
interactive=False) |
|
gr_counter = gr.Number(value=0, visible=False) |
|
with gr.Column(scale=3): |
|
gr_chatinterface_ol = gr.ChatInterface( |
|
fn=online_record_chat, |
|
type="messages", |
|
multimodal=False, |
|
chatbot=gr.Chatbot(height=600), |
|
textbox=gr. |
|
Textbox(placeholder=label_translations[ |
|
'gr_chatinterface_ol.textbox.placeholder'] |
|
['English'], |
|
submit_btn=True, |
|
stop_btn=True), |
|
additional_inputs=[ |
|
gr_webcam_images, gr_counter, |
|
gr_infer_history_ol, gr_thinking_hidden, |
|
gr_temperature_hidden |
|
], |
|
additional_outputs=[ |
|
gr_counter, gr_infer_history_ol |
|
], |
|
) |
|
|
|
def cache_webcam(recorded_image: str, |
|
recorded_images: list): |
|
if not recorded_images: |
|
recorded_images = [] |
|
return recorded_images + [recorded_image] |
|
|
|
gr_webcam_image.stream( |
|
fn=cache_webcam, |
|
inputs=[gr_webcam_image, gr_webcam_images], |
|
outputs=[gr_webcam_images], |
|
stream_every=1, |
|
concurrency_limit=30, |
|
) |
|
with gr.Row(): |
|
gr_thinking_ol = gr.Checkbox( |
|
value=True, |
|
label=label_translations['gr_thinking'] |
|
['English'], |
|
) |
|
gr_thinking_ol.change( |
|
lambda x: x, |
|
inputs=gr_thinking_ol, |
|
outputs=gr_thinking_hidden) |
|
gr_temperature_ol = gr.Slider( |
|
minimum=0.0, |
|
maximum=2.0, |
|
step=0.1, |
|
value=1.0, |
|
label=label_translations['gr_temperature'] |
|
['English'], |
|
interactive=True) |
|
gr_temperature_ol.change( |
|
lambda x: x, |
|
inputs=gr_temperature_ol, |
|
outputs=gr_temperature_hidden) |
|
gr_clear_button_ol = gr.Button( |
|
value=label_translations['gr_clear_button'] |
|
['English']) |
|
|
|
def clear_history_fn(): |
|
return None, [], [], [], [] |
|
|
|
gr_clear_button_ol.click( |
|
fn=clear_history_fn, |
|
outputs=[ |
|
gr_chatinterface_ol.conversation_id, |
|
gr_chatinterface_ol. |
|
saved_conversations, |
|
gr_chatinterface_ol.chatbot, |
|
gr_chatinterface_ol.chatbot_state, |
|
gr_infer_history_ol |
|
]) |
|
|
|
def update_lang(lang: str): |
|
return ( |
|
gr.update(label=label_translations['gr_chatinterface_ofl'][lang]), |
|
gr.update(label=label_translations['gr_chatinterface_ol'][lang]), |
|
gr.update(placeholder=label_translations[ |
|
'gr_chatinterface_ofl.textbox.placeholder'][lang]), |
|
gr.update(placeholder=label_translations[ |
|
'gr_chatinterface_ol.textbox.placeholder'][lang]), |
|
gr.update(label=label_translations['gr_tab_ofl'][lang]), |
|
gr.update(label=label_translations['gr_tab_ol'][lang]), |
|
gr.update(label=label_translations['gr_thinking'][lang]), |
|
gr.update(label=label_translations['gr_thinking'][lang]), |
|
gr.update(label=label_translations['gr_temperature'][lang]), |
|
gr.update(label=label_translations['gr_temperature'][lang]), |
|
gr.update(visible=lang == 'English'), |
|
gr.update(visible=lang != 'English'), |
|
gr.update(label=label_translations['gr_webcam_image'][lang]), |
|
gr.update(label=label_translations['gr_webcam_images'][lang]), |
|
gr.update(value=label_translations['gr_clear_button'][lang]), |
|
gr.update(value=label_translations['gr_clear_button'][lang]), |
|
) |
|
|
|
gr_lang_selector.change(fn=update_lang, |
|
inputs=[gr_lang_selector], |
|
outputs=[ |
|
gr_chatinterface_ofl.chatbot, |
|
gr_chatinterface_ol.chatbot, |
|
gr_chatinterface_ofl.textbox, |
|
gr_chatinterface_ol.textbox, |
|
gr_tab_ofl, |
|
gr_tab_ol, |
|
gr_thinking_ofl, |
|
gr_thinking_ol, |
|
gr_temperature_ofl, |
|
gr_temperature_ol, |
|
gr_examples_en, |
|
gr_examples_cn, |
|
gr_webcam_image, |
|
gr_webcam_images, |
|
gr_clear_button_ofl, |
|
gr_clear_button_ol, |
|
]) |
|
demo.queue(default_concurrency_limit=100, max_size=100).launch(share=True, |
|
max_threads=100, |
|
ssr_mode=False) |
|
|