Spaces:

echarlaix
/

vision-langage-openvino

Running on CPU Upgrade

App Files Files Community

echarlaix HF Staff commited on Jul 8

Commit

3586102

1 Parent(s): ce41e3e

add model choices

Browse files

Files changed (1) hide show

app.py +29 -6

app.py CHANGED Viewed

@@ -12,12 +12,22 @@ from transformers import AutoModelForImageTextToText, AutoProcessor
 from transformers.generation.streamers import TextIteratorStreamer
 from optimum.intel import OVModelForVisualCausalLM
-model_id = "echarlaix/SmolVLM2-2.2B-Instruct-openvino"
-# model_id = "echarlaix/SmolVLM-256M-Instruct-openvino"
-# model_id = "echarlaix/SmolVLM2-500M-Video-Instruct-openvino"
-processor = AutoProcessor.from_pretrained(model_id)
-model = OVModelForVisualCausalLM.from_pretrained(model_id)
 IMAGE_FILE_TYPES = (".jpg", ".jpeg", ".png", ".webp")
 VIDEO_FILE_TYPES = (".mp4", ".mov", ".webm")
@@ -152,7 +162,12 @@ def process_history(history: list[dict]) -> list[dict]:
 @torch.inference_mode()
-def generate(message: dict, history: list[dict], system_prompt: str = "", max_new_tokens: int = 512) -> Iterator[str]:
     if not validate_media_constraints(message):
         yield ""
         return
@@ -238,6 +253,13 @@ examples = [
     ],
 ]
 demo = gr.ChatInterface(
     fn=generate,
     type="messages",
@@ -248,6 +270,7 @@ demo = gr.ChatInterface(
     ),
     multimodal=True,
     additional_inputs=[
         gr.Textbox(label="System Prompt", value="You are a helpful assistant."),
         gr.Slider(label="Max New Tokens", minimum=100, maximum=2000, step=10, value=700),
     ],

 from transformers.generation.streamers import TextIteratorStreamer
 from optimum.intel import OVModelForVisualCausalLM
+default_model_id = "echarlaix/SmolVLM2-2.2B-Instruct-openvino"
+model_cache = {
+    "model_id" : default_model_id,
+    "processor" : AutoProcessor.from_pretrained(default_model_id),
+    "model" : OVModelForVisualCausalLM.from_pretrained(default_model_id),
+}
+def update_model(model_id):
+    if model_cache["model_id"] != model_id:
+        model_cache["model_id"] = model_id
+        model_cache["processor"] = AutoProcessor.from_pretrained(model_id)
+        model_cache["model"] = OVModelForVisualCausalLM.from_pretrained(model_id)
 IMAGE_FILE_TYPES = (".jpg", ".jpeg", ".png", ".webp")
 VIDEO_FILE_TYPES = (".mp4", ".mov", ".webm")
 @torch.inference_mode()
+def generate(message: dict, history: list[dict], model_id: str, system_prompt: str = "", max_new_tokens: int = 512) -> Iterator[str]:
+    update_model(model_id)
+    processor = model_cache["processor"]
+    model = model_cache["model"]
     if not validate_media_constraints(message):
         yield ""
         return
     ],
 ]
+model_choices = [
+    "echarlaix/SmolVLM2-2.2B-Instruct-openvino",
+    "echarlaix/SmolVLM-256M-Instruct-openvino",
+    "echarlaix/SmolVLM2-500M-Video-Instruct-openvino",
+]
 demo = gr.ChatInterface(
     fn=generate,
     type="messages",
     ),
     multimodal=True,
     additional_inputs=[
+        gr.Dropdown(model_choices, value=model_choices[0], label="Model ID"),
         gr.Textbox(label="System Prompt", value="You are a helpful assistant."),
         gr.Slider(label="Max New Tokens", minimum=100, maximum=2000, step=10, value=700),
     ],