Spaces:

amd
/

gpt-oss-120b-chatbot

Running on CPU Upgrade

App Files Files Community

mahdicv commited on 10 days ago

Commit

a93f838

1 Parent(s): cdbc1db

reverting to chat.completion API

Browse files

Files changed (3) hide show

README.md +1 -1
app.py +47 -78
gateway.py +26 -89

README.md CHANGED Viewed

@@ -8,7 +8,7 @@ sdk_version: 5.36.2
 app_file: app.py
 pinned: false
 license: apache-2.0
-short_description: 'gpt-oss-120b + web browsing + reasoning on AMD MI300X GPUs'
 models:
   - openai/gpt-oss-120b
 ---

 app_file: app.py
 pinned: false
 license: apache-2.0
+short_description: 'gpt-oss-120b on AMD MI300X GPUs'
 models:
   - openai/gpt-oss-120b
 ---

app.py CHANGED Viewed

@@ -1,12 +1,11 @@
-import os, logging, gradio as gr
-from pydoc import html
 from openai import OpenAI
 from gateway import request_generation
 from utils import LATEX_DELIMS
 openai_api_key = os.getenv("API_KEY")
 openai_api_base = os.getenv("API_ENDPOINT")
-model_name = os.getenv("MODEL_NAME")
 client = OpenAI(api_key=openai_api_key, base_url=openai_api_base)
 MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", 1024))
 CONCURRENCY_LIMIT = int(os.getenv("CONCURRENCY_LIMIT", 20))
@@ -14,26 +13,26 @@ QUEUE_SIZE = int(os.getenv("QUEUE_SIZE", CONCURRENCY_LIMIT * 4))
 logging.basicConfig(level=logging.INFO)
-def format_final(analysis_text: str, visible_text: str) -> str:
-    """Render final message with collapsible analysis + normal Markdown answer."""
-    reasoning_safe = html.escape((analysis_text or "").strip())
-    response = (visible_text or "").strip()
-    # Collapsible analysis, normal markdown answer
-    return (
-        "<details><summary><strong>🤔 Analysis</strong></summary>\n"
-        "<pre style='white-space:pre-wrap;'>"
-        f"{reasoning_safe}"
-        "</pre>\n</details>\n\n"
-        "**💬 Response:**\n\n"
-        f"{response}"
-    )
-def generate(message, history, system_prompt, temperature, reasoning_effort, enable_browsing, max_new_tokens):
     if not message.strip():
         yield "Please enter a prompt."
         return
-    # Flatten gradio history
     msgs = []
     for h in history:
         if isinstance(h, dict):
@@ -43,92 +42,62 @@ def generate(message, history, system_prompt, temperature, reasoning_effort, ena
             if u: msgs.append({"role": "user", "content": u})
             if a: msgs.append({"role": "assistant", "content": a})
-    tools = [{"type": "web_search_preview"}] if enable_browsing else None
-    tool_choice = "auto" if enable_browsing else None
-    in_analysis = False
-    in_visible  = False
-    raw_analysis = ""
-    raw_visible  = ""
-    raw_started = False
-    last_flush_len = 0
-    def make_raw_preview() -> str:
-        return (
-            "```text\n"
-            "Analysis (live):\n"
-            f"{raw_analysis}\n\n"
-            "Response (draft):\n"
-            f"{raw_visible}\n"
-            "```"
-        )
     try:
-        for chunk in request_generation(
             api_key=openai_api_key, api_base=openai_api_base,
             message=message, system_prompt=system_prompt,
-            model_name=model_name, chat_history=msgs,
-            temperature=temperature, reasoning_effort=reasoning_effort,
-            max_new_tokens=max_new_tokens, tools=tools, tool_choice=tool_choice,
         ):
-            if chunk == "analysis":
-                in_analysis, in_visible = True, False
-                if not raw_started:
-                    raw_started = True
-                    yield make_raw_preview()
                 continue
-            if chunk == "assistantfinal":
-                in_analysis, in_visible = False, True
-                if not raw_started:
-                    raw_started = True
-                    yield make_raw_preview()
-                continue
-            if in_analysis:
-                raw_analysis += chunk
-            elif in_visible:
-                raw_visible += chunk
-            else:
-                raw_visible += chunk
-            total_len = len(raw_analysis) + len(raw_visible)
-            if total_len - last_flush_len >= 120 or "\n" in chunk:
-                last_flush_len = total_len
-                yield make_raw_preview()
-        final_markdown = format_final(raw_analysis, raw_visible)
-        if final_markdown.count("$") % 2:
-            final_markdown += "$"
-        # This replaces the raw preview in-place with the pretty final message
-        yield final_markdown
     except Exception as e:
         logging.exception("Stream failed")
         yield f"❌ Error: {e}"
 chatbot_ui = gr.ChatInterface(
     fn=generate,
     type="messages",
     chatbot=gr.Chatbot(
         label="OSS vLLM Chatbot",
         type="messages",
         height=600,
         latex_delimiters=LATEX_DELIMS,
     ),
-    additional_inputs_accordion=gr.Accordion("⚙️ Settings", open=True),
     additional_inputs=[
         gr.Textbox(label="System prompt", value="You are a helpful assistant.", lines=2),
         gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, step=0.1, value=0.7),
-        gr.Radio(label="Reasoning Effort", choices=["low","medium","high"], value="medium"),
-        gr.Checkbox(label="Enable web browsing (web_search_preview)", value=False),
     ],
-    stop_btn=True,
     examples=[
         ["Explain the difference between supervised and unsupervised learning."],
         ["Summarize the plot of Inception in two sentences."],
@@ -137,10 +106,10 @@ chatbot_ui = gr.ChatInterface(
         ["Derive the gradient of softmax cross-entropy loss."],
         ["Explain why ∂/∂x xⁿ = n·xⁿ⁻¹ holds."],
     ],
     title=" GPT-OSS-120B on AMD MI300X",
-    description="This Space is an Alpha release that demonstrates gpt-oss-120b model running on AMD MI300 infrastructure. The space is built with Apache 2.0 License. ***DISCLAIMER:*** Analysis is provided along with final response to allow users to gain insight into model's chain of thought, but may contain content not deemed suitable to include in final response.",
 )
 if __name__ == "__main__":
     chatbot_ui.queue(max_size=QUEUE_SIZE,
                      default_concurrency_limit=CONCURRENCY_LIMIT).launch()

+import os, re, logging, gradio as gr
 from openai import OpenAI
 from gateway import request_generation
 from utils import LATEX_DELIMS
 openai_api_key = os.getenv("API_KEY")
 openai_api_base = os.getenv("API_ENDPOINT")
+MODEL = os.getenv("MODEL_NAME", "")
 client = OpenAI(api_key=openai_api_key, base_url=openai_api_base)
 MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", 1024))
 CONCURRENCY_LIMIT = int(os.getenv("CONCURRENCY_LIMIT", 20))
 logging.basicConfig(level=logging.INFO)
+def format_analysis_response(text):
+    m = re.search(r"analysis(.*?)assistantfinal", text, re.DOTALL)
+    if m:
+        reasoning = m.group(1).strip()
+        response = text.split("assistantfinal", 1)[-1].strip()
+        return (
+            f"**🤔 Analysis:**\n\n*{reasoning}*\n\n---\n\n"
+            f"**💬 Response:**\n\n{response}"
+        )
+    return text.strip()
+def generate(message, history,
+             system_prompt, temperature,
+             frequency_penalty, presence_penalty,
+             max_new_tokens):
     if not message.strip():
         yield "Please enter a prompt."
         return
     msgs = []
     for h in history:
         if isinstance(h, dict):
             if u: msgs.append({"role": "user", "content": u})
             if a: msgs.append({"role": "assistant", "content": a})
+    logging.info(f"[User] {message}")
+    logging.info(f"[System] {system_prompt} | Temp={temperature}")
+    collected, buffer = "", ""
+    yielded_once = False
     try:
+        for delta in request_generation(
             api_key=openai_api_key, api_base=openai_api_base,
             message=message, system_prompt=system_prompt,
+            model_name=MODEL, chat_history=msgs,
+            temperature=temperature,
+            frequency_penalty=frequency_penalty,
+            presence_penalty=presence_penalty,
+            max_new_tokens=max_new_tokens,
         ):
+            if not delta:
                 continue
+            collected += delta
+            buffer += delta
+            if not yielded_once:
+                yield delta
+                buffer = ""
+                yielded_once = True
+                continue
+            if "\n" in buffer or len(buffer) > 150:
+                yield collected
+                buffer = ""
+        final = format_analysis_response(collected)
+        if final.count("$") % 2:
+            final += "$"
+        yield final
     except Exception as e:
         logging.exception("Stream failed")
         yield f"❌ Error: {e}"
 chatbot_ui = gr.ChatInterface(
     fn=generate,
     type="messages",
     chatbot=gr.Chatbot(
         label="OSS vLLM Chatbot",
         type="messages",
+        scale=2,
         height=600,
         latex_delimiters=LATEX_DELIMS,
     ),
+    stop_btn=True,
     additional_inputs=[
         gr.Textbox(label="System prompt", value="You are a helpful assistant.", lines=2),
         gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, step=0.1, value=0.7),
     ],
     examples=[
         ["Explain the difference between supervised and unsupervised learning."],
         ["Summarize the plot of Inception in two sentences."],
         ["Derive the gradient of softmax cross-entropy loss."],
         ["Explain why ∂/∂x xⁿ = n·xⁿ⁻¹ holds."],
     ],
+    # title="Open-source GPT-OSS-120B on AMD MI300X",
     title=" GPT-OSS-120B on AMD MI300X",
+    description="This Space is an Alpha release that demonstrates gpt-oss-120b model running on AMD MI300 infrastructure. The space is built with Apache 2.0 License.",
 )
 if __name__ == "__main__":
     chatbot_ui.queue(max_size=QUEUE_SIZE,
                      default_concurrency_limit=CONCURRENCY_LIMIT).launch()

gateway.py CHANGED Viewed

@@ -1,7 +1,8 @@
-import json, logging
-from typing import List, Generator, Optional
 from openai import OpenAI
 def request_generation(
     api_key: str,
@@ -11,122 +12,58 @@ def request_generation(
     model_name: str,
     chat_history: Optional[List[dict]] = None,
     temperature: float = 0.3,
     max_new_tokens: int = 1024,
-    reasoning_effort: str = "off",
     tools: Optional[List[dict]] = None,
     tool_choice: Optional[str] = None,
 ) -> Generator[str, None, None]:
     """
-    Streams Responses API events. Emits:
-      - "analysis" sentinel once, then raw reasoning deltas
-      - "assistantfinal" sentinel once, then visible output deltas
-    If no visible deltas, emits a tool-call fallback message.
     """
     client = OpenAI(api_key=api_key, base_url=api_base)
-    input_messages: List[dict] = []
     if chat_history:
-        input_messages.extend(m for m in chat_history if m.get("role") != "system")
-    input_messages.append({"role": "user", "content": message})
     request_args = {
         "model": model_name,
-        "input": input_messages,
-        "instructions": system_prompt,
         "temperature": temperature,
-        "max_output_tokens": max_new_tokens,
-        "reasoning": {
-            "effort": reasoning_effort,
-            "generate_summary": "detailed",
-            "summary": "detailed",
-        },
         "stream": True,
     }
     if tools:
         request_args["tools"] = tools
     if tool_choice:
         request_args["tool_choice"] = tool_choice
-    raw_reasoning, raw_visible = [], []
     try:
-        stream = client.responses.create(**request_args)
-        reasoning_started = False
-        reasoning_closed = False
-        saw_visible_output = False
-        last_tool_name = None
-        last_tool_args = None
         buffer = ""
-        for event in stream:
-            et = getattr(event, "type", "")
-            if et == "response.reasoning_text.delta":
-                if not reasoning_started:
-                    yield "analysis"
-                    reasoning_started = True
-                rdelta = getattr(event, "delta", "") or ""
-                if rdelta:
-                    raw_reasoning.append(rdelta)
-                    yield rdelta
-                continue
-            if et == "response.output_text.delta":
-                if reasoning_started and not reasoning_closed:
-                    yield "assistantfinal"
-                    reasoning_closed = True
-                saw_visible_output = True
-                delta = getattr(event, "delta", "") or ""
-                raw_visible.append(delta)
-                buffer += delta
-                if "\n" in buffer or len(buffer) > 150:
-                    yield buffer
-                    buffer = ""
-                continue
-            if et.startswith("response.tool") or et.startswith("response.function_call"):
-                name = getattr(event, "name", None)
-                args = getattr(event, "arguments", None)
-                if args is None:
-                    args = getattr(event, "args", None) or getattr(event, "delta", None) or getattr(event, "data", None)
-                if name:
-                    last_tool_name = name
-                if args is not None:
-                    last_tool_args = args
-                continue
-            if et in ("response.completed", "response.error"):
-                if buffer:
-                    yield buffer
-                    buffer = ""
-                if reasoning_started and not reasoning_closed:
-                    yield "assistantfinal"
-                    reasoning_closed = True
-                if not saw_visible_output:
-                    msg = "I attempted to call a tool, but tools aren't executed in this environment, so no final answer was produced."
-                    if last_tool_name:
-                        try:
-                            args_text = json.dumps(last_tool_args, ensure_ascii=False, default=str)
-                        except Exception:
-                            args_text = str(last_tool_args)
-                        msg += f"\n\n• Tool requested: **{last_tool_name}**\n• Arguments: `{args_text}`"
-                    yield msg
-                if et == "response.error":
-                    err = getattr(event, "error", None)
-                    emsg = getattr(err, "message", "") if err else "Unknown error"
-                    yield f"Error: {emsg}"
-                break
         if buffer:
             yield buffer
     except Exception as e:
         logging.exception("[Gateway] Streaming failed")
-        yield f"Error: {e}"

+import logging
 from openai import OpenAI
+from typing import List, Generator, Optional
+logging.basicConfig(level=logging.INFO)
 def request_generation(
     api_key: str,
     model_name: str,
     chat_history: Optional[List[dict]] = None,
     temperature: float = 0.3,
+    frequency_penalty: float = 0.0,
+    presence_penalty: float = 0.0,
     max_new_tokens: int = 1024,
     tools: Optional[List[dict]] = None,
     tool_choice: Optional[str] = None,
 ) -> Generator[str, None, None]:
     """
+    Sends a streaming chat request to an OpenAI-compatible backend using the official OpenAI client.
+    Buffers output to improve LaTeX rendering.
     """
     client = OpenAI(api_key=api_key, base_url=api_base)
+    messages = [{"role": "system", "content": system_prompt}]
     if chat_history:
+        messages.extend(chat_history)
+    messages.append({"role": "user", "content": message})
     request_args = {
         "model": model_name,
+        "messages": messages,
         "temperature": temperature,
+        "frequency_penalty": frequency_penalty,
+        "presence_penalty": presence_penalty,
+        "max_tokens": max_new_tokens,
         "stream": True,
     }
     if tools:
         request_args["tools"] = tools
     if tool_choice:
         request_args["tool_choice"] = tool_choice
+    logging.info(f"[Gateway] Request to {api_base} | Model: {model_name}")
     try:
+        stream = client.chat.completions.create(**request_args)
+        collected = ""
         buffer = ""
+        for chunk in stream:
+            delta = chunk.choices[0].delta.content or ""
+            collected += delta
+            buffer += delta
+            if "\n" in buffer or len(buffer) > 150:
+                yield buffer
+                buffer = ""
         if buffer:
             yield buffer
     except Exception as e:
         logging.exception("[Gateway] Streaming failed")
+        yield f"Error: {e}"