Spaces:

amd
/

gpt-oss-120b-chatbot

Running on CPU Upgrade

File size: 3,923 Bytes

a93f838
f1b7ce9
 
 
a93f838
f1b7ce9
 
a93f838
f1b7ce9
 
 
 
ff1e824
f1b7ce9
ff1e824
a93f838
 
 
 
 
 
 
 
 
 
 
 
 
 
 
ff1e824
f1b7ce9
 
 
ff1e824
f1b7ce9
 
 
 
 
 
 
 
ff1e824
a93f838
 
ba7492c
a93f838
 
ff1e824
f1b7ce9
a93f838
f1b7ce9
 
a93f838
 
 
 
 
f1b7ce9
a93f838
f1b7ce9
ff1e824
a93f838
 
ba7492c
a93f838
 
 
 
 
f1b7ce9
a93f838
 
 
ba7492c
a93f838
 
 
 
f1b7ce9
 
 
 
 
 
 
 
 
 
 
a93f838
f1b7ce9
 
 
a93f838
ff1e824
f1b7ce9
 
ff1e824
f1b7ce9
 
 
 
 
 
 
 
a93f838
f1b7ce9
a93f838
ff1e824
 
f1b7ce9
ba7492c

import os, re, logging, gradio as gr
from openai import OpenAI
from gateway import request_generation
from utils import LATEX_DELIMS
    
openai_api_key = os.getenv("API_KEY")
openai_api_base = os.getenv("API_ENDPOINT")
MODEL = os.getenv("MODEL_NAME", "")
client = OpenAI(api_key=openai_api_key, base_url=openai_api_base)
MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", 1024))
CONCURRENCY_LIMIT = int(os.getenv("CONCURRENCY_LIMIT", 20))
QUEUE_SIZE = int(os.getenv("QUEUE_SIZE", CONCURRENCY_LIMIT * 4))

logging.basicConfig(level=logging.INFO)

def format_analysis_response(text):
    m = re.search(r"analysis(.*?)assistantfinal", text, re.DOTALL)
    if m:
        reasoning = m.group(1).strip()
        response = text.split("assistantfinal", 1)[-1].strip()
        return (
            f"**🤔 Analysis:**\n\n*{reasoning}*\n\n---\n\n"
            f"**💬 Response:**\n\n{response}"
        )
    return text.strip()

def generate(message, history,
             system_prompt, temperature,
             frequency_penalty, presence_penalty,
             max_new_tokens):

    if not message.strip():
        yield "Please enter a prompt."
        return

    msgs = []
    for h in history:
        if isinstance(h, dict):
            msgs.append(h)
        elif isinstance(h, (list, tuple)) and len(h) == 2:
            u, a = h
            if u: msgs.append({"role": "user", "content": u})
            if a: msgs.append({"role": "assistant", "content": a})

    logging.info(f"[User] {message}")
    logging.info(f"[System] {system_prompt} | Temp={temperature}")

    collected, buffer = "", ""
    yielded_once = False

    try:
        for delta in request_generation(
            api_key=openai_api_key, api_base=openai_api_base,
            message=message, system_prompt=system_prompt,
            model_name=MODEL, chat_history=msgs,
            temperature=temperature,
            frequency_penalty=frequency_penalty,
            presence_penalty=presence_penalty,
            max_new_tokens=max_new_tokens,
        ):
            if not delta:
                continue

            collected += delta
            buffer += delta

            if not yielded_once:
                yield delta
                buffer = ""
                yielded_once = True
                continue

            if "\n" in buffer or len(buffer) > 150:
                yield collected
                buffer = ""

        final = format_analysis_response(collected)
        if final.count("$") % 2:
            final += "$"
        yield final

    except Exception as e:
        logging.exception("Stream failed")
        yield f"❌ Error: {e}"

chatbot_ui = gr.ChatInterface(
    fn=generate,
    type="messages",
    chatbot=gr.Chatbot(
        label="OSS vLLM Chatbot",
        type="messages",
        scale=2,
        height=600,
        latex_delimiters=LATEX_DELIMS,
    ),
    stop_btn=True,
    additional_inputs=[
        gr.Textbox(label="System prompt", value="You are a helpful assistant.", lines=2),
        gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, step=0.1, value=0.7),
    ],
    examples=[
        ["Explain the difference between supervised and unsupervised learning."],
        ["Summarize the plot of Inception in two sentences."],
        ["Show me the LaTeX for the quadratic formula."],
        ["What are advantages of AMD Instinct MI300X GPU?"],
        ["Derive the gradient of softmax cross-entropy loss."],
        ["Explain why ∂/∂x xⁿ = n·xⁿ⁻¹ holds."],
    ],
    # title="Open-source GPT-OSS-120B on AMD MI300X",
    title=" GPT-OSS-120B on AMD MI300X",
    description="This Space is an Alpha release that demonstrates gpt-oss-120b model running on AMD MI300 infrastructure. The space is built with Apache 2.0 License.",
)
if __name__ == "__main__":
    chatbot_ui.queue(max_size=QUEUE_SIZE,
                     default_concurrency_limit=CONCURRENCY_LIMIT).launch()