mahdicv commited on
Commit
a93f838
·
1 Parent(s): cdbc1db

reverting to chat.completion API

Browse files
Files changed (3) hide show
  1. README.md +1 -1
  2. app.py +47 -78
  3. gateway.py +26 -89
README.md CHANGED
@@ -8,7 +8,7 @@ sdk_version: 5.36.2
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
- short_description: 'gpt-oss-120b + web browsing + reasoning on AMD MI300X GPUs'
12
  models:
13
  - openai/gpt-oss-120b
14
  ---
 
8
  app_file: app.py
9
  pinned: false
10
  license: apache-2.0
11
+ short_description: 'gpt-oss-120b on AMD MI300X GPUs'
12
  models:
13
  - openai/gpt-oss-120b
14
  ---
app.py CHANGED
@@ -1,12 +1,11 @@
1
- import os, logging, gradio as gr
2
- from pydoc import html
3
  from openai import OpenAI
4
  from gateway import request_generation
5
  from utils import LATEX_DELIMS
6
-
7
  openai_api_key = os.getenv("API_KEY")
8
  openai_api_base = os.getenv("API_ENDPOINT")
9
- model_name = os.getenv("MODEL_NAME")
10
  client = OpenAI(api_key=openai_api_key, base_url=openai_api_base)
11
  MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", 1024))
12
  CONCURRENCY_LIMIT = int(os.getenv("CONCURRENCY_LIMIT", 20))
@@ -14,26 +13,26 @@ QUEUE_SIZE = int(os.getenv("QUEUE_SIZE", CONCURRENCY_LIMIT * 4))
14
 
15
  logging.basicConfig(level=logging.INFO)
16
 
17
- def format_final(analysis_text: str, visible_text: str) -> str:
18
- """Render final message with collapsible analysis + normal Markdown answer."""
19
- reasoning_safe = html.escape((analysis_text or "").strip())
20
- response = (visible_text or "").strip()
21
- # Collapsible analysis, normal markdown answer
22
- return (
23
- "<details><summary><strong>🤔 Analysis</strong></summary>\n"
24
- "<pre style='white-space:pre-wrap;'>"
25
- f"{reasoning_safe}"
26
- "</pre>\n</details>\n\n"
27
- "**💬 Response:**\n\n"
28
- f"{response}"
29
- )
 
 
30
 
31
- def generate(message, history, system_prompt, temperature, reasoning_effort, enable_browsing, max_new_tokens):
32
  if not message.strip():
33
  yield "Please enter a prompt."
34
  return
35
 
36
- # Flatten gradio history
37
  msgs = []
38
  for h in history:
39
  if isinstance(h, dict):
@@ -43,92 +42,62 @@ def generate(message, history, system_prompt, temperature, reasoning_effort, ena
43
  if u: msgs.append({"role": "user", "content": u})
44
  if a: msgs.append({"role": "assistant", "content": a})
45
 
46
- tools = [{"type": "web_search_preview"}] if enable_browsing else None
47
- tool_choice = "auto" if enable_browsing else None
48
-
49
- in_analysis = False
50
- in_visible = False
51
 
52
- raw_analysis = ""
53
- raw_visible = ""
54
-
55
- raw_started = False
56
- last_flush_len = 0
57
-
58
- def make_raw_preview() -> str:
59
- return (
60
- "```text\n"
61
- "Analysis (live):\n"
62
- f"{raw_analysis}\n\n"
63
- "Response (draft):\n"
64
- f"{raw_visible}\n"
65
- "```"
66
- )
67
 
68
  try:
69
- for chunk in request_generation(
70
  api_key=openai_api_key, api_base=openai_api_base,
71
  message=message, system_prompt=system_prompt,
72
- model_name=model_name, chat_history=msgs,
73
- temperature=temperature, reasoning_effort=reasoning_effort,
74
- max_new_tokens=max_new_tokens, tools=tools, tool_choice=tool_choice,
 
 
75
  ):
76
- if chunk == "analysis":
77
- in_analysis, in_visible = True, False
78
- if not raw_started:
79
- raw_started = True
80
- yield make_raw_preview()
81
  continue
82
 
83
- if chunk == "assistantfinal":
84
- in_analysis, in_visible = False, True
85
- if not raw_started:
86
- raw_started = True
87
- yield make_raw_preview()
88
- continue
89
-
90
- if in_analysis:
91
- raw_analysis += chunk
92
- elif in_visible:
93
- raw_visible += chunk
94
- else:
95
- raw_visible += chunk
96
 
97
- total_len = len(raw_analysis) + len(raw_visible)
98
- if total_len - last_flush_len >= 120 or "\n" in chunk:
99
- last_flush_len = total_len
100
- yield make_raw_preview()
101
-
102
- final_markdown = format_final(raw_analysis, raw_visible)
103
 
104
- if final_markdown.count("$") % 2:
105
- final_markdown += "$"
 
106
 
107
- # This replaces the raw preview in-place with the pretty final message
108
- yield final_markdown
 
 
109
 
110
  except Exception as e:
111
  logging.exception("Stream failed")
112
  yield f"❌ Error: {e}"
113
 
114
-
115
  chatbot_ui = gr.ChatInterface(
116
  fn=generate,
117
  type="messages",
118
  chatbot=gr.Chatbot(
119
  label="OSS vLLM Chatbot",
120
  type="messages",
 
121
  height=600,
122
  latex_delimiters=LATEX_DELIMS,
123
  ),
124
- additional_inputs_accordion=gr.Accordion("⚙️ Settings", open=True),
125
  additional_inputs=[
126
  gr.Textbox(label="System prompt", value="You are a helpful assistant.", lines=2),
127
  gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, step=0.1, value=0.7),
128
- gr.Radio(label="Reasoning Effort", choices=["low","medium","high"], value="medium"),
129
- gr.Checkbox(label="Enable web browsing (web_search_preview)", value=False),
130
  ],
131
- stop_btn=True,
132
  examples=[
133
  ["Explain the difference between supervised and unsupervised learning."],
134
  ["Summarize the plot of Inception in two sentences."],
@@ -137,10 +106,10 @@ chatbot_ui = gr.ChatInterface(
137
  ["Derive the gradient of softmax cross-entropy loss."],
138
  ["Explain why ∂/∂x xⁿ = n·xⁿ⁻¹ holds."],
139
  ],
 
140
  title=" GPT-OSS-120B on AMD MI300X",
141
- description="This Space is an Alpha release that demonstrates gpt-oss-120b model running on AMD MI300 infrastructure. The space is built with Apache 2.0 License. ***DISCLAIMER:*** Analysis is provided along with final response to allow users to gain insight into model's chain of thought, but may contain content not deemed suitable to include in final response.",
142
  )
143
-
144
  if __name__ == "__main__":
145
  chatbot_ui.queue(max_size=QUEUE_SIZE,
146
  default_concurrency_limit=CONCURRENCY_LIMIT).launch()
 
1
+ import os, re, logging, gradio as gr
 
2
  from openai import OpenAI
3
  from gateway import request_generation
4
  from utils import LATEX_DELIMS
5
+
6
  openai_api_key = os.getenv("API_KEY")
7
  openai_api_base = os.getenv("API_ENDPOINT")
8
+ MODEL = os.getenv("MODEL_NAME", "")
9
  client = OpenAI(api_key=openai_api_key, base_url=openai_api_base)
10
  MAX_NEW_TOKENS = int(os.getenv("MAX_NEW_TOKENS", 1024))
11
  CONCURRENCY_LIMIT = int(os.getenv("CONCURRENCY_LIMIT", 20))
 
13
 
14
  logging.basicConfig(level=logging.INFO)
15
 
16
+ def format_analysis_response(text):
17
+ m = re.search(r"analysis(.*?)assistantfinal", text, re.DOTALL)
18
+ if m:
19
+ reasoning = m.group(1).strip()
20
+ response = text.split("assistantfinal", 1)[-1].strip()
21
+ return (
22
+ f"**🤔 Analysis:**\n\n*{reasoning}*\n\n---\n\n"
23
+ f"**💬 Response:**\n\n{response}"
24
+ )
25
+ return text.strip()
26
+
27
+ def generate(message, history,
28
+ system_prompt, temperature,
29
+ frequency_penalty, presence_penalty,
30
+ max_new_tokens):
31
 
 
32
  if not message.strip():
33
  yield "Please enter a prompt."
34
  return
35
 
 
36
  msgs = []
37
  for h in history:
38
  if isinstance(h, dict):
 
42
  if u: msgs.append({"role": "user", "content": u})
43
  if a: msgs.append({"role": "assistant", "content": a})
44
 
45
+ logging.info(f"[User] {message}")
46
+ logging.info(f"[System] {system_prompt} | Temp={temperature}")
 
 
 
47
 
48
+ collected, buffer = "", ""
49
+ yielded_once = False
 
 
 
 
 
 
 
 
 
 
 
 
 
50
 
51
  try:
52
+ for delta in request_generation(
53
  api_key=openai_api_key, api_base=openai_api_base,
54
  message=message, system_prompt=system_prompt,
55
+ model_name=MODEL, chat_history=msgs,
56
+ temperature=temperature,
57
+ frequency_penalty=frequency_penalty,
58
+ presence_penalty=presence_penalty,
59
+ max_new_tokens=max_new_tokens,
60
  ):
61
+ if not delta:
 
 
 
 
62
  continue
63
 
64
+ collected += delta
65
+ buffer += delta
 
 
 
 
 
 
 
 
 
 
 
66
 
67
+ if not yielded_once:
68
+ yield delta
69
+ buffer = ""
70
+ yielded_once = True
71
+ continue
 
72
 
73
+ if "\n" in buffer or len(buffer) > 150:
74
+ yield collected
75
+ buffer = ""
76
 
77
+ final = format_analysis_response(collected)
78
+ if final.count("$") % 2:
79
+ final += "$"
80
+ yield final
81
 
82
  except Exception as e:
83
  logging.exception("Stream failed")
84
  yield f"❌ Error: {e}"
85
 
 
86
  chatbot_ui = gr.ChatInterface(
87
  fn=generate,
88
  type="messages",
89
  chatbot=gr.Chatbot(
90
  label="OSS vLLM Chatbot",
91
  type="messages",
92
+ scale=2,
93
  height=600,
94
  latex_delimiters=LATEX_DELIMS,
95
  ),
96
+ stop_btn=True,
97
  additional_inputs=[
98
  gr.Textbox(label="System prompt", value="You are a helpful assistant.", lines=2),
99
  gr.Slider(label="Temperature", minimum=0.0, maximum=1.0, step=0.1, value=0.7),
 
 
100
  ],
 
101
  examples=[
102
  ["Explain the difference between supervised and unsupervised learning."],
103
  ["Summarize the plot of Inception in two sentences."],
 
106
  ["Derive the gradient of softmax cross-entropy loss."],
107
  ["Explain why ∂/∂x xⁿ = n·xⁿ⁻¹ holds."],
108
  ],
109
+ # title="Open-source GPT-OSS-120B on AMD MI300X",
110
  title=" GPT-OSS-120B on AMD MI300X",
111
+ description="This Space is an Alpha release that demonstrates gpt-oss-120b model running on AMD MI300 infrastructure. The space is built with Apache 2.0 License.",
112
  )
 
113
  if __name__ == "__main__":
114
  chatbot_ui.queue(max_size=QUEUE_SIZE,
115
  default_concurrency_limit=CONCURRENCY_LIMIT).launch()
gateway.py CHANGED
@@ -1,7 +1,8 @@
1
- import json, logging
2
- from typing import List, Generator, Optional
3
  from openai import OpenAI
 
4
 
 
5
 
6
  def request_generation(
7
  api_key: str,
@@ -11,122 +12,58 @@ def request_generation(
11
  model_name: str,
12
  chat_history: Optional[List[dict]] = None,
13
  temperature: float = 0.3,
 
 
14
  max_new_tokens: int = 1024,
15
- reasoning_effort: str = "off",
16
  tools: Optional[List[dict]] = None,
17
  tool_choice: Optional[str] = None,
18
  ) -> Generator[str, None, None]:
19
  """
20
- Streams Responses API events. Emits:
21
- - "analysis" sentinel once, then raw reasoning deltas
22
- - "assistantfinal" sentinel once, then visible output deltas
23
- If no visible deltas, emits a tool-call fallback message.
24
  """
25
  client = OpenAI(api_key=api_key, base_url=api_base)
26
 
27
- input_messages: List[dict] = []
28
  if chat_history:
29
- input_messages.extend(m for m in chat_history if m.get("role") != "system")
30
- input_messages.append({"role": "user", "content": message})
31
 
32
  request_args = {
33
  "model": model_name,
34
- "input": input_messages,
35
- "instructions": system_prompt,
36
  "temperature": temperature,
37
- "max_output_tokens": max_new_tokens,
38
- "reasoning": {
39
- "effort": reasoning_effort,
40
- "generate_summary": "detailed",
41
- "summary": "detailed",
42
- },
43
  "stream": True,
44
  }
 
45
  if tools:
46
  request_args["tools"] = tools
47
  if tool_choice:
48
  request_args["tool_choice"] = tool_choice
49
 
50
-
51
- raw_reasoning, raw_visible = [], []
52
 
53
  try:
54
- stream = client.responses.create(**request_args)
55
 
56
- reasoning_started = False
57
- reasoning_closed = False
58
- saw_visible_output = False
59
- last_tool_name = None
60
- last_tool_args = None
61
  buffer = ""
62
 
63
- for event in stream:
64
- et = getattr(event, "type", "")
65
-
66
- if et == "response.reasoning_text.delta":
67
- if not reasoning_started:
68
- yield "analysis"
69
- reasoning_started = True
70
- rdelta = getattr(event, "delta", "") or ""
71
- if rdelta:
72
- raw_reasoning.append(rdelta)
73
- yield rdelta
74
- continue
75
-
76
- if et == "response.output_text.delta":
77
- if reasoning_started and not reasoning_closed:
78
- yield "assistantfinal"
79
- reasoning_closed = True
80
-
81
- saw_visible_output = True
82
- delta = getattr(event, "delta", "") or ""
83
- raw_visible.append(delta)
84
- buffer += delta
85
-
86
- if "\n" in buffer or len(buffer) > 150:
87
- yield buffer
88
- buffer = ""
89
- continue
90
-
91
- if et.startswith("response.tool") or et.startswith("response.function_call"):
92
- name = getattr(event, "name", None)
93
- args = getattr(event, "arguments", None)
94
- if args is None:
95
- args = getattr(event, "args", None) or getattr(event, "delta", None) or getattr(event, "data", None)
96
- if name:
97
- last_tool_name = name
98
- if args is not None:
99
- last_tool_args = args
100
- continue
101
-
102
- if et in ("response.completed", "response.error"):
103
- if buffer:
104
- yield buffer
105
- buffer = ""
106
-
107
- if reasoning_started and not reasoning_closed:
108
- yield "assistantfinal"
109
- reasoning_closed = True
110
-
111
- if not saw_visible_output:
112
- msg = "I attempted to call a tool, but tools aren't executed in this environment, so no final answer was produced."
113
- if last_tool_name:
114
- try:
115
- args_text = json.dumps(last_tool_args, ensure_ascii=False, default=str)
116
- except Exception:
117
- args_text = str(last_tool_args)
118
- msg += f"\n\n• Tool requested: **{last_tool_name}**\n• Arguments: `{args_text}`"
119
- yield msg
120
 
121
- if et == "response.error":
122
- err = getattr(event, "error", None)
123
- emsg = getattr(err, "message", "") if err else "Unknown error"
124
- yield f"Error: {emsg}"
125
- break
126
 
127
  if buffer:
128
  yield buffer
129
 
130
  except Exception as e:
131
  logging.exception("[Gateway] Streaming failed")
132
- yield f"Error: {e}"
 
1
+ import logging
 
2
  from openai import OpenAI
3
+ from typing import List, Generator, Optional
4
 
5
+ logging.basicConfig(level=logging.INFO)
6
 
7
  def request_generation(
8
  api_key: str,
 
12
  model_name: str,
13
  chat_history: Optional[List[dict]] = None,
14
  temperature: float = 0.3,
15
+ frequency_penalty: float = 0.0,
16
+ presence_penalty: float = 0.0,
17
  max_new_tokens: int = 1024,
 
18
  tools: Optional[List[dict]] = None,
19
  tool_choice: Optional[str] = None,
20
  ) -> Generator[str, None, None]:
21
  """
22
+ Sends a streaming chat request to an OpenAI-compatible backend using the official OpenAI client.
23
+ Buffers output to improve LaTeX rendering.
 
 
24
  """
25
  client = OpenAI(api_key=api_key, base_url=api_base)
26
 
27
+ messages = [{"role": "system", "content": system_prompt}]
28
  if chat_history:
29
+ messages.extend(chat_history)
30
+ messages.append({"role": "user", "content": message})
31
 
32
  request_args = {
33
  "model": model_name,
34
+ "messages": messages,
 
35
  "temperature": temperature,
36
+ "frequency_penalty": frequency_penalty,
37
+ "presence_penalty": presence_penalty,
38
+ "max_tokens": max_new_tokens,
 
 
 
39
  "stream": True,
40
  }
41
+
42
  if tools:
43
  request_args["tools"] = tools
44
  if tool_choice:
45
  request_args["tool_choice"] = tool_choice
46
 
47
+ logging.info(f"[Gateway] Request to {api_base} | Model: {model_name}")
 
48
 
49
  try:
50
+ stream = client.chat.completions.create(**request_args)
51
 
52
+ collected = ""
 
 
 
 
53
  buffer = ""
54
 
55
+ for chunk in stream:
56
+ delta = chunk.choices[0].delta.content or ""
57
+ collected += delta
58
+ buffer += delta
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
59
 
60
+ if "\n" in buffer or len(buffer) > 150:
61
+ yield buffer
62
+ buffer = ""
 
 
63
 
64
  if buffer:
65
  yield buffer
66
 
67
  except Exception as e:
68
  logging.exception("[Gateway] Streaming failed")
69
+ yield f"Error: {e}"