ask-baba-bhAIro / app.py
rajeshlion's picture
Update app.py
6e17d14 verified
# import gradio as gr
# from huggingface_hub import InferenceClient
# import spaces
# """
# For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference
# """
# client = InferenceClient("HuggingFaceH4/zephyr-7b-beta")
# @spaces.GPU
# def respond(
# message,
# history: list[tuple[str, str]],
# system_message,
# max_tokens,
# temperature,
# top_p,
# ):
# messages = [{"role": "system", "content": system_message}]
# for val in history:
# if val[0]:
# messages.append({"role": "user", "content": val[0]})
# if val[1]:
# messages.append({"role": "assistant", "content": val[1]})
# messages.append({"role": "user", "content": message})
# response = ""
# for message in client.chat_completion(
# messages,
# max_tokens=max_tokens,
# stream=True,
# temperature=temperature,
# top_p=top_p,
# ):
# token = message.choices[0].delta.content
# response += token
# yield response
# """
# For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface
# """
# demo = gr.ChatInterface(
# respond,
# additional_inputs=[
# gr.Textbox(value="You are a Chatbot who only answers spiritual questions based on Indian scriptures and declines answering other questions.", label="System message"),
# gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
# gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
# gr.Slider(
# minimum=0.1,
# maximum=1.0,
# value=0.95,
# step=0.05,
# label="Top-p (nucleus sampling)",
# ),
# ],
# )
# if __name__ == "__main__":
# demo.launch()
# import os
# import gradio as gr
# from huggingface_hub import InferenceClient
# from huggingface_hub.utils import HfHubHTTPError
# MODEL_ID = "HuggingFaceH4/zephyr-7b-beta"
# HF_TOKEN = os.getenv("HF_TOKEN") # ⚠️ set this in Spaces → Settings → Secrets
# client = InferenceClient(model=MODEL_ID, token=HF_TOKEN)
# def _build_zephyr_prompt(system_message: str, history, user_msg: str) -> str:
# parts = []
# if system_message:
# parts.append(f"<|system|>\n{system_message}\n</s>")
# for u, a in (history or []):
# if u:
# parts.append(f"<|user|>\n{u}\n</s>")
# if a:
# parts.append(f"<|assistant|>\n{a}\n</s>")
# parts.append(f"<|user|>\n{user_msg}\n</s>\n<|assistant|>\n")
# return "\n".join(parts)
# def respond(message, history, system_message, max_tokens, temperature, top_p):
# # Early guardrails for missing token
# if not HF_TOKEN:
# yield (
# "⚠️ Missing HF_TOKEN.\n\n"
# "Set a Hugging Face access token in your Space:\n"
# "Settings → Repository secrets → Add secret → Name: HF_TOKEN, Value: <your token>\n"
# "Token needs at least 'read' scope."
# )
# return
# # Try OpenAI-like chat completion first
# try:
# response_text = ""
# for chunk in client.chat_completion(
# messages=(
# [{"role": "system", "content": system_message}] if system_message else []
# )
# + [
# msg
# for pair in (history or [])
# for msg in (
# [{"role": "user", "content": pair[0]}] if pair and pair[0] else []
# )
# + (
# [{"role": "assistant", "content": pair[1]}]
# if pair and len(pair) > 1 and pair[1]
# else []
# )
# ]
# + [{"role": "user", "content": message}],
# max_tokens=max_tokens,
# temperature=temperature,
# top_p=top_p,
# stream=True,
# ):
# token = getattr(chunk.choices[0].delta, "content", None)
# if token:
# response_text += token
# yield response_text
# return
# except HfHubHTTPError as e:
# # Handle 401 explicitly with helpful guidance
# try:
# status = e.response.status_code
# except Exception:
# status = None
# if status == 401:
# yield (
# "❌ 401 Unauthorized from Hugging Face Inference API.\n\n"
# "Fix:\n"
# "1) Create a token at https://huggingface.co/settings/tokens with at least 'read' scope.\n"
# "2) In your Space, go to Settings → Repository secrets → Add secret\n"
# " Name: HF_TOKEN, Value: <your token>\n"
# "3) Restart the Space.\n"
# )
# return
# # Otherwise drop to fallback
# except Exception:
# pass
# # Fallback: raw text_generation with Zephyr chat format
# zephyr_prompt = _build_zephyr_prompt(system_message, history, message)
# try:
# response_text = ""
# # for tok in client.text_generation(
# # zephyr_prompt,
# # max_new_tokens=max_tokens,
# # temperature=temperature,
# # top_p=top_p,
# # stream=True,
# # stop=["</s>", "<|user|>", "<|assistant|>", "<|system|>"],
# # ):
# for tok in client.text_generation(
# zephyr_prompt,
# max_new_tokens=max_tokens,
# temperature=temperature,
# top_p=top_p,
# stream=True,
# ):
# if tok:
# response_text += tok
# yield response_text
# except HfHubHTTPError as e:
# try:
# status = e.response.status_code
# except Exception:
# status = None
# if status == 401:
# yield (
# "❌ 401 Unauthorized (text_generation fallback).\n\n"
# "Set HF_TOKEN in Space secrets (Settings → Repository secrets)."
# )
# else:
# yield f"[Inference error] {e}"
# except Exception as e:
# yield f"[Runtime error] {e}"
# demo = gr.ChatInterface(
# respond,
# additional_inputs=[
# gr.Textbox(
# value=(
# "You are a Chatbot who only answers spiritual questions based "
# "on Indian scriptures and declines answering other questions."
# ),
# label="System message",
# ),
# gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
# gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
# gr.Slider(
# minimum=0.1,
# maximum=1.0,
# value=0.95,
# step=0.05,
# label="Top-p (nucleus sampling)",
# ),
# ],
# )
# if __name__ == "__main__":
# demo.launch()
# import os
# import gradio as gr
# from huggingface_hub import InferenceClient
# from huggingface_hub.utils import HfHubHTTPError # correct import for 0.22.x
# # You can override with a Space secret: MODEL_ID=<your preferred model>
# PREFERRED = os.getenv("MODEL_ID", "HuggingFaceH4/zephyr-7b-beta")
# # Accept either token name (matches your other Spaces)
# HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN")
# # If your preferred endpoint is down, we’ll try these in order:
# CANDIDATES = [
# PREFERRED,
# "google/gemma-2-2b-it",
# "Qwen/Qwen2.5-1.5B-Instruct",
# "tiiuae/falcon-7b-instruct",
# ]
# def _build_generic_prompt(system_message, history, user_msg):
# """
# Simple, model-agnostic chat prompt (works across many instruct models).
# """
# parts = []
# if system_message:
# parts.append(f"System: {system_message}")
# for u, a in (history or []):
# if u:
# parts.append(f"User: {u}")
# if a:
# parts.append(f"Assistant: {a}")
# parts.append(f"User: {user_msg}")
# parts.append("Assistant:")
# return "\n".join(parts)
# def _try_model(model_id, system_message, history, message, max_tokens, temperature, top_p):
# """
# Try streaming via chat_completions; on failure, fall back to text_generation.
# Returns a generator that yields text chunks.
# Raises ValueError('NEXT') to indicate “try next model”.
# """
# client = InferenceClient(model=model_id, token=HF_TOKEN) if HF_TOKEN else InferenceClient(model=model_id)
# # 1) Try chat-completions (if supported by the backend)
# try:
# msgs = (
# [{"role": "system", "content": system_message}] if system_message else []
# )
# for u, a in (history or []):
# if u:
# msgs.append({"role": "user", "content": u})
# if a:
# msgs.append({"role": "assistant", "content": a})
# msgs.append({"role": "user", "content": message})
# def gen_chat():
# response_text = ""
# for chunk in client.chat_completion(
# messages=msgs,
# max_tokens=max_tokens,
# temperature=temperature,
# top_p=top_p,
# stream=True,
# ):
# token = getattr(chunk.choices[0].delta, "content", None)
# if token:
# response_text += token
# yield response_text
# # sanity probe: start the generator and yield progressively
# for out in gen_chat():
# yield out
# return
# except HfHubHTTPError as e:
# status = getattr(e, "response", None).status_code if getattr(e, "response", None) else None
# if status == 404:
# # Endpoint not available for this model → try next candidate
# raise ValueError("NEXT")
# if status == 401:
# yield (
# "❌ 401 Unauthorized from HF Inference API.\n\n"
# "Ensure a read-scoped token is set (HF_TOKEN or HUGGINGFACEHUB_API_TOKEN) "
# "in Space secrets, then restart."
# )
# return
# if status == 403:
# yield (
# "❌ 403 Forbidden from HF Inference API.\n\n"
# "This model likely requires Inference Providers + billing on your token. "
# "Either enable those or switch to a free hosted model using the MODEL_ID secret."
# )
# return
# # fall through to text_generation for other statuses
# except Exception:
# # fall through to text_generation
# pass
# # 2) Fallback: plain text_generation with a generic prompt
# prompt = _build_generic_prompt(system_message, history, message)
# try:
# response_text = ""
# for tok in client.text_generation(
# prompt,
# max_new_tokens=max_tokens,
# temperature=temperature,
# top_p=top_p,
# stream=True,
# ):
# # Manual stop filtering (since huggingface_hub==0.22.x lacks 'stop' kwarg)
# if any(s in tok for s in ["</s>", "<|user|>", "<|assistant|>", "<|system|>"]):
# break
# if tok:
# response_text += tok
# yield response_text
# except HfHubHTTPError as e:
# status = getattr(e, "response", None).status_code if getattr(e, "response", None) else None
# if status == 404:
# # Endpoint not available for this model → try next candidate
# raise ValueError("NEXT")
# if status == 401:
# yield (
# "❌ 401 Unauthorized (text-generation fallback).\n\n"
# "Set HF_TOKEN or HUGGINGFACEHUB_API_TOKEN in Space secrets and restart."
# )
# elif status == 403:
# yield (
# "❌ 403 Forbidden (text-generation fallback).\n\n"
# "Your token lacks 'Use Inference API/Providers' or billing is not enabled. "
# "Enable those or use a free hosted model via MODEL_ID."
# )
# else:
# yield f"[Inference error] {e}"
# except Exception as e:
# yield f"[Runtime error] {e}"
# def respond(message, history, system_message, max_tokens, temperature, top_p):
# last_error = None
# tried = []
# for model_id in [m for m in CANDIDATES if m]:
# tried.append(model_id)
# try:
# for chunk in _try_model(model_id, system_message, history, message, max_tokens, temperature, top_p):
# yield chunk
# # If we streamed anything without raising, we’re done
# return
# except ValueError as ve:
# if str(ve) == "NEXT":
# last_error = f"Model `{model_id}` endpoint unavailable (404)."
# continue
# else:
# last_error = str(ve)
# except Exception as e:
# last_error = f"Unexpected error on `{model_id}`: {e}"
# # If we got here, all candidates failed
# tried_str = " → ".join(tried) if tried else "(none)"
# yield (
# "❌ All candidate models failed.\n\n"
# f"Tried: {tried_str}\n\n"
# f"Last error: {last_error or 'unknown'}\n\n"
# "Fixes:\n"
# "• Set MODEL_ID in Space secrets to a hosted model that’s online (e.g., google/gemma-2-2b-it, Qwen/Qwen2.5-1.5B-Instruct).\n"
# "• Or enable Inference Providers + billing on your HF token for models served via providers.\n"
# )
# demo = gr.ChatInterface(
# respond,
# additional_inputs=[
# gr.Textbox(
# value=("You are a Chatbot who only answers spiritual questions based on Indian scriptures "
# "and declines answering other questions."),
# label="System message",
# ),
# gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"),
# gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
# gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
# ],
# )
# if __name__ == "__main__":
# demo.launch(share=True)
import os
import gradio as gr
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
# You can override this via Space secret: MODEL_ID=Qwen/Qwen2-0.5B-Instruct (etc.)
MODEL_ID = os.getenv("MODEL_ID", "TinyLlama/TinyLlama-1.1B-Chat-v1.0")
# Load once at startup
print(f"🔧 Loading local model: {MODEL_ID}")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
model = AutoModelForCausalLM.from_pretrained(
MODEL_ID,
torch_dtype=torch.float32, # CPU-friendly
)
model.eval()
def build_prompt(system_message: str, history, user_msg: str) -> str:
"""Try to use the model's chat template if present; otherwise use a generic prompt."""
messages = []
if system_message:
messages.append({"role": "system", "content": system_message})
for u, a in (history or []):
if u:
messages.append({"role": "user", "content": u})
if a:
messages.append({"role": "assistant", "content": a})
messages.append({"role": "user", "content": user_msg})
# Use chat template when available
try:
if getattr(tokenizer, "chat_template", None):
return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True)
except Exception:
pass
# Fallback generic formatting
parts = []
if system_message:
parts.append(f"System: {system_message}")
for u, a in (history or []):
if u:
parts.append(f"User: {u}")
if a:
parts.append(f"Assistant: {a}")
parts.append(f"User: {user_msg}")
parts.append("Assistant:")
return "\n".join(parts)
def respond(message, history, system_message, max_tokens, temperature, top_p):
prompt = build_prompt(system_message, history, message)
inputs = tokenizer(prompt, return_tensors="pt")
with torch.no_grad():
outputs = model.generate(
**inputs,
max_new_tokens=int(max_tokens),
do_sample=True,
temperature=float(temperature),
top_p=float(top_p),
pad_token_id=tokenizer.eos_token_id,
eos_token_id=tokenizer.eos_token_id,
)
# Decode only the newly generated portion
gen_ids = outputs[0][inputs["input_ids"].shape[1]:]
text = tokenizer.decode(gen_ids, skip_special_tokens=True)
# Stream the text in chunks so the UI feels live
acc = ""
for i in range(0, len(text), 40):
acc += text[i:i+40]
yield acc
demo = gr.ChatInterface(
respond,
additional_inputs=[
gr.Textbox(
value=("You are a spiritual assistant who only answers spiritual questions based on Indian Hindu scriptures e.g., Bhagvadgita, and politely decline all other questions."),
label="System message",
),
gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"),
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
],
)
if __name__ == "__main__":
# share=True gives you a public link automatically
demo.launch(share=True)
# import os
# import gradio as gr
# # ---- llama.cpp backend (fast CPU) ----
# from llama_cpp import Llama
# # ---- to list files in a repo and pick a GGUF automatically ----
# from huggingface_hub import list_repo_files
# # ----------------- Config -----------------
# # You can override these via Space "Settings → Variables"
# # If MODEL_REPO is set, it's tried first; otherwise we try the CANDIDATE_REPOS below.
# MODEL_REPO = os.getenv("MODEL_REPO", "").strip() or None
# # Known small GGUF chat repos (fast & lightweight). We'll try them in order.
# CANDIDATE_REPOS = [
# MODEL_REPO, # user-preferred first (may be None)
# "Qwen/Qwen2.5-0.5B-Instruct-GGUF",
# "Qwen/Qwen2-0.5B-Instruct-GGUF",
# "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF",
# "bartowski/Qwen2.5-0.5B-Instruct-GGUF",
# ]
# # Best-to-worst file name patterns to prefer when multiple GGUFs are present.
# PREFERRED_PATTERNS = [
# "q4_k_m.gguf", "Q4_K_M.gguf",
# "q4_0.gguf", "Q4_0.gguf",
# "q5_k_m.gguf", "Q5_K_M.gguf",
# ".gguf", # catch-all
# ]
# # Runtime knobs
# N_THREADS = int(os.getenv("N_THREADS", str(os.cpu_count() or 4)))
# CTX = int(os.getenv("CTX", "2048"))
# SYSTEM_DEFAULT = (
# "You are a Chatbot who only answers spiritual questions based on Indian scriptures "
# "and politely decline other questions."
# )
# # --------------- GGUF Picker ---------------
# def pick_repo_and_file():
# """Return (repo_id, gguf_filename) by scanning candidate repos for a preferred GGUF."""
# tried = []
# for repo in [r for r in CANDIDATE_REPOS if r]: # drop None
# try:
# files = list_repo_files(repo)
# except Exception:
# tried.append(f"{repo} (list failed)")
# continue
# ggufs = [f for f in files if f.lower().endswith(".gguf")]
# if not ggufs:
# tried.append(f"{repo} (no .gguf)")
# continue
# # pick by pattern preference
# for pat in PREFERRED_PATTERNS:
# for f in ggufs:
# if pat in f:
# return repo, f
# tried_str = " | ".join(tried) if tried else "(none)"
# raise RuntimeError(
# "No GGUF file found in any candidate repo.\n"
# f"Tried: {tried_str}\n"
# "Tip: set MODEL_REPO to a GGUF repo like 'Qwen/Qwen2.5-0.5B-Instruct-GGUF' "
# "or 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF'."
# )
# REPO_ID, FILENAME = pick_repo_and_file()
# print(f"🔧 Loading GGUF from {REPO_ID}/{FILENAME} | threads={N_THREADS}, ctx={CTX}")
# llm = Llama.from_pretrained(
# repo_id=REPO_ID,
# filename=FILENAME,
# n_ctx=CTX,
# n_threads=N_THREADS,
# n_gpu_layers=0, # CPU only
# logits_all=False,
# verbose=False,
# )
# def respond(message, history, system_message, max_tokens, temperature, top_p):
# sysmsg = system_message or SYSTEM_DEFAULT
# msgs = [{"role": "system", "content": sysmsg}]
# for u, a in (history or []):
# if u:
# msgs.append({"role": "user", "content": u})
# if a:
# msgs.append({"role": "assistant", "content": a})
# msgs.append({"role": "user", "content": message})
# stream = llm.create_chat_completion(
# messages=msgs,
# temperature=float(temperature),
# top_p=float(top_p),
# max_tokens=int(max_tokens),
# stream=True,
# )
# acc = ""
# for chunk in stream:
# delta = chunk["choices"][0]["delta"]
# tok = delta.get("content", "")
# if tok:
# acc += tok
# yield acc
# demo = gr.ChatInterface(
# respond,
# additional_inputs=[
# gr.Textbox(value=SYSTEM_DEFAULT, label="System message"),
# gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"),
# gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"),
# gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"),
# ],
# )
# if __name__ == "__main__":
# print(f"🧵 Threads: {N_THREADS}")
# demo.launch(share=True)