Spaces:
Running
Running
# import gradio as gr | |
# from huggingface_hub import InferenceClient | |
# import spaces | |
# """ | |
# For more information on `huggingface_hub` Inference API support, please check the docs: https://huggingface.co/docs/huggingface_hub/v0.22.2/en/guides/inference | |
# """ | |
# client = InferenceClient("HuggingFaceH4/zephyr-7b-beta") | |
# @spaces.GPU | |
# def respond( | |
# message, | |
# history: list[tuple[str, str]], | |
# system_message, | |
# max_tokens, | |
# temperature, | |
# top_p, | |
# ): | |
# messages = [{"role": "system", "content": system_message}] | |
# for val in history: | |
# if val[0]: | |
# messages.append({"role": "user", "content": val[0]}) | |
# if val[1]: | |
# messages.append({"role": "assistant", "content": val[1]}) | |
# messages.append({"role": "user", "content": message}) | |
# response = "" | |
# for message in client.chat_completion( | |
# messages, | |
# max_tokens=max_tokens, | |
# stream=True, | |
# temperature=temperature, | |
# top_p=top_p, | |
# ): | |
# token = message.choices[0].delta.content | |
# response += token | |
# yield response | |
# """ | |
# For information on how to customize the ChatInterface, peruse the gradio docs: https://www.gradio.app/docs/chatinterface | |
# """ | |
# demo = gr.ChatInterface( | |
# respond, | |
# additional_inputs=[ | |
# gr.Textbox(value="You are a Chatbot who only answers spiritual questions based on Indian scriptures and declines answering other questions.", label="System message"), | |
# gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), | |
# gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), | |
# gr.Slider( | |
# minimum=0.1, | |
# maximum=1.0, | |
# value=0.95, | |
# step=0.05, | |
# label="Top-p (nucleus sampling)", | |
# ), | |
# ], | |
# ) | |
# if __name__ == "__main__": | |
# demo.launch() | |
# import os | |
# import gradio as gr | |
# from huggingface_hub import InferenceClient | |
# from huggingface_hub.utils import HfHubHTTPError | |
# MODEL_ID = "HuggingFaceH4/zephyr-7b-beta" | |
# HF_TOKEN = os.getenv("HF_TOKEN") # ⚠️ set this in Spaces → Settings → Secrets | |
# client = InferenceClient(model=MODEL_ID, token=HF_TOKEN) | |
# def _build_zephyr_prompt(system_message: str, history, user_msg: str) -> str: | |
# parts = [] | |
# if system_message: | |
# parts.append(f"<|system|>\n{system_message}\n</s>") | |
# for u, a in (history or []): | |
# if u: | |
# parts.append(f"<|user|>\n{u}\n</s>") | |
# if a: | |
# parts.append(f"<|assistant|>\n{a}\n</s>") | |
# parts.append(f"<|user|>\n{user_msg}\n</s>\n<|assistant|>\n") | |
# return "\n".join(parts) | |
# def respond(message, history, system_message, max_tokens, temperature, top_p): | |
# # Early guardrails for missing token | |
# if not HF_TOKEN: | |
# yield ( | |
# "⚠️ Missing HF_TOKEN.\n\n" | |
# "Set a Hugging Face access token in your Space:\n" | |
# "Settings → Repository secrets → Add secret → Name: HF_TOKEN, Value: <your token>\n" | |
# "Token needs at least 'read' scope." | |
# ) | |
# return | |
# # Try OpenAI-like chat completion first | |
# try: | |
# response_text = "" | |
# for chunk in client.chat_completion( | |
# messages=( | |
# [{"role": "system", "content": system_message}] if system_message else [] | |
# ) | |
# + [ | |
# msg | |
# for pair in (history or []) | |
# for msg in ( | |
# [{"role": "user", "content": pair[0]}] if pair and pair[0] else [] | |
# ) | |
# + ( | |
# [{"role": "assistant", "content": pair[1]}] | |
# if pair and len(pair) > 1 and pair[1] | |
# else [] | |
# ) | |
# ] | |
# + [{"role": "user", "content": message}], | |
# max_tokens=max_tokens, | |
# temperature=temperature, | |
# top_p=top_p, | |
# stream=True, | |
# ): | |
# token = getattr(chunk.choices[0].delta, "content", None) | |
# if token: | |
# response_text += token | |
# yield response_text | |
# return | |
# except HfHubHTTPError as e: | |
# # Handle 401 explicitly with helpful guidance | |
# try: | |
# status = e.response.status_code | |
# except Exception: | |
# status = None | |
# if status == 401: | |
# yield ( | |
# "❌ 401 Unauthorized from Hugging Face Inference API.\n\n" | |
# "Fix:\n" | |
# "1) Create a token at https://huggingface.co/settings/tokens with at least 'read' scope.\n" | |
# "2) In your Space, go to Settings → Repository secrets → Add secret\n" | |
# " Name: HF_TOKEN, Value: <your token>\n" | |
# "3) Restart the Space.\n" | |
# ) | |
# return | |
# # Otherwise drop to fallback | |
# except Exception: | |
# pass | |
# # Fallback: raw text_generation with Zephyr chat format | |
# zephyr_prompt = _build_zephyr_prompt(system_message, history, message) | |
# try: | |
# response_text = "" | |
# # for tok in client.text_generation( | |
# # zephyr_prompt, | |
# # max_new_tokens=max_tokens, | |
# # temperature=temperature, | |
# # top_p=top_p, | |
# # stream=True, | |
# # stop=["</s>", "<|user|>", "<|assistant|>", "<|system|>"], | |
# # ): | |
# for tok in client.text_generation( | |
# zephyr_prompt, | |
# max_new_tokens=max_tokens, | |
# temperature=temperature, | |
# top_p=top_p, | |
# stream=True, | |
# ): | |
# if tok: | |
# response_text += tok | |
# yield response_text | |
# except HfHubHTTPError as e: | |
# try: | |
# status = e.response.status_code | |
# except Exception: | |
# status = None | |
# if status == 401: | |
# yield ( | |
# "❌ 401 Unauthorized (text_generation fallback).\n\n" | |
# "Set HF_TOKEN in Space secrets (Settings → Repository secrets)." | |
# ) | |
# else: | |
# yield f"[Inference error] {e}" | |
# except Exception as e: | |
# yield f"[Runtime error] {e}" | |
# demo = gr.ChatInterface( | |
# respond, | |
# additional_inputs=[ | |
# gr.Textbox( | |
# value=( | |
# "You are a Chatbot who only answers spiritual questions based " | |
# "on Indian scriptures and declines answering other questions." | |
# ), | |
# label="System message", | |
# ), | |
# gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), | |
# gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), | |
# gr.Slider( | |
# minimum=0.1, | |
# maximum=1.0, | |
# value=0.95, | |
# step=0.05, | |
# label="Top-p (nucleus sampling)", | |
# ), | |
# ], | |
# ) | |
# if __name__ == "__main__": | |
# demo.launch() | |
# import os | |
# import gradio as gr | |
# from huggingface_hub import InferenceClient | |
# from huggingface_hub.utils import HfHubHTTPError # correct import for 0.22.x | |
# # You can override with a Space secret: MODEL_ID=<your preferred model> | |
# PREFERRED = os.getenv("MODEL_ID", "HuggingFaceH4/zephyr-7b-beta") | |
# # Accept either token name (matches your other Spaces) | |
# HF_TOKEN = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACEHUB_API_TOKEN") | |
# # If your preferred endpoint is down, we’ll try these in order: | |
# CANDIDATES = [ | |
# PREFERRED, | |
# "google/gemma-2-2b-it", | |
# "Qwen/Qwen2.5-1.5B-Instruct", | |
# "tiiuae/falcon-7b-instruct", | |
# ] | |
# def _build_generic_prompt(system_message, history, user_msg): | |
# """ | |
# Simple, model-agnostic chat prompt (works across many instruct models). | |
# """ | |
# parts = [] | |
# if system_message: | |
# parts.append(f"System: {system_message}") | |
# for u, a in (history or []): | |
# if u: | |
# parts.append(f"User: {u}") | |
# if a: | |
# parts.append(f"Assistant: {a}") | |
# parts.append(f"User: {user_msg}") | |
# parts.append("Assistant:") | |
# return "\n".join(parts) | |
# def _try_model(model_id, system_message, history, message, max_tokens, temperature, top_p): | |
# """ | |
# Try streaming via chat_completions; on failure, fall back to text_generation. | |
# Returns a generator that yields text chunks. | |
# Raises ValueError('NEXT') to indicate “try next model”. | |
# """ | |
# client = InferenceClient(model=model_id, token=HF_TOKEN) if HF_TOKEN else InferenceClient(model=model_id) | |
# # 1) Try chat-completions (if supported by the backend) | |
# try: | |
# msgs = ( | |
# [{"role": "system", "content": system_message}] if system_message else [] | |
# ) | |
# for u, a in (history or []): | |
# if u: | |
# msgs.append({"role": "user", "content": u}) | |
# if a: | |
# msgs.append({"role": "assistant", "content": a}) | |
# msgs.append({"role": "user", "content": message}) | |
# def gen_chat(): | |
# response_text = "" | |
# for chunk in client.chat_completion( | |
# messages=msgs, | |
# max_tokens=max_tokens, | |
# temperature=temperature, | |
# top_p=top_p, | |
# stream=True, | |
# ): | |
# token = getattr(chunk.choices[0].delta, "content", None) | |
# if token: | |
# response_text += token | |
# yield response_text | |
# # sanity probe: start the generator and yield progressively | |
# for out in gen_chat(): | |
# yield out | |
# return | |
# except HfHubHTTPError as e: | |
# status = getattr(e, "response", None).status_code if getattr(e, "response", None) else None | |
# if status == 404: | |
# # Endpoint not available for this model → try next candidate | |
# raise ValueError("NEXT") | |
# if status == 401: | |
# yield ( | |
# "❌ 401 Unauthorized from HF Inference API.\n\n" | |
# "Ensure a read-scoped token is set (HF_TOKEN or HUGGINGFACEHUB_API_TOKEN) " | |
# "in Space secrets, then restart." | |
# ) | |
# return | |
# if status == 403: | |
# yield ( | |
# "❌ 403 Forbidden from HF Inference API.\n\n" | |
# "This model likely requires Inference Providers + billing on your token. " | |
# "Either enable those or switch to a free hosted model using the MODEL_ID secret." | |
# ) | |
# return | |
# # fall through to text_generation for other statuses | |
# except Exception: | |
# # fall through to text_generation | |
# pass | |
# # 2) Fallback: plain text_generation with a generic prompt | |
# prompt = _build_generic_prompt(system_message, history, message) | |
# try: | |
# response_text = "" | |
# for tok in client.text_generation( | |
# prompt, | |
# max_new_tokens=max_tokens, | |
# temperature=temperature, | |
# top_p=top_p, | |
# stream=True, | |
# ): | |
# # Manual stop filtering (since huggingface_hub==0.22.x lacks 'stop' kwarg) | |
# if any(s in tok for s in ["</s>", "<|user|>", "<|assistant|>", "<|system|>"]): | |
# break | |
# if tok: | |
# response_text += tok | |
# yield response_text | |
# except HfHubHTTPError as e: | |
# status = getattr(e, "response", None).status_code if getattr(e, "response", None) else None | |
# if status == 404: | |
# # Endpoint not available for this model → try next candidate | |
# raise ValueError("NEXT") | |
# if status == 401: | |
# yield ( | |
# "❌ 401 Unauthorized (text-generation fallback).\n\n" | |
# "Set HF_TOKEN or HUGGINGFACEHUB_API_TOKEN in Space secrets and restart." | |
# ) | |
# elif status == 403: | |
# yield ( | |
# "❌ 403 Forbidden (text-generation fallback).\n\n" | |
# "Your token lacks 'Use Inference API/Providers' or billing is not enabled. " | |
# "Enable those or use a free hosted model via MODEL_ID." | |
# ) | |
# else: | |
# yield f"[Inference error] {e}" | |
# except Exception as e: | |
# yield f"[Runtime error] {e}" | |
# def respond(message, history, system_message, max_tokens, temperature, top_p): | |
# last_error = None | |
# tried = [] | |
# for model_id in [m for m in CANDIDATES if m]: | |
# tried.append(model_id) | |
# try: | |
# for chunk in _try_model(model_id, system_message, history, message, max_tokens, temperature, top_p): | |
# yield chunk | |
# # If we streamed anything without raising, we’re done | |
# return | |
# except ValueError as ve: | |
# if str(ve) == "NEXT": | |
# last_error = f"Model `{model_id}` endpoint unavailable (404)." | |
# continue | |
# else: | |
# last_error = str(ve) | |
# except Exception as e: | |
# last_error = f"Unexpected error on `{model_id}`: {e}" | |
# # If we got here, all candidates failed | |
# tried_str = " → ".join(tried) if tried else "(none)" | |
# yield ( | |
# "❌ All candidate models failed.\n\n" | |
# f"Tried: {tried_str}\n\n" | |
# f"Last error: {last_error or 'unknown'}\n\n" | |
# "Fixes:\n" | |
# "• Set MODEL_ID in Space secrets to a hosted model that’s online (e.g., google/gemma-2-2b-it, Qwen/Qwen2.5-1.5B-Instruct).\n" | |
# "• Or enable Inference Providers + billing on your HF token for models served via providers.\n" | |
# ) | |
# demo = gr.ChatInterface( | |
# respond, | |
# additional_inputs=[ | |
# gr.Textbox( | |
# value=("You are a Chatbot who only answers spiritual questions based on Indian scriptures " | |
# "and declines answering other questions."), | |
# label="System message", | |
# ), | |
# gr.Slider(minimum=1, maximum=2048, value=512, step=1, label="Max new tokens"), | |
# gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), | |
# gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"), | |
# ], | |
# ) | |
# if __name__ == "__main__": | |
# demo.launch(share=True) | |
import os | |
import gradio as gr | |
import torch | |
from transformers import AutoTokenizer, AutoModelForCausalLM | |
# You can override this via Space secret: MODEL_ID=Qwen/Qwen2-0.5B-Instruct (etc.) | |
MODEL_ID = os.getenv("MODEL_ID", "TinyLlama/TinyLlama-1.1B-Chat-v1.0") | |
# Load once at startup | |
print(f"🔧 Loading local model: {MODEL_ID}") | |
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True) | |
model = AutoModelForCausalLM.from_pretrained( | |
MODEL_ID, | |
torch_dtype=torch.float32, # CPU-friendly | |
) | |
model.eval() | |
def build_prompt(system_message: str, history, user_msg: str) -> str: | |
"""Try to use the model's chat template if present; otherwise use a generic prompt.""" | |
messages = [] | |
if system_message: | |
messages.append({"role": "system", "content": system_message}) | |
for u, a in (history or []): | |
if u: | |
messages.append({"role": "user", "content": u}) | |
if a: | |
messages.append({"role": "assistant", "content": a}) | |
messages.append({"role": "user", "content": user_msg}) | |
# Use chat template when available | |
try: | |
if getattr(tokenizer, "chat_template", None): | |
return tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=True) | |
except Exception: | |
pass | |
# Fallback generic formatting | |
parts = [] | |
if system_message: | |
parts.append(f"System: {system_message}") | |
for u, a in (history or []): | |
if u: | |
parts.append(f"User: {u}") | |
if a: | |
parts.append(f"Assistant: {a}") | |
parts.append(f"User: {user_msg}") | |
parts.append("Assistant:") | |
return "\n".join(parts) | |
def respond(message, history, system_message, max_tokens, temperature, top_p): | |
prompt = build_prompt(system_message, history, message) | |
inputs = tokenizer(prompt, return_tensors="pt") | |
with torch.no_grad(): | |
outputs = model.generate( | |
**inputs, | |
max_new_tokens=int(max_tokens), | |
do_sample=True, | |
temperature=float(temperature), | |
top_p=float(top_p), | |
pad_token_id=tokenizer.eos_token_id, | |
eos_token_id=tokenizer.eos_token_id, | |
) | |
# Decode only the newly generated portion | |
gen_ids = outputs[0][inputs["input_ids"].shape[1]:] | |
text = tokenizer.decode(gen_ids, skip_special_tokens=True) | |
# Stream the text in chunks so the UI feels live | |
acc = "" | |
for i in range(0, len(text), 40): | |
acc += text[i:i+40] | |
yield acc | |
demo = gr.ChatInterface( | |
respond, | |
additional_inputs=[ | |
gr.Textbox( | |
value=("You are a spiritual assistant who only answers spiritual questions based on Indian Hindu scriptures e.g., Bhagvadgita, and politely decline all other questions."), | |
label="System message", | |
), | |
gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"), | |
gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), | |
gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"), | |
], | |
) | |
if __name__ == "__main__": | |
# share=True gives you a public link automatically | |
demo.launch(share=True) | |
# import os | |
# import gradio as gr | |
# # ---- llama.cpp backend (fast CPU) ---- | |
# from llama_cpp import Llama | |
# # ---- to list files in a repo and pick a GGUF automatically ---- | |
# from huggingface_hub import list_repo_files | |
# # ----------------- Config ----------------- | |
# # You can override these via Space "Settings → Variables" | |
# # If MODEL_REPO is set, it's tried first; otherwise we try the CANDIDATE_REPOS below. | |
# MODEL_REPO = os.getenv("MODEL_REPO", "").strip() or None | |
# # Known small GGUF chat repos (fast & lightweight). We'll try them in order. | |
# CANDIDATE_REPOS = [ | |
# MODEL_REPO, # user-preferred first (may be None) | |
# "Qwen/Qwen2.5-0.5B-Instruct-GGUF", | |
# "Qwen/Qwen2-0.5B-Instruct-GGUF", | |
# "TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF", | |
# "bartowski/Qwen2.5-0.5B-Instruct-GGUF", | |
# ] | |
# # Best-to-worst file name patterns to prefer when multiple GGUFs are present. | |
# PREFERRED_PATTERNS = [ | |
# "q4_k_m.gguf", "Q4_K_M.gguf", | |
# "q4_0.gguf", "Q4_0.gguf", | |
# "q5_k_m.gguf", "Q5_K_M.gguf", | |
# ".gguf", # catch-all | |
# ] | |
# # Runtime knobs | |
# N_THREADS = int(os.getenv("N_THREADS", str(os.cpu_count() or 4))) | |
# CTX = int(os.getenv("CTX", "2048")) | |
# SYSTEM_DEFAULT = ( | |
# "You are a Chatbot who only answers spiritual questions based on Indian scriptures " | |
# "and politely decline other questions." | |
# ) | |
# # --------------- GGUF Picker --------------- | |
# def pick_repo_and_file(): | |
# """Return (repo_id, gguf_filename) by scanning candidate repos for a preferred GGUF.""" | |
# tried = [] | |
# for repo in [r for r in CANDIDATE_REPOS if r]: # drop None | |
# try: | |
# files = list_repo_files(repo) | |
# except Exception: | |
# tried.append(f"{repo} (list failed)") | |
# continue | |
# ggufs = [f for f in files if f.lower().endswith(".gguf")] | |
# if not ggufs: | |
# tried.append(f"{repo} (no .gguf)") | |
# continue | |
# # pick by pattern preference | |
# for pat in PREFERRED_PATTERNS: | |
# for f in ggufs: | |
# if pat in f: | |
# return repo, f | |
# tried_str = " | ".join(tried) if tried else "(none)" | |
# raise RuntimeError( | |
# "No GGUF file found in any candidate repo.\n" | |
# f"Tried: {tried_str}\n" | |
# "Tip: set MODEL_REPO to a GGUF repo like 'Qwen/Qwen2.5-0.5B-Instruct-GGUF' " | |
# "or 'TheBloke/TinyLlama-1.1B-Chat-v1.0-GGUF'." | |
# ) | |
# REPO_ID, FILENAME = pick_repo_and_file() | |
# print(f"🔧 Loading GGUF from {REPO_ID}/{FILENAME} | threads={N_THREADS}, ctx={CTX}") | |
# llm = Llama.from_pretrained( | |
# repo_id=REPO_ID, | |
# filename=FILENAME, | |
# n_ctx=CTX, | |
# n_threads=N_THREADS, | |
# n_gpu_layers=0, # CPU only | |
# logits_all=False, | |
# verbose=False, | |
# ) | |
# def respond(message, history, system_message, max_tokens, temperature, top_p): | |
# sysmsg = system_message or SYSTEM_DEFAULT | |
# msgs = [{"role": "system", "content": sysmsg}] | |
# for u, a in (history or []): | |
# if u: | |
# msgs.append({"role": "user", "content": u}) | |
# if a: | |
# msgs.append({"role": "assistant", "content": a}) | |
# msgs.append({"role": "user", "content": message}) | |
# stream = llm.create_chat_completion( | |
# messages=msgs, | |
# temperature=float(temperature), | |
# top_p=float(top_p), | |
# max_tokens=int(max_tokens), | |
# stream=True, | |
# ) | |
# acc = "" | |
# for chunk in stream: | |
# delta = chunk["choices"][0]["delta"] | |
# tok = delta.get("content", "") | |
# if tok: | |
# acc += tok | |
# yield acc | |
# demo = gr.ChatInterface( | |
# respond, | |
# additional_inputs=[ | |
# gr.Textbox(value=SYSTEM_DEFAULT, label="System message"), | |
# gr.Slider(minimum=1, maximum=1024, value=256, step=1, label="Max new tokens"), | |
# gr.Slider(minimum=0.1, maximum=4.0, value=0.7, step=0.1, label="Temperature"), | |
# gr.Slider(minimum=0.1, maximum=1.0, value=0.95, step=0.05, label="Top-p (nucleus sampling)"), | |
# ], | |
# ) | |
# if __name__ == "__main__": | |
# print(f"🧵 Threads: {N_THREADS}") | |
# demo.launch(share=True) | |