Spaces:
Sleeping
Sleeping
File size: 8,611 Bytes
bc2b09a 8db5e34 3c66c0b 8db5e34 3c66c0b 8db5e34 bc2b09a 6e36e0d bc2b09a 8db5e34 ddc2972 3c66c0b 8db5e34 ddc2972 6e36e0d bc2b09a 8db5e34 6e36e0d 3c66c0b 6e36e0d 3c66c0b 6e36e0d 3c66c0b b5b2ce9 3c66c0b ae90475 3c66c0b ae90475 6e36e0d 3c66c0b 8db5e34 ddc2972 3c66c0b ddc2972 ae90475 ddc2972 8db5e34 ddc2972 ae90475 ddc2972 ae90475 ddc2972 8db5e34 ddc2972 6e36e0d 3c66c0b bc2b09a ddc2972 bc2b09a 6e36e0d ddc2972 bc2b09a 6e36e0d bc2b09a 6e36e0d bc2b09a 8db5e34 ddc2972 bc2b09a ddc2972 bc2b09a b5b2ce9 bc2b09a 6e36e0d bc2b09a 8db5e34 ddc2972 3c66c0b 8db5e34 ddc2972 8db5e34 6e36e0d 3c66c0b 6e36e0d ddc2972 6e36e0d 8db5e34 ddc2972 3c66c0b ddc2972 6e36e0d ddc2972 6e36e0d b5b2ce9 6e36e0d 8db5e34 6e36e0d 3c66c0b 8db5e34 6e36e0d 8db5e34 bc2b09a 3c66c0b ddc2972 3c66c0b ddc2972 3c66c0b 6e36e0d bc2b09a 8db5e34 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 |
from __future__ import annotations
"""
llm_provider.py
- Single provider: Fireworks (OpenAI-compatible) with Qwen3-Coder-30B-A3B-Instruct
- Optional fallback: Hugging Face Inference Router (provider="fireworks-ai") if HF_TOKEN present
- No heartbeats (avoid rate hits); conservative throttling & retries
"""
import os, time, random, threading, logging
from typing import List
from dotenv import load_dotenv
from langchain_core.language_models.chat_models import BaseChatModel
from langchain.schema import HumanMessage, SystemMessage, AIMessage
from langchain_core.outputs import ChatGeneration, ChatResult
# Fireworks (OpenAI-compatible)
from openai import OpenAI
from openai import RateLimitError
# HF Router (provider routing)
from huggingface_hub import InferenceClient
load_dotenv()
log = logging.getLogger("fraud-analyst")
logging.basicConfig(level=logging.INFO)
SUMMARY_NOTICE = "π Please connect to an inference point to generate summary."
def _first_env(*names: List[str]):
for n in names:
v = os.getenv(n)
if v:
return v
return None
# Secrets
FIREWORKS_API_KEY = _first_env(
"fireworks_api_huggingface", "FIREWORKS_API_HUGGINGFACE",
"FIREWORKS_API_KEY", "OPENAI_API_KEY"
)
HF_TOKEN = _first_env("HF_TOKEN", "HUGGINGFACE_TOKEN")
# Models (Qwen only)
FW_PRIMARY_MODEL = os.getenv("FW_PRIMARY_MODEL", "accounts/fireworks/models/qwen3-coder-30b-a3b-instruct")
HF_PRIMARY_MODEL = os.getenv("HF_PRIMARY_MODEL", "Qwen/Qwen3-Coder-30B-A3B-Instruct")
# Throttle / Retry (conservative for demo)
MAX_NEW_TOKENS = int(os.getenv("LLM_MAX_NEW_TOKENS", "200"))
TEMP = float(os.getenv("LLM_TEMPERATURE", "0.3"))
MAX_RETRIES = int(os.getenv("LLM_MAX_RETRIES", "2"))
MIN_INTERVAL_S = float(os.getenv("LLM_MIN_INTERVAL_S", "5.0"))
MAX_CONCURRENCY = int(os.getenv("LLM_MAX_CONCURRENCY", "3"))
# Ensure OpenAI SDK itself doesn't also retry
os.environ.setdefault("OPENAI_MAX_RETRIES", "0")
# Global throttle across all instances
_CALL_LOCK = threading.BoundedSemaphore(MAX_CONCURRENCY)
_last_call_ts = 0.0
_ts_lock = threading.Lock()
def _pace():
"""Global pacing to avoid hitting 429s."""
global _last_call_ts
with _ts_lock:
now = time.monotonic()
dt = now - _last_call_ts
if dt < MIN_INTERVAL_S:
time.sleep(MIN_INTERVAL_S - dt + 0.5)
_last_call_ts = time.monotonic()
def _with_retries(fn):
for attempt in range(1, MAX_RETRIES + 1):
try:
return fn()
except RateLimitError:
if attempt >= MAX_RETRIES:
raise
time.sleep(5.0 * attempt + random.random() * 2.0) # 5-7 second wait
except Exception:
if attempt >= MAX_RETRIES:
raise
time.sleep(1.0 * attempt)
# ========================== Fireworks (OpenAI-compatible) ==========================
FW_BASE = os.getenv("OPENAI_API_BASE", "https://api.fireworks.ai/inference/v1")
class FireworksOpenAIChat(BaseChatModel):
"""Qwen on Fireworks via /chat/completions."""
model: str
api_key: str | None = None
temperature: float = TEMP
max_new_tokens: int = MAX_NEW_TOKENS
def __init__(self, **data):
super().__init__(**data)
self._client = OpenAI(base_url=FW_BASE, api_key=self.api_key, max_retries=0)
@property
def _llm_type(self) -> str:
return "fireworks_openai_chat"
def _convert(self, messages):
out=[]
for m in messages:
if isinstance(m, SystemMessage): out.append({"role":"system","content":m.content})
elif isinstance(m, HumanMessage): out.append({"role":"user","content":m.content})
elif isinstance(m, AIMessage): out.append({"role":"assistant","content":m.content})
else: out.append({"role":"user","content":str(getattr(m,"content",m))})
return out
def _generate(self, messages, stop=None, run_manager=None, **kwargs) -> ChatResult:
if not self.api_key:
return ChatResult(generations=[ChatGeneration(message=AIMessage(content=""))],
llm_output={"error": "no_api_key"})
def _call():
with _CALL_LOCK:
_pace()
return self._client.chat.completions.create(
model=self.model,
messages=self._convert(messages),
temperature=kwargs.get("temperature", self.temperature),
max_tokens=kwargs.get("max_tokens", self.max_new_tokens),
stream=False,
)
try:
resp = _with_retries(_call)
text = ""
if getattr(resp, "choices", None):
ch = resp.choices[0]
if getattr(ch, "message", None) and getattr(ch.message, "content", None):
text = ch.message.content
return ChatResult(generations=[ChatGeneration(message=AIMessage(content=text or ""))],
llm_output={"model": self.model, "endpoint": "chat"})
except Exception as e:
# Return empty output; UI will show notice if needed
logging.warning(f"Fireworks(Qwen) failed: {type(e).__name__}: {str(e)[:200]}")
return ChatResult(generations=[ChatGeneration(message=AIMessage(content=""))],
llm_output={"error": str(e)})
# ========================== HF Router (provider="fireworks-ai") ==========================
class HFRouterChat(BaseChatModel):
"""Fallback only if FIREWORKS_API_KEY is absent but HF_TOKEN is set."""
model: str
hf_token: str | None = None
temperature: float = TEMP
max_new_tokens: int = MAX_NEW_TOKENS
def __init__(self, **data):
super().__init__(**data)
self._client = InferenceClient(provider="fireworks-ai", api_key=self.hf_token)
@property
def _llm_type(self) -> str:
return "hf_router_fireworks"
def _convert(self, messages):
out=[]
for m in messages:
if isinstance(m, SystemMessage): out.append({"role":"system","content":m.content})
elif isinstance(m, HumanMessage): out.append({"role":"user","content":m.content})
elif isinstance(m, AIMessage): out.append({"role":"assistant","content":m.content})
else: out.append({"role":"user","content":str(getattr(m,"content",m))})
return out
def _generate(self, messages, stop=None, run_manager=None, **kwargs) -> ChatResult:
if not self.hf_token:
return ChatResult(generations=[ChatGeneration(message=AIMessage(content=""))],
llm_output={"error":"no_hf_token"})
def _call():
with _CALL_LOCK:
_pace()
return self._client.chat.completions.create(
model=self.model, # "Qwen/Qwen3-Coder-30B-A3B-Instruct"
messages=self._convert(messages),
stream=False,
max_tokens=kwargs.get("max_tokens", self.max_new_tokens),
temperature=kwargs.get("temperature", self.temperature),
)
try:
resp = _with_retries(_call)
text = ""
if getattr(resp, "choices", None):
ch = resp.choices[0]
if getattr(ch, "message", None) and getattr(ch.message, "content", None):
text = ch.message.content
elif getattr(ch, "text", None):
text = ch.text
return ChatResult(generations=[ChatGeneration(message=AIMessage(content=text or ""))],
llm_output={"model": self.model})
except Exception as e:
logging.warning(f"HF Router(Qwen) failed: {type(e).__name__}: {str(e)[:200]}")
return ChatResult(generations=[ChatGeneration(message=AIMessage(content=""))],
llm_output={"error": str(e)})
# =============================== Selection ===============================
def build_chat_llm():
# Prefer Fireworks direct (Qwen)
if FIREWORKS_API_KEY:
log.info(f"Using Fireworks chat model: {FW_PRIMARY_MODEL}")
return FireworksOpenAIChat(model=FW_PRIMARY_MODEL, api_key=FIREWORKS_API_KEY)
# Fallback to HF Router (if provided)
if HF_TOKEN:
log.info(f"Using HF Router chat model: {HF_PRIMARY_MODEL}")
return HFRouterChat(model=HF_PRIMARY_MODEL, hf_token=HF_TOKEN)
log.warning("No working chat model; notice will be shown.")
return None
CHAT_LLM = build_chat_llm()
|