Spaces:
Sleeping
Sleeping
use-cpu
Browse files- Dockerfile +2 -1
- app.py +131 -1
Dockerfile
CHANGED
@@ -24,4 +24,5 @@ RUN pip install --no-cache-dir -r requirements.txt
|
|
24 |
|
25 |
USER user
|
26 |
|
27 |
-
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
|
|
|
24 |
|
25 |
USER user
|
26 |
|
27 |
+
CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
|
28 |
+
EXPOSE 7860
|
app.py
CHANGED
@@ -1,3 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# app.py
|
2 |
import os
|
3 |
import json
|
@@ -7,6 +15,28 @@ import threading
|
|
7 |
from pathlib import Path
|
8 |
from typing import List, Dict, Any, Tuple
|
9 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
10 |
import numpy as np
|
11 |
import faiss
|
12 |
import pickle
|
@@ -400,6 +430,15 @@ def build_chat_prompt(history: List[List[str]], message: str, extra: str = "") -
|
|
400 |
# =========================
|
401 |
# FastAPI
|
402 |
# =========================
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
403 |
app = FastAPI(title="AI Code Analyst")
|
404 |
|
405 |
# --- Root endpoint for Hugging Face health checks and simple UI ---
|
@@ -639,4 +678,95 @@ def chat(req: ChatRequest):
|
|
639 |
|
640 |
updated = (history + [[req.message, response_text]])[-8:]
|
641 |
save_history(req.session_id, updated)
|
642 |
-
return ChatResponse(response=response_text, updated_history=updated)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
# --- HF model lists ---
|
3 |
+
THINKING_MODELS = ["Qwen/Qwen3-4B-Thinking-2507"]
|
4 |
+
INSTRUCT_MODELS = ["Qwen/Qwen2.5-3B-Instruct", "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"]
|
5 |
+
|
6 |
+
def _current_models():
|
7 |
+
return THINKING_MODELS if STATE.get("mode") == "thinking" else INSTRUCT_MODELS
|
8 |
+
|
9 |
# app.py
|
10 |
import os
|
11 |
import json
|
|
|
15 |
from pathlib import Path
|
16 |
from typing import List, Dict, Any, Tuple
|
17 |
|
18 |
+
|
19 |
+
# --- Model mode state (thinking/instruct) with simple persistence ---
|
20 |
+
from pathlib import Path
|
21 |
+
APP_DIR = Path(__file__).parent
|
22 |
+
DATA_DIR = APP_DIR / "data"
|
23 |
+
DATA_DIR.mkdir(parents=True, exist_ok=True)
|
24 |
+
STATE_PATH = DATA_DIR / "state.json"
|
25 |
+
|
26 |
+
def _load_state():
|
27 |
+
if STATE_PATH.exists():
|
28 |
+
try:
|
29 |
+
return json.loads(STATE_PATH.read_text(encoding="utf-8"))
|
30 |
+
except Exception:
|
31 |
+
pass
|
32 |
+
return {"mode": "instruct"}
|
33 |
+
|
34 |
+
def _save_state(s: dict):
|
35 |
+
STATE_PATH.write_text(json.dumps(s, ensure_ascii=False, indent=2), encoding="utf-8")
|
36 |
+
|
37 |
+
STATE = _load_state()
|
38 |
+
|
39 |
+
|
40 |
import numpy as np
|
41 |
import faiss
|
42 |
import pickle
|
|
|
430 |
# =========================
|
431 |
# FastAPI
|
432 |
# =========================
|
433 |
+
|
434 |
+
# --- Warm-up best effort on startup ---
|
435 |
+
try:
|
436 |
+
_ = _hf_call_single(_current_models()[0], "ping", 1, timeout_s=30)
|
437 |
+
logging.info("[HF] Warm-up OK for %s", _current_models()[0])
|
438 |
+
except Exception as e:
|
439 |
+
logging.warning("[HF] Warm-up failed: %s", e)
|
440 |
+
|
441 |
+
|
442 |
app = FastAPI(title="AI Code Analyst")
|
443 |
|
444 |
# --- Root endpoint for Hugging Face health checks and simple UI ---
|
|
|
678 |
|
679 |
updated = (history + [[req.message, response_text]])[-8:]
|
680 |
save_history(req.session_id, updated)
|
681 |
+
return ChatResponse(response=response_text, updated_history=updated)
|
682 |
+
|
683 |
+
|
684 |
+
# --- Robust HF Inference with retries/fallback/warmup ---
|
685 |
+
import requests, time
|
686 |
+
|
687 |
+
HF_API_URL = "https://api-inference.huggingface.co/models/{model}"
|
688 |
+
HF_TOKEN = os.getenv("HF_TOKEN","")
|
689 |
+
|
690 |
+
def _hf_headers():
|
691 |
+
hdr = {"Accept":"application/json"}
|
692 |
+
if HF_TOKEN:
|
693 |
+
hdr["Authorization"] = f"Bearer {HF_TOKEN}"
|
694 |
+
return hdr
|
695 |
+
|
696 |
+
def _hf_call_single(model: str, prompt: str, max_new_tokens: int = 256, timeout_s: int = 60) -> str:
|
697 |
+
url = HF_API_URL.format(model=model)
|
698 |
+
payload = {
|
699 |
+
"inputs": prompt,
|
700 |
+
"parameters": {"max_new_tokens": max_new_tokens, "temperature": 0.3},
|
701 |
+
"options": {"wait_for_model": True, "use_cache": True}
|
702 |
+
}
|
703 |
+
tries, backoff = 0, 2
|
704 |
+
while True:
|
705 |
+
tries += 1
|
706 |
+
try:
|
707 |
+
r = requests.post(url, headers=_hf_headers(), json=payload, timeout=timeout_s)
|
708 |
+
if r.status_code == 503:
|
709 |
+
try:
|
710 |
+
eta = r.json().get("estimated_time", 8)
|
711 |
+
except Exception:
|
712 |
+
eta = 8
|
713 |
+
time.sleep(min(30, max(2, int(eta))))
|
714 |
+
continue
|
715 |
+
r.raise_for_status()
|
716 |
+
data = r.json()
|
717 |
+
if isinstance(data, list):
|
718 |
+
if data and isinstance(data[0], dict) and "generated_text" in data[0]:
|
719 |
+
return data[0]["generated_text"]
|
720 |
+
if data and isinstance(data[0], dict) and "content" in data[0]:
|
721 |
+
return data[0]["content"]
|
722 |
+
if isinstance(data, dict) and "generated_text" in data:
|
723 |
+
return data["generated_text"]
|
724 |
+
return json.dumps(data, ensure_ascii=False)
|
725 |
+
except requests.HTTPError as e:
|
726 |
+
status = getattr(e.response, "status_code", None)
|
727 |
+
if status in (502, 503, 504, 429) and tries < 3:
|
728 |
+
time.sleep(backoff); backoff *= 2; continue
|
729 |
+
try:
|
730 |
+
text = e.response.text
|
731 |
+
except Exception:
|
732 |
+
text = ""
|
733 |
+
raise RuntimeError(f"HF error {status} on {model}: {text}") from e
|
734 |
+
except requests.RequestException as e:
|
735 |
+
if tries < 3:
|
736 |
+
time.sleep(backoff); backoff *= 2; continue
|
737 |
+
raise RuntimeError(f"HF request failed on {model}: {e}") from e
|
738 |
+
|
739 |
+
def call_hf_inference_robust(prompt: str, max_new_tokens: int = 256) -> str:
|
740 |
+
last_err = None
|
741 |
+
for m in _current_models():
|
742 |
+
try:
|
743 |
+
return _hf_call_single(m, prompt, max_new_tokens)
|
744 |
+
except Exception as e:
|
745 |
+
logging.warning(f"[HF] model {m} failed: {e}")
|
746 |
+
last_err = e
|
747 |
+
continue
|
748 |
+
raise RuntimeError(f"All HF models failed. Last: {last_err}")
|
749 |
+
|
750 |
+
|
751 |
+
|
752 |
+
from fastapi import Body
|
753 |
+
from pydantic import BaseModel
|
754 |
+
|
755 |
+
class _SetModelIn(BaseModel):
|
756 |
+
mode: str # 'thinking' or 'instruct'
|
757 |
+
|
758 |
+
@app.post("/set-model")
|
759 |
+
def set_model_endpoint(body: _SetModelIn):
|
760 |
+
mode = (body.mode or "").lower().strip()
|
761 |
+
if mode not in ("thinking","instruct"):
|
762 |
+
raise HTTPException(400, "mode must be 'thinking' or 'instruct'")
|
763 |
+
STATE["mode"] = mode
|
764 |
+
_save_state(STATE)
|
765 |
+
# Try warm-up immediately to inform user about readiness
|
766 |
+
try:
|
767 |
+
_ = _hf_call_single(_current_models()[0], "ping", 1, timeout_s=30)
|
768 |
+
warmed = True
|
769 |
+
except Exception:
|
770 |
+
warmed = False
|
771 |
+
return {"ok": True, "mode": mode, "models": _current_models(), "warmed": warmed}
|
772 |
+
|