AhmadA82 commited on
Commit
bf1e634
·
verified ·
1 Parent(s): 62b67dd
Files changed (2) hide show
  1. Dockerfile +2 -1
  2. app.py +131 -1
Dockerfile CHANGED
@@ -24,4 +24,5 @@ RUN pip install --no-cache-dir -r requirements.txt
24
 
25
  USER user
26
 
27
- CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
 
 
24
 
25
  USER user
26
 
27
+ CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "7860"]
28
+ EXPOSE 7860
app.py CHANGED
@@ -1,3 +1,11 @@
 
 
 
 
 
 
 
 
1
  # app.py
2
  import os
3
  import json
@@ -7,6 +15,28 @@ import threading
7
  from pathlib import Path
8
  from typing import List, Dict, Any, Tuple
9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
10
  import numpy as np
11
  import faiss
12
  import pickle
@@ -400,6 +430,15 @@ def build_chat_prompt(history: List[List[str]], message: str, extra: str = "") -
400
  # =========================
401
  # FastAPI
402
  # =========================
 
 
 
 
 
 
 
 
 
403
  app = FastAPI(title="AI Code Analyst")
404
 
405
  # --- Root endpoint for Hugging Face health checks and simple UI ---
@@ -639,4 +678,95 @@ def chat(req: ChatRequest):
639
 
640
  updated = (history + [[req.message, response_text]])[-8:]
641
  save_history(req.session_id, updated)
642
- return ChatResponse(response=response_text, updated_history=updated)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # --- HF model lists ---
3
+ THINKING_MODELS = ["Qwen/Qwen3-4B-Thinking-2507"]
4
+ INSTRUCT_MODELS = ["Qwen/Qwen2.5-3B-Instruct", "deepseek-ai/DeepSeek-Coder-V2-Lite-Instruct"]
5
+
6
+ def _current_models():
7
+ return THINKING_MODELS if STATE.get("mode") == "thinking" else INSTRUCT_MODELS
8
+
9
  # app.py
10
  import os
11
  import json
 
15
  from pathlib import Path
16
  from typing import List, Dict, Any, Tuple
17
 
18
+
19
+ # --- Model mode state (thinking/instruct) with simple persistence ---
20
+ from pathlib import Path
21
+ APP_DIR = Path(__file__).parent
22
+ DATA_DIR = APP_DIR / "data"
23
+ DATA_DIR.mkdir(parents=True, exist_ok=True)
24
+ STATE_PATH = DATA_DIR / "state.json"
25
+
26
+ def _load_state():
27
+ if STATE_PATH.exists():
28
+ try:
29
+ return json.loads(STATE_PATH.read_text(encoding="utf-8"))
30
+ except Exception:
31
+ pass
32
+ return {"mode": "instruct"}
33
+
34
+ def _save_state(s: dict):
35
+ STATE_PATH.write_text(json.dumps(s, ensure_ascii=False, indent=2), encoding="utf-8")
36
+
37
+ STATE = _load_state()
38
+
39
+
40
  import numpy as np
41
  import faiss
42
  import pickle
 
430
  # =========================
431
  # FastAPI
432
  # =========================
433
+
434
+ # --- Warm-up best effort on startup ---
435
+ try:
436
+ _ = _hf_call_single(_current_models()[0], "ping", 1, timeout_s=30)
437
+ logging.info("[HF] Warm-up OK for %s", _current_models()[0])
438
+ except Exception as e:
439
+ logging.warning("[HF] Warm-up failed: %s", e)
440
+
441
+
442
  app = FastAPI(title="AI Code Analyst")
443
 
444
  # --- Root endpoint for Hugging Face health checks and simple UI ---
 
678
 
679
  updated = (history + [[req.message, response_text]])[-8:]
680
  save_history(req.session_id, updated)
681
+ return ChatResponse(response=response_text, updated_history=updated)
682
+
683
+
684
+ # --- Robust HF Inference with retries/fallback/warmup ---
685
+ import requests, time
686
+
687
+ HF_API_URL = "https://api-inference.huggingface.co/models/{model}"
688
+ HF_TOKEN = os.getenv("HF_TOKEN","")
689
+
690
+ def _hf_headers():
691
+ hdr = {"Accept":"application/json"}
692
+ if HF_TOKEN:
693
+ hdr["Authorization"] = f"Bearer {HF_TOKEN}"
694
+ return hdr
695
+
696
+ def _hf_call_single(model: str, prompt: str, max_new_tokens: int = 256, timeout_s: int = 60) -> str:
697
+ url = HF_API_URL.format(model=model)
698
+ payload = {
699
+ "inputs": prompt,
700
+ "parameters": {"max_new_tokens": max_new_tokens, "temperature": 0.3},
701
+ "options": {"wait_for_model": True, "use_cache": True}
702
+ }
703
+ tries, backoff = 0, 2
704
+ while True:
705
+ tries += 1
706
+ try:
707
+ r = requests.post(url, headers=_hf_headers(), json=payload, timeout=timeout_s)
708
+ if r.status_code == 503:
709
+ try:
710
+ eta = r.json().get("estimated_time", 8)
711
+ except Exception:
712
+ eta = 8
713
+ time.sleep(min(30, max(2, int(eta))))
714
+ continue
715
+ r.raise_for_status()
716
+ data = r.json()
717
+ if isinstance(data, list):
718
+ if data and isinstance(data[0], dict) and "generated_text" in data[0]:
719
+ return data[0]["generated_text"]
720
+ if data and isinstance(data[0], dict) and "content" in data[0]:
721
+ return data[0]["content"]
722
+ if isinstance(data, dict) and "generated_text" in data:
723
+ return data["generated_text"]
724
+ return json.dumps(data, ensure_ascii=False)
725
+ except requests.HTTPError as e:
726
+ status = getattr(e.response, "status_code", None)
727
+ if status in (502, 503, 504, 429) and tries < 3:
728
+ time.sleep(backoff); backoff *= 2; continue
729
+ try:
730
+ text = e.response.text
731
+ except Exception:
732
+ text = ""
733
+ raise RuntimeError(f"HF error {status} on {model}: {text}") from e
734
+ except requests.RequestException as e:
735
+ if tries < 3:
736
+ time.sleep(backoff); backoff *= 2; continue
737
+ raise RuntimeError(f"HF request failed on {model}: {e}") from e
738
+
739
+ def call_hf_inference_robust(prompt: str, max_new_tokens: int = 256) -> str:
740
+ last_err = None
741
+ for m in _current_models():
742
+ try:
743
+ return _hf_call_single(m, prompt, max_new_tokens)
744
+ except Exception as e:
745
+ logging.warning(f"[HF] model {m} failed: {e}")
746
+ last_err = e
747
+ continue
748
+ raise RuntimeError(f"All HF models failed. Last: {last_err}")
749
+
750
+
751
+
752
+ from fastapi import Body
753
+ from pydantic import BaseModel
754
+
755
+ class _SetModelIn(BaseModel):
756
+ mode: str # 'thinking' or 'instruct'
757
+
758
+ @app.post("/set-model")
759
+ def set_model_endpoint(body: _SetModelIn):
760
+ mode = (body.mode or "").lower().strip()
761
+ if mode not in ("thinking","instruct"):
762
+ raise HTTPException(400, "mode must be 'thinking' or 'instruct'")
763
+ STATE["mode"] = mode
764
+ _save_state(STATE)
765
+ # Try warm-up immediately to inform user about readiness
766
+ try:
767
+ _ = _hf_call_single(_current_models()[0], "ping", 1, timeout_s=30)
768
+ warmed = True
769
+ except Exception:
770
+ warmed = False
771
+ return {"ok": True, "mode": mode, "models": _current_models(), "warmed": warmed}
772
+