Spaces:

AhmadA82
/

coder-demo

Running

App Files Files Community

AhmadA82 commited on 7 days ago

Commit

ac29781

verified ·

1 Parent(s): 17dca91

fast-response

Browse files

n_ctx صار 4096، n_batch صار 32، وعدد الخيوط يعتمد على عدد أنوية CPU.
خفّضت max_tokens الافتراضي إلى 256 في call_local_llm وcall_llm.
قلّصت تاريخ الدردشة المحفوظ إلى آخر 4 تبادلات فقط.
قلّلت حد التوكينات في مسار /upload-files إلى 256.
جرّب الآن:
أعد تشغيل الـ Space.
اختبر الرد من واجهة الخادم ثم من الواجهة المحلية. ستلاحظ بدء الرد أسرع وخفض الضغط على CPU.

Files changed (1) hide show

app.py +14 -13

app.py CHANGED Viewed

@@ -90,8 +90,7 @@ LOCAL_GGUF_PATH = CACHE_DIR / LOCAL_GGUF_FILE
 MAX_FILE_BYTES = int(os.getenv("MAX_FILE_BYTES", str(10 * 1024 * 1024)))  # 10MB احتياطيًا
 SYSTEM_PROMPT = """<|im_start|>system
-You are a senior AI code analyst. Analyze projects with hybrid indexing (code graph + retrieval).
-Return structured, accurate, concise answers. Use Arabic + English labels in the final report.
 <|im_end|>"""
 # =========================
@@ -126,10 +125,10 @@ def load_local_model_if_configured():
         llm = Llama.from_pretrained(
             repo_id=LOCAL_GGUF_REPO,
             filename=LOCAL_GGUF_FILE,
-            # Llama params
-            n_ctx=int(os.getenv("N_CTX", "32768")),
-            n_threads=int(os.getenv("N_THREADS", "2")),
-            n_batch=int(os.getenv("N_BATCH", "64")),
             n_gpu_layers=int(os.getenv("N_GPU_LAYERS", "0")),
             use_mlock=False,
             verbose=False,
@@ -140,15 +139,17 @@ def load_local_model_if_configured():
         logger.error(f"❌ فشل تحميل/تشغيل GGUF: {e}")
-def call_local_llm(prompt: str, max_tokens: int = 800) -> str:
     if llm is None:
         raise RuntimeError("النموذج المحلي غير محمل")
     try:
         res = llm(
             prompt,
-            max_tokens=max_tokens,
-            temperature=0.4,
             top_p=0.9,
             stop=["<|im_end|>", "<|im_start|>"],
             echo=False
         )
@@ -211,7 +212,7 @@ def _call_hf_single_model(model_repo: str, prompt: str, max_new_tokens: int = 90
 def call_hf_inference(prompt: str, max_new_tokens: int = 900) -> str:
     raise RuntimeError("تم تعطيل HF Inference. النموذج المحلي مستخدم فقط.")
-def call_llm(prompt: str, max_tokens: int = 900) -> str:
     return call_local_llm(prompt, max_tokens)
 # =========================
@@ -414,7 +415,7 @@ def upload_files(req: UploadFilesRequest):
             f"<|im_start|>user\nAnalyze these files and summarize key issues and functions.\n\n"
             + "\n\n".join(parts) + "\n<|im_end|>\n<|im_start|>assistant\n"
         )
-        summary = call_llm(prompt, max_tokens=900)
         return {"status": "ok", "summary": summary, "files": [p.name for p in saved_paths]}
     finally:
         # حذف الملفات فورًا
@@ -438,11 +439,11 @@ def chat(req: ChatRequest):
     history = get_history(global_key)
     prompt = build_chat_prompt(history, req.message, "")
     try:
-        response_text = call_llm(prompt, max_tokens=700)
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"LLM error: {str(e)}")
-    updated = (history + [[req.message, response_text]])[-8:]
     save_history(global_key, updated)
     return ChatResponse(response=response_text, updated_history=updated)

 MAX_FILE_BYTES = int(os.getenv("MAX_FILE_BYTES", str(10 * 1024 * 1024)))  # 10MB احتياطيًا
 SYSTEM_PROMPT = """<|im_start|>system
+نفذ ما يطلبه المستخدم كاملا بدون توقف حتى تنهي الطلب.
 <|im_end|>"""
 # =========================
         llm = Llama.from_pretrained(
             repo_id=LOCAL_GGUF_REPO,
             filename=LOCAL_GGUF_FILE,
+            # Llama params (tuned for HF Spaces CPU)
+            n_ctx=int(os.getenv("N_CTX", "4096")),
+            n_threads=int(os.getenv("N_THREADS", str(os.cpu_count() or 2))),
+            n_batch=int(os.getenv("N_BATCH", "32")),
             n_gpu_layers=int(os.getenv("N_GPU_LAYERS", "0")),
             use_mlock=False,
             verbose=False,
         logger.error(f"❌ فشل تحميل/تشغيل GGUF: {e}")
+def call_local_llm(prompt: str, max_tokens: int = 256) -> str:
     if llm is None:
         raise RuntimeError("النموذج المحلي غير محمل")
     try:
         res = llm(
             prompt,
+            max_tokens=min(max_tokens, 256),
+            temperature=0.5,
             top_p=0.9,
+            top_k=40,
+            repeat_penalty=1.1,
             stop=["<|im_end|>", "<|im_start|>"],
             echo=False
         )
 def call_hf_inference(prompt: str, max_new_tokens: int = 900) -> str:
     raise RuntimeError("تم تعطيل HF Inference. النموذج المحلي مستخدم فقط.")
+def call_llm(prompt: str, max_tokens: int = 256) -> str:
     return call_local_llm(prompt, max_tokens)
 # =========================
             f"<|im_start|>user\nAnalyze these files and summarize key issues and functions.\n\n"
             + "\n\n".join(parts) + "\n<|im_end|>\n<|im_start|>assistant\n"
         )
+        summary = call_llm(prompt, max_tokens=256)
         return {"status": "ok", "summary": summary, "files": [p.name for p in saved_paths]}
     finally:
         # حذف الملفات فورًا
     history = get_history(global_key)
     prompt = build_chat_prompt(history, req.message, "")
     try:
+        response_text = call_llm(prompt, max_tokens=256)
     except Exception as e:
         raise HTTPException(status_code=500, detail=f"LLM error: {str(e)}")
+    updated = (history + [[req.message, response_text]])[-4:]
     save_history(global_key, updated)
     return ChatResponse(response=response_text, updated_history=updated)