Spaces:
Running
Running
fast-response
Browse filesn_ctx صار 4096، n_batch صار 32، وعدد الخيوط يعتمد على عدد أنوية CPU.
خفّضت max_tokens الافتراضي إلى 256 في call_local_llm وcall_llm.
قلّصت تاريخ الدردشة المحفوظ إلى آخر 4 تبادلات فقط.
قلّلت حد التوكينات في مسار /upload-files إلى 256.
جرّب الآن:
أعد تشغيل الـ Space.
اختبر الرد من واجهة الخادم ثم من الواجهة المحلية. ستلاحظ بدء الرد أسرع وخفض الضغط على CPU.
app.py
CHANGED
@@ -90,8 +90,7 @@ LOCAL_GGUF_PATH = CACHE_DIR / LOCAL_GGUF_FILE
|
|
90 |
MAX_FILE_BYTES = int(os.getenv("MAX_FILE_BYTES", str(10 * 1024 * 1024))) # 10MB احتياطيًا
|
91 |
|
92 |
SYSTEM_PROMPT = """<|im_start|>system
|
93 |
-
|
94 |
-
Return structured, accurate, concise answers. Use Arabic + English labels in the final report.
|
95 |
<|im_end|>"""
|
96 |
|
97 |
# =========================
|
@@ -126,10 +125,10 @@ def load_local_model_if_configured():
|
|
126 |
llm = Llama.from_pretrained(
|
127 |
repo_id=LOCAL_GGUF_REPO,
|
128 |
filename=LOCAL_GGUF_FILE,
|
129 |
-
# Llama params
|
130 |
-
n_ctx=int(os.getenv("N_CTX", "
|
131 |
-
n_threads=int(os.getenv("N_THREADS",
|
132 |
-
n_batch=int(os.getenv("N_BATCH", "
|
133 |
n_gpu_layers=int(os.getenv("N_GPU_LAYERS", "0")),
|
134 |
use_mlock=False,
|
135 |
verbose=False,
|
@@ -140,15 +139,17 @@ def load_local_model_if_configured():
|
|
140 |
logger.error(f"❌ فشل تحميل/تشغيل GGUF: {e}")
|
141 |
|
142 |
|
143 |
-
def call_local_llm(prompt: str, max_tokens: int =
|
144 |
if llm is None:
|
145 |
raise RuntimeError("النموذج المحلي غير محمل")
|
146 |
try:
|
147 |
res = llm(
|
148 |
prompt,
|
149 |
-
max_tokens=max_tokens,
|
150 |
-
temperature=0.
|
151 |
top_p=0.9,
|
|
|
|
|
152 |
stop=["<|im_end|>", "<|im_start|>"],
|
153 |
echo=False
|
154 |
)
|
@@ -211,7 +212,7 @@ def _call_hf_single_model(model_repo: str, prompt: str, max_new_tokens: int = 90
|
|
211 |
def call_hf_inference(prompt: str, max_new_tokens: int = 900) -> str:
|
212 |
raise RuntimeError("تم تعطيل HF Inference. النموذج المحلي مستخدم فقط.")
|
213 |
|
214 |
-
def call_llm(prompt: str, max_tokens: int =
|
215 |
return call_local_llm(prompt, max_tokens)
|
216 |
|
217 |
# =========================
|
@@ -414,7 +415,7 @@ def upload_files(req: UploadFilesRequest):
|
|
414 |
f"<|im_start|>user\nAnalyze these files and summarize key issues and functions.\n\n"
|
415 |
+ "\n\n".join(parts) + "\n<|im_end|>\n<|im_start|>assistant\n"
|
416 |
)
|
417 |
-
summary = call_llm(prompt, max_tokens=
|
418 |
return {"status": "ok", "summary": summary, "files": [p.name for p in saved_paths]}
|
419 |
finally:
|
420 |
# حذف الملفات فورًا
|
@@ -438,11 +439,11 @@ def chat(req: ChatRequest):
|
|
438 |
history = get_history(global_key)
|
439 |
prompt = build_chat_prompt(history, req.message, "")
|
440 |
try:
|
441 |
-
response_text = call_llm(prompt, max_tokens=
|
442 |
except Exception as e:
|
443 |
raise HTTPException(status_code=500, detail=f"LLM error: {str(e)}")
|
444 |
|
445 |
-
updated = (history + [[req.message, response_text]])[-
|
446 |
save_history(global_key, updated)
|
447 |
return ChatResponse(response=response_text, updated_history=updated)
|
448 |
|
|
|
90 |
MAX_FILE_BYTES = int(os.getenv("MAX_FILE_BYTES", str(10 * 1024 * 1024))) # 10MB احتياطيًا
|
91 |
|
92 |
SYSTEM_PROMPT = """<|im_start|>system
|
93 |
+
نفذ ما يطلبه المستخدم كاملا بدون توقف حتى تنهي الطلب.
|
|
|
94 |
<|im_end|>"""
|
95 |
|
96 |
# =========================
|
|
|
125 |
llm = Llama.from_pretrained(
|
126 |
repo_id=LOCAL_GGUF_REPO,
|
127 |
filename=LOCAL_GGUF_FILE,
|
128 |
+
# Llama params (tuned for HF Spaces CPU)
|
129 |
+
n_ctx=int(os.getenv("N_CTX", "4096")),
|
130 |
+
n_threads=int(os.getenv("N_THREADS", str(os.cpu_count() or 2))),
|
131 |
+
n_batch=int(os.getenv("N_BATCH", "32")),
|
132 |
n_gpu_layers=int(os.getenv("N_GPU_LAYERS", "0")),
|
133 |
use_mlock=False,
|
134 |
verbose=False,
|
|
|
139 |
logger.error(f"❌ فشل تحميل/تشغيل GGUF: {e}")
|
140 |
|
141 |
|
142 |
+
def call_local_llm(prompt: str, max_tokens: int = 256) -> str:
|
143 |
if llm is None:
|
144 |
raise RuntimeError("النموذج المحلي غير محمل")
|
145 |
try:
|
146 |
res = llm(
|
147 |
prompt,
|
148 |
+
max_tokens=min(max_tokens, 256),
|
149 |
+
temperature=0.5,
|
150 |
top_p=0.9,
|
151 |
+
top_k=40,
|
152 |
+
repeat_penalty=1.1,
|
153 |
stop=["<|im_end|>", "<|im_start|>"],
|
154 |
echo=False
|
155 |
)
|
|
|
212 |
def call_hf_inference(prompt: str, max_new_tokens: int = 900) -> str:
|
213 |
raise RuntimeError("تم تعطيل HF Inference. النموذج المحلي مستخدم فقط.")
|
214 |
|
215 |
+
def call_llm(prompt: str, max_tokens: int = 256) -> str:
|
216 |
return call_local_llm(prompt, max_tokens)
|
217 |
|
218 |
# =========================
|
|
|
415 |
f"<|im_start|>user\nAnalyze these files and summarize key issues and functions.\n\n"
|
416 |
+ "\n\n".join(parts) + "\n<|im_end|>\n<|im_start|>assistant\n"
|
417 |
)
|
418 |
+
summary = call_llm(prompt, max_tokens=256)
|
419 |
return {"status": "ok", "summary": summary, "files": [p.name for p in saved_paths]}
|
420 |
finally:
|
421 |
# حذف الملفات فورًا
|
|
|
439 |
history = get_history(global_key)
|
440 |
prompt = build_chat_prompt(history, req.message, "")
|
441 |
try:
|
442 |
+
response_text = call_llm(prompt, max_tokens=256)
|
443 |
except Exception as e:
|
444 |
raise HTTPException(status_code=500, detail=f"LLM error: {str(e)}")
|
445 |
|
446 |
+
updated = (history + [[req.message, response_text]])[-4:]
|
447 |
save_history(global_key, updated)
|
448 |
return ChatResponse(response=response_text, updated_history=updated)
|
449 |
|