AhmadA82 commited on
Commit
ac29781
·
verified ·
1 Parent(s): 17dca91

fast-response

Browse files

n_ctx صار 4096، n_batch صار 32، وعدد الخيوط يعتمد على عدد أنوية CPU.
خفّضت max_tokens الافتراضي إلى 256 في call_local_llm وcall_llm.
قلّصت تاريخ الدردشة المحفوظ إلى آخر 4 تبادلات فقط.
قلّلت حد التوكينات في مسار /upload-files إلى 256.
جرّب الآن:
أعد تشغيل الـ Space.
اختبر الرد من واجهة الخادم ثم من الواجهة المحلية. ستلاحظ بدء الرد أسرع وخفض الضغط على CPU.

Files changed (1) hide show
  1. app.py +14 -13
app.py CHANGED
@@ -90,8 +90,7 @@ LOCAL_GGUF_PATH = CACHE_DIR / LOCAL_GGUF_FILE
90
  MAX_FILE_BYTES = int(os.getenv("MAX_FILE_BYTES", str(10 * 1024 * 1024))) # 10MB احتياطيًا
91
 
92
  SYSTEM_PROMPT = """<|im_start|>system
93
- You are a senior AI code analyst. Analyze projects with hybrid indexing (code graph + retrieval).
94
- Return structured, accurate, concise answers. Use Arabic + English labels in the final report.
95
  <|im_end|>"""
96
 
97
  # =========================
@@ -126,10 +125,10 @@ def load_local_model_if_configured():
126
  llm = Llama.from_pretrained(
127
  repo_id=LOCAL_GGUF_REPO,
128
  filename=LOCAL_GGUF_FILE,
129
- # Llama params
130
- n_ctx=int(os.getenv("N_CTX", "32768")),
131
- n_threads=int(os.getenv("N_THREADS", "2")),
132
- n_batch=int(os.getenv("N_BATCH", "64")),
133
  n_gpu_layers=int(os.getenv("N_GPU_LAYERS", "0")),
134
  use_mlock=False,
135
  verbose=False,
@@ -140,15 +139,17 @@ def load_local_model_if_configured():
140
  logger.error(f"❌ فشل تحميل/تشغيل GGUF: {e}")
141
 
142
 
143
- def call_local_llm(prompt: str, max_tokens: int = 800) -> str:
144
  if llm is None:
145
  raise RuntimeError("النموذج المحلي غير محمل")
146
  try:
147
  res = llm(
148
  prompt,
149
- max_tokens=max_tokens,
150
- temperature=0.4,
151
  top_p=0.9,
 
 
152
  stop=["<|im_end|>", "<|im_start|>"],
153
  echo=False
154
  )
@@ -211,7 +212,7 @@ def _call_hf_single_model(model_repo: str, prompt: str, max_new_tokens: int = 90
211
  def call_hf_inference(prompt: str, max_new_tokens: int = 900) -> str:
212
  raise RuntimeError("تم تعطيل HF Inference. النموذج المحلي مستخدم فقط.")
213
 
214
- def call_llm(prompt: str, max_tokens: int = 900) -> str:
215
  return call_local_llm(prompt, max_tokens)
216
 
217
  # =========================
@@ -414,7 +415,7 @@ def upload_files(req: UploadFilesRequest):
414
  f"<|im_start|>user\nAnalyze these files and summarize key issues and functions.\n\n"
415
  + "\n\n".join(parts) + "\n<|im_end|>\n<|im_start|>assistant\n"
416
  )
417
- summary = call_llm(prompt, max_tokens=900)
418
  return {"status": "ok", "summary": summary, "files": [p.name for p in saved_paths]}
419
  finally:
420
  # حذف الملفات فورًا
@@ -438,11 +439,11 @@ def chat(req: ChatRequest):
438
  history = get_history(global_key)
439
  prompt = build_chat_prompt(history, req.message, "")
440
  try:
441
- response_text = call_llm(prompt, max_tokens=700)
442
  except Exception as e:
443
  raise HTTPException(status_code=500, detail=f"LLM error: {str(e)}")
444
 
445
- updated = (history + [[req.message, response_text]])[-8:]
446
  save_history(global_key, updated)
447
  return ChatResponse(response=response_text, updated_history=updated)
448
 
 
90
  MAX_FILE_BYTES = int(os.getenv("MAX_FILE_BYTES", str(10 * 1024 * 1024))) # 10MB احتياطيًا
91
 
92
  SYSTEM_PROMPT = """<|im_start|>system
93
+ نفذ ما يطلبه المستخدم كاملا بدون توقف حتى تنهي الطلب.
 
94
  <|im_end|>"""
95
 
96
  # =========================
 
125
  llm = Llama.from_pretrained(
126
  repo_id=LOCAL_GGUF_REPO,
127
  filename=LOCAL_GGUF_FILE,
128
+ # Llama params (tuned for HF Spaces CPU)
129
+ n_ctx=int(os.getenv("N_CTX", "4096")),
130
+ n_threads=int(os.getenv("N_THREADS", str(os.cpu_count() or 2))),
131
+ n_batch=int(os.getenv("N_BATCH", "32")),
132
  n_gpu_layers=int(os.getenv("N_GPU_LAYERS", "0")),
133
  use_mlock=False,
134
  verbose=False,
 
139
  logger.error(f"❌ فشل تحميل/تشغيل GGUF: {e}")
140
 
141
 
142
+ def call_local_llm(prompt: str, max_tokens: int = 256) -> str:
143
  if llm is None:
144
  raise RuntimeError("النموذج المحلي غير محمل")
145
  try:
146
  res = llm(
147
  prompt,
148
+ max_tokens=min(max_tokens, 256),
149
+ temperature=0.5,
150
  top_p=0.9,
151
+ top_k=40,
152
+ repeat_penalty=1.1,
153
  stop=["<|im_end|>", "<|im_start|>"],
154
  echo=False
155
  )
 
212
  def call_hf_inference(prompt: str, max_new_tokens: int = 900) -> str:
213
  raise RuntimeError("تم تعطيل HF Inference. النموذج المحلي مستخدم فقط.")
214
 
215
+ def call_llm(prompt: str, max_tokens: int = 256) -> str:
216
  return call_local_llm(prompt, max_tokens)
217
 
218
  # =========================
 
415
  f"<|im_start|>user\nAnalyze these files and summarize key issues and functions.\n\n"
416
  + "\n\n".join(parts) + "\n<|im_end|>\n<|im_start|>assistant\n"
417
  )
418
+ summary = call_llm(prompt, max_tokens=256)
419
  return {"status": "ok", "summary": summary, "files": [p.name for p in saved_paths]}
420
  finally:
421
  # حذف الملفات فورًا
 
439
  history = get_history(global_key)
440
  prompt = build_chat_prompt(history, req.message, "")
441
  try:
442
+ response_text = call_llm(prompt, max_tokens=256)
443
  except Exception as e:
444
  raise HTTPException(status_code=500, detail=f"LLM error: {str(e)}")
445
 
446
+ updated = (history + [[req.message, response_text]])[-4:]
447
  save_history(global_key, updated)
448
  return ChatResponse(response=response_text, updated_history=updated)
449