Spaces:

dondoesstuff
/

LFM2-1.2B

Runtime error

App Files Files Community

dondoesstuff commited on Aug 12

Commit

c8ae67b

verified ·

1 Parent(s): 1c6b552

Update app.py

Browse files

Files changed (1) hide show

app.py +251 -222

app.py CHANGED Viewed

@@ -1,222 +1,251 @@
-import os
-import time
-import uuid
-from typing import List, Optional, Dict, Any
-import torch
-from fastapi import FastAPI, HTTPException
-from fastapi.responses import RedirectResponse
-from pydantic import BaseModel, Field
-from transformers import AutoModelForCausalLM, AutoTokenizer
-MODEL_ID = os.getenv("MODEL_ID", "LiquidAI/LFM2-1.2B")
-DEFAULT_MAX_TOKENS = int(os.getenv("MAX_TOKENS", "256"))
-app = FastAPI(title="OpenAI-compatible API for LiquidAI/LFM2-1.2B")
-tokenizer = None
-model = None
-def get_dtype() -> torch.dtype:
-    if torch.cuda.is_available():
-        # Prefer bfloat16 if supported; else float16
-        if torch.cuda.is_bf16_supported():
-            return torch.bfloat16
-        return torch.float16
-    # CPU
-    return torch.float32
-@app.on_event("startup")
-def load_model():
-    global tokenizer, model
-    dtype = get_dtype()
-    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True, trust_remote_code=True)
-    model = AutoModelForCausalLM.from_pretrained(
-        MODEL_ID,
-        torch_dtype=dtype,
-        device_map="auto",
-        trust_remote_code=True,
-    )
-    # Ensure eos/bos tokens exist
-    if tokenizer.eos_token is None:
-        tokenizer.eos_token = tokenizer.sep_token or tokenizer.pad_token or "</s>"
-    if tokenizer.pad_token is None:
-        tokenizer.pad_token = tokenizer.eos_token
-class ChatMessage(BaseModel):
-    role: str
-    content: str
-class ChatCompletionRequest(BaseModel):
-    model: Optional[str] = Field(default=MODEL_ID)
-    messages: List[ChatMessage]
-    temperature: Optional[float] = 0.7
-    top_p: Optional[float] = 0.95
-    max_tokens: Optional[int] = None
-    stop: Optional[List[str] | str] = None
-    n: Optional[int] = 1
-class CompletionRequest(BaseModel):
-    model: Optional[str] = Field(default=MODEL_ID)
-    prompt: str | List[str]
-    temperature: Optional[float] = 0.7
-    top_p: Optional[float] = 0.95
-    max_tokens: Optional[int] = None
-    stop: Optional[List[str] | str] = None
-    n: Optional[int] = 1
-class Usage(BaseModel):
-    prompt_tokens: int
-    completion_tokens: int
-    total_tokens: int
-# Simple chat prompt formatter
-def build_chat_prompt(messages: List[ChatMessage]) -> str:
-    system_prefix = "You are a helpful assistant."
-    system_msgs = [m.content for m in messages if m.role == "system"]
-    if system_msgs:
-        system_prefix = system_msgs[-1]
-    conv: List[str] = [f"System: {system_prefix}"]
-    for m in messages:
-        if m.role == "system":
-            continue
-        role = "User" if m.role == "user" else ("Assistant" if m.role == "assistant" else m.role.capitalize())
-        conv.append(f"{role}: {m.content}")
-    conv.append("Assistant:")
-    return "\n".join(conv)
-def apply_stop_sequences(text: str, stop: Optional[List[str] | str]) -> str:
-    if stop is None:
-        return text
-    stops = stop if isinstance(stop, list) else [stop]
-    cut = len(text)
-    for s in stops:
-        if not s:
-            continue
-        idx = text.find(s)
-        if idx != -1:
-            cut = min(cut, idx)
-    return text[:cut]
-def generate_once(prompt: str, temperature: float, top_p: float, max_new_tokens: int) -> Dict[str, Any]:
-    assert tokenizer is not None and model is not None, "Model not loaded"
-    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
-    gen_ids = model.generate(
-        **inputs,
-        max_new_tokens=max_new_tokens,
-        do_sample=True if temperature and temperature > 0 else False,
-        temperature=max(0.0, float(temperature or 0.0)),
-        top_p=max(0.0, float(top_p or 1.0)),
-        pad_token_id=tokenizer.pad_token_id,
-        eos_token_id=tokenizer.eos_token_id,
-    )
-    out = tokenizer.decode(gen_ids[0][inputs["input_ids"].shape[-1]:], skip_special_tokens=True)
-    return {
-        "text": out,
-        "prompt_tokens": inputs["input_ids"].numel(),
-        "completion_tokens": gen_ids[0].shape[0] - inputs["input_ids"].shape[-1],
-    }
-@app.get("/")
-def root():
-    return RedirectResponse(url="/docs")
-@app.get("/health")
-def health():
-    return {"status": "ok", "model": MODEL_ID}
-@app.post("/v1/chat/completions")
-def chat_completions(req: ChatCompletionRequest):
-    if req.n and req.n > 1:
-        raise HTTPException(status_code=400, detail="Only n=1 is supported in this simple server.")
-    max_new = req.max_tokens or DEFAULT_MAX_TOKENS
-    prompt = build_chat_prompt(req.messages)
-    g = generate_once(prompt, req.temperature or 0.7, req.top_p or 0.95, max_new)
-    text = apply_stop_sequences(g["text"], req.stop)
-    created = int(time.time())
-    comp_id = f"chatcmpl-{uuid.uuid4().hex[:24]}"
-    usage = Usage(
-        prompt_tokens=g["prompt_tokens"],
-        completion_tokens=g["completion_tokens"],
-        total_tokens=g["prompt_tokens"] + g["completion_tokens"],
-    )
-    return {
-        "id": comp_id,
-        "object": "chat.completion",
-        "created": created,
-        "model": req.model or MODEL_ID,
-        "choices": [
-            {
-                "index": 0,
-                "message": {"role": "assistant", "content": text},
-                "finish_reason": "stop",
-            }
-        ],
-        "usage": usage.dict(),
-    }
-@app.post("/v1/completions")
-def completions(req: CompletionRequest):
-    if req.n and req.n > 1:
-        raise HTTPException(status_code=400, detail="Only n=1 is supported in this simple server.")
-    prompts = req.prompt if isinstance(req.prompt, list) else [req.prompt]
-    if len(prompts) != 1:
-        raise HTTPException(status_code=400, detail="Only a single prompt is supported in this simple server.")
-    max_new = req.max_tokens or DEFAULT_MAX_TOKENS
-    g = generate_once(prompts[0], req.temperature or 0.7, req.top_p or 0.95, max_new)
-    text = apply_stop_sequences(g["text"], req.stop)
-    created = int(time.time())
-    comp_id = f"cmpl-{uuid.uuid4().hex[:24]}"
-    usage = Usage(
-        prompt_tokens=g["prompt_tokens"],
-        completion_tokens=g["completion_tokens"],
-        total_tokens=g["prompt_tokens"] + g["completion_tokens"],
-    )
-    return {
-        "id": comp_id,
-        "object": "text_completion",
-        "created": created,
-        "model": req.model or MODEL_ID,
-        "choices": [
-            {
-                "index": 0,
-                "text": text,
-                "finish_reason": "stop",
-                "logprobs": None,
-            }
-        ],
-        "usage": usage.dict(),
-    }
-if __name__ == "__main__":
-    import uvicorn
-    port = int(os.getenv("PORT", "7860"))
-    uvicorn.run("app:app", host="0.0.0.0", port=port, reload=False)

+"""
+Minimal OpenAI-compatible local server that serves /LiquidAI/LFM2-1.2B via Hugging Face
+Transformers on CPU and exposes a subset of the OpenAI REST API (chat/completions, models).
+Save as local_openai_compatible_server.py and run:
+    pip install -r requirements.txt
+    python local_openai_compatible_server.py
+Or run with uvicorn directly (recommended for production/dev):
+    uvicorn local_openai_compatible_server:app --host 0.0.0.0 --port 7860
+Requirements (requirements.txt):
+    fastapi
+    "uvicorn[standard]"
+    transformers
+    torch
+Notes:
+- CPU-only: model loads on CPU (may be slow for a 1.2B model depending on your machine).
+- Model repo id used: "/LiquidAI/LFM2-1.2B" — adjust if you have a different path or local copy.
+- This provides a simplified compatibility layer. It is NOT feature-complete with OpenAI's API
+  but implements common fields: messages, max_tokens, temperature, top_p, n, stop, stream (basic).
+"""
+from fastapi import FastAPI, Request, HTTPException
+from fastapi.responses import JSONResponse, StreamingResponse, PlainTextResponse
+from fastapi.middleware.cors import CORSMiddleware
+from pydantic import BaseModel
+from typing import List, Optional, Any, Dict
+import torch
+from transformers import AutoTokenizer, AutoModelForCausalLM
+import time
+import json
+import uuid
+# -----------------------------
+# Configuration
+# -----------------------------
+MODEL_ID = "/LiquidAI/LFM2-1.2B"  # change to your model location or HF repo
+HOST = "0.0.0.0"
+PORT = 7860
+DEVICE = torch.device("cpu")  # CPU-only as requested
+DEFAULT_MAX_TOKENS = 256
+# -----------------------------
+# Load model & tokenizer
+# -----------------------------
+print(f"Loading tokenizer and model '{MODEL_ID}' on device {DEVICE} (CPU-only)... this may take a while")
+try:
+    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID, use_fast=True)
+    model = AutoModelForCausalLM.from_pretrained(MODEL_ID, torch_dtype=torch.float32)
+    model.to(DEVICE)
+    model.eval()
+except Exception as e:
+    raise RuntimeError(f"Failed to load model/tokenizer for '{MODEL_ID}': {e}")
+# If tokenizer has no pad/eos, try to set sensible defaults
+if tokenizer.pad_token_id is None:
+    if tokenizer.eos_token_id is not None:
+        tokenizer.pad_token_id = tokenizer.eos_token_id
+# -----------------------------
+# FastAPI app
+# -----------------------------
+app = FastAPI(title="Local OpenAI-compatible server (transformers)", version="0.1")
+app.add_middleware(
+    CORSMiddleware,
+    allow_origins=["*"],
+    allow_credentials=True,
+    allow_methods=["*"],
+    allow_headers=["*"],
+)
+# -----------------------------
+# Pydantic models (request bodies)
+# -----------------------------
+class Message(BaseModel):
+    role: str
+    content: str
+class ChatCompletionRequest(BaseModel):
+    model: Optional[str] = MODEL_ID
+    messages: List[Message]
+    max_tokens: Optional[int] = DEFAULT_MAX_TOKENS
+    temperature: Optional[float] = 0.0
+    top_p: Optional[float] = 1.0
+    n: Optional[int] = 1
+    stop: Optional[List[str]] = None
+    stream: Optional[bool] = False
+# -----------------------------
+# Helpers
+# -----------------------------
+def build_prompt_from_messages(messages: List[Dict[str, Any]]) -> str:
+    # Simple conversational prompt formatting. Adjust to suit model's expected format.
+    parts = []
+    for m in messages:
+        role = m.get("role", "user")
+        content = m.get("content", "")
+        if role == "system":
+            parts.append(f"<|system|> {content}\n")
+        elif role == "user":
+            parts.append(f"User: {content}\n")
+        elif role == "assistant":
+            parts.append(f"Assistant: {content}\n")
+        else:
+            parts.append(f"{role}: {content}\n")
+    parts.append("Assistant: ")
+    return "".join(parts)
+def apply_stop_sequences(text: str, stops: Optional[List[str]]) -> str:
+    if not stops:
+        return text
+    idx = None
+    for s in stops:
+        if s == "":
+            continue
+        pos = text.find(s)
+        if pos != -1:
+            if idx is None or pos < idx:
+                idx = pos
+    if idx is not None:
+        return text[:idx]
+    return text
+# -----------------------------
+# Endpoints
+# -----------------------------
+@app.get("/", response_class=PlainTextResponse)
+async def root():
+    return "Local OpenAI-compatible server running. Use /v1/chat/completions or /v1/models"
+@app.get("/v1/models")
+async def list_models():
+    return {"data": [{"id": MODEL_ID, "object": "model"}], "object": "list"}
+@app.post("/v1/chat/completions")
+async def chat_completions(request: Request, body: ChatCompletionRequest):
+    # Basic validation
+    if body.model is None or body.model != MODEL_ID:
+        # Allow the default model but warn if mismatched
+        raise HTTPException(status_code=400, detail={"error": "invalid_model", "message": f"Only model {MODEL_ID} is available on this server."})
+    prompt = build_prompt_from_messages([m.dict() for m in body.messages])
+    # Tokenize
+    inputs = tokenizer(prompt, return_tensors="pt")
+    input_ids = inputs["input_ids"].to(DEVICE)
+    input_len = input_ids.shape[-1]
+    # Generation settings
+    gen_kwargs = {
+        "max_new_tokens": body.max_tokens,
+        "do_sample": bool(body.temperature and body.temperature > 0.0),
+        "temperature": float(body.temperature or 0.0),
+        "top_p": float(body.top_p or 1.0),
+        "num_return_sequences": int(body.n or 1),
+        "pad_token_id": tokenizer.pad_token_id or tokenizer.eos_token_id,
+        # note: on CPU large models may be slow
+    }
+    # Synchronous generation
+    with torch.no_grad():
+        outputs = model.generate(input_ids, **gen_kwargs)
+    choices = []
+    for i, out_ids in enumerate(outputs):
+        full_text = tokenizer.decode(out_ids, skip_special_tokens=True)
+        # Attempt to strip the prompt prefix to return only generated reply
+        # find the last occurrence of the prompt in full_text (best-effort)
+        stripped = full_text
+        try:
+            # prefer exact match; fallback to trimming by token count
+            if prompt.strip() and prompt in full_text:
+                stripped = full_text.split(prompt, 1)[1]
+            else:
+                # fallback: remove first input_len tokens from decoded sequence
+                decoded_all = full_text
+                # naive fallback: no-op (we keep the full_text)
+                stripped = decoded_all
+        except Exception:
+            stripped = full_text
+        # apply stop sequences
+        stripped = apply_stop_sequences(stripped, body.stop)
+        # build choice structure similar to OpenAI
+        choice = {
+            "index": i,
+            "message": {"role": "assistant", "content": stripped},
+            "finish_reason": "stop" if body.stop else "length",
+        }
+        choices.append(choice)
+    # approximate token usage
+    completion_tokens = max(0, (outputs.shape[-1] - input_len) if outputs is not None else 0)
+    usage = {"prompt_tokens": int(input_len), "completion_tokens": int(completion_tokens), "total_tokens": int(input_len + completion_tokens)}
+    response = {
+        "id": str(uuid.uuid4()),
+        "object": "chat.completion",
+        "created": int(time.time()),
+        "model": body.model,
+        "choices": choices,
+        "usage": usage,
+    }
+    # Streaming: rudimentary implementation that streams chunks of the final text as SSE
+    if body.stream:
+        # Only support streaming a single response (n > 1 will still stream the first)
+        text_to_stream = choices[0]["message"]["content"]
+        def event_stream():
+            # send a few small chunks
+            chunk_size = 128
+            for start in range(0, len(text_to_stream), chunk_size):
+                chunk = text_to_stream[start:start+chunk_size]
+                payload = {"id": response["id"], "object": "chat.completion.chunk", "choices": [{"delta": {"content": chunk}, "index": 0}]}
+                yield f"data: {json.dumps(payload)}\n\n"
+            # final done message
+            done_payload = {"id": response["id"], "object": "chat.completion.chunk", "choices": [{"delta": {}, "index": 0}], "done": True}
+            yield f"data: {json.dumps(done_payload)}\n\n"
+        return StreamingResponse(event_stream(), media_type="text/event-stream")
+    return JSONResponse(response)
+# A convenience POST /v1/completions that accepts 'prompt' (legacy completions API)
+class CompletionRequest(BaseModel):
+    model: Optional[str] = MODEL_ID
+    prompt: Optional[str] = ""
+    max_tokens: Optional[int] = DEFAULT_MAX_TOKENS
+    temperature: Optional[float] = 0.0
+    top_p: Optional[float] = 1.0
+    n: Optional[int] = 1
+    stop: Optional[List[str]] = None
+    stream: Optional[bool] = False
+@app.post("/v1/completions")
+async def completions(req: CompletionRequest):
+    # wrap prompt into the chat-format for our generator
+    messages = [Message(role="user", content=req.prompt)]
+    chat_req = ChatCompletionRequest(model=req.model, messages=messages, max_tokens=req.max_tokens, temperature=req.temperature, top_p=req.top_p, n=req.n, stop=req.stop, stream=req.stream)
+    # call the chat_completions handler directly
+    return await chat_completions(Request(scope={}), chat_req)
+# -----------------------------
+# If executed directly, run uvicorn
+# -----------------------------
+if __name__ == "__main__":
+    import uvicorn
+    uvicorn.run("local_openai_compatible_server:app", host=HOST, port=PORT, log_level="info")