Kakashi-hatake's picture
-
56d2f3b verified
from io import BytesIO
import json, re
import os
import base64
import requests
from fastapi import FastAPI, UploadFile, File
from fastapi.responses import JSONResponse
from fastapi.middleware.cors import CORSMiddleware
from PIL import Image
app = FastAPI(title="GLM-4.1V-9B-Thinking")
# Enable CORS for frontend interaction (Gradio/Spaces UI)
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_methods=["*"],
allow_headers=["*"],
)
API_URL = "https://router.huggingface.co/v1/chat/completions"
HEADERS = {
"Authorization": f"Bearer {os.environ['access_token']}",
"Content-Type": "application/json"
}
PROMPT = """
You are an AI assistant. Extract item names and their prices from the following image.
Your task is to extract item names and their corresponding prices from the image provided.
Return ONLY a clean JSON array in this format:
[
{"item": "<item_name>", "price": "<price>"},
...
]
⚠️ Guidelines:
- Do not include any explanation or text before/after the JSON.
- Include only entries that have both item and price.
- Preserve original spellings and formatting from the image.
- If prices are written in ₹, Rs., or INR, keep the symbol as is.
- Handle both packaged labels (like chips or snacks) and printed/handwritten menus.
- If there are duplicates or unclear text, skip them.
Only return the final JSON output, No explanation.
Make sure each entry has both item and price, and preserve the original spelling.
"""
def resize_image(image: Image.Image, max_size=(1024, 1024)) -> Image.Image:
image.thumbnail(max_size)
return image
async def encode_image_to_data_url(file: UploadFile=File(...)) -> str:
image = Image.open(BytesIO(await file.read()))
# Preprocessing
image = resize_image(image)
# Compress and convert to bytes
buffered = BytesIO()
image.save(buffered, quality=80, format=image.format)
buffered.seek(0)
image_bytes = buffered.getvalue()
# Encode to base64
base64_image = base64.b64encode(image_bytes).decode("utf-8")
mime_type = file.content_type
return f"data:{mime_type};base64,{base64_image}"
@app.get("/")
def root():
return {"message": "GLM 4.1V API for menu extraction is running."}
@app.post("/extract/")
async def extract(file: UploadFile = File(...)):
try:
# Convert uploaded image to base64 URL format
image_data_url = await encode_image_to_data_url(file)
# Create chat-style payload
payload = {
"model": "zai-org/GLM-4.1V-9B-Thinking:novita",
# "model": "meta-llama/Llama-3.2-11B-Vision-Instruct:together",
# "model": "meta-llama/Llama-4-Scout-17B-16E-Instruct:novita",
# "model": "llama3.2-vision:11b",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": PROMPT
},
{
"type": "image_url",
"image_url": {
"url": image_data_url
}
}
]
}
]
}
# Send POST request to Hugging Face Chat Completion endpoint
response = requests.post(API_URL, headers=HEADERS, json=payload)
result = response.json()
print("result :", result)
reply = result["choices"][0]["message"]["content"]
except Exception as e:
return JSONResponse(content={"error": str(e)}, status_code=400)
match = re.search(r"\[\s*{.*?}\s*\]", reply, re.DOTALL)
if match:
json_str = match.group(0)
try:
items = json.loads(json_str)
return JSONResponse(content={"menu_items": items})
except json.JSONDecodeError:
return JSONResponse(status_code=500, content={"error": "Failed to parse JSON", "raw": json_str})
else:
return JSONResponse(status_code=404,
content={"error": "No JSON array found in response", "model_response": reply})