Spaces:

Sumit404
/

QuGPT_LLM

Build error

App Files Files Community

QuGPT_LLM / app.py

Sumit404

Update app.py

a45beaf verified 14 days ago

raw

history blame contribute delete

2.27 kB

	import gradio as gr
	from unsloth import FastLanguageModel
	from peft import PeftModel
	import torch
	# Load model directly
	from transformers import AutoModel
	model = AutoModel.from_pretrained("Sumit404/Llama-3.2-3B-Instruct-bnb-4bit-finetuned", torch_dtype="auto")
	# Load the base model and tokenizer
	max_seq_length = 4096
	dtype = None
	load_in_4bit = True

	model, tokenizer = FastLanguageModel.from_pretrained(
	model_name="unsloth/Llama-3.2-3B-Instruct-bnb-4bit",
	max_seq_length=max_seq_length,
	dtype=dtype,
	load_in_4bit=load_in_4bit
	)

	# Load the LoRA adapters
	LORA_ADAPTER_PATH = "Sumit404/Llama-3.2-3B-Instruct-bnb-4bit-finetuned" # Replace with your repo ID
	model = PeftModel.from_pretrained(model, LORA_ADAPTER_PATH)

	# Set tokenizer and model for inference
	from unsloth.chat_templates import get_chat_template
	tokenizer = get_chat_template(
	tokenizer,
	chat_template = "llama-3.2",
	)
	tokenizer.pad_token = tokenizer.eos_token
	FastLanguageModel.for_inference(model)

	def generate_text(prompt):
	messages = [{"role": "user", "content": prompt}]
	inputs = tokenizer.apply_chat_template(
	messages,
	tokenize=True,
	add_generation_prompt=True,
	return_tensors="pt",
	padding=True,
	).to("cuda")

	attention_mask = inputs != tokenizer.pad_token_id

	outputs = model.generate(
	input_ids=inputs,
	attention_mask=attention_mask,
	max_new_tokens=128, # Increased output length for potentially longer answers
	use_cache=True,
	temperature=0.6,
	min_p=0.1,
	)

	text = tokenizer.decode(outputs[0], skip_special_tokens=True)
	# Extract only the assistant's response
	assistant_response_start = text.find("<\|start_header_id\|>assistant<\|end_header_id\|>\n\n")
	if assistant_response_start != -1:
	text = text[assistant_response_start + len("<\|start_header_id\|>assistant<\|end_header_id\|>\n\n"):]

	return text

	# Create the Gradio interface
	interface = gr.Interface(
	fn=generate_text,
	inputs=gr.Textbox(lines=2, placeholder="Enter your prompt here..."),
	outputs="text",
	title="Fine-tuned Llama-3.2 Instruct Model",
	description="Ask a question to the fine-tuned model."
	)

	# To run this in Colab, set share=True
	interface.launch(share=True)