Spaces:

MrUnknown420
/

my-ai-model-builder

Sleeping

App Files Files Community

Update app.py

by MrUnknown420 - opened 20 days ago

base: refs/heads/main

←

from: refs/pr/3

Discussion Files changed

+106

-79

Files changed (1) hide show

app.py +106 -79

app.py CHANGED Viewed

@@ -1,111 +1,138 @@
 import gradio as gr
-from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments
 from datasets import load_dataset, Dataset
-# --------------------------
-# Dataset Loader
-# --------------------------
 def get_dataset(dataset_name, config_name=None, user_file=None):
-    if user_file is not None:
-        with open(user_file, "r", encoding="utf-8") as f:
-            text_data = f.read().splitlines()
-        return Dataset.from_dict({"text": text_data})
-    if config_name:
-        return load_dataset(dataset_name, config_name, split="train")
-    else:
-        return load_dataset(dataset_name, split="train")
-# --------------------------
-# Training Function
-# --------------------------
-def train_model(model_name, dataset_name, config_name, user_file, output_dir, epochs=1):
     try:
-        dataset = get_dataset(dataset_name, config_name, user_file)
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        def tokenize_function(examples):
-            return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)
-        tokenized_dataset = dataset.map(tokenize_function, batched=True)
-        model = AutoModelForCausalLM.from_pretrained(model_name)
-        training_args = TrainingArguments(
-            output_dir=output_dir,
-            overwrite_output_dir=True,
-            per_device_train_batch_size=2,
-            num_train_epochs=epochs,
-            save_strategy="epoch",
-            logging_dir="./logs",
-            logging_steps=10,
-        )
-        trainer = Trainer(
-            model=model,
-            args=training_args,
-            train_dataset=tokenized_dataset,
-        )
         trainer.train()
         model.save_pretrained(output_dir)
         tokenizer.save_pretrained(output_dir)
-        return f"✅ Training complete! Model saved to {output_dir}"
-    except Exception as e:
-        return f"❌ Error: {str(e)}"
-# --------------------------
-# Chatbot with trained model
-# --------------------------
-def chat_with_model(user_input, model_name="custom_model"):
     try:
-        tokenizer = AutoTokenizer.from_pretrained(model_name)
-        model = AutoModelForCausalLM.from_pretrained(model_name)
-        inputs = tokenizer(user_input, return_tensors="pt")
-        outputs = model.generate(**inputs, max_length=200)
-        return tokenizer.decode(outputs[0], skip_special_tokens=True)
     except Exception as e:
-        return f"⚠️ Model not ready yet. Error: {str(e)}"
-# --------------------------
 # Gradio UI
-# --------------------------
 with gr.Blocks() as demo:
-    gr.Markdown("## 🧠 Custom AI Model Trainer + Chatbot")
-    with gr.Tab("Train a Model"):
         model_name = gr.Textbox(label="Base Model (e.g. gpt2, distilgpt2)", value="gpt2")
-        dataset_name = gr.Textbox(label="Dataset (e.g. wikitext)", value="wikitext")
-        config_name = gr.Dropdown(
-            label="Dataset Config (if needed)",
-            choices=["", "wikitext-103-raw-v1", "wikitext-103-v1", "wikitext-2-raw-v1", "wikitext-2-v1"],
-            value=""
-        )
-        user_file = gr.File(label="Upload TXT Dataset", file_types=[".txt"], type="filepath")
-        output_dir = gr.Textbox(label="Output Directory", value="custom_model")
-        epochs = gr.Slider(1, 5, value=1, step=1, label="Epochs")
-        train_button = gr.Button("🚀 Train Model")
         train_output = gr.Textbox(label="Training Logs / Status")
         train_button.click(
             fn=train_model,
-            inputs=[model_name, dataset_name, config_name, user_file, output_dir, epochs],
-            outputs=train_output
         )
-    with gr.Tab("Chat with Your Model"):
-        user_input = gr.Textbox(label="Your Message")
-        chat_output = gr.Textbox(label="Model Response")
-        chat_button = gr.Button("💬 Chat")
         chat_button.click(
             fn=chat_with_model,
-            inputs=[user_input],
             outputs=chat_output
         )
-if __name__ == "__main__":
-    demo.launch()

 import gradio as gr
 from datasets import load_dataset, Dataset
+from transformers import (
+    AutoTokenizer,
+    AutoModelForCausalLM,
+    Trainer,
+    TrainingArguments,
+    pipeline
+)
+import os
+# -------------------------
+# Helpers
+# -------------------------
 def get_dataset(dataset_name, config_name=None, user_file=None):
     try:
+        if user_file is not None:
+            with open(user_file, "r", encoding="utf-8") as f:
+                text_data = f.read().splitlines()
+            return Dataset.from_dict({"text": text_data})
+        elif dataset_name:
+            return load_dataset(dataset_name, config_name, split="train")
+    except Exception as e:
+        return None
+    return None
+def train_model(model_name, dataset_name, config_name, user_file, output_dir, epochs, lr):
+    dataset = get_dataset(dataset_name, config_name, user_file)
+    if dataset is None:
+        return "❌ Error: Could not load dataset. Check name or file.", None
+    # Tokenizer
+    tokenizer = AutoTokenizer.from_pretrained(model_name)
+    # Fix GPT-2 style models (no pad token)
+    if tokenizer.pad_token is None:
+        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
+    def tokenize_function(examples):
+        text_key = "text" if "text" in examples else list(examples.keys())[0]
+        return tokenizer(examples[text_key],
+                         truncation=True,
+                         padding="max_length",
+                         max_length=128)
+    tokenized_dataset = dataset.map(tokenize_function, batched=True)
+    # Model
+    model = AutoModelForCausalLM.from_pretrained(model_name)
+    model.resize_token_embeddings(len(tokenizer))
+    # Training args
+    training_args = TrainingArguments(
+        output_dir=output_dir,
+        evaluation_strategy="no",
+        learning_rate=float(lr),
+        per_device_train_batch_size=2,
+        num_train_epochs=int(epochs),
+        weight_decay=0.01,
+        save_strategy="epoch",
+        logging_dir=os.path.join(output_dir, "logs"),
+        push_to_hub=False
+    )
+    trainer = Trainer(
+        model=model,
+        args=training_args,
+        train_dataset=tokenized_dataset
+    )
+    try:
         trainer.train()
         model.save_pretrained(output_dir)
         tokenizer.save_pretrained(output_dir)
+        return f"✅ Training complete! Model saved to `{output_dir}`", output_dir
+    except Exception as e:
+        return f"❌ Training failed: {str(e)}", None
+# -------------------------
+# Chat interface
+# -------------------------
+chat_history = []
+def chat_with_model(user_input, model_dir):
+    global chat_history
+    if not model_dir or not os.path.exists(model_dir):
+        return "⚠️ No trained model found. Please train first."
     try:
+        generator = pipeline("text-generation", model=model_dir)
+        conversation = " ".join([f"User: {u}\nAI: {a}" for u, a in chat_history])
+        prompt = f"{conversation}\nUser: {user_input}\nAI:"
+        response = generator(prompt, max_length=200, num_return_sequences=1)[0]["generated_text"]
+        # Extract AI response after last "AI:"
+        ai_reply = response.split("AI:")[-1].strip()
+        chat_history.append((user_input, ai_reply))
+        return ai_reply
     except Exception as e:
+        return f"❌ Chat error: {str(e)}"
+# -------------------------
 # Gradio UI
+# -------------------------
 with gr.Blocks() as demo:
+    gr.Markdown("# 🧠 Personal AI Model Builder\nTrain + Chat with your own AI assistant.")
+    with gr.Tab("1️⃣ Train Model"):
         model_name = gr.Textbox(label="Base Model (e.g. gpt2, distilgpt2)", value="gpt2")
+        dataset_name = gr.Textbox(label="HuggingFace Dataset (optional, e.g. wikitext)", value="wikitext")
+        config_name = gr.Textbox(label="Dataset Config (e.g. wikitext-2-raw-v1)", value="wikitext-2-raw-v1")
+        user_file = gr.File(label="Or Upload Your Own TXT Dataset", file_types=[".txt"], type="filepath")
+        output_dir = gr.Textbox(label="Output Directory", value="./custom_model")
+        epochs = gr.Number(label="Epochs", value=1, precision=0)
+        lr = gr.Textbox(label="Learning Rate", value="5e-5")
+        train_button = gr.Button("🚀 Train My Model")
         train_output = gr.Textbox(label="Training Logs / Status")
         train_button.click(
             fn=train_model,
+            inputs=[model_name, dataset_name, config_name, user_file, output_dir, epochs, lr],
+            outputs=[train_output, output_dir]
         )
+    with gr.Tab("2️⃣ Chat With Model"):
+        chat_input = gr.Textbox(label="Your Message")
+        chat_button = gr.Button("💬 Send")
+        chat_output = gr.Textbox(label="AI Reply")
         chat_button.click(
             fn=chat_with_model,
+            inputs=[chat_input, output_dir],
             outputs=chat_output
         )
+demo.launch()