Spaces:

MrUnknown420
/

my-ai-model-builder

Sleeping

App Files Files Community

Update app.py

by MrUnknown420 - opened 20 days ago

base: refs/heads/main

←

from: refs/pr/4

Discussion Files changed

+103

-117

Files changed (1) hide show

app.py +103 -117

app.py CHANGED Viewed

@@ -1,138 +1,124 @@
 import gradio as gr
-from datasets import load_dataset, Dataset
 from transformers import (
     AutoTokenizer,
     AutoModelForCausalLM,
     Trainer,
     TrainingArguments,
-    pipeline
 )
-import os
-# -------------------------
-# Helpers
-# -------------------------
-def get_dataset(dataset_name, config_name=None, user_file=None):
     try:
-        if user_file is not None:
-            with open(user_file, "r", encoding="utf-8") as f:
-                text_data = f.read().splitlines()
-            return Dataset.from_dict({"text": text_data})
-        elif dataset_name:
-            return load_dataset(dataset_name, config_name, split="train")
-    except Exception as e:
-        return None
-    return None
-def train_model(model_name, dataset_name, config_name, user_file, output_dir, epochs, lr):
-    dataset = get_dataset(dataset_name, config_name, user_file)
-    if dataset is None:
-        return "❌ Error: Could not load dataset. Check name or file.", None
-    # Tokenizer
-    tokenizer = AutoTokenizer.from_pretrained(model_name)
-    # Fix GPT-2 style models (no pad token)
-    if tokenizer.pad_token is None:
-        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
-    def tokenize_function(examples):
-        text_key = "text" if "text" in examples else list(examples.keys())[0]
-        return tokenizer(examples[text_key],
-                         truncation=True,
-                         padding="max_length",
-                         max_length=128)
-    tokenized_dataset = dataset.map(tokenize_function, batched=True)
-    # Model
-    model = AutoModelForCausalLM.from_pretrained(model_name)
-    model.resize_token_embeddings(len(tokenizer))
-    # Training args
-    training_args = TrainingArguments(
-        output_dir=output_dir,
-        evaluation_strategy="no",
-        learning_rate=float(lr),
-        per_device_train_batch_size=2,
-        num_train_epochs=int(epochs),
-        weight_decay=0.01,
-        save_strategy="epoch",
-        logging_dir=os.path.join(output_dir, "logs"),
-        push_to_hub=False
-    )
-    trainer = Trainer(
-        model=model,
-        args=training_args,
-        train_dataset=tokenized_dataset
-    )
-    try:
         trainer.train()
-        model.save_pretrained(output_dir)
-        tokenizer.save_pretrained(output_dir)
-        return f"✅ Training complete! Model saved to `{output_dir}`", output_dir
-    except Exception as e:
-        return f"❌ Training failed: {str(e)}", None
-# -------------------------
-# Chat interface
-# -------------------------
-chat_history = []
-def chat_with_model(user_input, model_dir):
-    global chat_history
-    if not model_dir or not os.path.exists(model_dir):
-        return "⚠️ No trained model found. Please train first."
     try:
-        generator = pipeline("text-generation", model=model_dir)
-        conversation = " ".join([f"User: {u}\nAI: {a}" for u, a in chat_history])
-        prompt = f"{conversation}\nUser: {user_input}\nAI:"
-        response = generator(prompt, max_length=200, num_return_sequences=1)[0]["generated_text"]
-        # Extract AI response after last "AI:"
-        ai_reply = response.split("AI:")[-1].strip()
-        chat_history.append((user_input, ai_reply))
-        return ai_reply
     except Exception as e:
         return f"❌ Chat error: {str(e)}"
-# -------------------------
 # Gradio UI
-# -------------------------
 with gr.Blocks() as demo:
-    gr.Markdown("# 🧠 Personal AI Model Builder\nTrain + Chat with your own AI assistant.")
-    with gr.Tab("1️⃣ Train Model"):
-        model_name = gr.Textbox(label="Base Model (e.g. gpt2, distilgpt2)", value="gpt2")
-        dataset_name = gr.Textbox(label="HuggingFace Dataset (optional, e.g. wikitext)", value="wikitext")
-        config_name = gr.Textbox(label="Dataset Config (e.g. wikitext-2-raw-v1)", value="wikitext-2-raw-v1")
-        user_file = gr.File(label="Or Upload Your Own TXT Dataset", file_types=[".txt"], type="filepath")
-        output_dir = gr.Textbox(label="Output Directory", value="./custom_model")
-        epochs = gr.Number(label="Epochs", value=1, precision=0)
-        lr = gr.Textbox(label="Learning Rate", value="5e-5")
-        train_button = gr.Button("🚀 Train My Model")
-        train_output = gr.Textbox(label="Training Logs / Status")
-        train_button.click(
-            fn=train_model,
-            inputs=[model_name, dataset_name, config_name, user_file, output_dir, epochs, lr],
-            outputs=[train_output, output_dir]
-        )
-    with gr.Tab("2️⃣ Chat With Model"):
-        chat_input = gr.Textbox(label="Your Message")
-        chat_button = gr.Button("💬 Send")
-        chat_output = gr.Textbox(label="AI Reply")
-        chat_button.click(
-            fn=chat_with_model,
-            inputs=[chat_input, output_dir],
-            outputs=chat_output
         )
-demo.launch()

 import gradio as gr
+from datasets import load_dataset
 from transformers import (
     AutoTokenizer,
     AutoModelForCausalLM,
     Trainer,
     TrainingArguments,
+    DataCollatorForLanguageModeling,
 )
+import torch
+# Map specialization → dataset + base model
+SPECIALIZATIONS = {
+    "Coding Assistant": {
+        "dataset": "codeparrot/github-code",
+        "model": "EleutherAI/gpt-neo-125M",
+    },
+    "Cybersecurity Helper": {
+        "dataset": "wikitext",
+        "model": "distilgpt2",  # placeholder dataset, replace with cybersecurity text later
+    },
+    "App/Web Developer": {
+        "dataset": "wikitext",
+        "model": "gpt2",
+    },
+    "General Problem Solver": {
+        "dataset": "wikitext",
+        "model": "gpt2",
+    },
+}
+def train_model(specialization, epochs, lr):
     try:
+        spec = SPECIALIZATIONS.get(specialization, SPECIALIZATIONS["General Problem Solver"])
+        dataset_name = spec["dataset"]
+        model_name = spec["model"]
+        # Load dataset
+        dataset = load_dataset(dataset_name)
+        # Load tokenizer & model
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model = AutoModelForCausalLM.from_pretrained(model_name)
+        def tokenize_function(examples):
+            return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)
+        tokenized_datasets = dataset.map(tokenize_function, batched=True, remove_columns=["text"])
+        # Data collator
+        data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)
+        # Training args
+        training_args = TrainingArguments(
+            output_dir="./results",
+            eval_strategy="epoch",
+            learning_rate=lr,
+            per_device_train_batch_size=2,
+            per_device_eval_batch_size=2,
+            num_train_epochs=epochs,
+            weight_decay=0.01,
+            save_strategy="no",
+            logging_dir="./logs",
+            logging_steps=10,
+        )
+        trainer = Trainer(
+            model=model,
+            args=training_args,
+            train_dataset=tokenized_datasets["train"],
+            eval_dataset=tokenized_datasets["validation"],
+            tokenizer=tokenizer,
+            data_collator=data_collator,
+        )
         trainer.train()
+        return f"✅ Training complete for {specialization} model ({model_name}) with {epochs} epochs, lr={lr}"
+    except Exception as e:
+        return f"❌ Error: {str(e)}"
+# Inference / Chat Function
+def chat_fn(prompt, specialization):
     try:
+        spec = SPECIALIZATIONS.get(specialization, SPECIALIZATIONS["General Problem Solver"])
+        model_name = spec["model"]
+        tokenizer = AutoTokenizer.from_pretrained(model_name)
+        model = AutoModelForCausalLM.from_pretrained(model_name)
+        inputs = tokenizer(prompt, return_tensors="pt")
+        outputs = model.generate(**inputs, max_length=200)
+        return tokenizer.decode(outputs[0], skip_special_tokens=True)
     except Exception as e:
         return f"❌ Chat error: {str(e)}"
 # Gradio UI
 with gr.Blocks() as demo:
+    gr.Markdown("# 🚀 Custom AI Model Builder & Assistant")
+    with gr.Tab("1️⃣ Train Custom Model"):
+        specialization = gr.Radio(
+            list(SPECIALIZATIONS.keys()),
+            label="What do you want your AI to specialize in?",
+            value="General Problem Solver",
         )
+        epochs = gr.Slider(1, 10, value=1, step=1, label="Training Epochs")
+        lr = gr.Slider(1e-6, 5e-4, value=5e-5, step=1e-6, label="Learning Rate")
+        train_button = gr.Button("🚀 Start Training")
+        output_log = gr.Textbox(label="Training Log")
+        train_button.click(train_model, inputs=[specialization, epochs, lr], outputs=output_log)
+    with gr.Tab("2️⃣ Chat with Your Model"):
+        chat_specialization = gr.Dropdown(list(SPECIALIZATIONS.keys()), value="General Problem Solver", label="Model Type")
+        prompt = gr.Textbox(label="Ask me anything", placeholder="Type your question here...")
+        chat_button = gr.Button("💬 Generate Response")
+        chat_output = gr.Textbox(label="Response")
+        chat_button.click(chat_fn, inputs=[prompt, chat_specialization], outputs=chat_output)
+demo.launch(server_name="0.0.0.0", server_port=7860)