Spaces:

ahk-d
/

shakespeare-gpt

Running

App Files Files Community

ahk-d commited on 11 days ago

Commit

86069e9

verified ·

1 Parent(s): 2b6e0d1

Create gpt_min.py

Browse files

Files changed (1) hide show

gpt_min.py +102 -0

gpt_min.py ADDED Viewed

	@@ -0,0 +1,102 @@

+import math
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+class CausalSelfAttention(nn.Module):
+    def __init__(self, n_embd, n_head, dropout=0.1):
+        super().__init__()
+        assert n_embd % n_head == 0
+        self.n_embd = n_embd
+        self.n_head = n_head
+        self.head_dim = n_embd // n_head
+        self.query = nn.Linear(n_embd, n_embd, bias=False)
+        self.key   = nn.Linear(n_embd, n_embd, bias=False)
+        self.value = nn.Linear(n_embd, n_embd, bias=False)
+        self.output = nn.Linear(n_embd, n_embd)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        B, T, C = x.shape
+        q = self.query(x).view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        k = self.key(x).view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        v = self.value(x).view(B, T, self.n_head, self.head_dim).transpose(1, 2)
+        scores = torch.matmul(q, k.transpose(-2, -1)) / math.sqrt(self.head_dim)
+        mask = torch.triu(torch.ones(T, T, device=x.device), diagonal=1).bool()
+        scores = scores.masked_fill(mask, float('-inf'))
+        attn = F.softmax(scores, dim=-1)
+        attn = self.dropout(attn)
+        out = torch.matmul(attn, v)
+        out = out.transpose(1, 2).contiguous().view(B, T, C)
+        return self.output(out)
+class MLP(nn.Module):
+    def __init__(self, n_embd, dropout=0.1):
+        super().__init__()
+        self.fc1 = nn.Linear(n_embd, 4 * n_embd)
+        self.fc2 = nn.Linear(4 * n_embd, n_embd)
+        self.dropout = nn.Dropout(dropout)
+    def forward(self, x):
+        x = F.gelu(self.fc1(x))
+        x = self.dropout(x)
+        x = self.fc2(x)
+        return self.dropout(x)
+class TransformerBlock(nn.Module):
+    def __init__(self, n_embd, n_head, dropout=0.1):
+        super().__init__()
+        self.attention = CausalSelfAttention(n_embd, n_head, dropout)
+        self.mlp = MLP(n_embd, dropout)
+        self.ln1 = nn.LayerNorm(n_embd)
+        self.ln2 = nn.LayerNorm(n_embd)
+    def forward(self, x):
+        x = x + self.attention(self.ln1(x))
+        x = x + self.mlp(self.ln2(x))
+        return x
+class GPTModel(nn.Module):
+    def __init__(self, vocab_size, n_embd, n_head, n_layer, chunk_size, dropout=0.1):
+        super().__init__()
+        self.vocab_size = vocab_size
+        self.n_embd = n_embd
+        self.chunk_size = chunk_size
+        self.token_embeddings = nn.Embedding(vocab_size, n_embd)
+        self.position_embeddings = nn.Embedding(chunk_size, n_embd)
+        self.dropout = nn.Dropout(dropout)
+        self.blocks = nn.ModuleList([TransformerBlock(n_embd, n_head, dropout) for _ in range(n_layer)])
+        self.ln_f = nn.LayerNorm(n_embd)
+        self.output_projection = nn.Linear(n_embd, vocab_size, bias=False)
+    def forward(self, input_tokens):
+        B, T = input_tokens.shape
+        assert T <= self.chunk_size, f"Input length {T} > chunk_size {self.chunk_size}"
+        tok = self.token_embeddings(input_tokens)
+        pos = self.position_embeddings(torch.arange(T, device=input_tokens.device))
+        x = self.dropout(tok + pos)
+        for block in self.blocks:
+            x = block(x)
+        x = self.ln_f(x)
+        logits = self.output_projection(x)
+        return logits
+    def generate(self, context_ids, max_tokens, temperature=0.7, top_k=50):
+        self.eval()
+        generated = list(context_ids)
+        device = next(self.parameters()).device
+        with torch.no_grad():
+            for _ in range(max_tokens):
+                inp = torch.tensor(generated[-self.chunk_size:], dtype=torch.long, device=device).unsqueeze(0)
+                logits = self.forward(inp)[0, -1, :]
+                if temperature and temperature > 0:
+                    logits = logits / temperature
+                if top_k and top_k > 0:
+                    tk_vals, tk_idx = torch.topk(logits, min(top_k, logits.size(-1)))
+                    filtered = torch.full_like(logits, float('-inf'))
+                    filtered[tk_idx] = tk_vals
+                    logits = filtered
+                probs = torch.softmax(logits, dim=-1)
+                next_id = torch.multinomial(probs, 1).item()
+                generated.append(next_id)
+        return generated