BeardedMonster
/

SabiYarn-125M

Text Generation

Transformers

Safetensors

nanogpt-j

custom_code

Model card Files Files and versions Community

BeardedMonster commited on Jul 7

Commit

93fd473

•

1 Parent(s): 4f4f5b6

Upload GPTJXForCausalLM

Browse files

Files changed (1) hide show

pretrained_model.py +144 -47

pretrained_model.py CHANGED Viewed

@@ -1,7 +1,7 @@
 from transformers import AutoConfig, PreTrainedModel, AutoModelForCausalLM
 from typing import List, Optional
 from torch import nn
-from model import LayerNorm, BlockJ
 from transformers.modeling_outputs import CausalLMOutputWithPast
 import torch
 import math
@@ -10,6 +10,103 @@ from transformers import AutoConfig, AutoModel
 from .pretrained_config import *
 class GPTJXForCausalLM(PreTrainedModel):
     config_class = GPTJXConfig
@@ -117,36 +214,36 @@ class GPTJXForCausalLM(PreTrainedModel):
         return model_inputs
-    @torch.no_grad()
-    def stream(self, idx, max_new_tokens, temperature=1.0, top_k=None,gen_mode="greedy"):
-        """
-        Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
-        the sequence max_new_tokens times, feeding the predictions back into the model each time.
-        Most likely you'll want to make sure to be in model.eval() mode of operation for this.
-        """
-        for _ in range(max_new_tokens):
-            # if the sequence context is growing too long we must crop it at block_size
-            idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
-            # forward the model to get the logits for the index in the sequence
-            logits, _ = self(idx_cond, eval=True)
-            # pluck the logits at the final step and scale by desired temperature
-            logits = logits[:, -1, :] / temperature
-            # optionally crop the logits to only the top k options
-            if top_k is not None:
-                v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
-                logits[logits < v[:, [-1]]] = -float('Inf')
-            # apply softmax to convert logits to (normalized) probabilities
-            probs = F.softmax(logits, dim=-1)
-            # sample from the distribution
-            if gen_mode == 'greedy':
-                idx_next = torch.argmax(probs, dim=-1).unsqueeze(0)
-            else:
-                idx_next = torch.multinomial(probs, num_samples=1)
-            # print(idx_next.shape, idx.shape)
-            idx = torch.cat((idx, idx_next), dim=1)
-            # append sampled index to the running sequence and continue
-            yield idx_next
     def crop_block_size(self, block_size):
@@ -166,23 +263,23 @@ AutoModel.register(GPTJXConfig,GPTJXForCausalLM)
 AutoModelForCausalLM.register(GPTJXConfig, GPTJXForCausalLM)
-if __name__ == '__main__':
-    from transformers import AutoTokenizer
-    tokenizer = AutoTokenizer.from_pretrained("BeardedMonster/SabiYarn")
-    input_ids = tokenizer("ba wo ni?", return_tensors="pt")["input_ids"]
-    targets = input_ids
-    # config  = GPTJConfig()
-    # config.save_pretrained("gptj-config")
-    # new_config = GPTJ.from_pretrained("gptj-config")
-    # model = GPTJ(config)
-    # state_dict = torch.load('model.pt', map_location="cpu")
-    # model.load_state_dict(state_dict)
-    model = GPTJXForCausalLM.from_pretrained("/pretrainedmodel")
-    # model.save_pretrained("/pretrainedmodel")
-    outputs = model(input_ids, targets)
-    print(outputs)
-    output = model.generate(input_ids, max_new_tokens=100)
-    print(tokenizer.decode(output[0]))
     # print(new_config)

 from transformers import AutoConfig, PreTrainedModel, AutoModelForCausalLM
 from typing import List, Optional
 from torch import nn
+# from model import LayerNorm, BlockJ
 from transformers.modeling_outputs import CausalLMOutputWithPast
 import torch
 import math
 from .pretrained_config import *
+class LayerNorm(nn.Module):
+    """ LayerNorm but with an optional bias. PyTorch doesn't support simply bias=False """
+    def __init__(self, ndim, bias):
+        super().__init__()
+        self.weight = nn.Parameter(torch.ones(ndim))
+        self.bias = nn.Parameter(torch.zeros(ndim)) if bias else None
+    def forward(self, input):
+        return F.layer_norm(input, self.weight.shape, self.weight, self.bias, 1e-5)
+class CausalSelfAttention(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        assert config.n_embd % config.n_head == 0
+        # key, query, value projections for all heads, but in a batch
+        self.c_attn = nn.Linear(config.n_embd, 3 * config.n_embd, bias=config.bias)
+        # output projection
+        self.c_proj = nn.Linear(config.n_embd, config.n_embd, bias=config.bias)
+        # regularization
+        self.attn_dropout = nn.Dropout(config.dropout)
+        self.resid_dropout = nn.Dropout(config.dropout)
+        self.n_head = config.n_head
+        self.n_embd = config.n_embd
+        self.dropout = config.dropout
+        # flash attention make GPU go brrrrr but support is only in PyTorch >= 2.0
+        self.flash = hasattr(torch.nn.functional, 'scaled_dot_product_attention')
+        # if not self.flash:
+        #     print("WARNING: using slow attention. Flash Attention requires PyTorch >= 2.0")
+            # causal mask to ensure that attention is only applied to the left in the input sequence
+        self.register_buffer("bias", torch.tril(torch.ones(config.block_size, config.block_size))
+                                        .view(1, 1, config.block_size, config.block_size))
+    def forward(self, x, attn_mask=None):
+        B, T, C = x.size() # batch size, sequence length, embedding dimensionality (n_embd)
+        # calculate query, key, values for all heads in batch and move head forward to be the batch dim
+        q, k, v  = self.c_attn(x).split(self.n_embd, dim=2)
+        k = k.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+        q = q.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+        v = v.view(B, T, self.n_head, C // self.n_head).transpose(1, 2) # (B, nh, T, hs)
+        # causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T)
+        if self.flash:
+            if attn_mask is not None:
+            # efficient attention using Flash Attention CUDA kernels
+                y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=attn_mask, dropout_p=self.dropout if self.training else 0)
+            else:
+                y = torch.nn.functional.scaled_dot_product_attention(q, k, v, attn_mask=None, dropout_p=self.dropout if self.training else 0, is_causal=True)
+        else:
+            # manual implementation of attention
+            att = (q @ k.transpose(-2, -1)) * (1.0 / math.sqrt(k.size(-1)))
+            att = att.masked_fill(self.bias[:,:,:T,:T] == 0, float('-inf'))
+            att = F.softmax(att, dim=-1)
+            att = self.attn_dropout(att)
+            y = att @ v # (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs)
+        y = y.transpose(1, 2).contiguous().view(B, T, C) # re-assemble all head outputs side by side
+        # output projection
+        y = self.resid_dropout(self.c_proj(y))
+        return y
+class MLP(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.c_fc    = nn.Linear(config.n_embd, 4 * config.n_embd, bias=config.bias)
+        self.gelu    = nn.GELU()
+        self.c_proj  = nn.Linear(4 * config.n_embd, config.n_embd, bias=config.bias)
+        self.dropout = nn.Dropout(config.dropout)
+    def forward(self, x):
+        x = self.c_fc(x)
+        x = self.gelu(x)
+        x = self.c_proj(x)
+        x = self.dropout(x)
+        return x
+class BlockJ(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.ln_1 = LayerNorm(config.n_embd, bias=config.bias)
+        self.j = LayerNorm(config.n_embd, config.n_embd)
+        self.attn = CausalSelfAttention(config)
+        self.ln_2 = LayerNorm(config.n_embd, bias=config.bias)
+        self.mlp = MLP(config)
+    def forward(self, x, attn_mask=None):
+        h = x
+        x = self.ln_1(x)
+        x = h + self.attn(x, attn_mask) + self.j(x)
+        x = x + self.mlp(self.ln_2(x))
+        return x
 class GPTJXForCausalLM(PreTrainedModel):
     config_class = GPTJXConfig
         return model_inputs
+    # @torch.no_grad()
+    # def stream(self, idx, max_new_tokens, temperature=1.0, top_k=None,gen_mode="greedy"):
+    #     """
+    #     Take a conditioning sequence of indices idx (LongTensor of shape (b,t)) and complete
+    #     the sequence max_new_tokens times, feeding the predictions back into the model each time.
+    #     Most likely you'll want to make sure to be in model.eval() mode of operation for this.
+    #     """
+    #     for _ in range(max_new_tokens):
+    #         # if the sequence context is growing too long we must crop it at block_size
+    #         idx_cond = idx if idx.size(1) <= self.config.block_size else idx[:, -self.config.block_size:]
+    #         # forward the model to get the logits for the index in the sequence
+    #         logits, _ = self(idx_cond, eval=True)
+    #         # pluck the logits at the final step and scale by desired temperature
+    #         logits = logits[:, -1, :] / temperature
+    #         # optionally crop the logits to only the top k options
+    #         if top_k is not None:
+    #             v, _ = torch.topk(logits, min(top_k, logits.size(-1)))
+    #             logits[logits < v[:, [-1]]] = -float('Inf')
+    #         # apply softmax to convert logits to (normalized) probabilities
+    #         probs = F.softmax(logits, dim=-1)
+    #         # sample from the distribution
+    #         if gen_mode == 'greedy':
+    #             idx_next = torch.argmax(probs, dim=-1).unsqueeze(0)
+    #         else:
+    #             idx_next = torch.multinomial(probs, num_samples=1)
+    #         # print(idx_next.shape, idx.shape)
+    #         idx = torch.cat((idx, idx_next), dim=1)
+    #         # append sampled index to the running sequence and continue
+    #         yield idx_next
     def crop_block_size(self, block_size):
 AutoModelForCausalLM.register(GPTJXConfig, GPTJXForCausalLM)
+# if __name__ == '__main__':
+#     from transformers import AutoTokenizer
+#     tokenizer = AutoTokenizer.from_pretrained("BeardedMonster/SabiYarn")
+#     input_ids = tokenizer("Awọn eeyan Cairo, ni Egypt ti bẹrẹ si n to lawọn ileesẹ to n ṣe burẹdi bayii.", return_tensors="pt")["input_ids"]
+#     targets = input_ids
+#     # config  = GPTJConfig()
+#     # config.save_pretrained("gptj-config")
+#     # new_config = GPTJ.from_pretrained("gptj-config")
+#     # model = GPTJ(config)
+#     # state_dict = torch.load('model.pt', map_location="cpu")
+#     # model.load_state_dict(state_dict)
+#     model = GPTJXForCausalLM.from_pretrained("/pretrainedmodel")
+#     # model.save_pretrained("/pretrainedmodel")
+#     # outputs = model(input_ids, targets)
+#     # print(outputs)
+#     output = model.generate(input_ids, max_new_tokens=50)
+#     print(tokenizer.decode(output[0]))
     # print(new_config)