allow for smaller model dimensions for the finer hierarchical stages

lucidrains · lucidrains · commit 60995d14456c · 2023-05-26T09:46:24.000-07:00
diff --git a/MEGABYTE_pytorch/megabyte.py b/MEGABYTE_pytorch/megabyte.py
@@ -1,5 +1,6 @@
 import math
 import functools
+from itertools import zip_longest
 
 import torch
 import torch.nn.functional as F
@@ -204,8 +205,8 @@ def __init__(
         *,
         num_tokens,
         dim,
-        depth,
-        max_seq_len,
+        depth: tuple,
+        max_seq_len: tuple,
         dim_head = 64,
         heads = 8,
         attn_dropout = 0.,
@@ -225,26 +226,32 @@ def __init__(
         assert len(depth) == len(max_seq_len)
 
         self.stages = len(depth)
+        dim = cast_tuple(dim, self.stages)
 
-        self.token_emb = nn.Embedding(num_tokens, dim)
-        self.start_tokens = nn.Parameter(torch.randn(dim))
+        assert len(dim) == self.stages
+
+        coarsest_dim, *_, fine_dim = dim
+
+        self.token_emb = nn.Embedding(num_tokens, fine_dim)
+        self.start_tokens = nn.Parameter(torch.randn(coarsest_dim))
 
         self.max_seq_len = max_seq_len
 
-        self.pos_embs = nn.ModuleList([nn.Embedding(seq_len, dim) for seq_len in max_seq_len])
+        self.pos_embs = nn.ModuleList([nn.Embedding(seq_len, h_dim) for h_dim, seq_len in zip(dim, max_seq_len)])
 
         self.patch_embedders = nn.ModuleList([nn.Sequential(
             Rearrange('... r d -> ... (r d)'),
-            nn.LayerNorm(seq_len * dim),
-            nn.Linear(seq_len * dim, dim),
-            nn.LayerNorm(dim)
-        ) for seq_len in self.max_seq_len[1:]])
+            nn.LayerNorm(seq_len * dim_in),
+            nn.Linear(seq_len * dim_in, dim_out),
+            nn.LayerNorm(dim_out)
+        ) for dim_in, dim_out, seq_len in zip(dim[1:], dim[:-1], max_seq_len[1:])])
 
         self.transformers = nn.ModuleList([])
+        self.to_next_transformer_projections = nn.ModuleList([])
 
-        for stage_depth in depth:
+        for h_dim, next_h_dim, stage_depth in zip_longest(dim, dim[1:], depth):
             self.transformers.append(Transformer(
-                dim = dim,
+                dim = h_dim,
                 layers = stage_depth,
                 dim_head = dim_head,
                 heads = heads,
@@ -255,7 +262,10 @@ def __init__(
                 flash_attn = flash_attn
             ))
 
-        self.to_logits = nn.Linear(dim, num_tokens)
+            proj = nn.Linear(h_dim, next_h_dim) if exists(next_h_dim) and next_h_dim != dim else nn.Identity()
+            self.to_next_transformer_projections.append(proj)
+
+        self.to_logits = nn.Linear(fine_dim, num_tokens)
         self.pad_id = pad_id
 
     def generate(self, prime = None, filter_thres = 0.9, temperature = 1., default_batch_size = 1):
@@ -339,7 +349,7 @@ def forward(self, ids, return_loss = False):
 
         # spatial tokens is tokens with depth pos reduced along depth dimension + spatial positions        
 
-        for ind, (stage_tokens, transformer) in enumerate(zip(tokens_at_stages, self.transformers)):
+        for ind, (stage_tokens, transformer, proj) in enumerate(zip(tokens_at_stages, self.transformers, self.to_next_transformer_projections)):
             is_last = ind == (self.stages - 1)
 
             stage_tokens = torch.cat((
@@ -348,7 +358,10 @@ def forward(self, ids, return_loss = False):
             ), dim = -2)
 
             stage_tokens, ps = pack_one(stage_tokens, '* n d')
+
             attended = transformer(stage_tokens)
+            attended = proj(attended)
+
             attended = unpack_one(attended, ps, '* n d')
 
             start_tokens = rearrange(attended[..., :-1, :], '... n d -> ... n 1 d')
diff --git a/README.md b/README.md
@@ -24,7 +24,7 @@ from MEGABYTE_pytorch import MEGABYTE
 
 model = MEGABYTE(
     num_tokens = 16000,             # number of tokens
-    dim = 512,                      # transformer model dimension
+    dim = (512, 256),               # transformer model dimension (512 for coarsest, 256 for fine in this example)
     max_seq_len = (1024, 4),        # sequence length for global and then local. this can be more than 2
     depth = (6, 4),                 # number of layers for global and then local. this can be more than 2, but length must match the max_seq_len's
     dim_head = 64,                  # dimension per head
@@ -49,7 +49,7 @@ sampled = model.generate(temperature = 0.9, filter_thres = 0.9) # (1, 1024, 4)
 
 ## Test
 
-Train on character-level enwik8 with patches of size 4
+Train on character-level enwik8 with patches of size 4 - length 4096
 
 ```bash
 $ python train.py
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'MEGABYTE-pytorch',
   packages = find_packages(),
-  version = '0.0.7',
+  version = '0.0.9',
   license='MIT',
   description = 'MEGABYTE - Pytorch',
   long_description_content_type = 'text/markdown',
diff --git a/train.py b/train.py
@@ -38,7 +38,7 @@ def decode_tokens(tokens):
 
 model = MEGABYTE(
     num_tokens = 256,
-    dim = 512,
+    dim = (512, 512),
     depth = (6, 2),
     max_seq_len = (1024, 4),
     flash_attn = True