make tree attention decoding work with triton flash attention forward

lucidrains · lucidrains · commit 055f6d70dbfe · 2024-08-14T17:34:37.000Z
diff --git a/assert_tree_attn.py b/assert_tree_attn.py
@@ -57,9 +57,9 @@ def start(
 
     # inputs
 
-    q = torch.randn(batch, heads, 1, dim)
-    k = torch.randn(batch, heads, seq_len, dim)
-    v = torch.randn(batch, heads, seq_len, dim)
+    q = torch.randn(batch, heads, 1, dim).half()
+    k = torch.randn(batch, heads, seq_len, dim).half()
+    v = torch.randn(batch, heads, seq_len, dim).half()
 
     if use_cuda:
         q, k, v = tuple(t.cuda(rank) for t in (q, k, v))
@@ -75,6 +75,8 @@ def start(
     out = regular_decode(q, k, v)
     tree_out = tree_attn_decode(q, k, v)
 
+    out = out.to(tree_out.dtype)
+
     # if not main early return
 
     if not is_main:
@@ -95,7 +97,7 @@ def start(
 
 @click.command()
 @click.option('--world-size', default = 8, help = 'number of machines / processes')
-@click.option('--dim', default = 512, help = 'dimension')
+@click.option('--dim', default = 64, help = 'dimension')
 @click.option('--heads', default = 8, help = 'dimension')
 @click.option('--batch', default = 1, help = 'dimension')
 @click.option('--use-cuda', is_flag = True, help = 'whether to test with CUDA and NCCL')
diff --git a/ring_attention_pytorch/tree_attn_decoding.py b/ring_attention_pytorch/tree_attn_decoding.py
@@ -4,14 +4,22 @@
 
 from ring_attention_pytorch.distributed import get_rank, get_world_size
 
+# functions
+
 def exists(v):
     return v is not None
 
+def default(v, d):
+    return v if exists(v) else d
+
+# main function
+
 @torch.no_grad()
 def tree_attn_decode(
     q, k, v,
     eps = 1e-8,
-    shard_kv_seq = False
+    shard_kv_seq = False,
+    use_triton = None
 ):
 
     assert k.shape[:-1] == v.shape[:-1]
@@ -23,34 +31,44 @@ def tree_attn_decode(
     https://arxiv.org/abs/2408.04093
     """
 
-    device, dim_v = q.device, v.shape[-1]
-
-    rank = get_rank()
-    world_size = get_world_size()
-
-    # scale queries
-
-    scale = q.shape[-1] ** -0.5
-    q *= scale
+    dim_v = v.shape[-1]
 
     # each machine (rank) takes care of a chunk of kv sequence within the world of many machines
 
     if shard_kv_seq:
+        rank, world_size = get_rank(), get_world_size()
         k = k.chunk(world_size, dim = -2)
         v = v.chunk(world_size, dim = -2)
+
         k, v = (k[rank], v[rank]) if rank < len(k) else (None, None)
 
     if exists(k) and exists(v):
         # calculate local output and derive numerator and denominator
 
-        sim = einsum('... i d, ... j d -> ... i j', q, k)
+        use_triton = default(use_triton, q.is_cuda)
+
+        if use_triton and q.is_cuda:
+            from ring_attention_pytorch.triton_flash_attn import flash_attn_forward
+
+            out, local_max, lse = flash_attn_forward(
+                q, k, v,
+                causal = False,
+                return_normalized_output = True,
+                load_accumulated = False,
+                head_first_dim = True,
+                remove_padding = True
+            )
+
+        else:
+            scale = q.shape[-1] ** -0.5
+            sim = einsum('... i d, ... j d -> ... i j', q, k) * scale
 
-        local_max = sim.amax(dim = -1, keepdim = True)
-        sim -= local_max
-        lse = sim.logsumexp(dim = -1, keepdim = True)
+            local_max = sim.amax(dim = -1, keepdim = True)
+            sim -= local_max
+            lse = sim.logsumexp(dim = -1, keepdim = True)
 
-        attn = sim.softmax(dim = -1)
-        out = einsum('... i j, ... j d -> ... i d', attn, v)
+            attn = sim.softmax(dim = -1)
+            out = einsum('... i j, ... j d -> ... i d', attn, v)
 
         den = lse.exp()
         num = out * den
diff --git a/ring_attention_pytorch/triton_flash_attn.py b/ring_attention_pytorch/triton_flash_attn.py
@@ -8,7 +8,7 @@
 import torch
 from torch import Tensor
 
-from einops import repeat
+from einops import repeat, rearrange
 
 def exists(v):
     return v is not None
@@ -315,10 +315,18 @@ def flash_attn_forward(
     return_normalized_output = False,
     load_accumulated = True,
     softclamp_qk_sim = False,
-    softclamp_value = 50.
+    softclamp_value = 50.,
+    head_first_dim = False,
+    remove_padding = False
 ):
     q, k, v = [x if is_contiguous(x) else x.contiguous() for x in (q, k, v)]
 
+    if head_first_dim:
+        q, k, v = tuple(rearrange(t, 'b n h d -> b h n d') for t in (q, k, v))
+
+        if exists(o):
+            o = rearrange(o, 'b n h d -> b h n d')
+
     batch, seqlen_q, nheads, d = q.shape
     _, seqlen_k, _, _ = k.shape
 
@@ -412,6 +420,13 @@ def flash_attn_forward(
         num_stages = 1,
     )
 
+    if head_first_dim:
+        o = rearrange(o, 'b h n d -> b n h d')
+
+    if remove_padding:
+        m = m[..., :seqlen_q]
+        lse = lse[..., :seqlen_q]
+
     return o, m, lse
 
 @triton.jit
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'ring-attention-pytorch',
   packages = find_packages(exclude=[]),
-  version = '0.5.6',
+  version = '0.5.8',
   license='MIT',
   description = 'Ring Attention - Pytorch',
   author = 'Phil Wang',