offer a naive unoptimized attention w/ stop graddable queries, keys, values, and fix the self reasoning transformer to only stop grad keys and values for the reasoning tokens

lucidrains · lucidrains · commit 95502af9e81f · 2024-05-03T07:37:00.000-07:00
diff --git a/self_reasoning_tokens_pytorch/__init__.py b/self_reasoning_tokens_pytorch/__init__.py
@@ -1,3 +1,8 @@
 from self_reasoning_tokens_pytorch.self_reasoning_tokens import (
     Transformer
 )
+
+from self_reasoning_tokens_pytorch.attention_with_stop_graddable_qkv import (
+    stop_graddable_attn_,
+    stop_graddable_attn
+)
diff --git a/self_reasoning_tokens_pytorch/attention_with_stop_graddable_qkv.py b/self_reasoning_tokens_pytorch/attention_with_stop_graddable_qkv.py
@@ -0,0 +1,131 @@
+import torch
+from torch.autograd.function import Function
+
+from einops import einsum, rearrange
+
+def exists(val):
+    return val is not None
+
+# custom function
+
+class StopGraddableAttentionFunction(Function):
+
+    @staticmethod
+    @torch.no_grad()
+    def forward(
+        ctx,
+        q,
+        k,
+        v,
+        mask,
+        attn_mask,
+        causal: bool,
+        q_stop_grad_mask,
+        k_stop_grad_mask,
+        v_stop_grad_mask,
+    ):
+        scale = q.shape[-1] ** -0.5
+
+        sim = einsum(q, k, 'b h i d, b h j d -> b h i j') * scale
+
+        max_neg_value = -torch.finfo(sim.dtype).max
+
+        if exists(mask):
+            mask = rearrange(col_mask, 'b j -> b 1 1 j')
+            sim.masked_fill_(~mask, max_neg_value)
+
+        if exists(attn_mask):
+            sim.masked_fill_(~attn_mask, max_neg_value)
+
+        if causal:
+            i, j = sim.shape[-2:]
+            causal_mask = torch.ones((i, j), dtype = torch.bool, device = sim.device).triu(j - i + 1)
+            sim = sim.masked_fill(causal_mask, max_neg_value)
+
+        attn = sim.softmax(dim = -1)
+
+        out = einsum(attn, v, 'b h i j, b h j d -> b h i d')
+
+        ctx.args = (
+            causal,
+            scale,
+            mask,
+            q_stop_grad_mask,
+            k_stop_grad_mask,
+            v_stop_grad_mask
+        )
+
+        ctx.save_for_backward(
+            q, k, v,
+            attn,
+            out
+        )
+
+        return out
+
+    @staticmethod
+    @torch.no_grad()
+    def backward(ctx, do):
+
+        (
+            causal,
+            scale,
+            mask,
+            q_stop_grad_mask,
+            k_stop_grad_mask,
+            v_stop_grad_mask
+        ) = ctx.args
+
+        q, k, v, p, o = ctx.saved_tensors
+
+        # softmax D
+
+        D = (do * o).sum(dim = -1, keepdims = True)        
+
+        # stop grad for values
+
+        p_v = p
+
+        if exists(v_stop_grad_mask):
+            p_v = p_v.masked_fill(v_stop_grad_mask, 0.)
+
+        # dv
+
+        dv = einsum(p_v, do, 'b h i j, b h i d -> b h j d')
+
+        # prep for dq and dk
+
+        dp = einsum(do, v, 'b h i d, b h j d -> b h i j')
+        ds = p * scale * (dp - D)
+
+        # handle stop grad masking for queries and keys
+
+        ds_q = ds_k = ds
+
+        if exists(q_stop_grad_mask):
+            ds_q = ds_q.masked_fill(q_stop_grad_mask, 0.)
+
+        if exists(k_stop_grad_mask):            
+            ds_k = ds_k.masked_fill(k_stop_grad_mask, 0.)
+
+        # dq and dk
+
+        dq = einsum(ds_q, k, 'b h i j, b h j d -> b h i d')
+        dk = einsum(ds_k, q, 'b h i j, b h i d -> b h j d')
+
+        return dq, dk, dv, None, None, None, None, None, None
+
+# convenience method with defaults
+
+stop_graddable_attn_ = StopGraddableAttentionFunction.apply
+
+def stop_graddable_attn(
+    q, k, v,
+    mask = None,
+    attn_mask = None,
+    causal = False,
+    q_stop_grad_mask = None,
+    k_stop_grad_mask = None,
+    v_stop_grad_mask = None
+):
+    return stop_graddable_attn_(q, k, v, mask, attn_mask, causal, q_stop_grad_mask, k_stop_grad_mask, v_stop_grad_mask)
diff --git a/self_reasoning_tokens_pytorch/self_reasoning_tokens.py b/self_reasoning_tokens_pytorch/self_reasoning_tokens.py
@@ -11,6 +11,10 @@
     FeedForward
 )
 
+from self_reasoning_tokens_pytorch.attention_with_stop_graddable_qkv import (
+    stop_graddable_attn
+)
+
 # helper functions
 
 def exists(v):
@@ -53,27 +57,32 @@ def forward(
 
         q, k, v = self.to_qkv(x)
 
-        q = q * self.scale
-        sim = einsum(q, k, 'b h i d, b h j d -> b h i j')
+        if exists(stop_grad_attn_mask):
+            out = stop_graddable_attn(
+                q, k, v,
+                attn_mask = attn_mask,
+                k_stop_grad_mask = stop_grad_attn_mask,
+                v_stop_grad_mask = stop_grad_attn_mask
+            )
+
+        else:
+            q = q * self.scale
+            sim = einsum(q, k, 'b h i d, b h j d -> b h i j')
 
-        causal_mask = torch.ones((seq, seq), device = device, dtype = torch.bool).triu(1)
+            causal_mask = torch.ones((seq, seq), device = device, dtype = torch.bool).triu(1)
 
-        mask_value = -torch.finfo(sim.dtype).max
-        sim = sim.masked_fill(causal_mask, mask_value)
+            mask_value = -torch.finfo(sim.dtype).max
+            sim = sim.masked_fill(causal_mask, mask_value)
 
-        if exists(stop_grad_attn_mask):
-            # this approach isn't quite right, as the values are not stop gradient
-            # but will run some experiments just to see
+            if exists(attn_mask):
+                sim = sim.masked_fill(~attn_mask, mask_value)
 
-            detached_sim = sim.detach()
-            sim = torch.where(stop_grad_attn_mask, detached_sim, sim)
+            attn = sim.softmax(dim = -1)
 
-        if exists(attn_mask):
-            sim = sim.masked_fill(~attn_mask, mask_value)
+            out = einsum(attn, v, 'b h i j, b h j d -> b h i d')
 
-        attn = sim.softmax(dim = -1)
+        # combine heads
 
-        out = einsum(attn, v, 'b h i j, b h j d -> b h i d')
         return self.to_out(out)
 
 # transformer
diff --git a/setup.py b/setup.py
@@ -3,7 +3,7 @@
 setup(
   name = 'self-reasoning-tokens-pytorch',
   packages = find_packages(exclude = []),
-  version = '0.0.1',
+  version = '0.0.2',
   license='MIT',
   description = 'Self Reasoning Tokens',
   author = 'Phil Wang',

Original file line number	Diff line number	Diff line change
`@@ -1,3 +1,8 @@`
`1`	`1`	`from self_reasoning_tokens_pytorch.self_reasoning_tokens import (`
`2`	`2`	`Transformer`
`3`	`3`	`)`
	`4`	`+`
	`5`	`+from self_reasoning_tokens_pytorch.attention_with_stop_graddable_qkv import (`
	`6`	`+ stop_graddable_attn_,`
	`7`	`+ stop_graddable_attn`
	`8`	`+)`