add logic for llama3 context parallelism scheme

lucidrains · web-flow · commit 77e373322eb7 · 2024-10-25T08:22:05.000-07:00
diff --git a/README.md b/README.md
@@ -195,4 +195,15 @@ $ python assert_tree_attn.py --use-cuda --seq-len 8192
 }
 ```
 
+```bibtex
+@article{Dubey2024TheL3,
+    title   = {The Llama 3 Herd of Models},
+    author  = {Abhimanyu Dubey and Abhinav Jauhri and Abhinav Pandey and Abhishek Kadian and Ahmad Al-Dahle and Aiesha Letman and Akhil Mathur and Alan Schelten and Amy Yang and Angela Fan and Anirudh Goyal and Anthony Hartshorn and Aobo Yang and Archi Mitra and Archie Sravankumar and Artem Korenev and Arthur Hinsvark and Arun Rao and Aston Zhang and Aurelien Rodriguez and Austen Gregerson and Ava Spataru and Baptiste Rozi{\`e}re and Bethany Biron and Binh Tang and Bobbie Chern and Charlotte Caucheteux and Chaya Nayak and Chloe Bi and Chris Marra and Chris McConnell and Christian Keller and Christophe Touret and Chunyang Wu and Corinne Wong and Cristian Cant{\'o}n Ferrer and Cyrus Nikolaidis and Damien Allonsius and Daniel Song and Danielle Pintz and Danny Livshits and David Esiobu and Dhruv Choudhary and Dhruv Mahajan and Diego Garcia-Olano and Diego Perino and Dieuwke Hupkes and Egor Lakomkin and Ehab A. AlBadawy and Elina Lobanova and Emily Dinan and Eric Michael Smith and Filip Radenovic and Frank Zhang and Gabriele Synnaeve and Gabrielle Lee and Georgia Lewis Anderson and Graeme Nail and Gr{\'e}goire Mialon and Guanglong Pang and Guillem Cucurell and Hailey Nguyen and Hannah Korevaar and Hu Xu and Hugo Touvron and Iliyan Zarov and Imanol Arrieta Ibarra and Isabel M. Kloumann and Ishan Misra and Ivan Evtimov and Jade Copet and Jaewon Lee and Jan Laurens Geffert and Jana Vranes and Jason Park and Jay Mahadeokar and Jeet Shah and Jelmer van der Linde and Jennifer Billock and Jenny Hong and Jenya Lee and Jeremy Fu and Jianfeng Chi and Jianyu Huang and Jiawen Liu and Jie Wang and Jiecao Yu and Joanna Bitton and Joe Spisak and Jongsoo Park and Joseph Rocca and Joshua Johnstun and Joshua Saxe and Ju-Qing Jia and Kalyan Vasuden Alwala and K. Upasani and Kate Plawiak and Keqian Li and Ken-591 neth Heafield and Kevin Stone and Khalid El-Arini and Krithika Iyer and Kshitiz Malik and Kuenley Chiu and Kunal Bhalla and Lauren Rantala-Yeary and Laurens van der Maaten and Lawrence Chen and Liang Tan and Liz Jenkins and Louis Martin and Lovish Madaan and Lubo Malo and Lukas Blecher and Lukas Landzaat and Luke de Oliveira and Madeline C. Muzzi and Mahesh Babu Pasupuleti and Mannat Singh and Manohar Paluri and Marcin Kardas and Mathew Oldham and Mathieu Rita and Maya Pavlova and Melissa Hall Melanie Kambadur and Mike Lewis and Min Si and Mitesh Kumar Singh and Mona Hassan and Naman Goyal and Narjes Torabi and Nikolay Bashlykov and Nikolay Bogoychev and Niladri S. Chatterji and Olivier Duchenne and Onur cCelebi and Patrick Alrassy and Pengchuan Zhang and Pengwei Li and Petar Vasi{\'c} and Peter Weng and Prajjwal Bhargava and Pratik Dubal and Praveen Krishnan and Punit Singh Koura and Puxin Xu and Qing He and Qingxiao Dong and Ragavan Srinivasan and Raj Ganapathy and Ramon Calderer and Ricardo Silveira Cabral and Robert Stojnic and Roberta Raileanu and Rohit Girdhar and Rohit Patel and Romain Sauvestre and Ronnie Polidoro and Roshan Sumbaly and Ross Taylor and Ruan Silva and Rui Hou and Rui Wang and Saghar Hosseini and Sahana Chennabasappa and Sanjay Singh and Sean Bell and Seohyun Sonia Kim and Sergey Edunov and Shaoliang Nie and Sharan Narang and Sharath Chandra Raparthy and Sheng Shen and Shengye Wan and Shruti Bhosale and Shun Zhang and Simon Vandenhende and Soumya Batra and Spencer Whitman and Sten Sootla and Stephane Collot and Suchin Gururangan and Sydney Borodinsky and Tamar Herman and Tara Fowler and Tarek Sheasha and Thomas Georgiou and Thomas Scialom and Tobias Speckbacher and Todor Mihaylov and Tong Xiao and Ujjwal Karn and Vedanuj Goswami and Vibhor Gupta and Vignesh Ramanathan and Viktor Kerkez and Vincent Gonguet and Virginie Do and Vish Vogeti and Vladan Petrovic and Weiwei Chu and Wenhan Xiong and Wenyin Fu and Whitney Meers and Xavier Martinet and Xiaodong Wang and Xiaoqing Ellen Tan and Xinfeng Xie and Xuchao Jia and Xuewei Wang and Yaelle Goldschlag and Yashesh Gaur and Yasmine Babaei and Yiqian Wen and Yiwen Song and Yuchen Zhang and Yue Li and Yuning Mao and Zacharie Delpierre Coudert and Zhengxu Yan and Zhengxing Chen and Zoe Papakipos and Aaditya K. Singh and Aaron Grattafiori and Abha Jain and Adam Kelsey and Adam Shajnfeld and Adi Gangidi and Adolfo Victoria and Ahuva Goldstand and Ajay Menon and Ajay Sharma and Alex Boesenberg and Alex Vaughan and Alexei Baevski and Allie Feinstein and Amanda Kallet and Amit Sangani and Anam Yunus and Andrei Lupu and Andres Alvarado and Andrew Caples and Andrew Gu and Andrew Ho and Andrew Poulton and Andrew Ryan and Ankit Ramchandani and Annie Franco and Aparajita Saraf and Arkabandhu Chowdhury and Ashley Gabriel and Ashwin Bharambe and Assaf Eisenman and Azadeh Yazdan and Beau James and Ben Maurer and Ben Leonhardi and Bernie Huang and Beth Loyd and Beto De Paola and Bhargavi Paranjape and Bing Liu and Bo Wu and Boyu Ni and Braden Hancock and Bram Wasti and Brandon Spence and Brani Stojkovic and Brian Gamido and Britt Montalvo and Carl Parker and Carly Burton and Catalina Mejia and Changhan Wang and Changkyu Kim and Chao Zhou and Chester Hu and Ching-Hsiang Chu and Chris Cai and Chris Tindal and Christoph Feichtenhofer and Damon Civin and Dana Beaty and Daniel Kreymer and Shang-Wen Li and Danny Wyatt and David Adkins and David Xu and Davide Testuggine and Delia David and Devi Parikh and Diana Liskovich and Didem Foss and Dingkang Wang and Duc Le and Dustin Holland and Edward Dowling and Eissa Jamil and Elaine Montgomery and Eleonora Presani and Emily Hahn and Emily Wood and Erik Brinkman and Esteban Arcaute and Evan Dunbar and Evan Smothers and Fei Sun and Felix Kreuk and Feng Tian and Firat Ozgenel and Francesco Caggioni and Francisco Guzm'an and Frank J. Kanayet and Frank Seide and Gabriela Medina Florez and Gabriella Schwarz and Gada Badeer and Georgia Swee and Gil Halpern and Govind Thattai and Grant Herman and Grigory G. Sizov and Guangyi Zhang and Guna Lakshminarayanan and Hamid Shojanazeri and Han Zou and Hannah Wang and Han Zha and Haroun Habeeb and Harrison Rudolph and Helen Suk and Henry Aspegren and Hunter Goldman and Igor Molybog and Igor Tufanov and Irina-Elena Veliche and Itai Gat and Jake Weissman and James Geboski and James Kohli and Japhet Asher and Jean-Baptiste Gaya and Jeff Marcus and Jeff Tang and Jennifer Chan and Jenny Zhen and Jeremy Reizenstein and Jeremy Teboul and Jessica Zhong and Jian Jin and Jingyi Yang and Joe Cummings and Jon Carvill and Jon Shepard and Jonathan McPhie and Jonathan Torres and Josh Ginsburg and Junjie Wang and Kaixing(Kai) Wu and U KamHou and Karan Saxena and Karthik Prasad and Kartikay Khandelwal and Katayoun Zand and Kathy Matosich and Kaushik Veeraraghavan and Kelly Michelena and Keqian Li and Kun Huang and Kunal Chawla and Kushal Lakhotia and Kyle Huang and Lailin Chen and Lakshya Garg and A Lavender and Leandro Silva and Lee Bell and Lei Zhang and Liangpeng Guo and Licheng Yu and Liron Moshkovich and Luca Wehrstedt and Madian Khabsa and Manav Avalani and Manish Bhatt and Maria Tsimpoukelli and Martynas Mankus and Matan Hasson and Matthew Lennie and Matthias Reso and Maxim Groshev and Maxim Naumov and Maya Lathi and Meghan Keneally and Michael L. Seltzer and Michal Valko and Michelle Restrepo and Mihir Patel and Mik Vyatskov and Mikayel Samvelyan and Mike Clark and Mike Macey and Mike Wang and Miquel Jubert Hermoso and Mo Metanat and Mohammad Rastegari and Munish Bansal and Nandhini Santhanam and Natascha Parks and Natasha White and Navyata Bawa and Nayan Singhal and Nick Egebo and Nicolas Usunier and Nikolay Pavlovich Laptev and Ning Dong and Ning Zhang and Norman Cheng and Oleg Chernoguz and Olivia Hart and Omkar Salpekar and Ozlem Kalinli and Parkin Kent and Parth Parekh and Paul Saab and Pavan Balaji and Pedro Rittner and Philip Bontrager and Pierre Roux and Piotr Doll{\'a}r and Polina Zvyagina and Prashant Ratanchandani and Pritish Yuvraj and Qian Liang and Rachad Alao and Rachel Rodriguez and Rafi Ayub and Raghotham Murthy and Raghu Nayani and Rahul Mitra and Raymond Li and Rebekkah Hogan and Robin Battey and Rocky Wang and Rohan Maheswari and Russ Howes and Ruty Rinott and Sai Jayesh Bondu and Samyak Datta and Sara Chugh and Sara Hunt and Sargun Dhillon and Sasha Sidorov and Satadru Pan and Saurabh Verma and Seiji Yamamoto and Sharadh Ramaswamy and Shaun Lindsay and Sheng Feng and Shenghao Lin and Shengxin Cindy Zha and Shiva Shankar and Shuqiang Zhang and Sinong Wang and Sneha Agarwal and Soji Sajuyigbe and Soumith Chintala and Stephanie Max and Stephen Chen and Steve Kehoe and Steve Satterfield and Sudarshan Govindaprasad and Sumit Gupta and Sung-Bae Cho and Sunny Virk and Suraj Subramanian and Sy Choudhury and Sydney Goldman and Tal Remez and Tamar Glaser and Tamara Best and Thilo Kohler and Thomas Robinson and Tianhe Li and Tianjun Zhang and Tim Matthews and Timothy Chou and Tzook Shaked and Varun Vontimitta and Victoria Ajayi and Victoria Montanez and Vijai Mohan and Vinay Satish Kumar and Vishal Mangla and Vlad Ionescu and Vlad Andrei Poenaru and Vlad T. Mihailescu and Vladimir Ivanov and Wei Li and Wenchen Wang and Wenwen Jiang and Wes Bouaziz and Will Constable and Xia Tang and Xiaofang Wang and Xiaojian Wu and Xiaolan Wang and Xide Xia and Xilun Wu and Xinbo Gao and Yanjun Chen and Ye Hu and Ye Jia and Ye Qi and Yenda Li and Yilin Zhang and Ying Zhang and Yossi Adi and Youngjin Nam and Yu Wang and Yuchen Hao and Yundi Qian and Yuzi He and Zach Rait and Zachary DeVito and Zef Rosnbrick and Zhaoduo Wen and Zhenyu Yang and Zhiwei Zhao},
+    journal = {ArXiv},
+    year    = {2024},
+    volume  = {abs/2407.21783},
+    url     = {https://api.semanticscholar.org/CorpusID:271571434}
+}
+```
+
 *<a href="http://www.incompleteideas.net/IncIdeas/BitterLesson.html">The Bitter Lesson</a>* - Richard Sutton
diff --git a/assert_zig_zag.py b/assert_zig_zag.py
@@ -0,0 +1,205 @@
+import os
+import click
+from math import ceil
+
+import torch
+import torch.multiprocessing as mp
+import torch.distributed as dist
+from torch.amp import autocast
+from torch.nn.parallel import DistributedDataParallel as DDP
+
+from ring_attention_pytorch import RingAttention
+from ring_attention_pytorch.distributed import all_gather_variable_dim
+
+from einops import rearrange
+
+from ring_attention_pytorch.ring_attention import apply_rotary_pos_emb
+
+from ring_attention_pytorch.zig_zag_attention import (
+    zig_zag_pad_seq,
+    zig_zag_attn,
+    zig_zag_shard
+)
+
+def abs_diff(x, y):
+    return (x - y).abs().amax()
+
+def setup(
+    rank,
+    world_size,
+    use_cuda
+):
+    os.environ['MASTER_ADDR'] = 'localhost'
+    os.environ['MASTER_PORT'] = '12355'
+
+    backend = "gloo" if not use_cuda else "nccl"
+    dist.init_process_group(backend, rank = rank, world_size = world_size)
+
+    if use_cuda:
+        torch.cuda.set_device(rank)
+
+def cleanup():
+    dist.destroy_process_group()
+
+def start(
+    rank,
+    world_size,
+    batch_size,
+    batch_size_var_len,
+    seq_len,
+    num_sharded_batches,
+    dim,
+    heads,
+    num_grouped_query_heads,
+    dim_head,
+    use_cuda,
+    rotary
+):
+    setup(rank, world_size, use_cuda)
+
+    attention = RingAttention(
+        dim = dim,
+        dim_head = dim_head,
+        heads = heads,
+        num_grouped_query_heads = num_grouped_query_heads,
+        causal = True,
+        rotary_embed = rotary,
+        ring_attn = False,
+        use_cuda_kernel = use_cuda
+    )
+
+    if batch_size_var_len:
+        batch_size = batch_size + rank
+
+    seq = torch.randn(batch_size, seq_len, dim)
+
+    # move to cuda if needed
+
+    if use_cuda:
+        seq = seq.cuda(rank)
+        attention.cuda(rank)
+
+    # separate inputs for ring vs flash
+
+    regular_input = seq.clone().requires_grad_()
+    zig_zag_input = seq.clone().requires_grad_()
+
+    # wrap
+
+    ddp_attention = DDP(attention)
+
+    # regular
+
+    out = ddp_attention(regular_input)
+
+    out.mean().backward()
+
+    # zig zag
+
+    padded_inp, remove_pad = zig_zag_pad_seq(zig_zag_input)
+    (padded_inp, q_indices, kv_indices), gather_seq = zig_zag_shard(padded_inp, all_gather_batch = True)
+
+    qkv = attention.to_qkv(padded_inp)
+
+    q, k, v = rearrange(qkv, 'b n (h d) -> b h n d', d = dim_head).split(attention.qkv_head_breakdown, dim = -3)
+
+    if rotary:
+        pos_emb = attention.rotary_embed(q_indices)
+
+        q = apply_rotary_pos_emb(pos_emb, q, head_dim_first = True)
+        k = apply_rotary_pos_emb(pos_emb, k, head_dim_first = True)
+
+    # causal mask
+
+    causal_mask = q_indices[:, None] >= kv_indices[None, :]
+
+    # attention
+
+    o = zig_zag_attn(
+        q, k, v,
+        attn_mask = causal_mask
+    )
+
+    o = rearrange(o, 'b h n d -> b n (h d)')
+
+    padded_out = attention.to_out(o)
+
+    padded_out = gather_seq(padded_out)
+
+    zig_zag_out = remove_pad(padded_out)
+
+    zig_zag_out.mean().backward()
+
+    # validate output is the same for sequence split across machines vs without
+
+    if rank == 0:
+        out = out.cpu()
+        zig_zag_out = zig_zag_out.cpu()
+
+        output_atol = 1e-2 if use_cuda else 1e-6
+
+        assert torch.allclose(out, zig_zag_out, atol = output_atol), 'output is not the same'
+
+        # validate gradients is the same
+
+        regular_input_grad = regular_input.grad
+        zig_zag_input_grad = zig_zag_input.grad
+
+        assert torch.allclose(
+            regular_input_grad,
+            zig_zag_input_grad,
+            atol = 1e-2
+        ), 'grad is not the same'
+
+        print('✅ outputs and gradients are same between zig zag attention and regular attention')
+
+    cleanup()
+
+@click.command()
+@click.option('--world-size', default = 8, help = 'number of machines / processes')
+@click.option('--batch-size', default = 2, help = 'test batch size')
+@click.option('--num-sharded-batches', default = 1, help = 'number of sharded batches')
+@click.option('--batch-size-var-len', is_flag = True, help = 'test variable lengthed batch sizes')
+@click.option('--use-cuda', is_flag = True, help = 'whether to test with CUDA and NCCL')
+@click.option('--rotary', is_flag = True, help = 'whether to test with rotary embeddings')
+@click.option('--seq-len', default = 31, help = 'sequence length to test')
+@click.option('--model-dim', default = 8, help = 'model dimensions for testing')
+@click.option('--heads', default = 8, help = 'number of query attention heads')
+@click.option('--num-grouped-query-heads', default = 2, help = 'number of query attention head groups')
+@click.option('--dim-head', default = 16, help = 'model dimensions for testing')
+def test(
+    world_size: int,
+    batch_size: int,
+    num_sharded_batches: int,
+    batch_size_var_len: bool,
+    use_cuda: bool,
+    rotary: bool,
+    seq_len: int,
+    model_dim: int,
+    heads: int,
+    num_grouped_query_heads: int,
+    dim_head: int,
+):
+    assert not use_cuda or world_size <= torch.cuda.device_count(), f'world size {world_size} must be less than the number of cuda devices {torch.cuda.device_count()}'
+
+    mp.spawn(
+        start,
+        args = (
+            world_size,
+            batch_size,
+            batch_size_var_len,
+            seq_len,
+            num_sharded_batches,
+            model_dim,
+            heads,
+            num_grouped_query_heads,
+            dim_head,
+            use_cuda,
+            rotary
+        ),
+        nprocs = world_size,
+        join = True
+    )
+
+if __name__ == '__main__':
+    test()
diff --git a/ring_attention_pytorch/distributed.py b/ring_attention_pytorch/distributed.py
@@ -125,3 +125,5 @@ def split_by_rank(x):
 
     sizes = torch.tensor(sizes, device = out.device, dtype = torch.long)
     return out, sizes
+
+all_gather = AllGatherFunction.apply
diff --git a/ring_attention_pytorch/ring_attention.py b/ring_attention_pytorch/ring_attention.py
@@ -4,7 +4,7 @@
 import torch
 from torch import nn, einsum, Tensor
 import torch.nn.functional as F
-from torch.cuda.amp import autocast
+from torch.amp import autocast
 from torch.nn import Module, ModuleList
 
 from einops import rearrange, repeat
@@ -125,33 +125,36 @@ def device(self):
     def is_cuda(self):
         return self.inv_freq.is_cuda
 
-    @autocast(enabled = False)
+    @autocast('cuda', enabled = False)
     @beartype
     def forward(
         self,
-        seq_len: int
+        seq: int | Tensor
     ):
         device = self.device
 
         pos = None
+        if torch.is_tensor(seq):
+            pos = seq
 
-        if self.ring:
-            if self.striped:
-                buckets = 1 if self.is_cuda else self.buckets
-                ring_stride = get_world_size() * buckets
+        if not exists(pos):
+            if self.ring:
+                if self.striped:
+                    buckets = 1 if self.is_cuda else self.buckets
+                    ring_stride = get_world_size() * buckets
 
-                pos = torch.arange(seq_len // buckets, device = device)
-                pos = repeat(pos, 'n -> n b', b = buckets)
+                    pos = torch.arange(seq // buckets, device = device)
+                    pos = repeat(pos, 'n -> n b', b = buckets)
 
-                pos = pos * ring_stride
-                pos += torch.arange(buckets, device = device) + (get_rank() * buckets)
-                pos = rearrange(pos, 'n b -> (b n)')
+                    pos = pos * ring_stride
+                    pos += torch.arange(buckets, device = device) + (get_rank() * buckets)
+                    pos = rearrange(pos, 'n b -> (b n)')
 
+                else:
+                    pos = torch.arange(seq, device = device)
+                    pos += seq * get_rank()
             else:
-                pos = torch.arange(seq_len, device = device)
-                pos += seq_len * get_rank()
-        else:
-            pos = torch.arange(seq_len, device = device)
+                pos = torch.arange(seq, device = device)
 
         pos = pos.type_as(self.inv_freq)
         freqs = einsum('i , j -> i j', pos, self.inv_freq)
@@ -161,9 +164,11 @@ def rotate_half(x):
     x1, x2 = x.chunk(2, dim = -1)
     return torch.cat((-x2, x1), dim=-1)
 
-@autocast(enabled = False)
-def apply_rotary_pos_emb(pos, t):
-    pos = rearrange(pos, 'n d -> n 1 d')
+@autocast('cuda', enabled = False)
+def apply_rotary_pos_emb(pos, t, head_dim_first = False):
+    if not head_dim_first:
+        pos = rearrange(pos, 'n d -> n 1 d')
+
     return t * pos.cos() + rotate_half(t) * pos.sin()
 
 # batch to sequence sharding and back
diff --git a/ring_attention_pytorch/ring_flash_attention_cuda.py b/ring_attention_pytorch/ring_flash_attention_cuda.py
@@ -5,7 +5,7 @@
 import torch
 from torch import nn, einsum, Tensor
 from torch.autograd.function import Function
-from torch.cuda.amp import autocast
+from torch.amp import autocast
 
 from ring_attention_pytorch.ring import (
     ring_pass,
@@ -352,7 +352,7 @@ def backward(ctx, do):
 
 ring_flash_attn_cuda_ = RingFlashAttentionCUDAFunction.apply
 
-@autocast(enabled = False)
+@autocast('cuda', enabled = False)
 @beartype
 def ring_flash_attn_cuda(
     q: Tensor,
diff --git a/ring_attention_pytorch/zig_zag_attention.py b/ring_attention_pytorch/zig_zag_attention.py